xref: /aosp_15_r20/external/cronet/third_party/re2/src/re2/tostring.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2006 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Format a regular expression structure as a string.
6 // Tested by parse_test.cc
7 
8 #include <string.h>
9 #include <string>
10 
11 #include "absl/strings/str_format.h"
12 #include "util/logging.h"
13 #include "util/utf.h"
14 #include "re2/regexp.h"
15 #include "re2/walker-inl.h"
16 
17 namespace re2 {
18 
19 enum {
20   PrecAtom,
21   PrecUnary,
22   PrecConcat,
23   PrecAlternate,
24   PrecEmpty,
25   PrecParen,
26   PrecToplevel,
27 };
28 
29 // Helper function.  See description below.
30 static void AppendCCRange(std::string* t, Rune lo, Rune hi);
31 
32 // Walker to generate string in s_.
33 // The arg pointers are actually integers giving the
34 // context precedence.
35 // The child_args are always NULL.
36 class ToStringWalker : public Regexp::Walker<int> {
37  public:
ToStringWalker(std::string * t)38   explicit ToStringWalker(std::string* t) : t_(t) {}
39 
40   virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
41   virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
42                         int* child_args, int nchild_args);
ShortVisit(Regexp * re,int parent_arg)43   virtual int ShortVisit(Regexp* re, int parent_arg) {
44     return 0;
45   }
46 
47  private:
48   std::string* t_;  // The string the walker appends to.
49 
50   ToStringWalker(const ToStringWalker&) = delete;
51   ToStringWalker& operator=(const ToStringWalker&) = delete;
52 };
53 
ToString()54 std::string Regexp::ToString() {
55   std::string t;
56   ToStringWalker w(&t);
57   w.WalkExponential(this, PrecToplevel, 100000);
58   if (w.stopped_early())
59     t += " [truncated]";
60   return t;
61 }
62 
63 #define ToString DontCallToString  // Avoid accidental recursion.
64 
65 // Visits re before children are processed.
66 // Appends ( if needed and passes new precedence to children.
PreVisit(Regexp * re,int parent_arg,bool * stop)67 int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
68   int prec = parent_arg;
69   int nprec = PrecAtom;
70 
71   switch (re->op()) {
72     case kRegexpNoMatch:
73     case kRegexpEmptyMatch:
74     case kRegexpLiteral:
75     case kRegexpAnyChar:
76     case kRegexpAnyByte:
77     case kRegexpBeginLine:
78     case kRegexpEndLine:
79     case kRegexpBeginText:
80     case kRegexpEndText:
81     case kRegexpWordBoundary:
82     case kRegexpNoWordBoundary:
83     case kRegexpCharClass:
84     case kRegexpHaveMatch:
85       nprec = PrecAtom;
86       break;
87 
88     case kRegexpConcat:
89     case kRegexpLiteralString:
90       if (prec < PrecConcat)
91         t_->append("(?:");
92       nprec = PrecConcat;
93       break;
94 
95     case kRegexpAlternate:
96       if (prec < PrecAlternate)
97         t_->append("(?:");
98       nprec = PrecAlternate;
99       break;
100 
101     case kRegexpCapture:
102       t_->append("(");
103       if (re->cap() == 0)
104         LOG(DFATAL) << "kRegexpCapture cap() == 0";
105       if (re->name()) {
106         t_->append("?P<");
107         t_->append(*re->name());
108         t_->append(">");
109       }
110       nprec = PrecParen;
111       break;
112 
113     case kRegexpStar:
114     case kRegexpPlus:
115     case kRegexpQuest:
116     case kRegexpRepeat:
117       if (prec < PrecUnary)
118         t_->append("(?:");
119       // The subprecedence here is PrecAtom instead of PrecUnary
120       // because PCRE treats two unary ops in a row as a parse error.
121       nprec = PrecAtom;
122       break;
123   }
124 
125   return nprec;
126 }
127 
AppendLiteral(std::string * t,Rune r,bool foldcase)128 static void AppendLiteral(std::string *t, Rune r, bool foldcase) {
129   if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
130     t->append(1, '\\');
131     t->append(1, static_cast<char>(r));
132   } else if (foldcase && 'a' <= r && r <= 'z') {
133     r -= 'a' - 'A';
134     t->append(1, '[');
135     t->append(1, static_cast<char>(r));
136     t->append(1, static_cast<char>(r) + 'a' - 'A');
137     t->append(1, ']');
138   } else {
139     AppendCCRange(t, r, r);
140   }
141 }
142 
143 // Visits re after children are processed.
144 // For childless regexps, all the work is done here.
145 // For regexps with children, append any unary suffixes or ).
PostVisit(Regexp * re,int parent_arg,int pre_arg,int * child_args,int nchild_args)146 int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
147                               int* child_args, int nchild_args) {
148   int prec = parent_arg;
149   switch (re->op()) {
150     case kRegexpNoMatch:
151       // There's no simple symbol for "no match", but
152       // [^0-Runemax] excludes everything.
153       t_->append("[^\\x00-\\x{10ffff}]");
154       break;
155 
156     case kRegexpEmptyMatch:
157       // Append (?:) to make empty string visible,
158       // unless this is already being parenthesized.
159       if (prec < PrecEmpty)
160         t_->append("(?:)");
161       break;
162 
163     case kRegexpLiteral:
164       AppendLiteral(t_, re->rune(),
165                     (re->parse_flags() & Regexp::FoldCase) != 0);
166       break;
167 
168     case kRegexpLiteralString:
169       for (int i = 0; i < re->nrunes(); i++)
170         AppendLiteral(t_, re->runes()[i],
171                       (re->parse_flags() & Regexp::FoldCase) != 0);
172       if (prec < PrecConcat)
173         t_->append(")");
174       break;
175 
176     case kRegexpConcat:
177       if (prec < PrecConcat)
178         t_->append(")");
179       break;
180 
181     case kRegexpAlternate:
182       // Clumsy but workable: the children all appended |
183       // at the end of their strings, so just remove the last one.
184       if ((*t_)[t_->size()-1] == '|')
185         t_->erase(t_->size()-1);
186       else
187         LOG(DFATAL) << "Bad final char: " << t_;
188       if (prec < PrecAlternate)
189         t_->append(")");
190       break;
191 
192     case kRegexpStar:
193       t_->append("*");
194       if (re->parse_flags() & Regexp::NonGreedy)
195         t_->append("?");
196       if (prec < PrecUnary)
197         t_->append(")");
198       break;
199 
200     case kRegexpPlus:
201       t_->append("+");
202       if (re->parse_flags() & Regexp::NonGreedy)
203         t_->append("?");
204       if (prec < PrecUnary)
205         t_->append(")");
206       break;
207 
208     case kRegexpQuest:
209       t_->append("?");
210       if (re->parse_flags() & Regexp::NonGreedy)
211         t_->append("?");
212       if (prec < PrecUnary)
213         t_->append(")");
214       break;
215 
216     case kRegexpRepeat:
217       if (re->max() == -1)
218         t_->append(absl::StrFormat("{%d,}", re->min()));
219       else if (re->min() == re->max())
220         t_->append(absl::StrFormat("{%d}", re->min()));
221       else
222         t_->append(absl::StrFormat("{%d,%d}", re->min(), re->max()));
223       if (re->parse_flags() & Regexp::NonGreedy)
224         t_->append("?");
225       if (prec < PrecUnary)
226         t_->append(")");
227       break;
228 
229     case kRegexpAnyChar:
230       t_->append(".");
231       break;
232 
233     case kRegexpAnyByte:
234       t_->append("\\C");
235       break;
236 
237     case kRegexpBeginLine:
238       t_->append("^");
239       break;
240 
241     case kRegexpEndLine:
242       t_->append("$");
243       break;
244 
245     case kRegexpBeginText:
246       t_->append("(?-m:^)");
247       break;
248 
249     case kRegexpEndText:
250       if (re->parse_flags() & Regexp::WasDollar)
251         t_->append("(?-m:$)");
252       else
253         t_->append("\\z");
254       break;
255 
256     case kRegexpWordBoundary:
257       t_->append("\\b");
258       break;
259 
260     case kRegexpNoWordBoundary:
261       t_->append("\\B");
262       break;
263 
264     case kRegexpCharClass: {
265       if (re->cc()->size() == 0) {
266         t_->append("[^\\x00-\\x{10ffff}]");
267         break;
268       }
269       t_->append("[");
270       // Heuristic: show class as negated if it contains the
271       // non-character 0xFFFE and yet somehow isn't full.
272       CharClass* cc = re->cc();
273       if (cc->Contains(0xFFFE) && !cc->full()) {
274         cc = cc->Negate();
275         t_->append("^");
276       }
277       for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i)
278         AppendCCRange(t_, i->lo, i->hi);
279       if (cc != re->cc())
280         cc->Delete();
281       t_->append("]");
282       break;
283     }
284 
285     case kRegexpCapture:
286       t_->append(")");
287       break;
288 
289     case kRegexpHaveMatch:
290       // There's no syntax accepted by the parser to generate
291       // this node (it is generated by RE2::Set) so make something
292       // up that is readable but won't compile.
293       t_->append(absl::StrFormat("(?HaveMatch:%d)", re->match_id()));
294       break;
295   }
296 
297   // If the parent is an alternation, append the | for it.
298   if (prec == PrecAlternate)
299     t_->append("|");
300 
301   return 0;
302 }
303 
304 // Appends a rune for use in a character class to the string t.
AppendCCChar(std::string * t,Rune r)305 static void AppendCCChar(std::string* t, Rune r) {
306   if (0x20 <= r && r <= 0x7E) {
307     if (strchr("[]^-\\", r))
308       t->append("\\");
309     t->append(1, static_cast<char>(r));
310     return;
311   }
312   switch (r) {
313     default:
314       break;
315 
316     case '\r':
317       t->append("\\r");
318       return;
319 
320     case '\t':
321       t->append("\\t");
322       return;
323 
324     case '\n':
325       t->append("\\n");
326       return;
327 
328     case '\f':
329       t->append("\\f");
330       return;
331   }
332 
333   if (r < 0x100) {
334     *t += absl::StrFormat("\\x%02x", static_cast<int>(r));
335     return;
336   }
337   *t += absl::StrFormat("\\x{%x}", static_cast<int>(r));
338 }
339 
AppendCCRange(std::string * t,Rune lo,Rune hi)340 static void AppendCCRange(std::string* t, Rune lo, Rune hi) {
341   if (lo > hi)
342     return;
343   AppendCCChar(t, lo);
344   if (lo < hi) {
345     t->append("-");
346     AppendCCChar(t, hi);
347   }
348 }
349 
350 }  // namespace re2
351