xref: /aosp_15_r20/external/regex-re2/re2/testing/dump.cc (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1*ccdc9c3eSSadaf Ebrahimi // Copyright 2006 The RE2 Authors.  All Rights Reserved.
2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style
3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file.
4*ccdc9c3eSSadaf Ebrahimi 
5*ccdc9c3eSSadaf Ebrahimi // Dump the regexp into a string showing structure.
6*ccdc9c3eSSadaf Ebrahimi // Tested by parse_unittest.cc
7*ccdc9c3eSSadaf Ebrahimi 
8*ccdc9c3eSSadaf Ebrahimi // This function traverses the regexp recursively,
9*ccdc9c3eSSadaf Ebrahimi // meaning that on inputs like Regexp::Simplify of
10*ccdc9c3eSSadaf Ebrahimi // a{100}{100}{100}{100}{100}{100}{100}{100}{100}{100},
11*ccdc9c3eSSadaf Ebrahimi // it takes time and space exponential in the size of the
12*ccdc9c3eSSadaf Ebrahimi // original regular expression.  It can also use stack space
13*ccdc9c3eSSadaf Ebrahimi // linear in the size of the regular expression for inputs
14*ccdc9c3eSSadaf Ebrahimi // like ((((((((((((((((a*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*.
15*ccdc9c3eSSadaf Ebrahimi // IT IS NOT SAFE TO CALL FROM PRODUCTION CODE.
16*ccdc9c3eSSadaf Ebrahimi // As a result, Dump is provided only in the testing
17*ccdc9c3eSSadaf Ebrahimi // library (see BUILD).
18*ccdc9c3eSSadaf Ebrahimi 
19*ccdc9c3eSSadaf Ebrahimi #include <string>
20*ccdc9c3eSSadaf Ebrahimi 
21*ccdc9c3eSSadaf Ebrahimi #include "util/test.h"
22*ccdc9c3eSSadaf Ebrahimi #include "util/logging.h"
23*ccdc9c3eSSadaf Ebrahimi #include "util/strutil.h"
24*ccdc9c3eSSadaf Ebrahimi #include "util/utf.h"
25*ccdc9c3eSSadaf Ebrahimi #include "re2/stringpiece.h"
26*ccdc9c3eSSadaf Ebrahimi #include "re2/regexp.h"
27*ccdc9c3eSSadaf Ebrahimi 
28*ccdc9c3eSSadaf Ebrahimi // Cause a link error if this file is used outside of testing.
29*ccdc9c3eSSadaf Ebrahimi DECLARE_string(test_tmpdir);
30*ccdc9c3eSSadaf Ebrahimi 
31*ccdc9c3eSSadaf Ebrahimi namespace re2 {
32*ccdc9c3eSSadaf Ebrahimi 
33*ccdc9c3eSSadaf Ebrahimi static const char* kOpcodeNames[] = {
34*ccdc9c3eSSadaf Ebrahimi   "bad",
35*ccdc9c3eSSadaf Ebrahimi   "no",
36*ccdc9c3eSSadaf Ebrahimi   "emp",
37*ccdc9c3eSSadaf Ebrahimi   "lit",
38*ccdc9c3eSSadaf Ebrahimi   "str",
39*ccdc9c3eSSadaf Ebrahimi   "cat",
40*ccdc9c3eSSadaf Ebrahimi   "alt",
41*ccdc9c3eSSadaf Ebrahimi   "star",
42*ccdc9c3eSSadaf Ebrahimi   "plus",
43*ccdc9c3eSSadaf Ebrahimi   "que",
44*ccdc9c3eSSadaf Ebrahimi   "rep",
45*ccdc9c3eSSadaf Ebrahimi   "cap",
46*ccdc9c3eSSadaf Ebrahimi   "dot",
47*ccdc9c3eSSadaf Ebrahimi   "byte",
48*ccdc9c3eSSadaf Ebrahimi   "bol",
49*ccdc9c3eSSadaf Ebrahimi   "eol",
50*ccdc9c3eSSadaf Ebrahimi   "wb",   // kRegexpWordBoundary
51*ccdc9c3eSSadaf Ebrahimi   "nwb",  // kRegexpNoWordBoundary
52*ccdc9c3eSSadaf Ebrahimi   "bot",
53*ccdc9c3eSSadaf Ebrahimi   "eot",
54*ccdc9c3eSSadaf Ebrahimi   "cc",
55*ccdc9c3eSSadaf Ebrahimi   "match",
56*ccdc9c3eSSadaf Ebrahimi };
57*ccdc9c3eSSadaf Ebrahimi 
58*ccdc9c3eSSadaf Ebrahimi // Create string representation of regexp with explicit structure.
59*ccdc9c3eSSadaf Ebrahimi // Nothing pretty, just for testing.
DumpRegexpAppending(Regexp * re,string * s)60*ccdc9c3eSSadaf Ebrahimi static void DumpRegexpAppending(Regexp* re, string* s) {
61*ccdc9c3eSSadaf Ebrahimi   if (re->op() < 0 || re->op() >= arraysize(kOpcodeNames)) {
62*ccdc9c3eSSadaf Ebrahimi     StringAppendF(s, "op%d", re->op());
63*ccdc9c3eSSadaf Ebrahimi   } else {
64*ccdc9c3eSSadaf Ebrahimi     switch (re->op()) {
65*ccdc9c3eSSadaf Ebrahimi       default:
66*ccdc9c3eSSadaf Ebrahimi         break;
67*ccdc9c3eSSadaf Ebrahimi       case kRegexpStar:
68*ccdc9c3eSSadaf Ebrahimi       case kRegexpPlus:
69*ccdc9c3eSSadaf Ebrahimi       case kRegexpQuest:
70*ccdc9c3eSSadaf Ebrahimi       case kRegexpRepeat:
71*ccdc9c3eSSadaf Ebrahimi         if (re->parse_flags() & Regexp::NonGreedy)
72*ccdc9c3eSSadaf Ebrahimi           s->append("n");
73*ccdc9c3eSSadaf Ebrahimi         break;
74*ccdc9c3eSSadaf Ebrahimi     }
75*ccdc9c3eSSadaf Ebrahimi     s->append(kOpcodeNames[re->op()]);
76*ccdc9c3eSSadaf Ebrahimi     if (re->op() == kRegexpLiteral && (re->parse_flags() & Regexp::FoldCase)) {
77*ccdc9c3eSSadaf Ebrahimi       Rune r = re->rune();
78*ccdc9c3eSSadaf Ebrahimi       if ('a' <= r && r <= 'z')
79*ccdc9c3eSSadaf Ebrahimi         s->append("fold");
80*ccdc9c3eSSadaf Ebrahimi     }
81*ccdc9c3eSSadaf Ebrahimi     if (re->op() == kRegexpLiteralString && (re->parse_flags() & Regexp::FoldCase)) {
82*ccdc9c3eSSadaf Ebrahimi       for (int i = 0; i < re->nrunes(); i++) {
83*ccdc9c3eSSadaf Ebrahimi         Rune r = re->runes()[i];
84*ccdc9c3eSSadaf Ebrahimi         if ('a' <= r && r <= 'z') {
85*ccdc9c3eSSadaf Ebrahimi           s->append("fold");
86*ccdc9c3eSSadaf Ebrahimi           break;
87*ccdc9c3eSSadaf Ebrahimi         }
88*ccdc9c3eSSadaf Ebrahimi       }
89*ccdc9c3eSSadaf Ebrahimi     }
90*ccdc9c3eSSadaf Ebrahimi   }
91*ccdc9c3eSSadaf Ebrahimi   s->append("{");
92*ccdc9c3eSSadaf Ebrahimi   switch (re->op()) {
93*ccdc9c3eSSadaf Ebrahimi     default:
94*ccdc9c3eSSadaf Ebrahimi       break;
95*ccdc9c3eSSadaf Ebrahimi     case kRegexpEndText:
96*ccdc9c3eSSadaf Ebrahimi       if (!(re->parse_flags() & Regexp::WasDollar)) {
97*ccdc9c3eSSadaf Ebrahimi         s->append("\\z");
98*ccdc9c3eSSadaf Ebrahimi       }
99*ccdc9c3eSSadaf Ebrahimi       break;
100*ccdc9c3eSSadaf Ebrahimi     case kRegexpLiteral: {
101*ccdc9c3eSSadaf Ebrahimi       Rune r = re->rune();
102*ccdc9c3eSSadaf Ebrahimi       char buf[UTFmax+1];
103*ccdc9c3eSSadaf Ebrahimi       buf[runetochar(buf, &r)] = 0;
104*ccdc9c3eSSadaf Ebrahimi       s->append(buf);
105*ccdc9c3eSSadaf Ebrahimi       break;
106*ccdc9c3eSSadaf Ebrahimi     }
107*ccdc9c3eSSadaf Ebrahimi     case kRegexpLiteralString:
108*ccdc9c3eSSadaf Ebrahimi       for (int i = 0; i < re->nrunes(); i++) {
109*ccdc9c3eSSadaf Ebrahimi         Rune r = re->runes()[i];
110*ccdc9c3eSSadaf Ebrahimi         char buf[UTFmax+1];
111*ccdc9c3eSSadaf Ebrahimi         buf[runetochar(buf, &r)] = 0;
112*ccdc9c3eSSadaf Ebrahimi         s->append(buf);
113*ccdc9c3eSSadaf Ebrahimi       }
114*ccdc9c3eSSadaf Ebrahimi       break;
115*ccdc9c3eSSadaf Ebrahimi     case kRegexpConcat:
116*ccdc9c3eSSadaf Ebrahimi     case kRegexpAlternate:
117*ccdc9c3eSSadaf Ebrahimi       for (int i = 0; i < re->nsub(); i++)
118*ccdc9c3eSSadaf Ebrahimi         DumpRegexpAppending(re->sub()[i], s);
119*ccdc9c3eSSadaf Ebrahimi       break;
120*ccdc9c3eSSadaf Ebrahimi     case kRegexpStar:
121*ccdc9c3eSSadaf Ebrahimi     case kRegexpPlus:
122*ccdc9c3eSSadaf Ebrahimi     case kRegexpQuest:
123*ccdc9c3eSSadaf Ebrahimi       DumpRegexpAppending(re->sub()[0], s);
124*ccdc9c3eSSadaf Ebrahimi       break;
125*ccdc9c3eSSadaf Ebrahimi     case kRegexpCapture:
126*ccdc9c3eSSadaf Ebrahimi       if (re->cap() == 0)
127*ccdc9c3eSSadaf Ebrahimi         LOG(DFATAL) << "kRegexpCapture cap() == 0";
128*ccdc9c3eSSadaf Ebrahimi       if (re->name()) {
129*ccdc9c3eSSadaf Ebrahimi         s->append(*re->name());
130*ccdc9c3eSSadaf Ebrahimi         s->append(":");
131*ccdc9c3eSSadaf Ebrahimi       }
132*ccdc9c3eSSadaf Ebrahimi       DumpRegexpAppending(re->sub()[0], s);
133*ccdc9c3eSSadaf Ebrahimi       break;
134*ccdc9c3eSSadaf Ebrahimi     case kRegexpRepeat:
135*ccdc9c3eSSadaf Ebrahimi       s->append(StringPrintf("%d,%d ", re->min(), re->max()));
136*ccdc9c3eSSadaf Ebrahimi       DumpRegexpAppending(re->sub()[0], s);
137*ccdc9c3eSSadaf Ebrahimi       break;
138*ccdc9c3eSSadaf Ebrahimi     case kRegexpCharClass: {
139*ccdc9c3eSSadaf Ebrahimi       string sep;
140*ccdc9c3eSSadaf Ebrahimi       for (CharClass::iterator it = re->cc()->begin();
141*ccdc9c3eSSadaf Ebrahimi            it != re->cc()->end(); ++it) {
142*ccdc9c3eSSadaf Ebrahimi         RuneRange rr = *it;
143*ccdc9c3eSSadaf Ebrahimi         s->append(sep);
144*ccdc9c3eSSadaf Ebrahimi         if (rr.lo == rr.hi)
145*ccdc9c3eSSadaf Ebrahimi           s->append(StringPrintf("%#x", rr.lo));
146*ccdc9c3eSSadaf Ebrahimi         else
147*ccdc9c3eSSadaf Ebrahimi           s->append(StringPrintf("%#x-%#x", rr.lo, rr.hi));
148*ccdc9c3eSSadaf Ebrahimi         sep = " ";
149*ccdc9c3eSSadaf Ebrahimi       }
150*ccdc9c3eSSadaf Ebrahimi       break;
151*ccdc9c3eSSadaf Ebrahimi     }
152*ccdc9c3eSSadaf Ebrahimi   }
153*ccdc9c3eSSadaf Ebrahimi   s->append("}");
154*ccdc9c3eSSadaf Ebrahimi }
155*ccdc9c3eSSadaf Ebrahimi 
Dump()156*ccdc9c3eSSadaf Ebrahimi string Regexp::Dump() {
157*ccdc9c3eSSadaf Ebrahimi   string s;
158*ccdc9c3eSSadaf Ebrahimi 
159*ccdc9c3eSSadaf Ebrahimi   // Make sure being called from a unit test.
160*ccdc9c3eSSadaf Ebrahimi   if (FLAGS_test_tmpdir.empty()) {
161*ccdc9c3eSSadaf Ebrahimi     LOG(ERROR) << "Cannot use except for testing.";
162*ccdc9c3eSSadaf Ebrahimi     return s;
163*ccdc9c3eSSadaf Ebrahimi   }
164*ccdc9c3eSSadaf Ebrahimi 
165*ccdc9c3eSSadaf Ebrahimi   DumpRegexpAppending(this, &s);
166*ccdc9c3eSSadaf Ebrahimi   return s;
167*ccdc9c3eSSadaf Ebrahimi }
168*ccdc9c3eSSadaf Ebrahimi 
169*ccdc9c3eSSadaf Ebrahimi }  // namespace re2
170