1*ccdc9c3eSSadaf Ebrahimi // Copyright 2006 The RE2 Authors. All Rights Reserved.
2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style
3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file.
4*ccdc9c3eSSadaf Ebrahimi
5*ccdc9c3eSSadaf Ebrahimi // Dump the regexp into a string showing structure.
6*ccdc9c3eSSadaf Ebrahimi // Tested by parse_unittest.cc
7*ccdc9c3eSSadaf Ebrahimi
8*ccdc9c3eSSadaf Ebrahimi // This function traverses the regexp recursively,
9*ccdc9c3eSSadaf Ebrahimi // meaning that on inputs like Regexp::Simplify of
10*ccdc9c3eSSadaf Ebrahimi // a{100}{100}{100}{100}{100}{100}{100}{100}{100}{100},
11*ccdc9c3eSSadaf Ebrahimi // it takes time and space exponential in the size of the
12*ccdc9c3eSSadaf Ebrahimi // original regular expression. It can also use stack space
13*ccdc9c3eSSadaf Ebrahimi // linear in the size of the regular expression for inputs
14*ccdc9c3eSSadaf Ebrahimi // like ((((((((((((((((a*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*.
15*ccdc9c3eSSadaf Ebrahimi // IT IS NOT SAFE TO CALL FROM PRODUCTION CODE.
16*ccdc9c3eSSadaf Ebrahimi // As a result, Dump is provided only in the testing
17*ccdc9c3eSSadaf Ebrahimi // library (see BUILD).
18*ccdc9c3eSSadaf Ebrahimi
19*ccdc9c3eSSadaf Ebrahimi #include <string>
20*ccdc9c3eSSadaf Ebrahimi
21*ccdc9c3eSSadaf Ebrahimi #include "util/test.h"
22*ccdc9c3eSSadaf Ebrahimi #include "util/logging.h"
23*ccdc9c3eSSadaf Ebrahimi #include "util/strutil.h"
24*ccdc9c3eSSadaf Ebrahimi #include "util/utf.h"
25*ccdc9c3eSSadaf Ebrahimi #include "re2/stringpiece.h"
26*ccdc9c3eSSadaf Ebrahimi #include "re2/regexp.h"
27*ccdc9c3eSSadaf Ebrahimi
28*ccdc9c3eSSadaf Ebrahimi // Cause a link error if this file is used outside of testing.
29*ccdc9c3eSSadaf Ebrahimi DECLARE_string(test_tmpdir);
30*ccdc9c3eSSadaf Ebrahimi
31*ccdc9c3eSSadaf Ebrahimi namespace re2 {
32*ccdc9c3eSSadaf Ebrahimi
33*ccdc9c3eSSadaf Ebrahimi static const char* kOpcodeNames[] = {
34*ccdc9c3eSSadaf Ebrahimi "bad",
35*ccdc9c3eSSadaf Ebrahimi "no",
36*ccdc9c3eSSadaf Ebrahimi "emp",
37*ccdc9c3eSSadaf Ebrahimi "lit",
38*ccdc9c3eSSadaf Ebrahimi "str",
39*ccdc9c3eSSadaf Ebrahimi "cat",
40*ccdc9c3eSSadaf Ebrahimi "alt",
41*ccdc9c3eSSadaf Ebrahimi "star",
42*ccdc9c3eSSadaf Ebrahimi "plus",
43*ccdc9c3eSSadaf Ebrahimi "que",
44*ccdc9c3eSSadaf Ebrahimi "rep",
45*ccdc9c3eSSadaf Ebrahimi "cap",
46*ccdc9c3eSSadaf Ebrahimi "dot",
47*ccdc9c3eSSadaf Ebrahimi "byte",
48*ccdc9c3eSSadaf Ebrahimi "bol",
49*ccdc9c3eSSadaf Ebrahimi "eol",
50*ccdc9c3eSSadaf Ebrahimi "wb", // kRegexpWordBoundary
51*ccdc9c3eSSadaf Ebrahimi "nwb", // kRegexpNoWordBoundary
52*ccdc9c3eSSadaf Ebrahimi "bot",
53*ccdc9c3eSSadaf Ebrahimi "eot",
54*ccdc9c3eSSadaf Ebrahimi "cc",
55*ccdc9c3eSSadaf Ebrahimi "match",
56*ccdc9c3eSSadaf Ebrahimi };
57*ccdc9c3eSSadaf Ebrahimi
58*ccdc9c3eSSadaf Ebrahimi // Create string representation of regexp with explicit structure.
59*ccdc9c3eSSadaf Ebrahimi // Nothing pretty, just for testing.
DumpRegexpAppending(Regexp * re,string * s)60*ccdc9c3eSSadaf Ebrahimi static void DumpRegexpAppending(Regexp* re, string* s) {
61*ccdc9c3eSSadaf Ebrahimi if (re->op() < 0 || re->op() >= arraysize(kOpcodeNames)) {
62*ccdc9c3eSSadaf Ebrahimi StringAppendF(s, "op%d", re->op());
63*ccdc9c3eSSadaf Ebrahimi } else {
64*ccdc9c3eSSadaf Ebrahimi switch (re->op()) {
65*ccdc9c3eSSadaf Ebrahimi default:
66*ccdc9c3eSSadaf Ebrahimi break;
67*ccdc9c3eSSadaf Ebrahimi case kRegexpStar:
68*ccdc9c3eSSadaf Ebrahimi case kRegexpPlus:
69*ccdc9c3eSSadaf Ebrahimi case kRegexpQuest:
70*ccdc9c3eSSadaf Ebrahimi case kRegexpRepeat:
71*ccdc9c3eSSadaf Ebrahimi if (re->parse_flags() & Regexp::NonGreedy)
72*ccdc9c3eSSadaf Ebrahimi s->append("n");
73*ccdc9c3eSSadaf Ebrahimi break;
74*ccdc9c3eSSadaf Ebrahimi }
75*ccdc9c3eSSadaf Ebrahimi s->append(kOpcodeNames[re->op()]);
76*ccdc9c3eSSadaf Ebrahimi if (re->op() == kRegexpLiteral && (re->parse_flags() & Regexp::FoldCase)) {
77*ccdc9c3eSSadaf Ebrahimi Rune r = re->rune();
78*ccdc9c3eSSadaf Ebrahimi if ('a' <= r && r <= 'z')
79*ccdc9c3eSSadaf Ebrahimi s->append("fold");
80*ccdc9c3eSSadaf Ebrahimi }
81*ccdc9c3eSSadaf Ebrahimi if (re->op() == kRegexpLiteralString && (re->parse_flags() & Regexp::FoldCase)) {
82*ccdc9c3eSSadaf Ebrahimi for (int i = 0; i < re->nrunes(); i++) {
83*ccdc9c3eSSadaf Ebrahimi Rune r = re->runes()[i];
84*ccdc9c3eSSadaf Ebrahimi if ('a' <= r && r <= 'z') {
85*ccdc9c3eSSadaf Ebrahimi s->append("fold");
86*ccdc9c3eSSadaf Ebrahimi break;
87*ccdc9c3eSSadaf Ebrahimi }
88*ccdc9c3eSSadaf Ebrahimi }
89*ccdc9c3eSSadaf Ebrahimi }
90*ccdc9c3eSSadaf Ebrahimi }
91*ccdc9c3eSSadaf Ebrahimi s->append("{");
92*ccdc9c3eSSadaf Ebrahimi switch (re->op()) {
93*ccdc9c3eSSadaf Ebrahimi default:
94*ccdc9c3eSSadaf Ebrahimi break;
95*ccdc9c3eSSadaf Ebrahimi case kRegexpEndText:
96*ccdc9c3eSSadaf Ebrahimi if (!(re->parse_flags() & Regexp::WasDollar)) {
97*ccdc9c3eSSadaf Ebrahimi s->append("\\z");
98*ccdc9c3eSSadaf Ebrahimi }
99*ccdc9c3eSSadaf Ebrahimi break;
100*ccdc9c3eSSadaf Ebrahimi case kRegexpLiteral: {
101*ccdc9c3eSSadaf Ebrahimi Rune r = re->rune();
102*ccdc9c3eSSadaf Ebrahimi char buf[UTFmax+1];
103*ccdc9c3eSSadaf Ebrahimi buf[runetochar(buf, &r)] = 0;
104*ccdc9c3eSSadaf Ebrahimi s->append(buf);
105*ccdc9c3eSSadaf Ebrahimi break;
106*ccdc9c3eSSadaf Ebrahimi }
107*ccdc9c3eSSadaf Ebrahimi case kRegexpLiteralString:
108*ccdc9c3eSSadaf Ebrahimi for (int i = 0; i < re->nrunes(); i++) {
109*ccdc9c3eSSadaf Ebrahimi Rune r = re->runes()[i];
110*ccdc9c3eSSadaf Ebrahimi char buf[UTFmax+1];
111*ccdc9c3eSSadaf Ebrahimi buf[runetochar(buf, &r)] = 0;
112*ccdc9c3eSSadaf Ebrahimi s->append(buf);
113*ccdc9c3eSSadaf Ebrahimi }
114*ccdc9c3eSSadaf Ebrahimi break;
115*ccdc9c3eSSadaf Ebrahimi case kRegexpConcat:
116*ccdc9c3eSSadaf Ebrahimi case kRegexpAlternate:
117*ccdc9c3eSSadaf Ebrahimi for (int i = 0; i < re->nsub(); i++)
118*ccdc9c3eSSadaf Ebrahimi DumpRegexpAppending(re->sub()[i], s);
119*ccdc9c3eSSadaf Ebrahimi break;
120*ccdc9c3eSSadaf Ebrahimi case kRegexpStar:
121*ccdc9c3eSSadaf Ebrahimi case kRegexpPlus:
122*ccdc9c3eSSadaf Ebrahimi case kRegexpQuest:
123*ccdc9c3eSSadaf Ebrahimi DumpRegexpAppending(re->sub()[0], s);
124*ccdc9c3eSSadaf Ebrahimi break;
125*ccdc9c3eSSadaf Ebrahimi case kRegexpCapture:
126*ccdc9c3eSSadaf Ebrahimi if (re->cap() == 0)
127*ccdc9c3eSSadaf Ebrahimi LOG(DFATAL) << "kRegexpCapture cap() == 0";
128*ccdc9c3eSSadaf Ebrahimi if (re->name()) {
129*ccdc9c3eSSadaf Ebrahimi s->append(*re->name());
130*ccdc9c3eSSadaf Ebrahimi s->append(":");
131*ccdc9c3eSSadaf Ebrahimi }
132*ccdc9c3eSSadaf Ebrahimi DumpRegexpAppending(re->sub()[0], s);
133*ccdc9c3eSSadaf Ebrahimi break;
134*ccdc9c3eSSadaf Ebrahimi case kRegexpRepeat:
135*ccdc9c3eSSadaf Ebrahimi s->append(StringPrintf("%d,%d ", re->min(), re->max()));
136*ccdc9c3eSSadaf Ebrahimi DumpRegexpAppending(re->sub()[0], s);
137*ccdc9c3eSSadaf Ebrahimi break;
138*ccdc9c3eSSadaf Ebrahimi case kRegexpCharClass: {
139*ccdc9c3eSSadaf Ebrahimi string sep;
140*ccdc9c3eSSadaf Ebrahimi for (CharClass::iterator it = re->cc()->begin();
141*ccdc9c3eSSadaf Ebrahimi it != re->cc()->end(); ++it) {
142*ccdc9c3eSSadaf Ebrahimi RuneRange rr = *it;
143*ccdc9c3eSSadaf Ebrahimi s->append(sep);
144*ccdc9c3eSSadaf Ebrahimi if (rr.lo == rr.hi)
145*ccdc9c3eSSadaf Ebrahimi s->append(StringPrintf("%#x", rr.lo));
146*ccdc9c3eSSadaf Ebrahimi else
147*ccdc9c3eSSadaf Ebrahimi s->append(StringPrintf("%#x-%#x", rr.lo, rr.hi));
148*ccdc9c3eSSadaf Ebrahimi sep = " ";
149*ccdc9c3eSSadaf Ebrahimi }
150*ccdc9c3eSSadaf Ebrahimi break;
151*ccdc9c3eSSadaf Ebrahimi }
152*ccdc9c3eSSadaf Ebrahimi }
153*ccdc9c3eSSadaf Ebrahimi s->append("}");
154*ccdc9c3eSSadaf Ebrahimi }
155*ccdc9c3eSSadaf Ebrahimi
Dump()156*ccdc9c3eSSadaf Ebrahimi string Regexp::Dump() {
157*ccdc9c3eSSadaf Ebrahimi string s;
158*ccdc9c3eSSadaf Ebrahimi
159*ccdc9c3eSSadaf Ebrahimi // Make sure being called from a unit test.
160*ccdc9c3eSSadaf Ebrahimi if (FLAGS_test_tmpdir.empty()) {
161*ccdc9c3eSSadaf Ebrahimi LOG(ERROR) << "Cannot use except for testing.";
162*ccdc9c3eSSadaf Ebrahimi return s;
163*ccdc9c3eSSadaf Ebrahimi }
164*ccdc9c3eSSadaf Ebrahimi
165*ccdc9c3eSSadaf Ebrahimi DumpRegexpAppending(this, &s);
166*ccdc9c3eSSadaf Ebrahimi return s;
167*ccdc9c3eSSadaf Ebrahimi }
168*ccdc9c3eSSadaf Ebrahimi
169*ccdc9c3eSSadaf Ebrahimi } // namespace re2
170