xref: /aosp_15_r20/external/regex-re2/re2/testing/parse_test.cc (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1 // Copyright 2006 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Test parse.cc, dump.cc, and tostring.cc.
6 
7 #include <string>
8 
9 #include "util/test.h"
10 #include "util/logging.h"
11 #include "re2/regexp.h"
12 
13 namespace re2 {
14 
15 // In the past, we used 1<<30 here and zeroed the bit later, but that
16 // has undefined behaviour, so now we use an internal-only flag because
17 // otherwise we would have to introduce a new flag value just for this.
18 static const Regexp::ParseFlags TestZeroFlags = Regexp::WasDollar;
19 
20 struct Test {
21   const char* regexp;
22   const char* parse;
23   Regexp::ParseFlags flags;
24 };
25 
26 static Regexp::ParseFlags kTestFlags = Regexp::MatchNL |
27                                        Regexp::PerlX |
28                                        Regexp::PerlClasses |
29                                        Regexp::UnicodeGroups;
30 
31 static Test tests[] = {
32   // Base cases
33   { "a", "lit{a}" },
34   { "a.", "cat{lit{a}dot{}}" },
35   { "a.b", "cat{lit{a}dot{}lit{b}}" },
36   { "ab", "str{ab}" },
37   { "a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}" },
38   { "abc", "str{abc}" },
39   { "a|^", "alt{lit{a}bol{}}" },
40   { "a|b", "cc{0x61-0x62}" },
41   { "(a)", "cap{lit{a}}" },
42   { "(a)|b", "alt{cap{lit{a}}lit{b}}" },
43   { "a*", "star{lit{a}}" },
44   { "a+", "plus{lit{a}}" },
45   { "a?", "que{lit{a}}" },
46   { "a{2}", "rep{2,2 lit{a}}" },
47   { "a{2,3}", "rep{2,3 lit{a}}" },
48   { "a{2,}", "rep{2,-1 lit{a}}" },
49   { "a*?", "nstar{lit{a}}" },
50   { "a+?", "nplus{lit{a}}" },
51   { "a??", "nque{lit{a}}" },
52   { "a{2}?", "nrep{2,2 lit{a}}" },
53   { "a{2,3}?", "nrep{2,3 lit{a}}" },
54   { "a{2,}?", "nrep{2,-1 lit{a}}" },
55   { "", "emp{}" },
56   { "|", "alt{emp{}emp{}}" },
57   { "|x|", "alt{emp{}lit{x}emp{}}" },
58   { ".", "dot{}" },
59   { "^", "bol{}" },
60   { "$", "eol{}" },
61   { "\\|", "lit{|}" },
62   { "\\(", "lit{(}" },
63   { "\\)", "lit{)}" },
64   { "\\*", "lit{*}" },
65   { "\\+", "lit{+}" },
66   { "\\?", "lit{?}" },
67   { "{", "lit{{}" },
68   { "}", "lit{}}" },
69   { "\\.", "lit{.}" },
70   { "\\^", "lit{^}" },
71   { "\\$", "lit{$}" },
72   { "\\\\", "lit{\\}" },
73   { "[ace]", "cc{0x61 0x63 0x65}" },
74   { "[abc]", "cc{0x61-0x63}" },
75   { "[a-z]", "cc{0x61-0x7a}" },
76   { "[a]", "lit{a}" },
77   { "\\-", "lit{-}" },
78   { "-", "lit{-}" },
79   { "\\_", "lit{_}" },
80 
81   // Posix and Perl extensions
82   { "[[:lower:]]", "cc{0x61-0x7a}" },
83   { "[a-z]", "cc{0x61-0x7a}" },
84   { "[^[:lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
85   { "[[:^lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
86   { "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
87   { "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
88   { "(?i)[^[:lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
89   { "(?i)[[:^lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
90   { "\\d", "cc{0x30-0x39}" },
91   { "\\D", "cc{0-0x2f 0x3a-0x10ffff}" },
92   { "\\s", "cc{0x9-0xa 0xc-0xd 0x20}" },
93   { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" },
94   { "\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}" },
95   { "\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}" },
96   { "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}" },
97   { "(?i)\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
98   { "[^\\\\]", "cc{0-0x5b 0x5d-0x10ffff}" },
99   { "\\C", "byte{}" },
100 
101   // Unicode, negatives, and a double negative.
102   { "\\p{Braille}", "cc{0x2800-0x28ff}" },
103   { "\\P{Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" },
104   { "\\p{^Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" },
105   { "\\P{^Braille}", "cc{0x2800-0x28ff}" },
106 
107   // More interesting regular expressions.
108   { "a{,2}", "str{a{,2}}" },
109   { "\\.\\^\\$\\\\", "str{.^$\\}" },
110   { "[a-zABC]", "cc{0x41-0x43 0x61-0x7a}" },
111   { "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
112   { "[\xce\xb1-\xce\xb5\xe2\x98\xba]", "cc{0x3b1-0x3b5 0x263a}" },  // utf-8
113   { "a*{", "cat{star{lit{a}}lit{{}}" },
114 
115   // Test precedences
116   { "(?:ab)*", "star{str{ab}}" },
117   { "(ab)*", "star{cap{str{ab}}}" },
118   { "ab|cd", "alt{str{ab}str{cd}}" },
119   { "a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" },
120 
121   // Test squashing of **, ++, ?? et cetera.
122   { "(?:(?:a)*)*", "star{lit{a}}" },
123   { "(?:(?:a)+)+", "plus{lit{a}}" },
124   { "(?:(?:a)?)?", "que{lit{a}}" },
125   { "(?:(?:a)*)+", "star{lit{a}}" },
126   { "(?:(?:a)*)?", "star{lit{a}}" },
127   { "(?:(?:a)+)*", "star{lit{a}}" },
128   { "(?:(?:a)+)?", "star{lit{a}}" },
129   { "(?:(?:a)?)*", "star{lit{a}}" },
130   { "(?:(?:a)?)+", "star{lit{a}}" },
131 
132   // Test flattening.
133   { "(?:a)", "lit{a}" },
134   { "(?:ab)(?:cd)", "str{abcd}" },
135   { "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" },
136   { "a|c", "cc{0x61 0x63}" },
137   { "a|[cd]", "cc{0x61 0x63-0x64}" },
138   { "a|.", "dot{}" },
139   { "[ab]|c", "cc{0x61-0x63}" },
140   { "[ab]|[cd]", "cc{0x61-0x64}" },
141   { "[ab]|.", "dot{}" },
142   { ".|c", "dot{}" },
143   { ".|[cd]", "dot{}" },
144   { ".|.", "dot{}" },
145 
146   // Test Perl quoted literals
147   { "\\Q+|*?{[\\E", "str{+|*?{[}" },
148   { "\\Q+\\E+", "plus{lit{+}}" },
149   { "\\Q\\\\E", "lit{\\}" },
150   { "\\Q\\\\\\E", "str{\\\\}" },
151   { "\\Qa\\E*", "star{lit{a}}" },
152   { "\\Qab\\E*", "cat{lit{a}star{lit{b}}}" },
153   { "\\Qabc\\E*", "cat{str{ab}star{lit{c}}}" },
154 
155   // Test Perl \A and \z
156   { "(?m)^", "bol{}" },
157   { "(?m)$", "eol{}" },
158   { "(?-m)^", "bot{}" },
159   { "(?-m)$", "eot{}" },
160   { "(?m)\\A", "bot{}" },
161   { "(?m)\\z", "eot{\\z}" },
162   { "(?-m)\\A", "bot{}" },
163   { "(?-m)\\z", "eot{\\z}" },
164 
165   // Test named captures
166   { "(?P<name>a)", "cap{name:lit{a}}" },
167 
168   // Case-folded literals
169   { "[Aa]", "litfold{a}" },
170 
171   // Strings
172   { "abcde", "str{abcde}" },
173   { "[Aa][Bb]cd", "cat{strfold{ab}str{cd}}" },
174 
175   // Reported bug involving \n leaking in despite use of NeverNL.
176   { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags },
177   { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
178   { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
179   { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
180   { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", TestZeroFlags },
181   { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
182   { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
183   { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
184   { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", TestZeroFlags },
185   { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
186   { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
187   { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
188   { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", TestZeroFlags },
189   { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
190   { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
191   { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
192   { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags },
193   { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
194   { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
195   { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
196   { "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
197   { "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
198   { "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
199   { "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
200   { "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
201   { "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
202   { "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
203   { "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
204   { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
205     Regexp::PerlClasses },
206   { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
207     Regexp::PerlClasses | Regexp::FoldCase },
208   { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
209     Regexp::PerlClasses | Regexp::NeverNL },
210   { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
211     Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase },
212   { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
213     Regexp::PerlClasses },
214   { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
215     Regexp::PerlClasses | Regexp::FoldCase },
216   { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
217     Regexp::PerlClasses | Regexp::NeverNL },
218   { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
219     Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase },
220 };
221 
RegexpEqualTestingOnly(Regexp * a,Regexp * b)222 bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) {
223   return Regexp::Equal(a, b);
224 }
225 
TestParse(const Test * tests,int ntests,Regexp::ParseFlags flags,const string & title)226 void TestParse(const Test* tests, int ntests, Regexp::ParseFlags flags,
227                const string& title) {
228   Regexp** re = new Regexp*[ntests];
229   for (int i = 0; i < ntests; i++) {
230     RegexpStatus status;
231     Regexp::ParseFlags f = flags;
232     if (tests[i].flags != 0) {
233       f = tests[i].flags & ~TestZeroFlags;
234     }
235     re[i] = Regexp::Parse(tests[i].regexp, f, &status);
236     ASSERT_TRUE(re[i] != NULL)
237       << " " << tests[i].regexp << " " << status.Text();
238     string s = re[i]->Dump();
239     EXPECT_EQ(string(tests[i].parse), s) << "Regexp: " << tests[i].regexp
240       << "\nparse: " << string(tests[i].parse) << " s: " << s << " flag=" << f;
241   }
242 
243   for (int i = 0; i < ntests; i++) {
244     for (int j = 0; j < ntests; j++) {
245       EXPECT_EQ(string(tests[i].parse) == string(tests[j].parse),
246                 RegexpEqualTestingOnly(re[i], re[j]))
247         << "Regexp: " << tests[i].regexp << " " << tests[j].regexp;
248     }
249   }
250 
251   for (int i = 0; i < ntests; i++)
252     re[i]->Decref();
253   delete[] re;
254 }
255 
256 // Test that regexps parse to expected structures.
TEST(TestParse,SimpleRegexps)257 TEST(TestParse, SimpleRegexps) {
258   TestParse(tests, arraysize(tests), kTestFlags, "simple");
259 }
260 
261 Test foldcase_tests[] = {
262   { "AbCdE", "strfold{abcde}" },
263   { "[Aa]", "litfold{a}" },
264   { "a", "litfold{a}" },
265 
266   // 0x17F is an old English long s (looks like an f) and folds to s.
267   // 0x212A is the Kelvin symbol and folds to k.
268   { "A[F-g]", "cat{litfold{a}cc{0x41-0x7a 0x17f 0x212a}}" },  // [Aa][A-z...]
269   { "[[:upper:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
270   { "[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
271 };
272 
273 // Test that parsing with FoldCase works.
TEST(TestParse,FoldCase)274 TEST(TestParse, FoldCase) {
275   TestParse(foldcase_tests, arraysize(foldcase_tests), Regexp::FoldCase, "foldcase");
276 }
277 
278 Test literal_tests[] = {
279   { "(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}" },
280 };
281 
282 // Test that parsing with Literal works.
TEST(TestParse,Literal)283 TEST(TestParse, Literal) {
284   TestParse(literal_tests, arraysize(literal_tests), Regexp::Literal, "literal");
285 }
286 
287 Test matchnl_tests[] = {
288   { ".", "dot{}" },
289   { "\n", "lit{\n}" },
290   { "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
291   { "[a\\n]", "cc{0xa 0x61}" },
292 };
293 
294 // Test that parsing with MatchNL works.
295 // (Also tested above during simple cases.)
TEST(TestParse,MatchNL)296 TEST(TestParse, MatchNL) {
297   TestParse(matchnl_tests, arraysize(matchnl_tests), Regexp::MatchNL, "with MatchNL");
298 }
299 
300 Test nomatchnl_tests[] = {
301   { ".", "cc{0-0x9 0xb-0x10ffff}" },
302   { "\n", "lit{\n}" },
303   { "[^a]", "cc{0-0x9 0xb-0x60 0x62-0x10ffff}" },
304   { "[a\\n]", "cc{0xa 0x61}" },
305 };
306 
307 // Test that parsing without MatchNL works.
TEST(TestParse,NoMatchNL)308 TEST(TestParse, NoMatchNL) {
309   TestParse(nomatchnl_tests, arraysize(nomatchnl_tests), Regexp::NoParseFlags, "without MatchNL");
310 }
311 
312 Test prefix_tests[] = {
313   { "abc|abd", "cat{str{ab}cc{0x63-0x64}}" },
314   { "a(?:b)c|abd", "cat{str{ab}cc{0x63-0x64}}" },
315   { "abc|abd|aef|bcx|bcy",
316     "alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}"
317       "cat{str{bc}cc{0x78-0x79}}}" },
318   { "abc|x|abd", "alt{str{abc}lit{x}str{abd}}" },
319   { "(?i)abc|ABD", "cat{strfold{ab}cc{0x43-0x44 0x63-0x64}}" },
320   { "[ab]c|[ab]d", "cat{cc{0x61-0x62}cc{0x63-0x64}}" },
321   { ".c|.d", "cat{cc{0-0x9 0xb-0x10ffff}cc{0x63-0x64}}" },
322   { "\\Cc|\\Cd", "cat{byte{}cc{0x63-0x64}}" },
323   { "x{2}|x{2}[0-9]",
324     "cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}" },
325   { "x{2}y|x{2}[0-9]y",
326     "cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}" },
327   { "n|r|rs",
328     "alt{lit{n}cat{lit{r}alt{emp{}lit{s}}}}" },
329   { "n|rs|r",
330     "alt{lit{n}cat{lit{r}alt{lit{s}emp{}}}}" },
331   { "r|rs|n",
332     "alt{cat{lit{r}alt{emp{}lit{s}}}lit{n}}" },
333   { "rs|r|n",
334     "alt{cat{lit{r}alt{lit{s}emp{}}}lit{n}}" },
335   { "a\\C*?c|a\\C*?b",
336     "cat{lit{a}alt{cat{nstar{byte{}}lit{c}}cat{nstar{byte{}}lit{b}}}}" },
337   { "^/a/bc|^/a/de",
338     "cat{bol{}cat{str{/a/}alt{str{bc}str{de}}}}" },
339   // In the past, factoring was limited to kFactorAlternationMaxDepth (8).
340   { "a|aa|aaa|aaaa|aaaaa|aaaaaa|aaaaaaa|aaaaaaaa|aaaaaaaaa|aaaaaaaaaa",
341     "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}"
342     "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}"
343     "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}"
344     "lit{a}}}}}}}}}}}}}}}}}}}" },
345   { "a|aardvark|aardvarks|abaci|aback|abacus|abacuses|abaft|abalone|abalones",
346     "cat{lit{a}alt{emp{}cat{str{ardvark}alt{emp{}lit{s}}}"
347     "cat{str{ba}alt{cat{lit{c}alt{cc{0x69 0x6b}cat{str{us}alt{emp{}str{es}}}}}"
348     "str{ft}cat{str{lone}alt{emp{}lit{s}}}}}}}" },
349 };
350 
351 // Test that prefix factoring works.
TEST(TestParse,Prefix)352 TEST(TestParse, Prefix) {
353   TestParse(prefix_tests, arraysize(prefix_tests), Regexp::PerlX, "prefix");
354 }
355 
356 Test nested_tests[] = {
357   { "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))",
358     "cap{cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 lit{x}}}}}}}}}}}}}}}}}}}}" },
359   { "((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
360     "cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{1,1 lit{x}}}}}}}}}}}}}}}}}}}}}" },
361   { "((((((((((x{0}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
362     "cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{0,0 lit{x}}}}}}}}}}}}}}}}}}}}}" },
363   { "((((((x{2}){2}){2}){5}){5}){5})",
364     "cap{rep{5,5 cap{rep{5,5 cap{rep{5,5 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 lit{x}}}}}}}}}}}}}" },
365 };
366 
367 // Test that nested repetition works.
TEST(TestParse,Nested)368 TEST(TestParse, Nested) {
369   TestParse(nested_tests, arraysize(nested_tests), Regexp::PerlX, "nested");
370 }
371 
372 // Invalid regular expressions
373 const char* badtests[] = {
374   "(",
375   ")",
376   "(a",
377   "(a|b|",
378   "(a|b",
379   "[a-z",
380   "([a-z)",
381   "x{1001}",
382   "\xff",      // Invalid UTF-8
383   "[\xff]",
384   "[\\\xff]",
385   "\\\xff",
386   "(?P<name>a",
387   "(?P<name>",
388   "(?P<name",
389   "(?P<x y>a)",
390   "(?P<>a)",
391   "[a-Z]",
392   "(?i)[a-Z]",
393   "a{100000}",
394   "a{100000,}",
395   "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
396   "(((x{7}){11}){13})",
397   "\\Q\\E*",
398 };
399 
400 // Valid in Perl, bad in POSIX
401 const char* only_perl[] = {
402  "[a-b-c]",
403  "\\Qabc\\E",
404  "\\Q*+?{[\\E",
405  "\\Q\\\\E",
406  "\\Q\\\\\\E",
407  "\\Q\\\\\\\\E",
408  "\\Q\\\\\\\\\\E",
409  "(?:a)",
410  "(?P<name>a)",
411 };
412 
413 // Valid in POSIX, bad in Perl.
414 const char* only_posix[] = {
415   "a++",
416   "a**",
417   "a?*",
418   "a+*",
419   "a{1}*",
420 };
421 
422 // Test that parser rejects bad regexps.
TEST(TestParse,InvalidRegexps)423 TEST(TestParse, InvalidRegexps) {
424   for (int i = 0; i < arraysize(badtests); i++) {
425     ASSERT_TRUE(Regexp::Parse(badtests[i], Regexp::PerlX, NULL) == NULL)
426       << " " << badtests[i];
427     ASSERT_TRUE(Regexp::Parse(badtests[i], Regexp::NoParseFlags, NULL) == NULL)
428       << " " << badtests[i];
429   }
430   for (int i = 0; i < arraysize(only_posix); i++) {
431     ASSERT_TRUE(Regexp::Parse(only_posix[i], Regexp::PerlX, NULL) == NULL)
432       << " " << only_posix[i];
433     Regexp* re = Regexp::Parse(only_posix[i], Regexp::NoParseFlags, NULL);
434     ASSERT_TRUE(re != NULL) << " " << only_posix[i];
435     re->Decref();
436   }
437   for (int i = 0; i < arraysize(only_perl); i++) {
438     ASSERT_TRUE(Regexp::Parse(only_perl[i], Regexp::NoParseFlags, NULL) == NULL)
439       << " " << only_perl[i];
440     Regexp* re = Regexp::Parse(only_perl[i], Regexp::PerlX, NULL);
441     ASSERT_TRUE(re != NULL) << " " << only_perl[i];
442     re->Decref();
443   }
444 }
445 
446 // Test that ToString produces original regexp or equivalent one.
TEST(TestToString,EquivalentParse)447 TEST(TestToString, EquivalentParse) {
448   for (int i = 0; i < arraysize(tests); i++) {
449     RegexpStatus status;
450     Regexp::ParseFlags f = kTestFlags;
451     if (tests[i].flags != 0) {
452       f = tests[i].flags & ~TestZeroFlags;
453     }
454     Regexp* re = Regexp::Parse(tests[i].regexp, f, &status);
455     ASSERT_TRUE(re != NULL) << " " << tests[i].regexp << " " << status.Text();
456     string s = re->Dump();
457     EXPECT_EQ(string(tests[i].parse), s) << " " << tests[i].regexp << " " << string(tests[i].parse) << " " << s;
458     string t = re->ToString();
459     if (t != tests[i].regexp) {
460       // If ToString didn't return the original regexp,
461       // it must have found one with fewer parens.
462       // Unfortunately we can't check the length here, because
463       // ToString produces "\\{" for a literal brace,
464       // but "{" is a shorter equivalent.
465       // ASSERT_LT(t.size(), strlen(tests[i].regexp))
466       //     << " t=" << t << " regexp=" << tests[i].regexp;
467 
468       // Test that if we parse the new regexp we get the same structure.
469       Regexp* nre = Regexp::Parse(t, Regexp::MatchNL | Regexp::PerlX, &status);
470       ASSERT_TRUE(nre != NULL) << " reparse " << t << " " << status.Text();
471       string ss = nre->Dump();
472       string tt = nre->ToString();
473       if (s != ss || t != tt)
474         LOG(INFO) << "ToString(" << tests[i].regexp << ") = " << t;
475       EXPECT_EQ(s, ss);
476       EXPECT_EQ(t, tt);
477       nre->Decref();
478     }
479     re->Decref();
480   }
481 }
482 
483 // Test that capture error args are correct.
TEST(NamedCaptures,ErrorArgs)484 TEST(NamedCaptures, ErrorArgs) {
485   RegexpStatus status;
486   Regexp* re;
487 
488   re = Regexp::Parse("test(?P<name", Regexp::LikePerl, &status);
489   EXPECT_TRUE(re == NULL);
490   EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
491   EXPECT_EQ(status.error_arg(), "(?P<name");
492 
493   re = Regexp::Parse("test(?P<space bar>z)", Regexp::LikePerl, &status);
494   EXPECT_TRUE(re == NULL);
495   EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
496   EXPECT_EQ(status.error_arg(), "(?P<space bar>");
497 }
498 
499 }  // namespace re2
500