xref: /aosp_15_r20/external/regex-re2/re2/testing/re2_test.cc (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1 // -*- coding: utf-8 -*-
2 // Copyright 2002-2009 The RE2 Authors.  All Rights Reserved.
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
5 
6 // TODO: Test extractions for PartialMatch/Consume
7 
8 #include <errno.h>
9 #include <stddef.h>
10 #include <stdint.h>
11 #include <string.h>
12 #include <map>
13 #include <string>
14 #include <utility>
15 #if !defined(_MSC_VER) && !defined(__CYGWIN__) && !defined(__MINGW32__)
16 #include <sys/mman.h>
17 #include <unistd.h>  /* for sysconf */
18 #endif
19 
20 #include "util/test.h"
21 #include "util/logging.h"
22 #include "util/strutil.h"
23 #include "re2/re2.h"
24 #include "re2/regexp.h"
25 
26 namespace re2 {
27 
TEST(RE2,HexTests)28 TEST(RE2, HexTests) {
29 #define ASSERT_HEX(type, value)                                         \
30   do {                                                                  \
31     type v;                                                             \
32     ASSERT_TRUE(                                                        \
33         RE2::FullMatch(#value, "([0-9a-fA-F]+)[uUlL]*", RE2::Hex(&v))); \
34     ASSERT_EQ(v, 0x##value);                                            \
35     ASSERT_TRUE(RE2::FullMatch("0x" #value, "([0-9a-fA-FxX]+)[uUlL]*",  \
36                                RE2::CRadix(&v)));                       \
37     ASSERT_EQ(v, 0x##value);                                            \
38   } while (0)
39 
40   ASSERT_HEX(short,              2bad);
41   ASSERT_HEX(unsigned short,     2badU);
42   ASSERT_HEX(int,                dead);
43   ASSERT_HEX(unsigned int,       deadU);
44   ASSERT_HEX(long,               7eadbeefL);
45   ASSERT_HEX(unsigned long,      deadbeefUL);
46   ASSERT_HEX(long long,          12345678deadbeefLL);
47   ASSERT_HEX(unsigned long long, cafebabedeadbeefULL);
48 
49 #undef ASSERT_HEX
50 }
51 
TEST(RE2,OctalTests)52 TEST(RE2, OctalTests) {
53 #define ASSERT_OCTAL(type, value)                                           \
54   do {                                                                      \
55     type v;                                                                 \
56     ASSERT_TRUE(RE2::FullMatch(#value, "([0-7]+)[uUlL]*", RE2::Octal(&v))); \
57     ASSERT_EQ(v, 0##value);                                                 \
58     ASSERT_TRUE(RE2::FullMatch("0" #value, "([0-9a-fA-FxX]+)[uUlL]*",       \
59                                RE2::CRadix(&v)));                           \
60     ASSERT_EQ(v, 0##value);                                                 \
61   } while (0)
62 
63   ASSERT_OCTAL(short,              77777);
64   ASSERT_OCTAL(unsigned short,     177777U);
65   ASSERT_OCTAL(int,                17777777777);
66   ASSERT_OCTAL(unsigned int,       37777777777U);
67   ASSERT_OCTAL(long,               17777777777L);
68   ASSERT_OCTAL(unsigned long,      37777777777UL);
69   ASSERT_OCTAL(long long,          777777777777777777777LL);
70   ASSERT_OCTAL(unsigned long long, 1777777777777777777777ULL);
71 
72 #undef ASSERT_OCTAL
73 }
74 
TEST(RE2,DecimalTests)75 TEST(RE2, DecimalTests) {
76 #define ASSERT_DECIMAL(type, value)                                            \
77   do {                                                                         \
78     type v;                                                                    \
79     ASSERT_TRUE(RE2::FullMatch(#value, "(-?[0-9]+)[uUlL]*", &v));              \
80     ASSERT_EQ(v, value);                                                       \
81     ASSERT_TRUE(                                                               \
82         RE2::FullMatch(#value, "(-?[0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \
83     ASSERT_EQ(v, value);                                                       \
84   } while (0)
85 
86   ASSERT_DECIMAL(short,              -1);
87   ASSERT_DECIMAL(unsigned short,     9999);
88   ASSERT_DECIMAL(int,                -1000);
89   ASSERT_DECIMAL(unsigned int,       12345U);
90   ASSERT_DECIMAL(long,               -10000000L);
91   ASSERT_DECIMAL(unsigned long,      3083324652U);
92   ASSERT_DECIMAL(long long,          -100000000000000LL);
93   ASSERT_DECIMAL(unsigned long long, 1234567890987654321ULL);
94 
95 #undef ASSERT_DECIMAL
96 }
97 
TEST(RE2,Replace)98 TEST(RE2, Replace) {
99   struct ReplaceTest {
100     const char *regexp;
101     const char *rewrite;
102     const char *original;
103     const char *single;
104     const char *global;
105     int        greplace_count;
106   };
107   static const ReplaceTest tests[] = {
108     { "(qu|[b-df-hj-np-tv-z]*)([a-z]+)",
109       "\\2\\1ay",
110       "the quick brown fox jumps over the lazy dogs.",
111       "ethay quick brown fox jumps over the lazy dogs.",
112       "ethay ickquay ownbray oxfay umpsjay overay ethay azylay ogsday.",
113       9 },
114     { "\\w+",
115       "\\0-NOSPAM",
116       "[email protected]",
117       "[email protected]",
118       "[email protected]",
119       4 },
120     { "^",
121       "(START)",
122       "foo",
123       "(START)foo",
124       "(START)foo",
125       1 },
126     { "^",
127       "(START)",
128       "",
129       "(START)",
130       "(START)",
131       1 },
132     { "$",
133       "(END)",
134       "",
135       "(END)",
136       "(END)",
137       1 },
138     { "b",
139       "bb",
140       "ababababab",
141       "abbabababab",
142       "abbabbabbabbabb",
143       5 },
144     { "b",
145       "bb",
146       "bbbbbb",
147       "bbbbbbb",
148       "bbbbbbbbbbbb",
149       6 },
150     { "b+",
151       "bb",
152       "bbbbbb",
153       "bb",
154       "bb",
155       1 },
156     { "b*",
157       "bb",
158       "bbbbbb",
159       "bb",
160       "bb",
161       1 },
162     { "b*",
163       "bb",
164       "aaaaa",
165       "bbaaaaa",
166       "bbabbabbabbabbabb",
167       6 },
168     // Check newline handling
169     { "a.*a",
170       "(\\0)",
171       "aba\naba",
172       "(aba)\naba",
173       "(aba)\n(aba)",
174       2 },
175     { "", NULL, NULL, NULL, NULL, 0 }
176   };
177 
178   for (const ReplaceTest* t = tests; t->original != NULL; t++) {
179     string one(t->original);
180     ASSERT_TRUE(RE2::Replace(&one, t->regexp, t->rewrite));
181     ASSERT_EQ(one, t->single);
182     string all(t->original);
183     ASSERT_EQ(RE2::GlobalReplace(&all, t->regexp, t->rewrite), t->greplace_count)
184       << "Got: " << all;
185     ASSERT_EQ(all, t->global);
186   }
187 }
188 
TestCheckRewriteString(const char * regexp,const char * rewrite,bool expect_ok)189 static void TestCheckRewriteString(const char* regexp, const char* rewrite,
190                               bool expect_ok) {
191   string error;
192   RE2 exp(regexp);
193   bool actual_ok = exp.CheckRewriteString(rewrite, &error);
194   EXPECT_EQ(expect_ok, actual_ok) << " for " << rewrite << " error: " << error;
195 }
196 
TEST(CheckRewriteString,all)197 TEST(CheckRewriteString, all) {
198   TestCheckRewriteString("abc", "foo", true);
199   TestCheckRewriteString("abc", "foo\\", false);
200   TestCheckRewriteString("abc", "foo\\0bar", true);
201 
202   TestCheckRewriteString("a(b)c", "foo", true);
203   TestCheckRewriteString("a(b)c", "foo\\0bar", true);
204   TestCheckRewriteString("a(b)c", "foo\\1bar", true);
205   TestCheckRewriteString("a(b)c", "foo\\2bar", false);
206   TestCheckRewriteString("a(b)c", "f\\\\2o\\1o", true);
207 
208   TestCheckRewriteString("a(b)(c)", "foo\\12", true);
209   TestCheckRewriteString("a(b)(c)", "f\\2o\\1o", true);
210   TestCheckRewriteString("a(b)(c)", "f\\oo\\1", false);
211 }
212 
TEST(RE2,Extract)213 TEST(RE2, Extract) {
214   string s;
215 
216   ASSERT_TRUE(RE2::Extract("[email protected]", "(.*)@([^.]*)", "\\2!\\1", &s));
217   ASSERT_EQ(s, "kremvax!boris");
218 
219   ASSERT_TRUE(RE2::Extract("foo", ".*", "'\\0'", &s));
220   ASSERT_EQ(s, "'foo'");
221   // check that false match doesn't overwrite
222   ASSERT_FALSE(RE2::Extract("baz", "bar", "'\\0'", &s));
223   ASSERT_EQ(s, "'foo'");
224 }
225 
TEST(RE2,Consume)226 TEST(RE2, Consume) {
227   RE2 r("\\s*(\\w+)");    // matches a word, possibly proceeded by whitespace
228   string word;
229 
230   string s("   aaa b!@#$@#$cccc");
231   StringPiece input(s);
232 
233   ASSERT_TRUE(RE2::Consume(&input, r, &word));
234   ASSERT_EQ(word, "aaa") << " input: " << input;
235   ASSERT_TRUE(RE2::Consume(&input, r, &word));
236   ASSERT_EQ(word, "b") << " input: " << input;
237   ASSERT_FALSE(RE2::Consume(&input, r, &word)) << " input: " << input;
238 }
239 
TEST(RE2,ConsumeN)240 TEST(RE2, ConsumeN) {
241   const string s(" one two three 4");
242   StringPiece input(s);
243 
244   RE2::Arg argv[2];
245   const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
246 
247   // 0 arg
248   EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 0));  // Skips "one".
249 
250   // 1 arg
251   string word;
252   argv[0] = &word;
253   EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 1));
254   EXPECT_EQ("two", word);
255 
256   // Multi-args
257   int n;
258   argv[1] = &n;
259   EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)\\s*(\\d+)", args, 2));
260   EXPECT_EQ("three", word);
261   EXPECT_EQ(4, n);
262 }
263 
TEST(RE2,FindAndConsume)264 TEST(RE2, FindAndConsume) {
265   RE2 r("(\\w+)");      // matches a word
266   string word;
267 
268   string s("   aaa b!@#$@#$cccc");
269   StringPiece input(s);
270 
271   ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word));
272   ASSERT_EQ(word, "aaa");
273   ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word));
274   ASSERT_EQ(word, "b");
275   ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word));
276   ASSERT_EQ(word, "cccc");
277   ASSERT_FALSE(RE2::FindAndConsume(&input, r, &word));
278 
279   // Check that FindAndConsume works without any submatches.
280   // Earlier version used uninitialized data for
281   // length to consume.
282   input = "aaa";
283   ASSERT_TRUE(RE2::FindAndConsume(&input, "aaa"));
284   ASSERT_EQ(input, "");
285 }
286 
TEST(RE2,FindAndConsumeN)287 TEST(RE2, FindAndConsumeN) {
288   const string s(" one two three 4");
289   StringPiece input(s);
290 
291   RE2::Arg argv[2];
292   const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
293 
294   // 0 arg
295   EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 0));  // Skips "one".
296 
297   // 1 arg
298   string word;
299   argv[0] = &word;
300   EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 1));
301   EXPECT_EQ("two", word);
302 
303   // Multi-args
304   int n;
305   argv[1] = &n;
306   EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)\\s*(\\d+)", args, 2));
307   EXPECT_EQ("three", word);
308   EXPECT_EQ(4, n);
309 }
310 
TEST(RE2,MatchNumberPeculiarity)311 TEST(RE2, MatchNumberPeculiarity) {
312   RE2 r("(foo)|(bar)|(baz)");
313   string word1;
314   string word2;
315   string word3;
316 
317   ASSERT_TRUE(RE2::PartialMatch("foo", r, &word1, &word2, &word3));
318   ASSERT_EQ(word1, "foo");
319   ASSERT_EQ(word2, "");
320   ASSERT_EQ(word3, "");
321   ASSERT_TRUE(RE2::PartialMatch("bar", r, &word1, &word2, &word3));
322   ASSERT_EQ(word1, "");
323   ASSERT_EQ(word2, "bar");
324   ASSERT_EQ(word3, "");
325   ASSERT_TRUE(RE2::PartialMatch("baz", r, &word1, &word2, &word3));
326   ASSERT_EQ(word1, "");
327   ASSERT_EQ(word2, "");
328   ASSERT_EQ(word3, "baz");
329   ASSERT_FALSE(RE2::PartialMatch("f", r, &word1, &word2, &word3));
330 
331   string a;
332   ASSERT_TRUE(RE2::FullMatch("hello", "(foo)|hello", &a));
333   ASSERT_EQ(a, "");
334 }
335 
TEST(RE2,Match)336 TEST(RE2, Match) {
337   RE2 re("((\\w+):([0-9]+))");   // extracts host and port
338   StringPiece group[4];
339 
340   // No match.
341   StringPiece s = "zyzzyva";
342   ASSERT_FALSE(
343       re.Match(s, 0, s.size(), RE2::UNANCHORED, group, arraysize(group)));
344 
345   // Matches and extracts.
346   s = "a chrisr:9000 here";
347   ASSERT_TRUE(
348       re.Match(s, 0, s.size(), RE2::UNANCHORED, group, arraysize(group)));
349   ASSERT_EQ(group[0], "chrisr:9000");
350   ASSERT_EQ(group[1], "chrisr:9000");
351   ASSERT_EQ(group[2], "chrisr");
352   ASSERT_EQ(group[3], "9000");
353 
354   string all, host;
355   int port;
356   ASSERT_TRUE(RE2::PartialMatch("a chrisr:9000 here", re, &all, &host, &port));
357   ASSERT_EQ(all, "chrisr:9000");
358   ASSERT_EQ(host, "chrisr");
359   ASSERT_EQ(port, 9000);
360 }
361 
TestRecursion(int size,const char * pattern)362 static void TestRecursion(int size, const char* pattern) {
363   // Fill up a string repeating the pattern given
364   string domain;
365   domain.resize(size);
366   size_t patlen = strlen(pattern);
367   for (int i = 0; i < size; i++) {
368     domain[i] = pattern[i % patlen];
369   }
370   // Just make sure it doesn't crash due to too much recursion.
371   RE2 re("([a-zA-Z0-9]|-)+(\\.([a-zA-Z0-9]|-)+)*(\\.)?", RE2::Quiet);
372   RE2::FullMatch(domain, re);
373 }
374 
375 // A meta-quoted string, interpreted as a pattern, should always match
376 // the original unquoted string.
TestQuoteMeta(const string & unquoted,const RE2::Options & options=RE2::DefaultOptions)377 static void TestQuoteMeta(const string& unquoted,
378                           const RE2::Options& options = RE2::DefaultOptions) {
379   string quoted = RE2::QuoteMeta(unquoted);
380   RE2 re(quoted, options);
381   EXPECT_TRUE(RE2::FullMatch(unquoted, re))
382       << "Unquoted='" << unquoted << "', quoted='" << quoted << "'.";
383 }
384 
385 // A meta-quoted string, interpreted as a pattern, should always match
386 // the original unquoted string.
NegativeTestQuoteMeta(const string & unquoted,const string & should_not_match,const RE2::Options & options=RE2::DefaultOptions)387 static void NegativeTestQuoteMeta(
388     const string& unquoted, const string& should_not_match,
389     const RE2::Options& options = RE2::DefaultOptions) {
390   string quoted = RE2::QuoteMeta(unquoted);
391   RE2 re(quoted, options);
392   EXPECT_FALSE(RE2::FullMatch(should_not_match, re))
393       << "Unquoted='" << unquoted << "', quoted='" << quoted << "'.";
394 }
395 
396 // Tests that quoted meta characters match their original strings,
397 // and that a few things that shouldn't match indeed do not.
TEST(QuoteMeta,Simple)398 TEST(QuoteMeta, Simple) {
399   TestQuoteMeta("foo");
400   TestQuoteMeta("foo.bar");
401   TestQuoteMeta("foo\\.bar");
402   TestQuoteMeta("[1-9]");
403   TestQuoteMeta("1.5-2.0?");
404   TestQuoteMeta("\\d");
405   TestQuoteMeta("Who doesn't like ice cream?");
406   TestQuoteMeta("((a|b)c?d*e+[f-h]i)");
407   TestQuoteMeta("((?!)xxx).*yyy");
408   TestQuoteMeta("([");
409 }
TEST(QuoteMeta,SimpleNegative)410 TEST(QuoteMeta, SimpleNegative) {
411   NegativeTestQuoteMeta("foo", "bar");
412   NegativeTestQuoteMeta("...", "bar");
413   NegativeTestQuoteMeta("\\.", ".");
414   NegativeTestQuoteMeta("\\.", "..");
415   NegativeTestQuoteMeta("(a)", "a");
416   NegativeTestQuoteMeta("(a|b)", "a");
417   NegativeTestQuoteMeta("(a|b)", "(a)");
418   NegativeTestQuoteMeta("(a|b)", "a|b");
419   NegativeTestQuoteMeta("[0-9]", "0");
420   NegativeTestQuoteMeta("[0-9]", "0-9");
421   NegativeTestQuoteMeta("[0-9]", "[9]");
422   NegativeTestQuoteMeta("((?!)xxx)", "xxx");
423 }
424 
TEST(QuoteMeta,Latin1)425 TEST(QuoteMeta, Latin1) {
426   TestQuoteMeta("3\xb2 = 9", RE2::Latin1);
427 }
428 
TEST(QuoteMeta,UTF8)429 TEST(QuoteMeta, UTF8) {
430   TestQuoteMeta("Plácido Domingo");
431   TestQuoteMeta("xyz");  // No fancy utf8.
432   TestQuoteMeta("\xc2\xb0");  // 2-byte utf8 -- a degree symbol.
433   TestQuoteMeta("27\xc2\xb0 degrees");  // As a middle character.
434   TestQuoteMeta("\xe2\x80\xb3");  // 3-byte utf8 -- a double prime.
435   TestQuoteMeta("\xf0\x9d\x85\x9f");  // 4-byte utf8 -- a music note.
436   TestQuoteMeta("27\xc2\xb0");  // Interpreted as Latin-1, this should
437                                 // still work.
438   NegativeTestQuoteMeta("27\xc2\xb0",
439                         "27\\\xc2\\\xb0");  // 2-byte utf8 -- a degree symbol.
440 }
441 
TEST(QuoteMeta,HasNull)442 TEST(QuoteMeta, HasNull) {
443   string has_null;
444 
445   // string with one null character
446   has_null += '\0';
447   TestQuoteMeta(has_null);
448   NegativeTestQuoteMeta(has_null, "");
449 
450   // Don't want null-followed-by-'1' to be interpreted as '\01'.
451   has_null += '1';
452   TestQuoteMeta(has_null);
453   NegativeTestQuoteMeta(has_null, "\1");
454 }
455 
TEST(ProgramSize,BigProgram)456 TEST(ProgramSize, BigProgram) {
457   RE2 re_simple("simple regexp");
458   RE2 re_medium("medium.*regexp");
459   RE2 re_complex("complex.{1,128}regexp");
460 
461   ASSERT_GT(re_simple.ProgramSize(), 0);
462   ASSERT_GT(re_medium.ProgramSize(), re_simple.ProgramSize());
463   ASSERT_GT(re_complex.ProgramSize(), re_medium.ProgramSize());
464 
465   ASSERT_GT(re_simple.ReverseProgramSize(), 0);
466   ASSERT_GT(re_medium.ReverseProgramSize(), re_simple.ReverseProgramSize());
467   ASSERT_GT(re_complex.ReverseProgramSize(), re_medium.ReverseProgramSize());
468 }
469 
TEST(ProgramFanout,BigProgram)470 TEST(ProgramFanout, BigProgram) {
471   RE2 re1("(?:(?:(?:(?:(?:.)?){1})*)+)");
472   RE2 re10("(?:(?:(?:(?:(?:.)?){10})*)+)");
473   RE2 re100("(?:(?:(?:(?:(?:.)?){100})*)+)");
474   RE2 re1000("(?:(?:(?:(?:(?:.)?){1000})*)+)");
475 
476   std::map<int, int> histogram;
477 
478   // 3 is the largest non-empty bucket and has 1 element.
479   ASSERT_EQ(3, re1.ProgramFanout(&histogram));
480   ASSERT_EQ(1, histogram[3]);
481 
482   // 7 is the largest non-empty bucket and has 10 elements.
483   ASSERT_EQ(7, re10.ProgramFanout(&histogram));
484   ASSERT_EQ(10, histogram[7]);
485 
486   // 10 is the largest non-empty bucket and has 100 elements.
487   ASSERT_EQ(10, re100.ProgramFanout(&histogram));
488   ASSERT_EQ(100, histogram[10]);
489 
490   // 13 is the largest non-empty bucket and has 1000 elements.
491   ASSERT_EQ(13, re1000.ProgramFanout(&histogram));
492   ASSERT_EQ(1000, histogram[13]);
493 
494   // 2 is the largest non-empty bucket and has 3 elements.
495   // This differs from the others due to how reverse `.' works.
496   ASSERT_EQ(2, re1.ReverseProgramFanout(&histogram));
497   ASSERT_EQ(3, histogram[2]);
498 
499   // 5 is the largest non-empty bucket and has 10 elements.
500   ASSERT_EQ(5, re10.ReverseProgramFanout(&histogram));
501   ASSERT_EQ(10, histogram[5]);
502 
503   // 9 is the largest non-empty bucket and has 100 elements.
504   ASSERT_EQ(9, re100.ReverseProgramFanout(&histogram));
505   ASSERT_EQ(100, histogram[9]);
506 
507   // 12 is the largest non-empty bucket and has 1000 elements.
508   ASSERT_EQ(12, re1000.ReverseProgramFanout(&histogram));
509   ASSERT_EQ(1000, histogram[12]);
510 }
511 
512 // Issue 956519: handling empty character sets was
513 // causing NULL dereference.  This tests a few empty character sets.
514 // (The way to get an empty character set is to negate a full one.)
TEST(EmptyCharset,Fuzz)515 TEST(EmptyCharset, Fuzz) {
516   static const char *empties[] = {
517     "[^\\S\\s]",
518     "[^\\S[:space:]]",
519     "[^\\D\\d]",
520     "[^\\D[:digit:]]"
521   };
522   for (int i = 0; i < arraysize(empties); i++)
523     ASSERT_FALSE(RE2(empties[i]).Match("abc", 0, 3, RE2::UNANCHORED, NULL, 0));
524 }
525 
526 // Bitstate assumes that kInstFail instructions in
527 // alternations or capture groups have been "compiled away".
TEST(EmptyCharset,BitstateAssumptions)528 TEST(EmptyCharset, BitstateAssumptions) {
529   // Captures trigger use of Bitstate.
530   static const char *nop_empties[] = {
531     "((((()))))" "[^\\S\\s]?",
532     "((((()))))" "([^\\S\\s])?",
533     "((((()))))" "([^\\S\\s]|[^\\S\\s])?",
534     "((((()))))" "(([^\\S\\s]|[^\\S\\s])|)"
535   };
536   StringPiece group[6];
537   for (int i = 0; i < arraysize(nop_empties); i++)
538     ASSERT_TRUE(RE2(nop_empties[i]).Match("", 0, 0, RE2::UNANCHORED, group, 6));
539 }
540 
541 // Test that named groups work correctly.
TEST(Capture,NamedGroups)542 TEST(Capture, NamedGroups) {
543   {
544     RE2 re("(hello world)");
545     ASSERT_EQ(re.NumberOfCapturingGroups(), 1);
546     const std::map<string, int>& m = re.NamedCapturingGroups();
547     ASSERT_EQ(m.size(), 0);
548   }
549 
550   {
551     RE2 re("(?P<A>expr(?P<B>expr)(?P<C>expr))((expr)(?P<D>expr))");
552     ASSERT_EQ(re.NumberOfCapturingGroups(), 6);
553     const std::map<string, int>& m = re.NamedCapturingGroups();
554     ASSERT_EQ(m.size(), 4);
555     ASSERT_EQ(m.find("A")->second, 1);
556     ASSERT_EQ(m.find("B")->second, 2);
557     ASSERT_EQ(m.find("C")->second, 3);
558     ASSERT_EQ(m.find("D")->second, 6);  // $4 and $5 are anonymous
559   }
560 }
561 
TEST(RE2,CapturedGroupTest)562 TEST(RE2, CapturedGroupTest) {
563   RE2 re("directions from (?P<S>.*) to (?P<D>.*)");
564   int num_groups = re.NumberOfCapturingGroups();
565   EXPECT_EQ(2, num_groups);
566   string args[4];
567   RE2::Arg arg0(&args[0]);
568   RE2::Arg arg1(&args[1]);
569   RE2::Arg arg2(&args[2]);
570   RE2::Arg arg3(&args[3]);
571 
572   const RE2::Arg* const matches[4] = {&arg0, &arg1, &arg2, &arg3};
573   EXPECT_TRUE(RE2::FullMatchN("directions from mountain view to san jose",
574                               re, matches, num_groups));
575   const std::map<string, int>& named_groups = re.NamedCapturingGroups();
576   EXPECT_TRUE(named_groups.find("S") != named_groups.end());
577   EXPECT_TRUE(named_groups.find("D") != named_groups.end());
578 
579   // The named group index is 1-based.
580   int source_group_index = named_groups.find("S")->second;
581   int destination_group_index = named_groups.find("D")->second;
582   EXPECT_EQ(1, source_group_index);
583   EXPECT_EQ(2, destination_group_index);
584 
585   // The args is zero-based.
586   EXPECT_EQ("mountain view", args[source_group_index - 1]);
587   EXPECT_EQ("san jose", args[destination_group_index - 1]);
588 }
589 
TEST(RE2,FullMatchWithNoArgs)590 TEST(RE2, FullMatchWithNoArgs) {
591   ASSERT_TRUE(RE2::FullMatch("h", "h"));
592   ASSERT_TRUE(RE2::FullMatch("hello", "hello"));
593   ASSERT_TRUE(RE2::FullMatch("hello", "h.*o"));
594   ASSERT_FALSE(RE2::FullMatch("othello", "h.*o"));  // Must be anchored at front
595   ASSERT_FALSE(RE2::FullMatch("hello!", "h.*o"));   // Must be anchored at end
596 }
597 
TEST(RE2,PartialMatch)598 TEST(RE2, PartialMatch) {
599   ASSERT_TRUE(RE2::PartialMatch("x", "x"));
600   ASSERT_TRUE(RE2::PartialMatch("hello", "h.*o"));
601   ASSERT_TRUE(RE2::PartialMatch("othello", "h.*o"));
602   ASSERT_TRUE(RE2::PartialMatch("hello!", "h.*o"));
603   ASSERT_TRUE(RE2::PartialMatch("x", "((((((((((((((((((((x))))))))))))))))))))"));
604 }
605 
TEST(RE2,PartialMatchN)606 TEST(RE2, PartialMatchN) {
607   RE2::Arg argv[2];
608   const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
609 
610   // 0 arg
611   EXPECT_TRUE(RE2::PartialMatchN("hello", "e.*o", args, 0));
612   EXPECT_FALSE(RE2::PartialMatchN("othello", "a.*o", args, 0));
613 
614   // 1 arg
615   int i;
616   argv[0] = &i;
617   EXPECT_TRUE(RE2::PartialMatchN("1001 nights", "(\\d+)", args, 1));
618   EXPECT_EQ(1001, i);
619   EXPECT_FALSE(RE2::PartialMatchN("three", "(\\d+)", args, 1));
620 
621   // Multi-arg
622   string s;
623   argv[1] = &s;
624   EXPECT_TRUE(RE2::PartialMatchN("answer: 42:life", "(\\d+):(\\w+)", args, 2));
625   EXPECT_EQ(42, i);
626   EXPECT_EQ("life", s);
627   EXPECT_FALSE(RE2::PartialMatchN("hi1", "(\\w+)(1)", args, 2));
628 }
629 
TEST(RE2,FullMatchZeroArg)630 TEST(RE2, FullMatchZeroArg) {
631   // Zero-arg
632   ASSERT_TRUE(RE2::FullMatch("1001", "\\d+"));
633 }
634 
TEST(RE2,FullMatchOneArg)635 TEST(RE2, FullMatchOneArg) {
636   int i;
637 
638   // Single-arg
639   ASSERT_TRUE(RE2::FullMatch("1001", "(\\d+)",   &i));
640   ASSERT_EQ(i, 1001);
641   ASSERT_TRUE(RE2::FullMatch("-123", "(-?\\d+)", &i));
642   ASSERT_EQ(i, -123);
643   ASSERT_FALSE(RE2::FullMatch("10", "()\\d+", &i));
644   ASSERT_FALSE(
645       RE2::FullMatch("1234567890123456789012345678901234567890", "(\\d+)", &i));
646 }
647 
TEST(RE2,FullMatchIntegerArg)648 TEST(RE2, FullMatchIntegerArg) {
649   int i;
650 
651   // Digits surrounding integer-arg
652   ASSERT_TRUE(RE2::FullMatch("1234", "1(\\d*)4", &i));
653   ASSERT_EQ(i, 23);
654   ASSERT_TRUE(RE2::FullMatch("1234", "(\\d)\\d+", &i));
655   ASSERT_EQ(i, 1);
656   ASSERT_TRUE(RE2::FullMatch("-1234", "(-\\d)\\d+", &i));
657   ASSERT_EQ(i, -1);
658   ASSERT_TRUE(RE2::PartialMatch("1234", "(\\d)", &i));
659   ASSERT_EQ(i, 1);
660   ASSERT_TRUE(RE2::PartialMatch("-1234", "(-\\d)", &i));
661   ASSERT_EQ(i, -1);
662 }
663 
TEST(RE2,FullMatchStringArg)664 TEST(RE2, FullMatchStringArg) {
665   string s;
666   // String-arg
667   ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", &s));
668   ASSERT_EQ(s, string("ell"));
669 }
670 
TEST(RE2,FullMatchStringPieceArg)671 TEST(RE2, FullMatchStringPieceArg) {
672   int i;
673   // StringPiece-arg
674   StringPiece sp;
675   ASSERT_TRUE(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &sp, &i));
676   ASSERT_EQ(sp.size(), 4);
677   ASSERT_TRUE(memcmp(sp.data(), "ruby", 4) == 0);
678   ASSERT_EQ(i, 1234);
679 }
680 
TEST(RE2,FullMatchMultiArg)681 TEST(RE2, FullMatchMultiArg) {
682   int i;
683   string s;
684   // Multi-arg
685   ASSERT_TRUE(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
686   ASSERT_EQ(s, string("ruby"));
687   ASSERT_EQ(i, 1234);
688 }
689 
TEST(RE2,FullMatchN)690 TEST(RE2, FullMatchN) {
691   RE2::Arg argv[2];
692   const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
693 
694   // 0 arg
695   EXPECT_TRUE(RE2::FullMatchN("hello", "h.*o", args, 0));
696   EXPECT_FALSE(RE2::FullMatchN("othello", "h.*o", args, 0));
697 
698   // 1 arg
699   int i;
700   argv[0] = &i;
701   EXPECT_TRUE(RE2::FullMatchN("1001", "(\\d+)", args, 1));
702   EXPECT_EQ(1001, i);
703   EXPECT_FALSE(RE2::FullMatchN("three", "(\\d+)", args, 1));
704 
705   // Multi-arg
706   string s;
707   argv[1] = &s;
708   EXPECT_TRUE(RE2::FullMatchN("42:life", "(\\d+):(\\w+)", args, 2));
709   EXPECT_EQ(42, i);
710   EXPECT_EQ("life", s);
711   EXPECT_FALSE(RE2::FullMatchN("hi1", "(\\w+)(1)", args, 2));
712 }
713 
TEST(RE2,FullMatchIgnoredArg)714 TEST(RE2, FullMatchIgnoredArg) {
715   int i;
716   string s;
717 
718   // Old-school NULL should be ignored.
719   ASSERT_TRUE(
720       RE2::FullMatch("ruby:1234", "(\\w+)(:)(\\d+)", &s, (void*)NULL, &i));
721   ASSERT_EQ(s, string("ruby"));
722   ASSERT_EQ(i, 1234);
723 
724   // C++11 nullptr should also be ignored.
725   ASSERT_TRUE(RE2::FullMatch("rubz:1235", "(\\w+)(:)(\\d+)", &s, nullptr, &i));
726   ASSERT_EQ(s, string("rubz"));
727   ASSERT_EQ(i, 1235);
728 }
729 
TEST(RE2,FullMatchTypedNullArg)730 TEST(RE2, FullMatchTypedNullArg) {
731   string s;
732 
733   // Ignore non-void* NULL arg
734   ASSERT_TRUE(RE2::FullMatch("hello", "he(.*)lo", (char*)NULL));
735   ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", (string*)NULL));
736   ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", (StringPiece*)NULL));
737   ASSERT_TRUE(RE2::FullMatch("1234", "(.*)", (int*)NULL));
738   ASSERT_TRUE(RE2::FullMatch("1234567890123456", "(.*)", (long long*)NULL));
739   ASSERT_TRUE(RE2::FullMatch("123.4567890123456", "(.*)", (double*)NULL));
740   ASSERT_TRUE(RE2::FullMatch("123.4567890123456", "(.*)", (float*)NULL));
741 
742   // Fail on non-void* NULL arg if the match doesn't parse for the given type.
743   ASSERT_FALSE(RE2::FullMatch("hello", "h(.*)lo", &s, (char*)NULL));
744   ASSERT_FALSE(RE2::FullMatch("hello", "(.*)", (int*)NULL));
745   ASSERT_FALSE(RE2::FullMatch("1234567890123456", "(.*)", (int*)NULL));
746   ASSERT_FALSE(RE2::FullMatch("hello", "(.*)", (double*)NULL));
747   ASSERT_FALSE(RE2::FullMatch("hello", "(.*)", (float*)NULL));
748 }
749 
750 // Check that numeric parsing code does not read past the end of
751 // the number being parsed.
752 // This implementation requires mmap(2) et al. and thus cannot
753 // be used unless they are available.
TEST(RE2,NULTerminated)754 TEST(RE2, NULTerminated) {
755 #if defined(_POSIX_MAPPED_FILES) && _POSIX_MAPPED_FILES > 0
756   char *v;
757   int x;
758   long pagesize = sysconf(_SC_PAGE_SIZE);
759 
760 #ifndef MAP_ANONYMOUS
761 #define MAP_ANONYMOUS MAP_ANON
762 #endif
763   v = static_cast<char*>(mmap(NULL, 2*pagesize, PROT_READ|PROT_WRITE,
764                               MAP_ANONYMOUS|MAP_PRIVATE, -1, 0));
765   ASSERT_TRUE(v != reinterpret_cast<char*>(-1));
766   LOG(INFO) << "Memory at " << (void*)v;
767   ASSERT_EQ(munmap(v + pagesize, pagesize), 0) << " error " << errno;
768   v[pagesize - 1] = '1';
769 
770   x = 0;
771   ASSERT_TRUE(RE2::FullMatch(StringPiece(v + pagesize - 1, 1), "(.*)", &x));
772   ASSERT_EQ(x, 1);
773 #endif
774 }
775 
TEST(RE2,FullMatchTypeTests)776 TEST(RE2, FullMatchTypeTests) {
777   // Type tests
778   string zeros(1000, '0');
779   {
780     char c;
781     ASSERT_TRUE(RE2::FullMatch("Hello", "(H)ello", &c));
782     ASSERT_EQ(c, 'H');
783   }
784   {
785     unsigned char c;
786     ASSERT_TRUE(RE2::FullMatch("Hello", "(H)ello", &c));
787     ASSERT_EQ(c, static_cast<unsigned char>('H'));
788   }
789   {
790     int16_t v;
791     ASSERT_TRUE(RE2::FullMatch("100",     "(-?\\d+)", &v)); ASSERT_EQ(v, 100);
792     ASSERT_TRUE(RE2::FullMatch("-100",    "(-?\\d+)", &v)); ASSERT_EQ(v, -100);
793     ASSERT_TRUE(RE2::FullMatch("32767",   "(-?\\d+)", &v)); ASSERT_EQ(v, 32767);
794     ASSERT_TRUE(RE2::FullMatch("-32768",  "(-?\\d+)", &v)); ASSERT_EQ(v, -32768);
795     ASSERT_FALSE(RE2::FullMatch("-32769", "(-?\\d+)", &v));
796     ASSERT_FALSE(RE2::FullMatch("32768",  "(-?\\d+)", &v));
797   }
798   {
799     uint16_t v;
800     ASSERT_TRUE(RE2::FullMatch("100",    "(\\d+)", &v)); ASSERT_EQ(v, 100);
801     ASSERT_TRUE(RE2::FullMatch("32767",  "(\\d+)", &v)); ASSERT_EQ(v, 32767);
802     ASSERT_TRUE(RE2::FullMatch("65535",  "(\\d+)", &v)); ASSERT_EQ(v, 65535);
803     ASSERT_FALSE(RE2::FullMatch("65536", "(\\d+)", &v));
804   }
805   {
806     int32_t v;
807     static const int32_t max = INT32_C(0x7fffffff);
808     static const int32_t min = -max - 1;
809     ASSERT_TRUE(RE2::FullMatch("100",          "(-?\\d+)", &v)); ASSERT_EQ(v, 100);
810     ASSERT_TRUE(RE2::FullMatch("-100",         "(-?\\d+)", &v)); ASSERT_EQ(v, -100);
811     ASSERT_TRUE(RE2::FullMatch("2147483647",   "(-?\\d+)", &v)); ASSERT_EQ(v, max);
812     ASSERT_TRUE(RE2::FullMatch("-2147483648",  "(-?\\d+)", &v)); ASSERT_EQ(v, min);
813     ASSERT_FALSE(RE2::FullMatch("-2147483649", "(-?\\d+)", &v));
814     ASSERT_FALSE(RE2::FullMatch("2147483648",  "(-?\\d+)", &v));
815 
816     ASSERT_TRUE(RE2::FullMatch(zeros + "2147483647", "(-?\\d+)", &v));
817     ASSERT_EQ(v, max);
818     ASSERT_TRUE(RE2::FullMatch("-" + zeros + "2147483648", "(-?\\d+)", &v));
819     ASSERT_EQ(v, min);
820 
821     ASSERT_FALSE(RE2::FullMatch("-" + zeros + "2147483649", "(-?\\d+)", &v));
822     ASSERT_TRUE(RE2::FullMatch("0x7fffffff", "(.*)", RE2::CRadix(&v)));
823     ASSERT_EQ(v, max);
824     ASSERT_FALSE(RE2::FullMatch("000x7fffffff", "(.*)", RE2::CRadix(&v)));
825   }
826   {
827     uint32_t v;
828     static const uint32_t max = UINT32_C(0xffffffff);
829     ASSERT_TRUE(RE2::FullMatch("100",         "(\\d+)", &v)); ASSERT_EQ(v, 100);
830     ASSERT_TRUE(RE2::FullMatch("4294967295",  "(\\d+)", &v)); ASSERT_EQ(v, max);
831     ASSERT_FALSE(RE2::FullMatch("4294967296", "(\\d+)", &v));
832     ASSERT_FALSE(RE2::FullMatch("-1",         "(\\d+)", &v));
833 
834     ASSERT_TRUE(RE2::FullMatch(zeros + "4294967295", "(\\d+)", &v)); ASSERT_EQ(v, max);
835   }
836   {
837     int64_t v;
838     static const int64_t max = INT64_C(0x7fffffffffffffff);
839     static const int64_t min = -max - 1;
840     string str;
841 
842     ASSERT_TRUE(RE2::FullMatch("100",  "(-?\\d+)", &v)); ASSERT_EQ(v, 100);
843     ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v)); ASSERT_EQ(v, -100);
844 
845     str = std::to_string(max);
846     ASSERT_TRUE(RE2::FullMatch(str,    "(-?\\d+)", &v)); ASSERT_EQ(v, max);
847 
848     str = std::to_string(min);
849     ASSERT_TRUE(RE2::FullMatch(str,    "(-?\\d+)", &v)); ASSERT_EQ(v, min);
850 
851     str = std::to_string(max);
852     ASSERT_NE(str.back(), '9');
853     str.back()++;
854     ASSERT_FALSE(RE2::FullMatch(str,   "(-?\\d+)", &v));
855 
856     str = std::to_string(min);
857     ASSERT_NE(str.back(), '9');
858     str.back()++;
859     ASSERT_FALSE(RE2::FullMatch(str,   "(-?\\d+)", &v));
860   }
861   {
862     uint64_t v;
863     int64_t v2;
864     static const uint64_t max = UINT64_C(0xffffffffffffffff);
865     string str;
866 
867     ASSERT_TRUE(RE2::FullMatch("100",  "(-?\\d+)", &v));  ASSERT_EQ(v, 100);
868     ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v2)); ASSERT_EQ(v2, -100);
869 
870     str = std::to_string(max);
871     ASSERT_TRUE(RE2::FullMatch(str,    "(-?\\d+)", &v)); ASSERT_EQ(v, max);
872 
873     ASSERT_NE(str.back(), '9');
874     str.back()++;
875     ASSERT_FALSE(RE2::FullMatch(str,   "(-?\\d+)", &v));
876   }
877 }
878 
TEST(RE2,FloatingPointFullMatchTypes)879 TEST(RE2, FloatingPointFullMatchTypes) {
880   string zeros(1000, '0');
881   {
882     float v;
883     ASSERT_TRUE(RE2::FullMatch("100",   "(.*)", &v)); ASSERT_EQ(v, 100);
884     ASSERT_TRUE(RE2::FullMatch("-100.", "(.*)", &v)); ASSERT_EQ(v, -100);
885     ASSERT_TRUE(RE2::FullMatch("1e23",  "(.*)", &v)); ASSERT_EQ(v, float(1e23));
886     ASSERT_TRUE(RE2::FullMatch(" 100",  "(.*)", &v)); ASSERT_EQ(v, 100);
887 
888     ASSERT_TRUE(RE2::FullMatch(zeros + "1e23",  "(.*)", &v));
889     ASSERT_EQ(v, float(1e23));
890 
891     // 6700000000081920.1 is an edge case.
892     // 6700000000081920 is exactly halfway between
893     // two float32s, so the .1 should make it round up.
894     // However, the .1 is outside the precision possible with
895     // a float64: the nearest float64 is 6700000000081920.
896     // So if the code uses strtod and then converts to float32,
897     // round-to-even will make it round down instead of up.
898     // To pass the test, the parser must call strtof directly.
899     // This test case is carefully chosen to use only a 17-digit
900     // number, since C does not guarantee to get the correctly
901     // rounded answer for strtod and strtof unless the input is
902     // short.
903     //
904     // This is known to fail on Cygwin and MinGW due to a broken
905     // implementation of strtof(3). And apparently MSVC too. Sigh.
906 #if !defined(_MSC_VER) && !defined(__CYGWIN__) && !defined(__MINGW32__)
907     ASSERT_TRUE(RE2::FullMatch("0.1", "(.*)", &v));
908     ASSERT_EQ(v, 0.1f) << StringPrintf("%.8g != %.8g", v, 0.1f);
909     ASSERT_TRUE(RE2::FullMatch("6700000000081920.1", "(.*)", &v));
910     ASSERT_EQ(v, 6700000000081920.1f)
911       << StringPrintf("%.8g != %.8g", v, 6700000000081920.1f);
912 #endif
913   }
914   {
915     double v;
916     ASSERT_TRUE(RE2::FullMatch("100",   "(.*)", &v)); ASSERT_EQ(v, 100);
917     ASSERT_TRUE(RE2::FullMatch("-100.", "(.*)", &v)); ASSERT_EQ(v, -100);
918     ASSERT_TRUE(RE2::FullMatch("1e23",  "(.*)", &v)); ASSERT_EQ(v, 1e23);
919     ASSERT_TRUE(RE2::FullMatch(zeros + "1e23", "(.*)", &v));
920     ASSERT_EQ(v, double(1e23));
921 
922     ASSERT_TRUE(RE2::FullMatch("0.1", "(.*)", &v));
923     ASSERT_EQ(v, 0.1) << StringPrintf("%.17g != %.17g", v, 0.1);
924     ASSERT_TRUE(RE2::FullMatch("1.00000005960464485", "(.*)", &v));
925     ASSERT_EQ(v, 1.0000000596046448)
926       << StringPrintf("%.17g != %.17g", v, 1.0000000596046448);
927   }
928 }
929 
TEST(RE2,FullMatchAnchored)930 TEST(RE2, FullMatchAnchored) {
931   int i;
932   // Check that matching is fully anchored
933   ASSERT_FALSE(RE2::FullMatch("x1001", "(\\d+)",  &i));
934   ASSERT_FALSE(RE2::FullMatch("1001x", "(\\d+)",  &i));
935   ASSERT_TRUE(RE2::FullMatch("x1001",  "x(\\d+)", &i)); ASSERT_EQ(i, 1001);
936   ASSERT_TRUE(RE2::FullMatch("1001x",  "(\\d+)x", &i)); ASSERT_EQ(i, 1001);
937 }
938 
TEST(RE2,FullMatchBraces)939 TEST(RE2, FullMatchBraces) {
940   // Braces
941   ASSERT_TRUE(RE2::FullMatch("0abcd",  "[0-9a-f+.-]{5,}"));
942   ASSERT_TRUE(RE2::FullMatch("0abcde", "[0-9a-f+.-]{5,}"));
943   ASSERT_FALSE(RE2::FullMatch("0abc",  "[0-9a-f+.-]{5,}"));
944 }
945 
TEST(RE2,Complicated)946 TEST(RE2, Complicated) {
947   // Complicated RE2
948   ASSERT_TRUE(RE2::FullMatch("foo", "foo|bar|[A-Z]"));
949   ASSERT_TRUE(RE2::FullMatch("bar", "foo|bar|[A-Z]"));
950   ASSERT_TRUE(RE2::FullMatch("X",   "foo|bar|[A-Z]"));
951   ASSERT_FALSE(RE2::FullMatch("XY", "foo|bar|[A-Z]"));
952 }
953 
TEST(RE2,FullMatchEnd)954 TEST(RE2, FullMatchEnd) {
955   // Check full-match handling (needs '$' tacked on internally)
956   ASSERT_TRUE(RE2::FullMatch("fo", "fo|foo"));
957   ASSERT_TRUE(RE2::FullMatch("foo", "fo|foo"));
958   ASSERT_TRUE(RE2::FullMatch("fo", "fo|foo$"));
959   ASSERT_TRUE(RE2::FullMatch("foo", "fo|foo$"));
960   ASSERT_TRUE(RE2::FullMatch("foo", "foo$"));
961   ASSERT_FALSE(RE2::FullMatch("foo$bar", "foo\\$"));
962   ASSERT_FALSE(RE2::FullMatch("fox", "fo|bar"));
963 
964   // Uncomment the following if we change the handling of '$' to
965   // prevent it from matching a trailing newline
966   if (false) {
967     // Check that we don't get bitten by pcre's special handling of a
968     // '\n' at the end of the string matching '$'
969     ASSERT_FALSE(RE2::PartialMatch("foo\n", "foo$"));
970   }
971 }
972 
TEST(RE2,FullMatchArgCount)973 TEST(RE2, FullMatchArgCount) {
974   // Number of args
975   int a[16];
976   ASSERT_TRUE(RE2::FullMatch("", ""));
977 
978   memset(a, 0, sizeof(0));
979   ASSERT_TRUE(RE2::FullMatch("1", "(\\d){1}", &a[0]));
980   ASSERT_EQ(a[0], 1);
981 
982   memset(a, 0, sizeof(0));
983   ASSERT_TRUE(RE2::FullMatch("12", "(\\d)(\\d)", &a[0], &a[1]));
984   ASSERT_EQ(a[0], 1);
985   ASSERT_EQ(a[1], 2);
986 
987   memset(a, 0, sizeof(0));
988   ASSERT_TRUE(RE2::FullMatch("123", "(\\d)(\\d)(\\d)", &a[0], &a[1], &a[2]));
989   ASSERT_EQ(a[0], 1);
990   ASSERT_EQ(a[1], 2);
991   ASSERT_EQ(a[2], 3);
992 
993   memset(a, 0, sizeof(0));
994   ASSERT_TRUE(RE2::FullMatch("1234", "(\\d)(\\d)(\\d)(\\d)", &a[0], &a[1],
995                              &a[2], &a[3]));
996   ASSERT_EQ(a[0], 1);
997   ASSERT_EQ(a[1], 2);
998   ASSERT_EQ(a[2], 3);
999   ASSERT_EQ(a[3], 4);
1000 
1001   memset(a, 0, sizeof(0));
1002   ASSERT_TRUE(RE2::FullMatch("12345", "(\\d)(\\d)(\\d)(\\d)(\\d)", &a[0], &a[1],
1003                              &a[2], &a[3], &a[4]));
1004   ASSERT_EQ(a[0], 1);
1005   ASSERT_EQ(a[1], 2);
1006   ASSERT_EQ(a[2], 3);
1007   ASSERT_EQ(a[3], 4);
1008   ASSERT_EQ(a[4], 5);
1009 
1010   memset(a, 0, sizeof(0));
1011   ASSERT_TRUE(RE2::FullMatch("123456", "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)", &a[0],
1012                              &a[1], &a[2], &a[3], &a[4], &a[5]));
1013   ASSERT_EQ(a[0], 1);
1014   ASSERT_EQ(a[1], 2);
1015   ASSERT_EQ(a[2], 3);
1016   ASSERT_EQ(a[3], 4);
1017   ASSERT_EQ(a[4], 5);
1018   ASSERT_EQ(a[5], 6);
1019 
1020   memset(a, 0, sizeof(0));
1021   ASSERT_TRUE(RE2::FullMatch("1234567", "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)",
1022                              &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6]));
1023   ASSERT_EQ(a[0], 1);
1024   ASSERT_EQ(a[1], 2);
1025   ASSERT_EQ(a[2], 3);
1026   ASSERT_EQ(a[3], 4);
1027   ASSERT_EQ(a[4], 5);
1028   ASSERT_EQ(a[5], 6);
1029   ASSERT_EQ(a[6], 7);
1030 
1031   memset(a, 0, sizeof(0));
1032   ASSERT_TRUE(RE2::FullMatch("1234567890123456",
1033                              "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)"
1034                              "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)",
1035                              &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
1036                              &a[7], &a[8], &a[9], &a[10], &a[11], &a[12],
1037                              &a[13], &a[14], &a[15]));
1038   ASSERT_EQ(a[0], 1);
1039   ASSERT_EQ(a[1], 2);
1040   ASSERT_EQ(a[2], 3);
1041   ASSERT_EQ(a[3], 4);
1042   ASSERT_EQ(a[4], 5);
1043   ASSERT_EQ(a[5], 6);
1044   ASSERT_EQ(a[6], 7);
1045   ASSERT_EQ(a[7], 8);
1046   ASSERT_EQ(a[8], 9);
1047   ASSERT_EQ(a[9], 0);
1048   ASSERT_EQ(a[10], 1);
1049   ASSERT_EQ(a[11], 2);
1050   ASSERT_EQ(a[12], 3);
1051   ASSERT_EQ(a[13], 4);
1052   ASSERT_EQ(a[14], 5);
1053   ASSERT_EQ(a[15], 6);
1054 }
1055 
TEST(RE2,Accessors)1056 TEST(RE2, Accessors) {
1057   // Check the pattern() accessor
1058   {
1059     const string kPattern = "http://([^/]+)/.*";
1060     const RE2 re(kPattern);
1061     ASSERT_EQ(kPattern, re.pattern());
1062   }
1063 
1064   // Check RE2 error field.
1065   {
1066     RE2 re("foo");
1067     ASSERT_TRUE(re.error().empty());  // Must have no error
1068     ASSERT_TRUE(re.ok());
1069     ASSERT_EQ(re.error_code(), RE2::NoError);
1070   }
1071 }
1072 
TEST(RE2,UTF8)1073 TEST(RE2, UTF8) {
1074   // Check UTF-8 handling
1075   // Three Japanese characters (nihongo)
1076   const char utf8_string[] = {
1077        (char)0xe6, (char)0x97, (char)0xa5, // 65e5
1078        (char)0xe6, (char)0x9c, (char)0xac, // 627c
1079        (char)0xe8, (char)0xaa, (char)0x9e, // 8a9e
1080        0
1081   };
1082   const char utf8_pattern[] = {
1083        '.',
1084        (char)0xe6, (char)0x9c, (char)0xac, // 627c
1085        '.',
1086        0
1087   };
1088 
1089   // Both should match in either mode, bytes or UTF-8
1090   RE2 re_test1(".........", RE2::Latin1);
1091   ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test1));
1092   RE2 re_test2("...");
1093   ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test2));
1094 
1095   // Check that '.' matches one byte or UTF-8 character
1096   // according to the mode.
1097   string s;
1098   RE2 re_test3("(.)", RE2::Latin1);
1099   ASSERT_TRUE(RE2::PartialMatch(utf8_string, re_test3, &s));
1100   ASSERT_EQ(s, string("\xe6"));
1101   RE2 re_test4("(.)");
1102   ASSERT_TRUE(RE2::PartialMatch(utf8_string, re_test4, &s));
1103   ASSERT_EQ(s, string("\xe6\x97\xa5"));
1104 
1105   // Check that string matches itself in either mode
1106   RE2 re_test5(utf8_string, RE2::Latin1);
1107   ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test5));
1108   RE2 re_test6(utf8_string);
1109   ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test6));
1110 
1111   // Check that pattern matches string only in UTF8 mode
1112   RE2 re_test7(utf8_pattern, RE2::Latin1);
1113   ASSERT_FALSE(RE2::FullMatch(utf8_string, re_test7));
1114   RE2 re_test8(utf8_pattern);
1115   ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test8));
1116 }
1117 
TEST(RE2,UngreedyUTF8)1118 TEST(RE2, UngreedyUTF8) {
1119   // Check that ungreedy, UTF8 regular expressions don't match when they
1120   // oughtn't -- see bug 82246.
1121   {
1122     // This code always worked.
1123     const char* pattern = "\\w+X";
1124     const string target = "a aX";
1125     RE2 match_sentence(pattern, RE2::Latin1);
1126     RE2 match_sentence_re(pattern);
1127 
1128     ASSERT_FALSE(RE2::FullMatch(target, match_sentence));
1129     ASSERT_FALSE(RE2::FullMatch(target, match_sentence_re));
1130   }
1131   {
1132     const char* pattern = "(?U)\\w+X";
1133     const string target = "a aX";
1134     RE2 match_sentence(pattern, RE2::Latin1);
1135     ASSERT_EQ(match_sentence.error(), "");
1136     RE2 match_sentence_re(pattern);
1137 
1138     ASSERT_FALSE(RE2::FullMatch(target, match_sentence));
1139     ASSERT_FALSE(RE2::FullMatch(target, match_sentence_re));
1140   }
1141 }
1142 
TEST(RE2,Rejects)1143 TEST(RE2, Rejects) {
1144   {
1145     RE2 re("a\\1", RE2::Quiet);
1146     ASSERT_FALSE(re.ok()); }
1147   {
1148     RE2 re("a[x", RE2::Quiet);
1149     ASSERT_FALSE(re.ok());
1150   }
1151   {
1152     RE2 re("a[z-a]", RE2::Quiet);
1153     ASSERT_FALSE(re.ok());
1154   }
1155   {
1156     RE2 re("a[[:foobar:]]", RE2::Quiet);
1157     ASSERT_FALSE(re.ok());
1158   }
1159   {
1160     RE2 re("a(b", RE2::Quiet);
1161     ASSERT_FALSE(re.ok());
1162   }
1163   {
1164     RE2 re("a\\", RE2::Quiet);
1165     ASSERT_FALSE(re.ok());
1166   }
1167 }
1168 
TEST(RE2,NoCrash)1169 TEST(RE2, NoCrash) {
1170   // Test that using a bad regexp doesn't crash.
1171   {
1172     RE2 re("a\\", RE2::Quiet);
1173     ASSERT_FALSE(re.ok());
1174     ASSERT_FALSE(RE2::PartialMatch("a\\b", re));
1175   }
1176 
1177   // Test that using an enormous regexp doesn't crash
1178   {
1179     RE2 re("(((.{100}){100}){100}){100}", RE2::Quiet);
1180     ASSERT_FALSE(re.ok());
1181     ASSERT_FALSE(RE2::PartialMatch("aaa", re));
1182   }
1183 
1184   // Test that a crazy regexp still compiles and runs.
1185   {
1186     RE2 re(".{512}x", RE2::Quiet);
1187     ASSERT_TRUE(re.ok());
1188     string s;
1189     s.append(515, 'c');
1190     s.append("x");
1191     ASSERT_TRUE(RE2::PartialMatch(s, re));
1192   }
1193 }
1194 
TEST(RE2,Recursion)1195 TEST(RE2, Recursion) {
1196   // Test that recursion is stopped.
1197   // This test is PCRE-legacy -- there's no recursion in RE2.
1198   int bytes = 15 * 1024;  // enough to crash PCRE
1199   TestRecursion(bytes, ".");
1200   TestRecursion(bytes, "a");
1201   TestRecursion(bytes, "a.");
1202   TestRecursion(bytes, "ab.");
1203   TestRecursion(bytes, "abc.");
1204 }
1205 
TEST(RE2,BigCountedRepetition)1206 TEST(RE2, BigCountedRepetition) {
1207   // Test that counted repetition works, given tons of memory.
1208   RE2::Options opt;
1209   opt.set_max_mem(256<<20);
1210 
1211   RE2 re(".{512}x", opt);
1212   ASSERT_TRUE(re.ok());
1213   string s;
1214   s.append(515, 'c');
1215   s.append("x");
1216   ASSERT_TRUE(RE2::PartialMatch(s, re));
1217 }
1218 
TEST(RE2,DeepRecursion)1219 TEST(RE2, DeepRecursion) {
1220   // Test for deep stack recursion.  This would fail with a
1221   // segmentation violation due to stack overflow before pcre was
1222   // patched.
1223   // Again, a PCRE legacy test.  RE2 doesn't recurse.
1224   string comment("x*");
1225   string a(131072, 'a');
1226   comment += a;
1227   comment += "*x";
1228   RE2 re("((?:\\s|xx.*\n|x[*](?:\n|.)*?[*]x)*)");
1229   ASSERT_TRUE(RE2::FullMatch(comment, re));
1230 }
1231 
1232 // Suggested by Josh Hyman.  Failed when SearchOnePass was
1233 // not implementing case-folding.
TEST(CaseInsensitive,MatchAndConsume)1234 TEST(CaseInsensitive, MatchAndConsume) {
1235   string result;
1236   string text = "A fish named *Wanda*";
1237   StringPiece sp(text);
1238 
1239   EXPECT_TRUE(RE2::PartialMatch(sp, "(?i)([wand]{5})", &result));
1240   EXPECT_TRUE(RE2::FindAndConsume(&sp, "(?i)([wand]{5})", &result));
1241 }
1242 
1243 // RE2 should permit implicit conversions from string, StringPiece, const char*,
1244 // and C string literals.
TEST(RE2,ImplicitConversions)1245 TEST(RE2, ImplicitConversions) {
1246   string re_string(".");
1247   StringPiece re_stringpiece(".");
1248   const char* re_cstring = ".";
1249   EXPECT_TRUE(RE2::PartialMatch("e", re_string));
1250   EXPECT_TRUE(RE2::PartialMatch("e", re_stringpiece));
1251   EXPECT_TRUE(RE2::PartialMatch("e", re_cstring));
1252   EXPECT_TRUE(RE2::PartialMatch("e", "."));
1253 }
1254 
1255 // Bugs introduced by 8622304
TEST(RE2,CL8622304)1256 TEST(RE2, CL8622304) {
1257   // reported by ingow
1258   string dir;
1259   EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])"));  // ok
1260   EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])", &dir));  // fails
1261 
1262   // reported by jacobsa
1263   string key, val;
1264   EXPECT_TRUE(RE2::PartialMatch("bar:1,0x2F,030,4,5;baz:true;fooby:false,true",
1265               "(\\w+)(?::((?:[^;\\\\]|\\\\.)*))?;?",
1266               &key,
1267               &val));
1268   EXPECT_EQ(key, "bar");
1269   EXPECT_EQ(val, "1,0x2F,030,4,5");
1270 }
1271 
1272 
1273 // Check that RE2 returns correct regexp pieces on error.
1274 // In particular, make sure it returns whole runes
1275 // and that it always reports invalid UTF-8.
1276 // Also check that Perl error flag piece is big enough.
1277 static struct ErrorTest {
1278   const char *regexp;
1279   const char *error;
1280 } error_tests[] = {
1281   { "ab\\αcd", "\\α" },
1282   { "ef\\x☺01", "\\x☺0" },
1283   { "gh\\x1☺01", "\\x1☺" },
1284   { "ij\\x1", "\\x1" },
1285   { "kl\\x", "\\x" },
1286   { "uv\\x{0000☺}", "\\x{0000☺" },
1287   { "wx\\p{ABC", "\\p{ABC" },
1288   { "yz(?smiUX:abc)", "(?smiUX" },   // used to return (?s but the error is X
1289   { "aa(?sm☺i", "(?sm☺" },
1290   { "bb[abc", "[abc" },
1291 
1292   { "mn\\x1\377", "" },  // no argument string returned for invalid UTF-8
1293   { "op\377qr", "" },
1294   { "st\\x{00000\377", "" },
1295   { "zz\\p{\377}", "" },
1296   { "zz\\x{00\377}", "" },
1297   { "zz(?P<name\377>abc)", "" },
1298 };
TEST(RE2,ErrorArgs)1299 TEST(RE2, ErrorArgs) {
1300   for (int i = 0; i < arraysize(error_tests); i++) {
1301     RE2 re(error_tests[i].regexp, RE2::Quiet);
1302     EXPECT_FALSE(re.ok());
1303     EXPECT_EQ(re.error_arg(), error_tests[i].error) << re.error();
1304   }
1305 }
1306 
1307 // Check that "never match \n" mode never matches \n.
1308 static struct NeverTest {
1309   const char* regexp;
1310   const char* text;
1311   const char* match;
1312 } never_tests[] = {
1313   { "(.*)", "abc\ndef\nghi\n", "abc" },
1314   { "(?s)(abc.*def)", "abc\ndef\n", NULL },
1315   { "(abc(.|\n)*def)", "abc\ndef\n", NULL },
1316   { "(abc[^x]*def)", "abc\ndef\n", NULL },
1317   { "(abc[^x]*def)", "abczzzdef\ndef\n", "abczzzdef" },
1318 };
TEST(RE2,NeverNewline)1319 TEST(RE2, NeverNewline) {
1320   RE2::Options opt;
1321   opt.set_never_nl(true);
1322   for (int i = 0; i < arraysize(never_tests); i++) {
1323     const NeverTest& t = never_tests[i];
1324     RE2 re(t.regexp, opt);
1325     if (t.match == NULL) {
1326       EXPECT_FALSE(re.PartialMatch(t.text, re));
1327     } else {
1328       StringPiece m;
1329       EXPECT_TRUE(re.PartialMatch(t.text, re, &m));
1330       EXPECT_EQ(m, t.match);
1331     }
1332   }
1333 }
1334 
1335 // Check that dot_nl option works.
TEST(RE2,DotNL)1336 TEST(RE2, DotNL) {
1337   RE2::Options opt;
1338   opt.set_dot_nl(true);
1339   EXPECT_TRUE(RE2::PartialMatch("\n", RE2(".", opt)));
1340   EXPECT_FALSE(RE2::PartialMatch("\n", RE2("(?-s).", opt)));
1341   opt.set_never_nl(true);
1342   EXPECT_FALSE(RE2::PartialMatch("\n", RE2(".", opt)));
1343 }
1344 
1345 // Check that there are no capturing groups in "never capture" mode.
TEST(RE2,NeverCapture)1346 TEST(RE2, NeverCapture) {
1347   RE2::Options opt;
1348   opt.set_never_capture(true);
1349   RE2 re("(r)(e)", opt);
1350   EXPECT_EQ(0, re.NumberOfCapturingGroups());
1351 }
1352 
1353 // Bitstate bug was looking at submatch[0] even if nsubmatch == 0.
1354 // Triggered by a failed DFA search falling back to Bitstate when
1355 // using Match with a NULL submatch set.  Bitstate tried to read
1356 // the submatch[0] entry even if nsubmatch was 0.
TEST(RE2,BitstateCaptureBug)1357 TEST(RE2, BitstateCaptureBug) {
1358   RE2::Options opt;
1359   opt.set_max_mem(20000);
1360   RE2 re("(_________$)", opt);
1361   StringPiece s = "xxxxxxxxxxxxxxxxxxxxxxxxxx_________x";
1362   EXPECT_FALSE(re.Match(s, 0, s.size(), RE2::UNANCHORED, NULL, 0));
1363 }
1364 
1365 // C++ version of bug 609710.
TEST(RE2,UnicodeClasses)1366 TEST(RE2, UnicodeClasses) {
1367   const string str = "ABCDEFGHI譚永鋒";
1368   string a, b, c;
1369 
1370   EXPECT_TRUE(RE2::FullMatch("A", "\\p{L}"));
1371   EXPECT_TRUE(RE2::FullMatch("A", "\\p{Lu}"));
1372   EXPECT_FALSE(RE2::FullMatch("A", "\\p{Ll}"));
1373   EXPECT_FALSE(RE2::FullMatch("A", "\\P{L}"));
1374   EXPECT_FALSE(RE2::FullMatch("A", "\\P{Lu}"));
1375   EXPECT_TRUE(RE2::FullMatch("A", "\\P{Ll}"));
1376 
1377   EXPECT_TRUE(RE2::FullMatch("譚", "\\p{L}"));
1378   EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Lu}"));
1379   EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Ll}"));
1380   EXPECT_FALSE(RE2::FullMatch("譚", "\\P{L}"));
1381   EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Lu}"));
1382   EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Ll}"));
1383 
1384   EXPECT_TRUE(RE2::FullMatch("永", "\\p{L}"));
1385   EXPECT_FALSE(RE2::FullMatch("永", "\\p{Lu}"));
1386   EXPECT_FALSE(RE2::FullMatch("永", "\\p{Ll}"));
1387   EXPECT_FALSE(RE2::FullMatch("永", "\\P{L}"));
1388   EXPECT_TRUE(RE2::FullMatch("永", "\\P{Lu}"));
1389   EXPECT_TRUE(RE2::FullMatch("永", "\\P{Ll}"));
1390 
1391   EXPECT_TRUE(RE2::FullMatch("鋒", "\\p{L}"));
1392   EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Lu}"));
1393   EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Ll}"));
1394   EXPECT_FALSE(RE2::FullMatch("鋒", "\\P{L}"));
1395   EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Lu}"));
1396   EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Ll}"));
1397 
1398   EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?(.).*?(.)", &a, &b, &c));
1399   EXPECT_EQ("A", a);
1400   EXPECT_EQ("B", b);
1401   EXPECT_EQ("C", c);
1402 
1403   EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{L}]).*?(.)", &a, &b, &c));
1404   EXPECT_EQ("A", a);
1405   EXPECT_EQ("B", b);
1406   EXPECT_EQ("C", c);
1407 
1408   EXPECT_FALSE(RE2::PartialMatch(str, "\\P{L}"));
1409 
1410   EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{Lu}]).*?(.)", &a, &b, &c));
1411   EXPECT_EQ("A", a);
1412   EXPECT_EQ("B", b);
1413   EXPECT_EQ("C", c);
1414 
1415   EXPECT_FALSE(RE2::PartialMatch(str, "[^\\p{Lu}\\p{Lo}]"));
1416 
1417   EXPECT_TRUE(RE2::PartialMatch(str, ".*(.).*?([\\p{Lu}\\p{Lo}]).*?(.)", &a, &b, &c));
1418   EXPECT_EQ("譚", a);
1419   EXPECT_EQ("永", b);
1420   EXPECT_EQ("鋒", c);
1421 }
1422 
TEST(RE2,LazyRE2)1423 TEST(RE2, LazyRE2) {
1424   // Test with and without options.
1425   static LazyRE2 a = {"a"};
1426   static LazyRE2 b = {"b", RE2::Latin1};
1427 
1428   EXPECT_EQ("a", a->pattern());
1429   EXPECT_EQ(RE2::Options::EncodingUTF8, a->options().encoding());
1430 
1431   EXPECT_EQ("b", b->pattern());
1432   EXPECT_EQ(RE2::Options::EncodingLatin1, b->options().encoding());
1433 }
1434 
1435 // Bug reported by saito. 2009/02/17
TEST(RE2,NullVsEmptyString)1436 TEST(RE2, NullVsEmptyString) {
1437   RE2 re(".*");
1438   EXPECT_TRUE(re.ok());
1439 
1440   StringPiece null;
1441   EXPECT_TRUE(RE2::FullMatch(null, re));
1442 
1443   StringPiece empty("");
1444   EXPECT_TRUE(RE2::FullMatch(empty, re));
1445 }
1446 
1447 // Similar to the previous test, check that the null string and the empty
1448 // string both match, but also that the null string can only provide null
1449 // submatches whereas the empty string can also provide empty submatches.
TEST(RE2,NullVsEmptyStringSubmatches)1450 TEST(RE2, NullVsEmptyStringSubmatches) {
1451   RE2 re("()|(foo)");
1452   EXPECT_TRUE(re.ok());
1453 
1454   // matches[0] is overall match, [1] is (), [2] is (foo), [3] is nonexistent.
1455   StringPiece matches[4];
1456 
1457   for (int i = 0; i < arraysize(matches); i++)
1458     matches[i] = "bar";
1459 
1460   StringPiece null;
1461   EXPECT_TRUE(re.Match(null, 0, null.size(), RE2::UNANCHORED,
1462                        matches, arraysize(matches)));
1463   for (int i = 0; i < arraysize(matches); i++) {
1464     EXPECT_TRUE(matches[i] == StringPiece());
1465     EXPECT_TRUE(matches[i].data() == NULL);  // always null
1466     EXPECT_TRUE(matches[i] == "");
1467   }
1468 
1469   for (int i = 0; i < arraysize(matches); i++)
1470     matches[i] = "bar";
1471 
1472   StringPiece empty("");
1473   EXPECT_TRUE(re.Match(empty, 0, empty.size(), RE2::UNANCHORED,
1474                        matches, arraysize(matches)));
1475   EXPECT_TRUE(matches[0] == StringPiece());
1476   EXPECT_TRUE(matches[0].data() != NULL);  // empty, not null
1477   EXPECT_TRUE(matches[0] == "");
1478   EXPECT_TRUE(matches[1] == StringPiece());
1479   EXPECT_TRUE(matches[1].data() != NULL);  // empty, not null
1480   EXPECT_TRUE(matches[1] == "");
1481   EXPECT_TRUE(matches[2] == StringPiece());
1482   EXPECT_TRUE(matches[2].data() == NULL);
1483   EXPECT_TRUE(matches[2] == "");
1484   EXPECT_TRUE(matches[3] == StringPiece());
1485   EXPECT_TRUE(matches[3].data() == NULL);
1486   EXPECT_TRUE(matches[3] == "");
1487 }
1488 
1489 // Issue 1816809
TEST(RE2,Bug1816809)1490 TEST(RE2, Bug1816809) {
1491   RE2 re("(((((llx((-3)|(4)))(;(llx((-3)|(4))))*))))");
1492   StringPiece piece("llx-3;llx4");
1493   string x;
1494   EXPECT_TRUE(RE2::Consume(&piece, re, &x));
1495 }
1496 
1497 // Issue 3061120
TEST(RE2,Bug3061120)1498 TEST(RE2, Bug3061120) {
1499   RE2 re("(?i)\\W");
1500   EXPECT_FALSE(RE2::PartialMatch("x", re));  // always worked
1501   EXPECT_FALSE(RE2::PartialMatch("k", re));  // broke because of kelvin
1502   EXPECT_FALSE(RE2::PartialMatch("s", re));  // broke because of latin long s
1503 }
1504 
TEST(RE2,CapturingGroupNames)1505 TEST(RE2, CapturingGroupNames) {
1506   // Opening parentheses annotated with group IDs:
1507   //      12    3        45   6         7
1508   RE2 re("((abc)(?P<G2>)|((e+)(?P<G2>.*)(?P<G1>u+)))");
1509   EXPECT_TRUE(re.ok());
1510   const std::map<int, string>& have = re.CapturingGroupNames();
1511   std::map<int, string> want;
1512   want[3] = "G2";
1513   want[6] = "G2";
1514   want[7] = "G1";
1515   EXPECT_EQ(want, have);
1516 }
1517 
TEST(RE2,RegexpToStringLossOfAnchor)1518 TEST(RE2, RegexpToStringLossOfAnchor) {
1519   EXPECT_EQ(RE2("^[a-c]at", RE2::POSIX).Regexp()->ToString(), "^[a-c]at");
1520   EXPECT_EQ(RE2("^[a-c]at").Regexp()->ToString(), "(?-m:^)[a-c]at");
1521   EXPECT_EQ(RE2("ca[t-z]$", RE2::POSIX).Regexp()->ToString(), "ca[t-z]$");
1522   EXPECT_EQ(RE2("ca[t-z]$").Regexp()->ToString(), "ca[t-z](?-m:$)");
1523 }
1524 
1525 // Issue 10131674
TEST(RE2,Bug10131674)1526 TEST(RE2, Bug10131674) {
1527   // Some of these escapes describe values that do not fit in a byte.
1528   RE2 re("\\140\\440\\174\\271\\150\\656\\106\\201\\004\\332", RE2::Latin1);
1529   EXPECT_FALSE(re.ok());
1530   EXPECT_FALSE(RE2::FullMatch("hello world", re));
1531 }
1532 
TEST(RE2,Bug18391750)1533 TEST(RE2, Bug18391750) {
1534   // Stray write past end of match_ in nfa.cc, caught by fuzzing + address sanitizer.
1535   const char t[] = {
1536       (char)0x28, (char)0x28, (char)0xfc, (char)0xfc, (char)0x08, (char)0x08,
1537       (char)0x26, (char)0x26, (char)0x28, (char)0xc2, (char)0x9b, (char)0xc5,
1538       (char)0xc5, (char)0xd4, (char)0x8f, (char)0x8f, (char)0x69, (char)0x69,
1539       (char)0xe7, (char)0x29, (char)0x7b, (char)0x37, (char)0x31, (char)0x31,
1540       (char)0x7d, (char)0xae, (char)0x7c, (char)0x7c, (char)0xf3, (char)0x29,
1541       (char)0xae, (char)0xae, (char)0x2e, (char)0x2a, (char)0x29, (char)0x00,
1542   };
1543   RE2::Options opt;
1544   opt.set_encoding(RE2::Options::EncodingLatin1);
1545   opt.set_longest_match(true);
1546   opt.set_dot_nl(true);
1547   opt.set_case_sensitive(false);
1548   RE2 re(t, opt);
1549   ASSERT_TRUE(re.ok());
1550   RE2::PartialMatch(t, re);
1551 }
1552 
TEST(RE2,Bug18458852)1553 TEST(RE2, Bug18458852) {
1554   // Bug in parser accepting invalid (too large) rune,
1555   // causing compiler to fail in DCHECK in UTF-8
1556   // character class code.
1557   const char b[] = {
1558       (char)0x28, (char)0x05, (char)0x05, (char)0x41, (char)0x41, (char)0x28,
1559       (char)0x24, (char)0x5b, (char)0x5e, (char)0xf5, (char)0x87, (char)0x87,
1560       (char)0x90, (char)0x29, (char)0x5d, (char)0x29, (char)0x29, (char)0x00,
1561   };
1562   RE2 re(b);
1563   ASSERT_FALSE(re.ok());
1564 }
1565 
TEST(RE2,Bug18523943)1566 TEST(RE2, Bug18523943) {
1567   // Bug in BitState: case kFailInst failed the match entirely.
1568 
1569   RE2::Options opt;
1570   const char a[] = {
1571       (char)0x29, (char)0x29, (char)0x24, (char)0x00,
1572   };
1573   const char b[] = {
1574       (char)0x28, (char)0x0a, (char)0x2a, (char)0x2a, (char)0x29, (char)0x00,
1575   };
1576   opt.set_log_errors(false);
1577   opt.set_encoding(RE2::Options::EncodingLatin1);
1578   opt.set_posix_syntax(true);
1579   opt.set_longest_match(true);
1580   opt.set_literal(false);
1581   opt.set_never_nl(true);
1582 
1583   RE2 re((const char*)b, opt);
1584   ASSERT_TRUE(re.ok());
1585   string s1;
1586   ASSERT_TRUE(RE2::PartialMatch((const char*)a, re, &s1));
1587 }
1588 
TEST(RE2,Bug21371806)1589 TEST(RE2, Bug21371806) {
1590   // Bug in parser accepting Unicode groups in Latin-1 mode,
1591   // causing compiler to fail in DCHECK in prog.cc.
1592 
1593   RE2::Options opt;
1594   opt.set_encoding(RE2::Options::EncodingLatin1);
1595 
1596   RE2 re("g\\p{Zl}]", opt);
1597   ASSERT_TRUE(re.ok());
1598 }
1599 
TEST(RE2,Bug26356109)1600 TEST(RE2, Bug26356109) {
1601   // Bug in parser caused by factoring of common prefixes in alternations.
1602 
1603   // In the past, this was factored to "a\\C*?[bc]". Thus, the automaton would
1604   // consume "ab" and then stop (when unanchored) whereas it should consume all
1605   // of "abc" as per first-match semantics.
1606   RE2 re("a\\C*?c|a\\C*?b");
1607   ASSERT_TRUE(re.ok());
1608 
1609   string s = "abc";
1610   StringPiece m;
1611 
1612   ASSERT_TRUE(re.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1));
1613   ASSERT_EQ(m, s) << " (UNANCHORED) got m='" << m << "', want '" << s << "'";
1614 
1615   ASSERT_TRUE(re.Match(s, 0, s.size(), RE2::ANCHOR_BOTH, &m, 1));
1616   ASSERT_EQ(m, s) << " (ANCHOR_BOTH) got m='" << m << "', want '" << s << "'";
1617 }
1618 
TEST(RE2,Issue104)1619 TEST(RE2, Issue104) {
1620   // RE2::GlobalReplace always advanced by one byte when the empty string was
1621   // matched, which would clobber any rune that is longer than one byte.
1622 
1623   string s = "bc";
1624   ASSERT_EQ(3, RE2::GlobalReplace(&s, "a*", "d"));
1625   ASSERT_EQ("dbdcd", s);
1626 
1627   s = "ąć";
1628   ASSERT_EQ(3, RE2::GlobalReplace(&s, "Ć*", "Ĉ"));
1629   ASSERT_EQ("ĈąĈćĈ", s);
1630 
1631   s = "人类";
1632   ASSERT_EQ(3, RE2::GlobalReplace(&s, "大*", "小"));
1633   ASSERT_EQ("小人小类小", s);
1634 }
1635 
1636 }  // namespace re2
1637