xref: /aosp_15_r20/external/cronet/third_party/re2/src/re2/testing/re2_test.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // -*- coding: utf-8 -*-
2 // Copyright 2002-2009 The RE2 Authors.  All Rights Reserved.
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
5 
6 // TODO: Test extractions for PartialMatch/Consume
7 
8 #include <errno.h>
9 #include <stddef.h>
10 #include <stdint.h>
11 #include <string.h>
12 #include <map>
13 #include <string>
14 #include <utility>
15 #include <vector>
16 #if !defined(_MSC_VER) && !defined(__CYGWIN__) && !defined(__MINGW32__)
17 #include <sys/mman.h>
18 #include <unistd.h>  /* for sysconf */
19 #endif
20 
21 #include "absl/base/macros.h"
22 #include "absl/strings/str_format.h"
23 #include "gtest/gtest.h"
24 #include "util/logging.h"
25 #include "re2/re2.h"
26 #include "re2/regexp.h"
27 
28 namespace re2 {
29 
TEST(RE2,HexTests)30 TEST(RE2, HexTests) {
31 #define ASSERT_HEX(type, value)                                         \
32   do {                                                                  \
33     type v;                                                             \
34     ASSERT_TRUE(                                                        \
35         RE2::FullMatch(#value, "([0-9a-fA-F]+)[uUlL]*", RE2::Hex(&v))); \
36     ASSERT_EQ(v, 0x##value);                                            \
37     ASSERT_TRUE(RE2::FullMatch("0x" #value, "([0-9a-fA-FxX]+)[uUlL]*",  \
38                                RE2::CRadix(&v)));                       \
39     ASSERT_EQ(v, 0x##value);                                            \
40   } while (0)
41 
42   ASSERT_HEX(short,              2bad);
43   ASSERT_HEX(unsigned short,     2badU);
44   ASSERT_HEX(int,                dead);
45   ASSERT_HEX(unsigned int,       deadU);
46   ASSERT_HEX(long,               7eadbeefL);
47   ASSERT_HEX(unsigned long,      deadbeefUL);
48   ASSERT_HEX(long long,          12345678deadbeefLL);
49   ASSERT_HEX(unsigned long long, cafebabedeadbeefULL);
50 
51 #undef ASSERT_HEX
52 }
53 
TEST(RE2,OctalTests)54 TEST(RE2, OctalTests) {
55 #define ASSERT_OCTAL(type, value)                                           \
56   do {                                                                      \
57     type v;                                                                 \
58     ASSERT_TRUE(RE2::FullMatch(#value, "([0-7]+)[uUlL]*", RE2::Octal(&v))); \
59     ASSERT_EQ(v, 0##value);                                                 \
60     ASSERT_TRUE(RE2::FullMatch("0" #value, "([0-9a-fA-FxX]+)[uUlL]*",       \
61                                RE2::CRadix(&v)));                           \
62     ASSERT_EQ(v, 0##value);                                                 \
63   } while (0)
64 
65   ASSERT_OCTAL(short,              77777);
66   ASSERT_OCTAL(unsigned short,     177777U);
67   ASSERT_OCTAL(int,                17777777777);
68   ASSERT_OCTAL(unsigned int,       37777777777U);
69   ASSERT_OCTAL(long,               17777777777L);
70   ASSERT_OCTAL(unsigned long,      37777777777UL);
71   ASSERT_OCTAL(long long,          777777777777777777777LL);
72   ASSERT_OCTAL(unsigned long long, 1777777777777777777777ULL);
73 
74 #undef ASSERT_OCTAL
75 }
76 
TEST(RE2,DecimalTests)77 TEST(RE2, DecimalTests) {
78 #define ASSERT_DECIMAL(type, value)                                            \
79   do {                                                                         \
80     type v;                                                                    \
81     ASSERT_TRUE(RE2::FullMatch(#value, "(-?[0-9]+)[uUlL]*", &v));              \
82     ASSERT_EQ(v, value);                                                       \
83     ASSERT_TRUE(                                                               \
84         RE2::FullMatch(#value, "(-?[0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \
85     ASSERT_EQ(v, value);                                                       \
86   } while (0)
87 
88   ASSERT_DECIMAL(short,              -1);
89   ASSERT_DECIMAL(unsigned short,     9999);
90   ASSERT_DECIMAL(int,                -1000);
91   ASSERT_DECIMAL(unsigned int,       12345U);
92   ASSERT_DECIMAL(long,               -10000000L);
93   ASSERT_DECIMAL(unsigned long,      3083324652U);
94   ASSERT_DECIMAL(long long,          -100000000000000LL);
95   ASSERT_DECIMAL(unsigned long long, 1234567890987654321ULL);
96 
97 #undef ASSERT_DECIMAL
98 }
99 
TEST(RE2,Replace)100 TEST(RE2, Replace) {
101   struct ReplaceTest {
102     const char *regexp;
103     const char *rewrite;
104     const char *original;
105     const char *single;
106     const char *global;
107     int        greplace_count;
108   };
109   static const ReplaceTest tests[] = {
110     { "(qu|[b-df-hj-np-tv-z]*)([a-z]+)",
111       "\\2\\1ay",
112       "the quick brown fox jumps over the lazy dogs.",
113       "ethay quick brown fox jumps over the lazy dogs.",
114       "ethay ickquay ownbray oxfay umpsjay overay ethay azylay ogsday.",
115       9 },
116     { "\\w+",
117       "\\0-NOSPAM",
118       "[email protected]",
119       "[email protected]",
120       "[email protected]",
121       4 },
122     { "^",
123       "(START)",
124       "foo",
125       "(START)foo",
126       "(START)foo",
127       1 },
128     { "^",
129       "(START)",
130       "",
131       "(START)",
132       "(START)",
133       1 },
134     { "$",
135       "(END)",
136       "",
137       "(END)",
138       "(END)",
139       1 },
140     { "b",
141       "bb",
142       "ababababab",
143       "abbabababab",
144       "abbabbabbabbabb",
145       5 },
146     { "b",
147       "bb",
148       "bbbbbb",
149       "bbbbbbb",
150       "bbbbbbbbbbbb",
151       6 },
152     { "b+",
153       "bb",
154       "bbbbbb",
155       "bb",
156       "bb",
157       1 },
158     { "b*",
159       "bb",
160       "bbbbbb",
161       "bb",
162       "bb",
163       1 },
164     { "b*",
165       "bb",
166       "aaaaa",
167       "bbaaaaa",
168       "bbabbabbabbabbabb",
169       6 },
170     // Check newline handling
171     { "a.*a",
172       "(\\0)",
173       "aba\naba",
174       "(aba)\naba",
175       "(aba)\n(aba)",
176       2 },
177     { "", NULL, NULL, NULL, NULL, 0 }
178   };
179 
180   for (const ReplaceTest* t = tests; t->original != NULL; t++) {
181     std::string one(t->original);
182     ASSERT_TRUE(RE2::Replace(&one, t->regexp, t->rewrite));
183     ASSERT_EQ(one, t->single);
184     std::string all(t->original);
185     ASSERT_EQ(RE2::GlobalReplace(&all, t->regexp, t->rewrite), t->greplace_count)
186       << "Got: " << all;
187     ASSERT_EQ(all, t->global);
188   }
189 }
190 
TestCheckRewriteString(const char * regexp,const char * rewrite,bool expect_ok)191 static void TestCheckRewriteString(const char* regexp, const char* rewrite,
192                               bool expect_ok) {
193   std::string error;
194   RE2 exp(regexp);
195   bool actual_ok = exp.CheckRewriteString(rewrite, &error);
196   EXPECT_EQ(expect_ok, actual_ok) << " for " << rewrite << " error: " << error;
197 }
198 
TEST(CheckRewriteString,all)199 TEST(CheckRewriteString, all) {
200   TestCheckRewriteString("abc", "foo", true);
201   TestCheckRewriteString("abc", "foo\\", false);
202   TestCheckRewriteString("abc", "foo\\0bar", true);
203 
204   TestCheckRewriteString("a(b)c", "foo", true);
205   TestCheckRewriteString("a(b)c", "foo\\0bar", true);
206   TestCheckRewriteString("a(b)c", "foo\\1bar", true);
207   TestCheckRewriteString("a(b)c", "foo\\2bar", false);
208   TestCheckRewriteString("a(b)c", "f\\\\2o\\1o", true);
209 
210   TestCheckRewriteString("a(b)(c)", "foo\\12", true);
211   TestCheckRewriteString("a(b)(c)", "f\\2o\\1o", true);
212   TestCheckRewriteString("a(b)(c)", "f\\oo\\1", false);
213 }
214 
TEST(RE2,Extract)215 TEST(RE2, Extract) {
216   std::string s;
217 
218   ASSERT_TRUE(RE2::Extract("[email protected]", "(.*)@([^.]*)", "\\2!\\1", &s));
219   ASSERT_EQ(s, "kremvax!boris");
220 
221   ASSERT_TRUE(RE2::Extract("foo", ".*", "'\\0'", &s));
222   ASSERT_EQ(s, "'foo'");
223   // check that false match doesn't overwrite
224   ASSERT_FALSE(RE2::Extract("baz", "bar", "'\\0'", &s));
225   ASSERT_EQ(s, "'foo'");
226 }
227 
TEST(RE2,MaxSubmatchTooLarge)228 TEST(RE2, MaxSubmatchTooLarge) {
229   std::string s;
230   ASSERT_FALSE(RE2::Extract("foo", "f(o+)", "\\1\\2", &s));
231   s = "foo";
232   ASSERT_FALSE(RE2::Replace(&s, "f(o+)", "\\1\\2"));
233   s = "foo";
234   ASSERT_FALSE(RE2::GlobalReplace(&s, "f(o+)", "\\1\\2"));
235 }
236 
TEST(RE2,Consume)237 TEST(RE2, Consume) {
238   RE2 r("\\s*(\\w+)");    // matches a word, possibly proceeded by whitespace
239   std::string word;
240 
241   std::string s("   aaa b!@#$@#$cccc");
242   absl::string_view input(s);
243 
244   ASSERT_TRUE(RE2::Consume(&input, r, &word));
245   ASSERT_EQ(word, "aaa") << " input: " << input;
246   ASSERT_TRUE(RE2::Consume(&input, r, &word));
247   ASSERT_EQ(word, "b") << " input: " << input;
248   ASSERT_FALSE(RE2::Consume(&input, r, &word)) << " input: " << input;
249 }
250 
TEST(RE2,ConsumeN)251 TEST(RE2, ConsumeN) {
252   const std::string s(" one two three 4");
253   absl::string_view input(s);
254 
255   RE2::Arg argv[2];
256   const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
257 
258   // 0 arg
259   EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 0));  // Skips "one".
260 
261   // 1 arg
262   std::string word;
263   argv[0] = &word;
264   EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 1));
265   EXPECT_EQ("two", word);
266 
267   // Multi-args
268   int n;
269   argv[1] = &n;
270   EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)\\s*(\\d+)", args, 2));
271   EXPECT_EQ("three", word);
272   EXPECT_EQ(4, n);
273 }
274 
TEST(RE2,FindAndConsume)275 TEST(RE2, FindAndConsume) {
276   RE2 r("(\\w+)");      // matches a word
277   std::string word;
278 
279   std::string s("   aaa b!@#$@#$cccc");
280   absl::string_view input(s);
281 
282   ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word));
283   ASSERT_EQ(word, "aaa");
284   ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word));
285   ASSERT_EQ(word, "b");
286   ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word));
287   ASSERT_EQ(word, "cccc");
288   ASSERT_FALSE(RE2::FindAndConsume(&input, r, &word));
289 
290   // Check that FindAndConsume works without any submatches.
291   // Earlier version used uninitialized data for
292   // length to consume.
293   input = "aaa";
294   ASSERT_TRUE(RE2::FindAndConsume(&input, "aaa"));
295   ASSERT_EQ(input, "");
296 }
297 
TEST(RE2,FindAndConsumeN)298 TEST(RE2, FindAndConsumeN) {
299   const std::string s(" one two three 4");
300   absl::string_view input(s);
301 
302   RE2::Arg argv[2];
303   const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
304 
305   // 0 arg
306   EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 0));  // Skips "one".
307 
308   // 1 arg
309   std::string word;
310   argv[0] = &word;
311   EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 1));
312   EXPECT_EQ("two", word);
313 
314   // Multi-args
315   int n;
316   argv[1] = &n;
317   EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)\\s*(\\d+)", args, 2));
318   EXPECT_EQ("three", word);
319   EXPECT_EQ(4, n);
320 }
321 
TEST(RE2,MatchNumberPeculiarity)322 TEST(RE2, MatchNumberPeculiarity) {
323   RE2 r("(foo)|(bar)|(baz)");
324   std::string word1;
325   std::string word2;
326   std::string word3;
327 
328   ASSERT_TRUE(RE2::PartialMatch("foo", r, &word1, &word2, &word3));
329   ASSERT_EQ(word1, "foo");
330   ASSERT_EQ(word2, "");
331   ASSERT_EQ(word3, "");
332   ASSERT_TRUE(RE2::PartialMatch("bar", r, &word1, &word2, &word3));
333   ASSERT_EQ(word1, "");
334   ASSERT_EQ(word2, "bar");
335   ASSERT_EQ(word3, "");
336   ASSERT_TRUE(RE2::PartialMatch("baz", r, &word1, &word2, &word3));
337   ASSERT_EQ(word1, "");
338   ASSERT_EQ(word2, "");
339   ASSERT_EQ(word3, "baz");
340   ASSERT_FALSE(RE2::PartialMatch("f", r, &word1, &word2, &word3));
341 
342   std::string a;
343   ASSERT_TRUE(RE2::FullMatch("hello", "(foo)|hello", &a));
344   ASSERT_EQ(a, "");
345 }
346 
TEST(RE2,Match)347 TEST(RE2, Match) {
348   RE2 re("((\\w+):([0-9]+))");   // extracts host and port
349   absl::string_view group[4];
350 
351   // No match.
352   absl::string_view s = "zyzzyva";
353   ASSERT_FALSE(
354       re.Match(s, 0, s.size(), RE2::UNANCHORED, group, ABSL_ARRAYSIZE(group)));
355 
356   // Matches and extracts.
357   s = "a chrisr:9000 here";
358   ASSERT_TRUE(
359       re.Match(s, 0, s.size(), RE2::UNANCHORED, group, ABSL_ARRAYSIZE(group)));
360   ASSERT_EQ(group[0], "chrisr:9000");
361   ASSERT_EQ(group[1], "chrisr:9000");
362   ASSERT_EQ(group[2], "chrisr");
363   ASSERT_EQ(group[3], "9000");
364 
365   std::string all, host;
366   int port;
367   ASSERT_TRUE(RE2::PartialMatch("a chrisr:9000 here", re, &all, &host, &port));
368   ASSERT_EQ(all, "chrisr:9000");
369   ASSERT_EQ(host, "chrisr");
370   ASSERT_EQ(port, 9000);
371 }
372 
TestRecursion(int size,const char * pattern)373 static void TestRecursion(int size, const char* pattern) {
374   // Fill up a string repeating the pattern given
375   std::string domain;
376   domain.resize(size);
377   size_t patlen = strlen(pattern);
378   for (int i = 0; i < size; i++) {
379     domain[i] = pattern[i % patlen];
380   }
381   // Just make sure it doesn't crash due to too much recursion.
382   RE2 re("([a-zA-Z0-9]|-)+(\\.([a-zA-Z0-9]|-)+)*(\\.)?", RE2::Quiet);
383   RE2::FullMatch(domain, re);
384 }
385 
386 // A meta-quoted string, interpreted as a pattern, should always match
387 // the original unquoted string.
TestQuoteMeta(const std::string & unquoted,const RE2::Options & options=RE2::DefaultOptions)388 static void TestQuoteMeta(const std::string& unquoted,
389                           const RE2::Options& options = RE2::DefaultOptions) {
390   std::string quoted = RE2::QuoteMeta(unquoted);
391   RE2 re(quoted, options);
392   EXPECT_TRUE(RE2::FullMatch(unquoted, re))
393       << "Unquoted='" << unquoted << "', quoted='" << quoted << "'.";
394 }
395 
396 // A meta-quoted string, interpreted as a pattern, should always match
397 // the original unquoted string.
NegativeTestQuoteMeta(const std::string & unquoted,const std::string & should_not_match,const RE2::Options & options=RE2::DefaultOptions)398 static void NegativeTestQuoteMeta(
399     const std::string& unquoted, const std::string& should_not_match,
400     const RE2::Options& options = RE2::DefaultOptions) {
401   std::string quoted = RE2::QuoteMeta(unquoted);
402   RE2 re(quoted, options);
403   EXPECT_FALSE(RE2::FullMatch(should_not_match, re))
404       << "Unquoted='" << unquoted << "', quoted='" << quoted << "'.";
405 }
406 
407 // Tests that quoted meta characters match their original strings,
408 // and that a few things that shouldn't match indeed do not.
TEST(QuoteMeta,Simple)409 TEST(QuoteMeta, Simple) {
410   TestQuoteMeta("foo");
411   TestQuoteMeta("foo.bar");
412   TestQuoteMeta("foo\\.bar");
413   TestQuoteMeta("[1-9]");
414   TestQuoteMeta("1.5-2.0?");
415   TestQuoteMeta("\\d");
416   TestQuoteMeta("Who doesn't like ice cream?");
417   TestQuoteMeta("((a|b)c?d*e+[f-h]i)");
418   TestQuoteMeta("((?!)xxx).*yyy");
419   TestQuoteMeta("([");
420 }
TEST(QuoteMeta,SimpleNegative)421 TEST(QuoteMeta, SimpleNegative) {
422   NegativeTestQuoteMeta("foo", "bar");
423   NegativeTestQuoteMeta("...", "bar");
424   NegativeTestQuoteMeta("\\.", ".");
425   NegativeTestQuoteMeta("\\.", "..");
426   NegativeTestQuoteMeta("(a)", "a");
427   NegativeTestQuoteMeta("(a|b)", "a");
428   NegativeTestQuoteMeta("(a|b)", "(a)");
429   NegativeTestQuoteMeta("(a|b)", "a|b");
430   NegativeTestQuoteMeta("[0-9]", "0");
431   NegativeTestQuoteMeta("[0-9]", "0-9");
432   NegativeTestQuoteMeta("[0-9]", "[9]");
433   NegativeTestQuoteMeta("((?!)xxx)", "xxx");
434 }
435 
TEST(QuoteMeta,Latin1)436 TEST(QuoteMeta, Latin1) {
437   TestQuoteMeta("3\xb2 = 9", RE2::Latin1);
438 }
439 
TEST(QuoteMeta,UTF8)440 TEST(QuoteMeta, UTF8) {
441   TestQuoteMeta("Plácido Domingo");
442   TestQuoteMeta("xyz");  // No fancy utf8.
443   TestQuoteMeta("\xc2\xb0");  // 2-byte utf8 -- a degree symbol.
444   TestQuoteMeta("27\xc2\xb0 degrees");  // As a middle character.
445   TestQuoteMeta("\xe2\x80\xb3");  // 3-byte utf8 -- a double prime.
446   TestQuoteMeta("\xf0\x9d\x85\x9f");  // 4-byte utf8 -- a music note.
447   TestQuoteMeta("27\xc2\xb0");  // Interpreted as Latin-1, this should
448                                 // still work.
449   NegativeTestQuoteMeta("27\xc2\xb0",
450                         "27\\\xc2\\\xb0");  // 2-byte utf8 -- a degree symbol.
451 }
452 
TEST(QuoteMeta,HasNull)453 TEST(QuoteMeta, HasNull) {
454   std::string has_null;
455 
456   // string with one null character
457   has_null += '\0';
458   TestQuoteMeta(has_null);
459   NegativeTestQuoteMeta(has_null, "");
460 
461   // Don't want null-followed-by-'1' to be interpreted as '\01'.
462   has_null += '1';
463   TestQuoteMeta(has_null);
464   NegativeTestQuoteMeta(has_null, "\1");
465 }
466 
TEST(ProgramSize,BigProgram)467 TEST(ProgramSize, BigProgram) {
468   RE2 re_simple("simple regexp");
469   RE2 re_medium("medium.*regexp");
470   RE2 re_complex("complex.{1,128}regexp");
471 
472   ASSERT_GT(re_simple.ProgramSize(), 0);
473   ASSERT_GT(re_medium.ProgramSize(), re_simple.ProgramSize());
474   ASSERT_GT(re_complex.ProgramSize(), re_medium.ProgramSize());
475 
476   ASSERT_GT(re_simple.ReverseProgramSize(), 0);
477   ASSERT_GT(re_medium.ReverseProgramSize(), re_simple.ReverseProgramSize());
478   ASSERT_GT(re_complex.ReverseProgramSize(), re_medium.ReverseProgramSize());
479 }
480 
TEST(ProgramFanout,BigProgram)481 TEST(ProgramFanout, BigProgram) {
482   RE2 re1("(?:(?:(?:(?:(?:.)?){1})*)+)");
483   RE2 re10("(?:(?:(?:(?:(?:.)?){10})*)+)");
484   RE2 re100("(?:(?:(?:(?:(?:.)?){100})*)+)");
485   RE2 re1000("(?:(?:(?:(?:(?:.)?){1000})*)+)");
486 
487   std::vector<int> histogram;
488 
489   // 3 is the largest non-empty bucket and has 2 element.
490   ASSERT_EQ(3, re1.ProgramFanout(&histogram));
491   ASSERT_EQ(2, histogram[3]);
492 
493   // 6 is the largest non-empty bucket and has 11 elements.
494   ASSERT_EQ(6, re10.ProgramFanout(&histogram));
495   ASSERT_EQ(11, histogram[6]);
496 
497   // 9 is the largest non-empty bucket and has 101 elements.
498   ASSERT_EQ(9, re100.ProgramFanout(&histogram));
499   ASSERT_EQ(101, histogram[9]);
500 
501   // 13 is the largest non-empty bucket and has 1001 elements.
502   ASSERT_EQ(13, re1000.ProgramFanout(&histogram));
503   ASSERT_EQ(1001, histogram[13]);
504 
505   // 2 is the largest non-empty bucket and has 2 element.
506   ASSERT_EQ(2, re1.ReverseProgramFanout(&histogram));
507   ASSERT_EQ(2, histogram[2]);
508 
509   // 5 is the largest non-empty bucket and has 11 elements.
510   ASSERT_EQ(5, re10.ReverseProgramFanout(&histogram));
511   ASSERT_EQ(11, histogram[5]);
512 
513   // 9 is the largest non-empty bucket and has 101 elements.
514   ASSERT_EQ(9, re100.ReverseProgramFanout(&histogram));
515   ASSERT_EQ(101, histogram[9]);
516 
517   // 12 is the largest non-empty bucket and has 1001 elements.
518   ASSERT_EQ(12, re1000.ReverseProgramFanout(&histogram));
519   ASSERT_EQ(1001, histogram[12]);
520 }
521 
522 // Issue 956519: handling empty character sets was
523 // causing NULL dereference.  This tests a few empty character sets.
524 // (The way to get an empty character set is to negate a full one.)
TEST(EmptyCharset,Fuzz)525 TEST(EmptyCharset, Fuzz) {
526   static const char *empties[] = {
527     "[^\\S\\s]",
528     "[^\\S[:space:]]",
529     "[^\\D\\d]",
530     "[^\\D[:digit:]]"
531   };
532   for (size_t i = 0; i < ABSL_ARRAYSIZE(empties); i++)
533     ASSERT_FALSE(RE2(empties[i]).Match("abc", 0, 3, RE2::UNANCHORED, NULL, 0));
534 }
535 
536 // Bitstate assumes that kInstFail instructions in
537 // alternations or capture groups have been "compiled away".
TEST(EmptyCharset,BitstateAssumptions)538 TEST(EmptyCharset, BitstateAssumptions) {
539   // Captures trigger use of Bitstate.
540   static const char *nop_empties[] = {
541     "((((()))))" "[^\\S\\s]?",
542     "((((()))))" "([^\\S\\s])?",
543     "((((()))))" "([^\\S\\s]|[^\\S\\s])?",
544     "((((()))))" "(([^\\S\\s]|[^\\S\\s])|)"
545   };
546   absl::string_view group[6];
547   for (size_t i = 0; i < ABSL_ARRAYSIZE(nop_empties); i++)
548     ASSERT_TRUE(RE2(nop_empties[i]).Match("", 0, 0, RE2::UNANCHORED, group, 6));
549 }
550 
551 // Test that named groups work correctly.
TEST(Capture,NamedGroups)552 TEST(Capture, NamedGroups) {
553   {
554     RE2 re("(hello world)");
555     ASSERT_EQ(re.NumberOfCapturingGroups(), 1);
556     const std::map<std::string, int>& m = re.NamedCapturingGroups();
557     ASSERT_EQ(m.size(), 0);
558   }
559 
560   {
561     RE2 re("(?P<A>expr(?P<B>expr)(?P<C>expr))((expr)(?P<D>expr))");
562     ASSERT_EQ(re.NumberOfCapturingGroups(), 6);
563     const std::map<std::string, int>& m = re.NamedCapturingGroups();
564     ASSERT_EQ(m.size(), 4);
565     ASSERT_EQ(m.find("A")->second, 1);
566     ASSERT_EQ(m.find("B")->second, 2);
567     ASSERT_EQ(m.find("C")->second, 3);
568     ASSERT_EQ(m.find("D")->second, 6);  // $4 and $5 are anonymous
569   }
570 }
571 
TEST(RE2,CapturedGroupTest)572 TEST(RE2, CapturedGroupTest) {
573   RE2 re("directions from (?P<S>.*) to (?P<D>.*)");
574   int num_groups = re.NumberOfCapturingGroups();
575   EXPECT_EQ(2, num_groups);
576   std::string args[4];
577   RE2::Arg arg0(&args[0]);
578   RE2::Arg arg1(&args[1]);
579   RE2::Arg arg2(&args[2]);
580   RE2::Arg arg3(&args[3]);
581 
582   const RE2::Arg* const matches[4] = {&arg0, &arg1, &arg2, &arg3};
583   EXPECT_TRUE(RE2::FullMatchN("directions from mountain view to san jose",
584                               re, matches, num_groups));
585   const std::map<std::string, int>& named_groups = re.NamedCapturingGroups();
586   EXPECT_TRUE(named_groups.find("S") != named_groups.end());
587   EXPECT_TRUE(named_groups.find("D") != named_groups.end());
588 
589   // The named group index is 1-based.
590   int source_group_index = named_groups.find("S")->second;
591   int destination_group_index = named_groups.find("D")->second;
592   EXPECT_EQ(1, source_group_index);
593   EXPECT_EQ(2, destination_group_index);
594 
595   // The args is zero-based.
596   EXPECT_EQ("mountain view", args[source_group_index - 1]);
597   EXPECT_EQ("san jose", args[destination_group_index - 1]);
598 }
599 
TEST(RE2,FullMatchWithNoArgs)600 TEST(RE2, FullMatchWithNoArgs) {
601   ASSERT_TRUE(RE2::FullMatch("h", "h"));
602   ASSERT_TRUE(RE2::FullMatch("hello", "hello"));
603   ASSERT_TRUE(RE2::FullMatch("hello", "h.*o"));
604   ASSERT_FALSE(RE2::FullMatch("othello", "h.*o"));  // Must be anchored at front
605   ASSERT_FALSE(RE2::FullMatch("hello!", "h.*o"));   // Must be anchored at end
606 }
607 
TEST(RE2,PartialMatch)608 TEST(RE2, PartialMatch) {
609   ASSERT_TRUE(RE2::PartialMatch("x", "x"));
610   ASSERT_TRUE(RE2::PartialMatch("hello", "h.*o"));
611   ASSERT_TRUE(RE2::PartialMatch("othello", "h.*o"));
612   ASSERT_TRUE(RE2::PartialMatch("hello!", "h.*o"));
613   ASSERT_TRUE(RE2::PartialMatch("x", "((((((((((((((((((((x))))))))))))))))))))"));
614 }
615 
TEST(RE2,PartialMatchN)616 TEST(RE2, PartialMatchN) {
617   RE2::Arg argv[2];
618   const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
619 
620   // 0 arg
621   EXPECT_TRUE(RE2::PartialMatchN("hello", "e.*o", args, 0));
622   EXPECT_FALSE(RE2::PartialMatchN("othello", "a.*o", args, 0));
623 
624   // 1 arg
625   int i;
626   argv[0] = &i;
627   EXPECT_TRUE(RE2::PartialMatchN("1001 nights", "(\\d+)", args, 1));
628   EXPECT_EQ(1001, i);
629   EXPECT_FALSE(RE2::PartialMatchN("three", "(\\d+)", args, 1));
630 
631   // Multi-arg
632   std::string s;
633   argv[1] = &s;
634   EXPECT_TRUE(RE2::PartialMatchN("answer: 42:life", "(\\d+):(\\w+)", args, 2));
635   EXPECT_EQ(42, i);
636   EXPECT_EQ("life", s);
637   EXPECT_FALSE(RE2::PartialMatchN("hi1", "(\\w+)(1)", args, 2));
638 }
639 
TEST(RE2,FullMatchZeroArg)640 TEST(RE2, FullMatchZeroArg) {
641   // Zero-arg
642   ASSERT_TRUE(RE2::FullMatch("1001", "\\d+"));
643 }
644 
TEST(RE2,FullMatchOneArg)645 TEST(RE2, FullMatchOneArg) {
646   int i;
647 
648   // Single-arg
649   ASSERT_TRUE(RE2::FullMatch("1001", "(\\d+)",   &i));
650   ASSERT_EQ(i, 1001);
651   ASSERT_TRUE(RE2::FullMatch("-123", "(-?\\d+)", &i));
652   ASSERT_EQ(i, -123);
653   ASSERT_FALSE(RE2::FullMatch("10", "()\\d+", &i));
654   ASSERT_FALSE(
655       RE2::FullMatch("1234567890123456789012345678901234567890", "(\\d+)", &i));
656 }
657 
TEST(RE2,FullMatchIntegerArg)658 TEST(RE2, FullMatchIntegerArg) {
659   int i;
660 
661   // Digits surrounding integer-arg
662   ASSERT_TRUE(RE2::FullMatch("1234", "1(\\d*)4", &i));
663   ASSERT_EQ(i, 23);
664   ASSERT_TRUE(RE2::FullMatch("1234", "(\\d)\\d+", &i));
665   ASSERT_EQ(i, 1);
666   ASSERT_TRUE(RE2::FullMatch("-1234", "(-\\d)\\d+", &i));
667   ASSERT_EQ(i, -1);
668   ASSERT_TRUE(RE2::PartialMatch("1234", "(\\d)", &i));
669   ASSERT_EQ(i, 1);
670   ASSERT_TRUE(RE2::PartialMatch("-1234", "(-\\d)", &i));
671   ASSERT_EQ(i, -1);
672 }
673 
TEST(RE2,FullMatchStringArg)674 TEST(RE2, FullMatchStringArg) {
675   std::string s;
676   // string-arg
677   ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", &s));
678   ASSERT_EQ(s, std::string("ell"));
679 }
680 
TEST(RE2,FullMatchStringViewArg)681 TEST(RE2, FullMatchStringViewArg) {
682   int i;
683   absl::string_view sp;
684   // string_view-arg
685   ASSERT_TRUE(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &sp, &i));
686   ASSERT_EQ(sp.size(), 4);
687   ASSERT_TRUE(memcmp(sp.data(), "ruby", 4) == 0);
688   ASSERT_EQ(i, 1234);
689 }
690 
TEST(RE2,FullMatchMultiArg)691 TEST(RE2, FullMatchMultiArg) {
692   int i;
693   std::string s;
694   // Multi-arg
695   ASSERT_TRUE(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
696   ASSERT_EQ(s, std::string("ruby"));
697   ASSERT_EQ(i, 1234);
698 }
699 
TEST(RE2,FullMatchN)700 TEST(RE2, FullMatchN) {
701   RE2::Arg argv[2];
702   const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
703 
704   // 0 arg
705   EXPECT_TRUE(RE2::FullMatchN("hello", "h.*o", args, 0));
706   EXPECT_FALSE(RE2::FullMatchN("othello", "h.*o", args, 0));
707 
708   // 1 arg
709   int i;
710   argv[0] = &i;
711   EXPECT_TRUE(RE2::FullMatchN("1001", "(\\d+)", args, 1));
712   EXPECT_EQ(1001, i);
713   EXPECT_FALSE(RE2::FullMatchN("three", "(\\d+)", args, 1));
714 
715   // Multi-arg
716   std::string s;
717   argv[1] = &s;
718   EXPECT_TRUE(RE2::FullMatchN("42:life", "(\\d+):(\\w+)", args, 2));
719   EXPECT_EQ(42, i);
720   EXPECT_EQ("life", s);
721   EXPECT_FALSE(RE2::FullMatchN("hi1", "(\\w+)(1)", args, 2));
722 }
723 
TEST(RE2,FullMatchIgnoredArg)724 TEST(RE2, FullMatchIgnoredArg) {
725   int i;
726   std::string s;
727 
728   // Old-school NULL should be ignored.
729   ASSERT_TRUE(
730       RE2::FullMatch("ruby:1234", "(\\w+)(:)(\\d+)", &s, (void*)NULL, &i));
731   ASSERT_EQ(s, std::string("ruby"));
732   ASSERT_EQ(i, 1234);
733 
734   // C++11 nullptr should also be ignored.
735   ASSERT_TRUE(RE2::FullMatch("rubz:1235", "(\\w+)(:)(\\d+)", &s, nullptr, &i));
736   ASSERT_EQ(s, std::string("rubz"));
737   ASSERT_EQ(i, 1235);
738 }
739 
TEST(RE2,FullMatchTypedNullArg)740 TEST(RE2, FullMatchTypedNullArg) {
741   std::string s;
742 
743   // Ignore non-void* NULL arg
744   ASSERT_TRUE(RE2::FullMatch("hello", "he(.*)lo", (char*)NULL));
745   ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", (std::string*)NULL));
746   ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", (absl::string_view*)NULL));
747   ASSERT_TRUE(RE2::FullMatch("1234", "(.*)", (int*)NULL));
748   ASSERT_TRUE(RE2::FullMatch("1234567890123456", "(.*)", (long long*)NULL));
749   ASSERT_TRUE(RE2::FullMatch("123.4567890123456", "(.*)", (double*)NULL));
750   ASSERT_TRUE(RE2::FullMatch("123.4567890123456", "(.*)", (float*)NULL));
751 
752   // Fail on non-void* NULL arg if the match doesn't parse for the given type.
753   ASSERT_FALSE(RE2::FullMatch("hello", "h(.*)lo", &s, (char*)NULL));
754   ASSERT_FALSE(RE2::FullMatch("hello", "(.*)", (int*)NULL));
755   ASSERT_FALSE(RE2::FullMatch("1234567890123456", "(.*)", (int*)NULL));
756   ASSERT_FALSE(RE2::FullMatch("hello", "(.*)", (double*)NULL));
757   ASSERT_FALSE(RE2::FullMatch("hello", "(.*)", (float*)NULL));
758 }
759 
760 // Check that numeric parsing code does not read past the end of
761 // the number being parsed.
762 // This implementation requires mmap(2) et al. and thus cannot
763 // be used unless they are available.
TEST(RE2,NULTerminated)764 TEST(RE2, NULTerminated) {
765 #if defined(_POSIX_MAPPED_FILES) && _POSIX_MAPPED_FILES > 0
766   char *v;
767   int x;
768   long pagesize = sysconf(_SC_PAGE_SIZE);
769 
770 #ifndef MAP_ANONYMOUS
771 #define MAP_ANONYMOUS MAP_ANON
772 #endif
773   v = static_cast<char*>(mmap(NULL, 2*pagesize, PROT_READ|PROT_WRITE,
774                               MAP_ANONYMOUS|MAP_PRIVATE, -1, 0));
775   ASSERT_TRUE(v != reinterpret_cast<char*>(-1));
776   LOG(INFO) << "Memory at " << (void*)v;
777   ASSERT_EQ(munmap(v + pagesize, pagesize), 0) << " error " << errno;
778   v[pagesize - 1] = '1';
779 
780   x = 0;
781   ASSERT_TRUE(
782       RE2::FullMatch(absl::string_view(v + pagesize - 1, 1), "(.*)", &x));
783   ASSERT_EQ(x, 1);
784 #endif
785 }
786 
TEST(RE2,FullMatchTypeTests)787 TEST(RE2, FullMatchTypeTests) {
788   // Type tests
789   std::string zeros(1000, '0');
790   {
791     char c;
792     ASSERT_TRUE(RE2::FullMatch("Hello", "(H)ello", &c));
793     ASSERT_EQ(c, 'H');
794   }
795   {
796     unsigned char c;
797     ASSERT_TRUE(RE2::FullMatch("Hello", "(H)ello", &c));
798     ASSERT_EQ(c, static_cast<unsigned char>('H'));
799   }
800   {
801     int16_t v;
802     ASSERT_TRUE(RE2::FullMatch("100",     "(-?\\d+)", &v)); ASSERT_EQ(v, 100);
803     ASSERT_TRUE(RE2::FullMatch("-100",    "(-?\\d+)", &v)); ASSERT_EQ(v, -100);
804     ASSERT_TRUE(RE2::FullMatch("32767",   "(-?\\d+)", &v)); ASSERT_EQ(v, 32767);
805     ASSERT_TRUE(RE2::FullMatch("-32768",  "(-?\\d+)", &v)); ASSERT_EQ(v, -32768);
806     ASSERT_FALSE(RE2::FullMatch("-32769", "(-?\\d+)", &v));
807     ASSERT_FALSE(RE2::FullMatch("32768",  "(-?\\d+)", &v));
808   }
809   {
810     uint16_t v;
811     ASSERT_TRUE(RE2::FullMatch("100",    "(\\d+)", &v)); ASSERT_EQ(v, 100);
812     ASSERT_TRUE(RE2::FullMatch("32767",  "(\\d+)", &v)); ASSERT_EQ(v, 32767);
813     ASSERT_TRUE(RE2::FullMatch("65535",  "(\\d+)", &v)); ASSERT_EQ(v, 65535);
814     ASSERT_FALSE(RE2::FullMatch("65536", "(\\d+)", &v));
815   }
816   {
817     int32_t v;
818     static const int32_t max = INT32_C(0x7fffffff);
819     static const int32_t min = -max - 1;
820     ASSERT_TRUE(RE2::FullMatch("100",          "(-?\\d+)", &v)); ASSERT_EQ(v, 100);
821     ASSERT_TRUE(RE2::FullMatch("-100",         "(-?\\d+)", &v)); ASSERT_EQ(v, -100);
822     ASSERT_TRUE(RE2::FullMatch("2147483647",   "(-?\\d+)", &v)); ASSERT_EQ(v, max);
823     ASSERT_TRUE(RE2::FullMatch("-2147483648",  "(-?\\d+)", &v)); ASSERT_EQ(v, min);
824     ASSERT_FALSE(RE2::FullMatch("-2147483649", "(-?\\d+)", &v));
825     ASSERT_FALSE(RE2::FullMatch("2147483648",  "(-?\\d+)", &v));
826 
827     ASSERT_TRUE(RE2::FullMatch(zeros + "2147483647", "(-?\\d+)", &v));
828     ASSERT_EQ(v, max);
829     ASSERT_TRUE(RE2::FullMatch("-" + zeros + "2147483648", "(-?\\d+)", &v));
830     ASSERT_EQ(v, min);
831 
832     ASSERT_FALSE(RE2::FullMatch("-" + zeros + "2147483649", "(-?\\d+)", &v));
833     ASSERT_TRUE(RE2::FullMatch("0x7fffffff", "(.*)", RE2::CRadix(&v)));
834     ASSERT_EQ(v, max);
835     ASSERT_FALSE(RE2::FullMatch("000x7fffffff", "(.*)", RE2::CRadix(&v)));
836   }
837   {
838     uint32_t v;
839     static const uint32_t max = UINT32_C(0xffffffff);
840     ASSERT_TRUE(RE2::FullMatch("100",         "(\\d+)", &v)); ASSERT_EQ(v, 100);
841     ASSERT_TRUE(RE2::FullMatch("4294967295",  "(\\d+)", &v)); ASSERT_EQ(v, max);
842     ASSERT_FALSE(RE2::FullMatch("4294967296", "(\\d+)", &v));
843     ASSERT_FALSE(RE2::FullMatch("-1",         "(\\d+)", &v));
844 
845     ASSERT_TRUE(RE2::FullMatch(zeros + "4294967295", "(\\d+)", &v)); ASSERT_EQ(v, max);
846   }
847   {
848     int64_t v;
849     static const int64_t max = INT64_C(0x7fffffffffffffff);
850     static const int64_t min = -max - 1;
851     std::string str;
852 
853     ASSERT_TRUE(RE2::FullMatch("100",  "(-?\\d+)", &v)); ASSERT_EQ(v, 100);
854     ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v)); ASSERT_EQ(v, -100);
855 
856     str = std::to_string(max);
857     ASSERT_TRUE(RE2::FullMatch(str,    "(-?\\d+)", &v)); ASSERT_EQ(v, max);
858 
859     str = std::to_string(min);
860     ASSERT_TRUE(RE2::FullMatch(str,    "(-?\\d+)", &v)); ASSERT_EQ(v, min);
861 
862     str = std::to_string(max);
863     ASSERT_NE(str.back(), '9');
864     str.back()++;
865     ASSERT_FALSE(RE2::FullMatch(str,   "(-?\\d+)", &v));
866 
867     str = std::to_string(min);
868     ASSERT_NE(str.back(), '9');
869     str.back()++;
870     ASSERT_FALSE(RE2::FullMatch(str,   "(-?\\d+)", &v));
871   }
872   {
873     uint64_t v;
874     int64_t v2;
875     static const uint64_t max = UINT64_C(0xffffffffffffffff);
876     std::string str;
877 
878     ASSERT_TRUE(RE2::FullMatch("100",  "(-?\\d+)", &v));  ASSERT_EQ(v, 100);
879     ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v2)); ASSERT_EQ(v2, -100);
880 
881     str = std::to_string(max);
882     ASSERT_TRUE(RE2::FullMatch(str,    "(-?\\d+)", &v)); ASSERT_EQ(v, max);
883 
884     ASSERT_NE(str.back(), '9');
885     str.back()++;
886     ASSERT_FALSE(RE2::FullMatch(str,   "(-?\\d+)", &v));
887   }
888 }
889 
TEST(RE2,FloatingPointFullMatchTypes)890 TEST(RE2, FloatingPointFullMatchTypes) {
891   std::string zeros(1000, '0');
892   {
893     float v;
894     ASSERT_TRUE(RE2::FullMatch("100",   "(.*)", &v)); ASSERT_EQ(v, 100);
895     ASSERT_TRUE(RE2::FullMatch("-100.", "(.*)", &v)); ASSERT_EQ(v, -100);
896     ASSERT_TRUE(RE2::FullMatch("1e23",  "(.*)", &v)); ASSERT_EQ(v, float(1e23));
897     ASSERT_TRUE(RE2::FullMatch(" 100",  "(.*)", &v)); ASSERT_EQ(v, 100);
898 
899     ASSERT_TRUE(RE2::FullMatch(zeros + "1e23",  "(.*)", &v));
900     ASSERT_EQ(v, float(1e23));
901 
902     // 6700000000081920.1 is an edge case.
903     // 6700000000081920 is exactly halfway between
904     // two float32s, so the .1 should make it round up.
905     // However, the .1 is outside the precision possible with
906     // a float64: the nearest float64 is 6700000000081920.
907     // So if the code uses strtod and then converts to float32,
908     // round-to-even will make it round down instead of up.
909     // To pass the test, the parser must call strtof directly.
910     // This test case is carefully chosen to use only a 17-digit
911     // number, since C does not guarantee to get the correctly
912     // rounded answer for strtod and strtof unless the input is
913     // short.
914     //
915     // This is known to fail on Cygwin and MinGW due to a broken
916     // implementation of strtof(3). And apparently MSVC too. Sigh.
917 #if !defined(_MSC_VER) && !defined(__CYGWIN__) && !defined(__MINGW32__)
918     ASSERT_TRUE(RE2::FullMatch("0.1", "(.*)", &v));
919     ASSERT_EQ(v, 0.1f) << absl::StrFormat("%.8g != %.8g", v, 0.1f);
920     ASSERT_TRUE(RE2::FullMatch("6700000000081920.1", "(.*)", &v));
921     ASSERT_EQ(v, 6700000000081920.1f)
922       << absl::StrFormat("%.8g != %.8g", v, 6700000000081920.1f);
923 #endif
924   }
925   {
926     double v;
927     ASSERT_TRUE(RE2::FullMatch("100",   "(.*)", &v)); ASSERT_EQ(v, 100);
928     ASSERT_TRUE(RE2::FullMatch("-100.", "(.*)", &v)); ASSERT_EQ(v, -100);
929     ASSERT_TRUE(RE2::FullMatch("1e23",  "(.*)", &v)); ASSERT_EQ(v, 1e23);
930     ASSERT_TRUE(RE2::FullMatch(zeros + "1e23", "(.*)", &v));
931     ASSERT_EQ(v, double(1e23));
932 
933     ASSERT_TRUE(RE2::FullMatch("0.1", "(.*)", &v));
934     ASSERT_EQ(v, 0.1) << absl::StrFormat("%.17g != %.17g", v, 0.1);
935     ASSERT_TRUE(RE2::FullMatch("1.00000005960464485", "(.*)", &v));
936     ASSERT_EQ(v, 1.0000000596046448)
937       << absl::StrFormat("%.17g != %.17g", v, 1.0000000596046448);
938   }
939 }
940 
TEST(RE2,FullMatchAnchored)941 TEST(RE2, FullMatchAnchored) {
942   int i;
943   // Check that matching is fully anchored
944   ASSERT_FALSE(RE2::FullMatch("x1001", "(\\d+)",  &i));
945   ASSERT_FALSE(RE2::FullMatch("1001x", "(\\d+)",  &i));
946   ASSERT_TRUE(RE2::FullMatch("x1001",  "x(\\d+)", &i)); ASSERT_EQ(i, 1001);
947   ASSERT_TRUE(RE2::FullMatch("1001x",  "(\\d+)x", &i)); ASSERT_EQ(i, 1001);
948 }
949 
TEST(RE2,FullMatchBraces)950 TEST(RE2, FullMatchBraces) {
951   // Braces
952   ASSERT_TRUE(RE2::FullMatch("0abcd",  "[0-9a-f+.-]{5,}"));
953   ASSERT_TRUE(RE2::FullMatch("0abcde", "[0-9a-f+.-]{5,}"));
954   ASSERT_FALSE(RE2::FullMatch("0abc",  "[0-9a-f+.-]{5,}"));
955 }
956 
TEST(RE2,Complicated)957 TEST(RE2, Complicated) {
958   // Complicated RE2
959   ASSERT_TRUE(RE2::FullMatch("foo", "foo|bar|[A-Z]"));
960   ASSERT_TRUE(RE2::FullMatch("bar", "foo|bar|[A-Z]"));
961   ASSERT_TRUE(RE2::FullMatch("X",   "foo|bar|[A-Z]"));
962   ASSERT_FALSE(RE2::FullMatch("XY", "foo|bar|[A-Z]"));
963 }
964 
TEST(RE2,FullMatchEnd)965 TEST(RE2, FullMatchEnd) {
966   // Check full-match handling (needs '$' tacked on internally)
967   ASSERT_TRUE(RE2::FullMatch("fo", "fo|foo"));
968   ASSERT_TRUE(RE2::FullMatch("foo", "fo|foo"));
969   ASSERT_TRUE(RE2::FullMatch("fo", "fo|foo$"));
970   ASSERT_TRUE(RE2::FullMatch("foo", "fo|foo$"));
971   ASSERT_TRUE(RE2::FullMatch("foo", "foo$"));
972   ASSERT_FALSE(RE2::FullMatch("foo$bar", "foo\\$"));
973   ASSERT_FALSE(RE2::FullMatch("fox", "fo|bar"));
974 
975   // Uncomment the following if we change the handling of '$' to
976   // prevent it from matching a trailing newline
977   if (false) {
978     // Check that we don't get bitten by pcre's special handling of a
979     // '\n' at the end of the string matching '$'
980     ASSERT_FALSE(RE2::PartialMatch("foo\n", "foo$"));
981   }
982 }
983 
TEST(RE2,FullMatchArgCount)984 TEST(RE2, FullMatchArgCount) {
985   // Number of args
986   int a[16];
987   ASSERT_TRUE(RE2::FullMatch("", ""));
988 
989   memset(a, 0, sizeof(0));
990   ASSERT_TRUE(RE2::FullMatch("1", "(\\d){1}", &a[0]));
991   ASSERT_EQ(a[0], 1);
992 
993   memset(a, 0, sizeof(0));
994   ASSERT_TRUE(RE2::FullMatch("12", "(\\d)(\\d)", &a[0], &a[1]));
995   ASSERT_EQ(a[0], 1);
996   ASSERT_EQ(a[1], 2);
997 
998   memset(a, 0, sizeof(0));
999   ASSERT_TRUE(RE2::FullMatch("123", "(\\d)(\\d)(\\d)", &a[0], &a[1], &a[2]));
1000   ASSERT_EQ(a[0], 1);
1001   ASSERT_EQ(a[1], 2);
1002   ASSERT_EQ(a[2], 3);
1003 
1004   memset(a, 0, sizeof(0));
1005   ASSERT_TRUE(RE2::FullMatch("1234", "(\\d)(\\d)(\\d)(\\d)", &a[0], &a[1],
1006                              &a[2], &a[3]));
1007   ASSERT_EQ(a[0], 1);
1008   ASSERT_EQ(a[1], 2);
1009   ASSERT_EQ(a[2], 3);
1010   ASSERT_EQ(a[3], 4);
1011 
1012   memset(a, 0, sizeof(0));
1013   ASSERT_TRUE(RE2::FullMatch("12345", "(\\d)(\\d)(\\d)(\\d)(\\d)", &a[0], &a[1],
1014                              &a[2], &a[3], &a[4]));
1015   ASSERT_EQ(a[0], 1);
1016   ASSERT_EQ(a[1], 2);
1017   ASSERT_EQ(a[2], 3);
1018   ASSERT_EQ(a[3], 4);
1019   ASSERT_EQ(a[4], 5);
1020 
1021   memset(a, 0, sizeof(0));
1022   ASSERT_TRUE(RE2::FullMatch("123456", "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)", &a[0],
1023                              &a[1], &a[2], &a[3], &a[4], &a[5]));
1024   ASSERT_EQ(a[0], 1);
1025   ASSERT_EQ(a[1], 2);
1026   ASSERT_EQ(a[2], 3);
1027   ASSERT_EQ(a[3], 4);
1028   ASSERT_EQ(a[4], 5);
1029   ASSERT_EQ(a[5], 6);
1030 
1031   memset(a, 0, sizeof(0));
1032   ASSERT_TRUE(RE2::FullMatch("1234567", "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)",
1033                              &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6]));
1034   ASSERT_EQ(a[0], 1);
1035   ASSERT_EQ(a[1], 2);
1036   ASSERT_EQ(a[2], 3);
1037   ASSERT_EQ(a[3], 4);
1038   ASSERT_EQ(a[4], 5);
1039   ASSERT_EQ(a[5], 6);
1040   ASSERT_EQ(a[6], 7);
1041 
1042   memset(a, 0, sizeof(0));
1043   ASSERT_TRUE(RE2::FullMatch("1234567890123456",
1044                              "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)"
1045                              "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)",
1046                              &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
1047                              &a[7], &a[8], &a[9], &a[10], &a[11], &a[12],
1048                              &a[13], &a[14], &a[15]));
1049   ASSERT_EQ(a[0], 1);
1050   ASSERT_EQ(a[1], 2);
1051   ASSERT_EQ(a[2], 3);
1052   ASSERT_EQ(a[3], 4);
1053   ASSERT_EQ(a[4], 5);
1054   ASSERT_EQ(a[5], 6);
1055   ASSERT_EQ(a[6], 7);
1056   ASSERT_EQ(a[7], 8);
1057   ASSERT_EQ(a[8], 9);
1058   ASSERT_EQ(a[9], 0);
1059   ASSERT_EQ(a[10], 1);
1060   ASSERT_EQ(a[11], 2);
1061   ASSERT_EQ(a[12], 3);
1062   ASSERT_EQ(a[13], 4);
1063   ASSERT_EQ(a[14], 5);
1064   ASSERT_EQ(a[15], 6);
1065 }
1066 
TEST(RE2,Accessors)1067 TEST(RE2, Accessors) {
1068   // Check the pattern() accessor
1069   {
1070     const std::string kPattern = "http://([^/]+)/.*";
1071     const RE2 re(kPattern);
1072     ASSERT_EQ(kPattern, re.pattern());
1073   }
1074 
1075   // Check RE2 error field.
1076   {
1077     RE2 re("foo");
1078     ASSERT_TRUE(re.error().empty());  // Must have no error
1079     ASSERT_TRUE(re.ok());
1080     ASSERT_EQ(re.error_code(), RE2::NoError);
1081   }
1082 }
1083 
TEST(RE2,UTF8)1084 TEST(RE2, UTF8) {
1085   // Check UTF-8 handling
1086   // Three Japanese characters (nihongo)
1087   const char utf8_string[] = {
1088        (char)0xe6, (char)0x97, (char)0xa5, // 65e5
1089        (char)0xe6, (char)0x9c, (char)0xac, // 627c
1090        (char)0xe8, (char)0xaa, (char)0x9e, // 8a9e
1091        0
1092   };
1093   const char utf8_pattern[] = {
1094        '.',
1095        (char)0xe6, (char)0x9c, (char)0xac, // 627c
1096        '.',
1097        0
1098   };
1099 
1100   // Both should match in either mode, bytes or UTF-8
1101   RE2 re_test1(".........", RE2::Latin1);
1102   ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test1));
1103   RE2 re_test2("...");
1104   ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test2));
1105 
1106   // Check that '.' matches one byte or UTF-8 character
1107   // according to the mode.
1108   std::string s;
1109   RE2 re_test3("(.)", RE2::Latin1);
1110   ASSERT_TRUE(RE2::PartialMatch(utf8_string, re_test3, &s));
1111   ASSERT_EQ(s, std::string("\xe6"));
1112   RE2 re_test4("(.)");
1113   ASSERT_TRUE(RE2::PartialMatch(utf8_string, re_test4, &s));
1114   ASSERT_EQ(s, std::string("\xe6\x97\xa5"));
1115 
1116   // Check that string matches itself in either mode
1117   RE2 re_test5(utf8_string, RE2::Latin1);
1118   ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test5));
1119   RE2 re_test6(utf8_string);
1120   ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test6));
1121 
1122   // Check that pattern matches string only in UTF8 mode
1123   RE2 re_test7(utf8_pattern, RE2::Latin1);
1124   ASSERT_FALSE(RE2::FullMatch(utf8_string, re_test7));
1125   RE2 re_test8(utf8_pattern);
1126   ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test8));
1127 }
1128 
TEST(RE2,UngreedyUTF8)1129 TEST(RE2, UngreedyUTF8) {
1130   // Check that ungreedy, UTF8 regular expressions don't match when they
1131   // oughtn't -- see bug 82246.
1132   {
1133     // This code always worked.
1134     const char* pattern = "\\w+X";
1135     const std::string target = "a aX";
1136     RE2 match_sentence(pattern, RE2::Latin1);
1137     RE2 match_sentence_re(pattern);
1138 
1139     ASSERT_FALSE(RE2::FullMatch(target, match_sentence));
1140     ASSERT_FALSE(RE2::FullMatch(target, match_sentence_re));
1141   }
1142   {
1143     const char* pattern = "(?U)\\w+X";
1144     const std::string target = "a aX";
1145     RE2 match_sentence(pattern, RE2::Latin1);
1146     ASSERT_EQ(match_sentence.error(), "");
1147     RE2 match_sentence_re(pattern);
1148 
1149     ASSERT_FALSE(RE2::FullMatch(target, match_sentence));
1150     ASSERT_FALSE(RE2::FullMatch(target, match_sentence_re));
1151   }
1152 }
1153 
TEST(RE2,Rejects)1154 TEST(RE2, Rejects) {
1155   {
1156     RE2 re("a\\1", RE2::Quiet);
1157     ASSERT_FALSE(re.ok()); }
1158   {
1159     RE2 re("a[x", RE2::Quiet);
1160     ASSERT_FALSE(re.ok());
1161   }
1162   {
1163     RE2 re("a[z-a]", RE2::Quiet);
1164     ASSERT_FALSE(re.ok());
1165   }
1166   {
1167     RE2 re("a[[:foobar:]]", RE2::Quiet);
1168     ASSERT_FALSE(re.ok());
1169   }
1170   {
1171     RE2 re("a(b", RE2::Quiet);
1172     ASSERT_FALSE(re.ok());
1173   }
1174   {
1175     RE2 re("a\\", RE2::Quiet);
1176     ASSERT_FALSE(re.ok());
1177   }
1178 }
1179 
TEST(RE2,NoCrash)1180 TEST(RE2, NoCrash) {
1181   // Test that using a bad regexp doesn't crash.
1182   {
1183     RE2 re("a\\", RE2::Quiet);
1184     ASSERT_FALSE(re.ok());
1185     ASSERT_FALSE(RE2::PartialMatch("a\\b", re));
1186   }
1187 
1188   // Test that using an enormous regexp doesn't crash
1189   {
1190     RE2 re("(((.{100}){100}){100}){100}", RE2::Quiet);
1191     ASSERT_FALSE(re.ok());
1192     ASSERT_FALSE(RE2::PartialMatch("aaa", re));
1193   }
1194 
1195   // Test that a crazy regexp still compiles and runs.
1196   {
1197     RE2 re(".{512}x", RE2::Quiet);
1198     ASSERT_TRUE(re.ok());
1199     std::string s;
1200     s.append(515, 'c');
1201     s.append("x");
1202     ASSERT_TRUE(RE2::PartialMatch(s, re));
1203   }
1204 }
1205 
TEST(RE2,Recursion)1206 TEST(RE2, Recursion) {
1207   // Test that recursion is stopped.
1208   // This test is PCRE-legacy -- there's no recursion in RE2.
1209   int bytes = 15 * 1024;  // enough to crash PCRE
1210   TestRecursion(bytes, ".");
1211   TestRecursion(bytes, "a");
1212   TestRecursion(bytes, "a.");
1213   TestRecursion(bytes, "ab.");
1214   TestRecursion(bytes, "abc.");
1215 }
1216 
TEST(RE2,BigCountedRepetition)1217 TEST(RE2, BigCountedRepetition) {
1218   // Test that counted repetition works, given tons of memory.
1219   RE2::Options opt;
1220   opt.set_max_mem(256<<20);
1221 
1222   RE2 re(".{512}x", opt);
1223   ASSERT_TRUE(re.ok());
1224   std::string s;
1225   s.append(515, 'c');
1226   s.append("x");
1227   ASSERT_TRUE(RE2::PartialMatch(s, re));
1228 }
1229 
TEST(RE2,DeepRecursion)1230 TEST(RE2, DeepRecursion) {
1231   // Test for deep stack recursion.  This would fail with a
1232   // segmentation violation due to stack overflow before pcre was
1233   // patched.
1234   // Again, a PCRE legacy test.  RE2 doesn't recurse.
1235   std::string comment("x*");
1236   std::string a(131072, 'a');
1237   comment += a;
1238   comment += "*x";
1239   RE2 re("((?:\\s|xx.*\n|x[*](?:\n|.)*?[*]x)*)");
1240   ASSERT_TRUE(RE2::FullMatch(comment, re));
1241 }
1242 
1243 // Suggested by Josh Hyman.  Failed when SearchOnePass was
1244 // not implementing case-folding.
TEST(CaseInsensitive,MatchAndConsume)1245 TEST(CaseInsensitive, MatchAndConsume) {
1246   std::string text = "A fish named *Wanda*";
1247   absl::string_view sp(text);
1248   absl::string_view result;
1249   EXPECT_TRUE(RE2::PartialMatch(text, "(?i)([wand]{5})", &result));
1250   EXPECT_TRUE(RE2::FindAndConsume(&sp, "(?i)([wand]{5})", &result));
1251 }
1252 
1253 // RE2 should permit implicit conversions from string, string_view, const char*,
1254 // and C string literals.
TEST(RE2,ImplicitConversions)1255 TEST(RE2, ImplicitConversions) {
1256   std::string re_string(".");
1257   absl::string_view re_string_view(".");
1258   const char* re_c_string = ".";
1259   EXPECT_TRUE(RE2::PartialMatch("e", re_string));
1260   EXPECT_TRUE(RE2::PartialMatch("e", re_string_view));
1261   EXPECT_TRUE(RE2::PartialMatch("e", re_c_string));
1262   EXPECT_TRUE(RE2::PartialMatch("e", "."));
1263 }
1264 
1265 // Bugs introduced by 8622304
TEST(RE2,CL8622304)1266 TEST(RE2, CL8622304) {
1267   // reported by ingow
1268   std::string dir;
1269   EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])"));  // ok
1270   EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])", &dir));  // fails
1271 
1272   // reported by jacobsa
1273   std::string key, val;
1274   EXPECT_TRUE(RE2::PartialMatch("bar:1,0x2F,030,4,5;baz:true;fooby:false,true",
1275               "(\\w+)(?::((?:[^;\\\\]|\\\\.)*))?;?",
1276               &key,
1277               &val));
1278   EXPECT_EQ(key, "bar");
1279   EXPECT_EQ(val, "1,0x2F,030,4,5");
1280 }
1281 
1282 // Check that RE2 returns correct regexp pieces on error.
1283 // In particular, make sure it returns whole runes
1284 // and that it always reports invalid UTF-8.
1285 // Also check that Perl error flag piece is big enough.
1286 static struct ErrorTest {
1287   const char *regexp;
1288   RE2::ErrorCode error_code;
1289   const char *error_arg;
1290 } error_tests[] = {
1291   { "ab\\αcd", RE2::ErrorBadEscape, "\\α" },
1292   { "ef\\x☺01", RE2::ErrorBadEscape, "\\x☺0" },
1293   { "gh\\x1☺01", RE2::ErrorBadEscape, "\\x1☺" },
1294   { "ij\\x1", RE2::ErrorBadEscape, "\\x1" },
1295   { "kl\\x", RE2::ErrorBadEscape, "\\x" },
1296   { "uv\\x{0000☺}", RE2::ErrorBadEscape, "\\x{0000☺" },
1297   { "wx\\p{ABC", RE2::ErrorBadCharRange, "\\p{ABC" },
1298   // used to return (?s but the error is X
1299   { "yz(?smiUX:abc)", RE2::ErrorBadPerlOp, "(?smiUX" },
1300   { "aa(?sm☺i", RE2::ErrorBadPerlOp, "(?sm☺" },
1301   { "bb[abc", RE2::ErrorMissingBracket, "[abc" },
1302   { "abc(def", RE2::ErrorMissingParen, "abc(def" },
1303   { "abc)def", RE2::ErrorUnexpectedParen, "abc)def" },
1304 
1305   // no argument string returned for invalid UTF-8
1306   { "mn\\x1\377", RE2::ErrorBadUTF8, "" },
1307   { "op\377qr", RE2::ErrorBadUTF8, "" },
1308   { "st\\x{00000\377", RE2::ErrorBadUTF8, "" },
1309   { "zz\\p{\377}", RE2::ErrorBadUTF8, "" },
1310   { "zz\\x{00\377}", RE2::ErrorBadUTF8, "" },
1311   { "zz(?P<name\377>abc)", RE2::ErrorBadUTF8, "" },
1312 };
TEST(RE2,ErrorCodeAndArg)1313 TEST(RE2, ErrorCodeAndArg) {
1314   for (size_t i = 0; i < ABSL_ARRAYSIZE(error_tests); i++) {
1315     RE2 re(error_tests[i].regexp, RE2::Quiet);
1316     EXPECT_FALSE(re.ok());
1317     EXPECT_EQ(re.error_code(), error_tests[i].error_code) << re.error();
1318     EXPECT_EQ(re.error_arg(), error_tests[i].error_arg) << re.error();
1319   }
1320 }
1321 
1322 // Check that "never match \n" mode never matches \n.
1323 static struct NeverTest {
1324   const char* regexp;
1325   const char* text;
1326   const char* match;
1327 } never_tests[] = {
1328   { "(.*)", "abc\ndef\nghi\n", "abc" },
1329   { "(?s)(abc.*def)", "abc\ndef\n", NULL },
1330   { "(abc(.|\n)*def)", "abc\ndef\n", NULL },
1331   { "(abc[^x]*def)", "abc\ndef\n", NULL },
1332   { "(abc[^x]*def)", "abczzzdef\ndef\n", "abczzzdef" },
1333 };
TEST(RE2,NeverNewline)1334 TEST(RE2, NeverNewline) {
1335   RE2::Options opt;
1336   opt.set_never_nl(true);
1337   for (size_t i = 0; i < ABSL_ARRAYSIZE(never_tests); i++) {
1338     const NeverTest& t = never_tests[i];
1339     RE2 re(t.regexp, opt);
1340     if (t.match == NULL) {
1341       EXPECT_FALSE(re.PartialMatch(t.text, re));
1342     } else {
1343       absl::string_view m;
1344       EXPECT_TRUE(re.PartialMatch(t.text, re, &m));
1345       EXPECT_EQ(m, t.match);
1346     }
1347   }
1348 }
1349 
1350 // Check that dot_nl option works.
TEST(RE2,DotNL)1351 TEST(RE2, DotNL) {
1352   RE2::Options opt;
1353   opt.set_dot_nl(true);
1354   EXPECT_TRUE(RE2::PartialMatch("\n", RE2(".", opt)));
1355   EXPECT_FALSE(RE2::PartialMatch("\n", RE2("(?-s).", opt)));
1356   opt.set_never_nl(true);
1357   EXPECT_FALSE(RE2::PartialMatch("\n", RE2(".", opt)));
1358 }
1359 
1360 // Check that there are no capturing groups in "never capture" mode.
TEST(RE2,NeverCapture)1361 TEST(RE2, NeverCapture) {
1362   RE2::Options opt;
1363   opt.set_never_capture(true);
1364   RE2 re("(r)(e)", opt);
1365   EXPECT_EQ(0, re.NumberOfCapturingGroups());
1366 }
1367 
1368 // Bitstate bug was looking at submatch[0] even if nsubmatch == 0.
1369 // Triggered by a failed DFA search falling back to Bitstate when
1370 // using Match with a NULL submatch set.  Bitstate tried to read
1371 // the submatch[0] entry even if nsubmatch was 0.
TEST(RE2,BitstateCaptureBug)1372 TEST(RE2, BitstateCaptureBug) {
1373   RE2::Options opt;
1374   opt.set_max_mem(20000);
1375   RE2 re("(_________$)", opt);
1376   absl::string_view s = "xxxxxxxxxxxxxxxxxxxxxxxxxx_________x";
1377   EXPECT_FALSE(re.Match(s, 0, s.size(), RE2::UNANCHORED, NULL, 0));
1378 }
1379 
1380 // C++ version of bug 609710.
TEST(RE2,UnicodeClasses)1381 TEST(RE2, UnicodeClasses) {
1382   const std::string str = "ABCDEFGHI譚永鋒";
1383   std::string a, b, c;
1384 
1385   EXPECT_TRUE(RE2::FullMatch("A", "\\p{L}"));
1386   EXPECT_TRUE(RE2::FullMatch("A", "\\p{Lu}"));
1387   EXPECT_FALSE(RE2::FullMatch("A", "\\p{Ll}"));
1388   EXPECT_FALSE(RE2::FullMatch("A", "\\P{L}"));
1389   EXPECT_FALSE(RE2::FullMatch("A", "\\P{Lu}"));
1390   EXPECT_TRUE(RE2::FullMatch("A", "\\P{Ll}"));
1391 
1392   EXPECT_TRUE(RE2::FullMatch("譚", "\\p{L}"));
1393   EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Lu}"));
1394   EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Ll}"));
1395   EXPECT_FALSE(RE2::FullMatch("譚", "\\P{L}"));
1396   EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Lu}"));
1397   EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Ll}"));
1398 
1399   EXPECT_TRUE(RE2::FullMatch("永", "\\p{L}"));
1400   EXPECT_FALSE(RE2::FullMatch("永", "\\p{Lu}"));
1401   EXPECT_FALSE(RE2::FullMatch("永", "\\p{Ll}"));
1402   EXPECT_FALSE(RE2::FullMatch("永", "\\P{L}"));
1403   EXPECT_TRUE(RE2::FullMatch("永", "\\P{Lu}"));
1404   EXPECT_TRUE(RE2::FullMatch("永", "\\P{Ll}"));
1405 
1406   EXPECT_TRUE(RE2::FullMatch("鋒", "\\p{L}"));
1407   EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Lu}"));
1408   EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Ll}"));
1409   EXPECT_FALSE(RE2::FullMatch("鋒", "\\P{L}"));
1410   EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Lu}"));
1411   EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Ll}"));
1412 
1413   EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?(.).*?(.)", &a, &b, &c));
1414   EXPECT_EQ("A", a);
1415   EXPECT_EQ("B", b);
1416   EXPECT_EQ("C", c);
1417 
1418   EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{L}]).*?(.)", &a, &b, &c));
1419   EXPECT_EQ("A", a);
1420   EXPECT_EQ("B", b);
1421   EXPECT_EQ("C", c);
1422 
1423   EXPECT_FALSE(RE2::PartialMatch(str, "\\P{L}"));
1424 
1425   EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{Lu}]).*?(.)", &a, &b, &c));
1426   EXPECT_EQ("A", a);
1427   EXPECT_EQ("B", b);
1428   EXPECT_EQ("C", c);
1429 
1430   EXPECT_FALSE(RE2::PartialMatch(str, "[^\\p{Lu}\\p{Lo}]"));
1431 
1432   EXPECT_TRUE(RE2::PartialMatch(str, ".*(.).*?([\\p{Lu}\\p{Lo}]).*?(.)", &a, &b, &c));
1433   EXPECT_EQ("譚", a);
1434   EXPECT_EQ("永", b);
1435   EXPECT_EQ("鋒", c);
1436 }
1437 
TEST(RE2,LazyRE2)1438 TEST(RE2, LazyRE2) {
1439   // Test with and without options.
1440   static LazyRE2 a = {"a"};
1441   static LazyRE2 b = {"b", RE2::Latin1};
1442 
1443   EXPECT_EQ("a", a->pattern());
1444   EXPECT_EQ(RE2::Options::EncodingUTF8, a->options().encoding());
1445 
1446   EXPECT_EQ("b", b->pattern());
1447   EXPECT_EQ(RE2::Options::EncodingLatin1, b->options().encoding());
1448 }
1449 
1450 // Bug reported by saito. 2009/02/17
TEST(RE2,NullVsEmptyString)1451 TEST(RE2, NullVsEmptyString) {
1452   RE2 re(".*");
1453   EXPECT_TRUE(re.ok());
1454 
1455   absl::string_view null;
1456   EXPECT_TRUE(RE2::FullMatch(null, re));
1457 
1458   absl::string_view empty("");
1459   EXPECT_TRUE(RE2::FullMatch(empty, re));
1460 }
1461 
1462 // Similar to the previous test, check that the null string and the empty
1463 // string both match, but also that the null string can only provide null
1464 // submatches whereas the empty string can also provide empty submatches.
TEST(RE2,NullVsEmptyStringSubmatches)1465 TEST(RE2, NullVsEmptyStringSubmatches) {
1466   RE2 re("()|(foo)");
1467   EXPECT_TRUE(re.ok());
1468 
1469   // matches[0] is overall match, [1] is (), [2] is (foo), [3] is nonexistent.
1470   absl::string_view matches[4];
1471 
1472   for (size_t i = 0; i < ABSL_ARRAYSIZE(matches); i++)
1473     matches[i] = "bar";
1474 
1475   absl::string_view null;
1476   EXPECT_TRUE(re.Match(null, 0, null.size(), RE2::UNANCHORED,
1477                        matches, ABSL_ARRAYSIZE(matches)));
1478   for (size_t i = 0; i < ABSL_ARRAYSIZE(matches); i++) {
1479     EXPECT_TRUE(matches[i].data() == NULL);  // always null
1480     EXPECT_TRUE(matches[i].empty());
1481   }
1482 
1483   for (size_t i = 0; i < ABSL_ARRAYSIZE(matches); i++)
1484     matches[i] = "bar";
1485 
1486   absl::string_view empty("");
1487   EXPECT_TRUE(re.Match(empty, 0, empty.size(), RE2::UNANCHORED,
1488                        matches, ABSL_ARRAYSIZE(matches)));
1489   EXPECT_TRUE(matches[0].data() != NULL);  // empty, not null
1490   EXPECT_TRUE(matches[0].empty());
1491   EXPECT_TRUE(matches[1].data() != NULL);  // empty, not null
1492   EXPECT_TRUE(matches[1].empty());
1493   EXPECT_TRUE(matches[2].data() == NULL);
1494   EXPECT_TRUE(matches[2].empty());
1495   EXPECT_TRUE(matches[3].data() == NULL);
1496   EXPECT_TRUE(matches[3].empty());
1497 }
1498 
1499 // Issue 1816809
TEST(RE2,Bug1816809)1500 TEST(RE2, Bug1816809) {
1501   RE2 re("(((((llx((-3)|(4)))(;(llx((-3)|(4))))*))))");
1502   absl::string_view piece("llx-3;llx4");
1503   std::string x;
1504   EXPECT_TRUE(RE2::Consume(&piece, re, &x));
1505 }
1506 
1507 // Issue 3061120
TEST(RE2,Bug3061120)1508 TEST(RE2, Bug3061120) {
1509   RE2 re("(?i)\\W");
1510   EXPECT_FALSE(RE2::PartialMatch("x", re));  // always worked
1511   EXPECT_FALSE(RE2::PartialMatch("k", re));  // broke because of kelvin
1512   EXPECT_FALSE(RE2::PartialMatch("s", re));  // broke because of latin long s
1513 }
1514 
TEST(RE2,CapturingGroupNames)1515 TEST(RE2, CapturingGroupNames) {
1516   // Opening parentheses annotated with group IDs:
1517   //      12    3        45   6         7
1518   RE2 re("((abc)(?P<G2>)|((e+)(?P<G2>.*)(?P<G1>u+)))");
1519   EXPECT_TRUE(re.ok());
1520   const std::map<int, std::string>& have = re.CapturingGroupNames();
1521   std::map<int, std::string> want;
1522   want[3] = "G2";
1523   want[6] = "G2";
1524   want[7] = "G1";
1525   EXPECT_EQ(want, have);
1526 }
1527 
TEST(RE2,RegexpToStringLossOfAnchor)1528 TEST(RE2, RegexpToStringLossOfAnchor) {
1529   EXPECT_EQ(RE2("^[a-c]at", RE2::POSIX).Regexp()->ToString(), "^[a-c]at");
1530   EXPECT_EQ(RE2("^[a-c]at").Regexp()->ToString(), "(?-m:^)[a-c]at");
1531   EXPECT_EQ(RE2("ca[t-z]$", RE2::POSIX).Regexp()->ToString(), "ca[t-z]$");
1532   EXPECT_EQ(RE2("ca[t-z]$").Regexp()->ToString(), "ca[t-z](?-m:$)");
1533 }
1534 
1535 // Issue 10131674
TEST(RE2,Bug10131674)1536 TEST(RE2, Bug10131674) {
1537   // Some of these escapes describe values that do not fit in a byte.
1538   RE2 re("\\140\\440\\174\\271\\150\\656\\106\\201\\004\\332", RE2::Latin1);
1539   EXPECT_FALSE(re.ok());
1540   EXPECT_FALSE(RE2::FullMatch("hello world", re));
1541 }
1542 
TEST(RE2,Bug18391750)1543 TEST(RE2, Bug18391750) {
1544   // Stray write past end of match_ in nfa.cc, caught by fuzzing + address sanitizer.
1545   const char t[] = {
1546       (char)0x28, (char)0x28, (char)0xfc, (char)0xfc, (char)0x08, (char)0x08,
1547       (char)0x26, (char)0x26, (char)0x28, (char)0xc2, (char)0x9b, (char)0xc5,
1548       (char)0xc5, (char)0xd4, (char)0x8f, (char)0x8f, (char)0x69, (char)0x69,
1549       (char)0xe7, (char)0x29, (char)0x7b, (char)0x37, (char)0x31, (char)0x31,
1550       (char)0x7d, (char)0xae, (char)0x7c, (char)0x7c, (char)0xf3, (char)0x29,
1551       (char)0xae, (char)0xae, (char)0x2e, (char)0x2a, (char)0x29, (char)0x00,
1552   };
1553   RE2::Options opt;
1554   opt.set_encoding(RE2::Options::EncodingLatin1);
1555   opt.set_longest_match(true);
1556   opt.set_dot_nl(true);
1557   opt.set_case_sensitive(false);
1558   RE2 re(t, opt);
1559   ASSERT_TRUE(re.ok());
1560   RE2::PartialMatch(t, re);
1561 }
1562 
TEST(RE2,Bug18458852)1563 TEST(RE2, Bug18458852) {
1564   // Bug in parser accepting invalid (too large) rune,
1565   // causing compiler to fail in DCHECK in UTF-8
1566   // character class code.
1567   const char b[] = {
1568       (char)0x28, (char)0x05, (char)0x05, (char)0x41, (char)0x41, (char)0x28,
1569       (char)0x24, (char)0x5b, (char)0x5e, (char)0xf5, (char)0x87, (char)0x87,
1570       (char)0x90, (char)0x29, (char)0x5d, (char)0x29, (char)0x29, (char)0x00,
1571   };
1572   RE2 re(b);
1573   ASSERT_FALSE(re.ok());
1574 }
1575 
TEST(RE2,Bug18523943)1576 TEST(RE2, Bug18523943) {
1577   // Bug in BitState: case kFailInst failed the match entirely.
1578 
1579   RE2::Options opt;
1580   const char a[] = {
1581       (char)0x29, (char)0x29, (char)0x24, (char)0x00,
1582   };
1583   const char b[] = {
1584       (char)0x28, (char)0x0a, (char)0x2a, (char)0x2a, (char)0x29, (char)0x00,
1585   };
1586   opt.set_log_errors(false);
1587   opt.set_encoding(RE2::Options::EncodingLatin1);
1588   opt.set_posix_syntax(true);
1589   opt.set_longest_match(true);
1590   opt.set_literal(false);
1591   opt.set_never_nl(true);
1592 
1593   RE2 re((const char*)b, opt);
1594   ASSERT_TRUE(re.ok());
1595   std::string s1;
1596   ASSERT_TRUE(RE2::PartialMatch((const char*)a, re, &s1));
1597 }
1598 
TEST(RE2,Bug21371806)1599 TEST(RE2, Bug21371806) {
1600   // Bug in parser accepting Unicode groups in Latin-1 mode,
1601   // causing compiler to fail in DCHECK in prog.cc.
1602 
1603   RE2::Options opt;
1604   opt.set_encoding(RE2::Options::EncodingLatin1);
1605 
1606   RE2 re("g\\p{Zl}]", opt);
1607   ASSERT_TRUE(re.ok());
1608 }
1609 
TEST(RE2,Bug26356109)1610 TEST(RE2, Bug26356109) {
1611   // Bug in parser caused by factoring of common prefixes in alternations.
1612 
1613   // In the past, this was factored to "a\\C*?[bc]". Thus, the automaton would
1614   // consume "ab" and then stop (when unanchored) whereas it should consume all
1615   // of "abc" as per first-match semantics.
1616   RE2 re("a\\C*?c|a\\C*?b");
1617   ASSERT_TRUE(re.ok());
1618 
1619   std::string s = "abc";
1620   absl::string_view m;
1621 
1622   ASSERT_TRUE(re.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1));
1623   ASSERT_EQ(m, s) << " (UNANCHORED) got m='" << m << "', want '" << s << "'";
1624 
1625   ASSERT_TRUE(re.Match(s, 0, s.size(), RE2::ANCHOR_BOTH, &m, 1));
1626   ASSERT_EQ(m, s) << " (ANCHOR_BOTH) got m='" << m << "', want '" << s << "'";
1627 }
1628 
TEST(RE2,Issue104)1629 TEST(RE2, Issue104) {
1630   // RE2::GlobalReplace always advanced by one byte when the empty string was
1631   // matched, which would clobber any rune that is longer than one byte.
1632 
1633   std::string s = "bc";
1634   ASSERT_EQ(3, RE2::GlobalReplace(&s, "a*", "d"));
1635   ASSERT_EQ("dbdcd", s);
1636 
1637   s = "ąć";
1638   ASSERT_EQ(3, RE2::GlobalReplace(&s, "Ć*", "Ĉ"));
1639   ASSERT_EQ("ĈąĈćĈ", s);
1640 
1641   s = "人类";
1642   ASSERT_EQ(3, RE2::GlobalReplace(&s, "大*", "小"));
1643   ASSERT_EQ("小人小类小", s);
1644 }
1645 
TEST(RE2,Issue310)1646 TEST(RE2, Issue310) {
1647   // (?:|a)* matched more text than (?:|a)+ did.
1648 
1649   std::string s = "aaa";
1650   absl::string_view m;
1651 
1652   RE2 star("(?:|a)*");
1653   ASSERT_TRUE(star.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1));
1654   ASSERT_EQ(m, "") << " got m='" << m << "', want ''";
1655 
1656   RE2 plus("(?:|a)+");
1657   ASSERT_TRUE(plus.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1));
1658   ASSERT_EQ(m, "") << " got m='" << m << "', want ''";
1659 }
1660 
TEST(RE2,Issue477)1661 TEST(RE2, Issue477) {
1662   // Regexp::LeadingString didn't output Latin1 into flags.
1663   // In the given pattern, 0xA5 should be factored out, but
1664   // shouldn't lose its Latin1-ness in the process. Because
1665   // that was happening, the prefix for accel was 0xC2 0xA5
1666   // instead of 0xA5. Note that the former doesn't occur in
1667   // the given input and so replacements weren't occurring.
1668 
1669   const char bytes[] = {
1670       (char)0xa5, (char)0xd1, (char)0xa5, (char)0xd1,
1671       (char)0x61, (char)0x63, (char)0xa5, (char)0x64,
1672   };
1673   std::string s(bytes, ABSL_ARRAYSIZE(bytes));
1674   RE2 re("\xa5\xd1|\xa5\x64", RE2::Latin1);
1675   int n = RE2::GlobalReplace(&s, re, "");
1676   ASSERT_EQ(n, 3);
1677   ASSERT_EQ(s, "\x61\x63");
1678 }
1679 
1680 }  // namespace re2
1681