1 // -*- coding: utf-8 -*-
2 // Copyright 2002-2009 The RE2 Authors. All Rights Reserved.
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
5
6 // TODO: Test extractions for PartialMatch/Consume
7
8 #include <errno.h>
9 #include <stddef.h>
10 #include <stdint.h>
11 #include <string.h>
12 #include <map>
13 #include <string>
14 #include <utility>
15 #include <vector>
16 #if !defined(_MSC_VER) && !defined(__CYGWIN__) && !defined(__MINGW32__)
17 #include <sys/mman.h>
18 #include <unistd.h> /* for sysconf */
19 #endif
20
21 #include "absl/base/macros.h"
22 #include "absl/strings/str_format.h"
23 #include "gtest/gtest.h"
24 #include "util/logging.h"
25 #include "re2/re2.h"
26 #include "re2/regexp.h"
27
28 namespace re2 {
29
TEST(RE2,HexTests)30 TEST(RE2, HexTests) {
31 #define ASSERT_HEX(type, value) \
32 do { \
33 type v; \
34 ASSERT_TRUE( \
35 RE2::FullMatch(#value, "([0-9a-fA-F]+)[uUlL]*", RE2::Hex(&v))); \
36 ASSERT_EQ(v, 0x##value); \
37 ASSERT_TRUE(RE2::FullMatch("0x" #value, "([0-9a-fA-FxX]+)[uUlL]*", \
38 RE2::CRadix(&v))); \
39 ASSERT_EQ(v, 0x##value); \
40 } while (0)
41
42 ASSERT_HEX(short, 2bad);
43 ASSERT_HEX(unsigned short, 2badU);
44 ASSERT_HEX(int, dead);
45 ASSERT_HEX(unsigned int, deadU);
46 ASSERT_HEX(long, 7eadbeefL);
47 ASSERT_HEX(unsigned long, deadbeefUL);
48 ASSERT_HEX(long long, 12345678deadbeefLL);
49 ASSERT_HEX(unsigned long long, cafebabedeadbeefULL);
50
51 #undef ASSERT_HEX
52 }
53
TEST(RE2,OctalTests)54 TEST(RE2, OctalTests) {
55 #define ASSERT_OCTAL(type, value) \
56 do { \
57 type v; \
58 ASSERT_TRUE(RE2::FullMatch(#value, "([0-7]+)[uUlL]*", RE2::Octal(&v))); \
59 ASSERT_EQ(v, 0##value); \
60 ASSERT_TRUE(RE2::FullMatch("0" #value, "([0-9a-fA-FxX]+)[uUlL]*", \
61 RE2::CRadix(&v))); \
62 ASSERT_EQ(v, 0##value); \
63 } while (0)
64
65 ASSERT_OCTAL(short, 77777);
66 ASSERT_OCTAL(unsigned short, 177777U);
67 ASSERT_OCTAL(int, 17777777777);
68 ASSERT_OCTAL(unsigned int, 37777777777U);
69 ASSERT_OCTAL(long, 17777777777L);
70 ASSERT_OCTAL(unsigned long, 37777777777UL);
71 ASSERT_OCTAL(long long, 777777777777777777777LL);
72 ASSERT_OCTAL(unsigned long long, 1777777777777777777777ULL);
73
74 #undef ASSERT_OCTAL
75 }
76
TEST(RE2,DecimalTests)77 TEST(RE2, DecimalTests) {
78 #define ASSERT_DECIMAL(type, value) \
79 do { \
80 type v; \
81 ASSERT_TRUE(RE2::FullMatch(#value, "(-?[0-9]+)[uUlL]*", &v)); \
82 ASSERT_EQ(v, value); \
83 ASSERT_TRUE( \
84 RE2::FullMatch(#value, "(-?[0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \
85 ASSERT_EQ(v, value); \
86 } while (0)
87
88 ASSERT_DECIMAL(short, -1);
89 ASSERT_DECIMAL(unsigned short, 9999);
90 ASSERT_DECIMAL(int, -1000);
91 ASSERT_DECIMAL(unsigned int, 12345U);
92 ASSERT_DECIMAL(long, -10000000L);
93 ASSERT_DECIMAL(unsigned long, 3083324652U);
94 ASSERT_DECIMAL(long long, -100000000000000LL);
95 ASSERT_DECIMAL(unsigned long long, 1234567890987654321ULL);
96
97 #undef ASSERT_DECIMAL
98 }
99
TEST(RE2,Replace)100 TEST(RE2, Replace) {
101 struct ReplaceTest {
102 const char *regexp;
103 const char *rewrite;
104 const char *original;
105 const char *single;
106 const char *global;
107 int greplace_count;
108 };
109 static const ReplaceTest tests[] = {
110 { "(qu|[b-df-hj-np-tv-z]*)([a-z]+)",
111 "\\2\\1ay",
112 "the quick brown fox jumps over the lazy dogs.",
113 "ethay quick brown fox jumps over the lazy dogs.",
114 "ethay ickquay ownbray oxfay umpsjay overay ethay azylay ogsday.",
115 9 },
116 { "\\w+",
117 "\\0-NOSPAM",
118 "[email protected]",
119 "[email protected]",
120 "[email protected]",
121 4 },
122 { "^",
123 "(START)",
124 "foo",
125 "(START)foo",
126 "(START)foo",
127 1 },
128 { "^",
129 "(START)",
130 "",
131 "(START)",
132 "(START)",
133 1 },
134 { "$",
135 "(END)",
136 "",
137 "(END)",
138 "(END)",
139 1 },
140 { "b",
141 "bb",
142 "ababababab",
143 "abbabababab",
144 "abbabbabbabbabb",
145 5 },
146 { "b",
147 "bb",
148 "bbbbbb",
149 "bbbbbbb",
150 "bbbbbbbbbbbb",
151 6 },
152 { "b+",
153 "bb",
154 "bbbbbb",
155 "bb",
156 "bb",
157 1 },
158 { "b*",
159 "bb",
160 "bbbbbb",
161 "bb",
162 "bb",
163 1 },
164 { "b*",
165 "bb",
166 "aaaaa",
167 "bbaaaaa",
168 "bbabbabbabbabbabb",
169 6 },
170 // Check newline handling
171 { "a.*a",
172 "(\\0)",
173 "aba\naba",
174 "(aba)\naba",
175 "(aba)\n(aba)",
176 2 },
177 { "", NULL, NULL, NULL, NULL, 0 }
178 };
179
180 for (const ReplaceTest* t = tests; t->original != NULL; t++) {
181 std::string one(t->original);
182 ASSERT_TRUE(RE2::Replace(&one, t->regexp, t->rewrite));
183 ASSERT_EQ(one, t->single);
184 std::string all(t->original);
185 ASSERT_EQ(RE2::GlobalReplace(&all, t->regexp, t->rewrite), t->greplace_count)
186 << "Got: " << all;
187 ASSERT_EQ(all, t->global);
188 }
189 }
190
TestCheckRewriteString(const char * regexp,const char * rewrite,bool expect_ok)191 static void TestCheckRewriteString(const char* regexp, const char* rewrite,
192 bool expect_ok) {
193 std::string error;
194 RE2 exp(regexp);
195 bool actual_ok = exp.CheckRewriteString(rewrite, &error);
196 EXPECT_EQ(expect_ok, actual_ok) << " for " << rewrite << " error: " << error;
197 }
198
TEST(CheckRewriteString,all)199 TEST(CheckRewriteString, all) {
200 TestCheckRewriteString("abc", "foo", true);
201 TestCheckRewriteString("abc", "foo\\", false);
202 TestCheckRewriteString("abc", "foo\\0bar", true);
203
204 TestCheckRewriteString("a(b)c", "foo", true);
205 TestCheckRewriteString("a(b)c", "foo\\0bar", true);
206 TestCheckRewriteString("a(b)c", "foo\\1bar", true);
207 TestCheckRewriteString("a(b)c", "foo\\2bar", false);
208 TestCheckRewriteString("a(b)c", "f\\\\2o\\1o", true);
209
210 TestCheckRewriteString("a(b)(c)", "foo\\12", true);
211 TestCheckRewriteString("a(b)(c)", "f\\2o\\1o", true);
212 TestCheckRewriteString("a(b)(c)", "f\\oo\\1", false);
213 }
214
TEST(RE2,Extract)215 TEST(RE2, Extract) {
216 std::string s;
217
218 ASSERT_TRUE(RE2::Extract("[email protected]", "(.*)@([^.]*)", "\\2!\\1", &s));
219 ASSERT_EQ(s, "kremvax!boris");
220
221 ASSERT_TRUE(RE2::Extract("foo", ".*", "'\\0'", &s));
222 ASSERT_EQ(s, "'foo'");
223 // check that false match doesn't overwrite
224 ASSERT_FALSE(RE2::Extract("baz", "bar", "'\\0'", &s));
225 ASSERT_EQ(s, "'foo'");
226 }
227
TEST(RE2,MaxSubmatchTooLarge)228 TEST(RE2, MaxSubmatchTooLarge) {
229 std::string s;
230 ASSERT_FALSE(RE2::Extract("foo", "f(o+)", "\\1\\2", &s));
231 s = "foo";
232 ASSERT_FALSE(RE2::Replace(&s, "f(o+)", "\\1\\2"));
233 s = "foo";
234 ASSERT_FALSE(RE2::GlobalReplace(&s, "f(o+)", "\\1\\2"));
235 }
236
TEST(RE2,Consume)237 TEST(RE2, Consume) {
238 RE2 r("\\s*(\\w+)"); // matches a word, possibly proceeded by whitespace
239 std::string word;
240
241 std::string s(" aaa b!@#$@#$cccc");
242 absl::string_view input(s);
243
244 ASSERT_TRUE(RE2::Consume(&input, r, &word));
245 ASSERT_EQ(word, "aaa") << " input: " << input;
246 ASSERT_TRUE(RE2::Consume(&input, r, &word));
247 ASSERT_EQ(word, "b") << " input: " << input;
248 ASSERT_FALSE(RE2::Consume(&input, r, &word)) << " input: " << input;
249 }
250
TEST(RE2,ConsumeN)251 TEST(RE2, ConsumeN) {
252 const std::string s(" one two three 4");
253 absl::string_view input(s);
254
255 RE2::Arg argv[2];
256 const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
257
258 // 0 arg
259 EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 0)); // Skips "one".
260
261 // 1 arg
262 std::string word;
263 argv[0] = &word;
264 EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 1));
265 EXPECT_EQ("two", word);
266
267 // Multi-args
268 int n;
269 argv[1] = &n;
270 EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)\\s*(\\d+)", args, 2));
271 EXPECT_EQ("three", word);
272 EXPECT_EQ(4, n);
273 }
274
TEST(RE2,FindAndConsume)275 TEST(RE2, FindAndConsume) {
276 RE2 r("(\\w+)"); // matches a word
277 std::string word;
278
279 std::string s(" aaa b!@#$@#$cccc");
280 absl::string_view input(s);
281
282 ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word));
283 ASSERT_EQ(word, "aaa");
284 ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word));
285 ASSERT_EQ(word, "b");
286 ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word));
287 ASSERT_EQ(word, "cccc");
288 ASSERT_FALSE(RE2::FindAndConsume(&input, r, &word));
289
290 // Check that FindAndConsume works without any submatches.
291 // Earlier version used uninitialized data for
292 // length to consume.
293 input = "aaa";
294 ASSERT_TRUE(RE2::FindAndConsume(&input, "aaa"));
295 ASSERT_EQ(input, "");
296 }
297
TEST(RE2,FindAndConsumeN)298 TEST(RE2, FindAndConsumeN) {
299 const std::string s(" one two three 4");
300 absl::string_view input(s);
301
302 RE2::Arg argv[2];
303 const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
304
305 // 0 arg
306 EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 0)); // Skips "one".
307
308 // 1 arg
309 std::string word;
310 argv[0] = &word;
311 EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 1));
312 EXPECT_EQ("two", word);
313
314 // Multi-args
315 int n;
316 argv[1] = &n;
317 EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)\\s*(\\d+)", args, 2));
318 EXPECT_EQ("three", word);
319 EXPECT_EQ(4, n);
320 }
321
TEST(RE2,MatchNumberPeculiarity)322 TEST(RE2, MatchNumberPeculiarity) {
323 RE2 r("(foo)|(bar)|(baz)");
324 std::string word1;
325 std::string word2;
326 std::string word3;
327
328 ASSERT_TRUE(RE2::PartialMatch("foo", r, &word1, &word2, &word3));
329 ASSERT_EQ(word1, "foo");
330 ASSERT_EQ(word2, "");
331 ASSERT_EQ(word3, "");
332 ASSERT_TRUE(RE2::PartialMatch("bar", r, &word1, &word2, &word3));
333 ASSERT_EQ(word1, "");
334 ASSERT_EQ(word2, "bar");
335 ASSERT_EQ(word3, "");
336 ASSERT_TRUE(RE2::PartialMatch("baz", r, &word1, &word2, &word3));
337 ASSERT_EQ(word1, "");
338 ASSERT_EQ(word2, "");
339 ASSERT_EQ(word3, "baz");
340 ASSERT_FALSE(RE2::PartialMatch("f", r, &word1, &word2, &word3));
341
342 std::string a;
343 ASSERT_TRUE(RE2::FullMatch("hello", "(foo)|hello", &a));
344 ASSERT_EQ(a, "");
345 }
346
TEST(RE2,Match)347 TEST(RE2, Match) {
348 RE2 re("((\\w+):([0-9]+))"); // extracts host and port
349 absl::string_view group[4];
350
351 // No match.
352 absl::string_view s = "zyzzyva";
353 ASSERT_FALSE(
354 re.Match(s, 0, s.size(), RE2::UNANCHORED, group, ABSL_ARRAYSIZE(group)));
355
356 // Matches and extracts.
357 s = "a chrisr:9000 here";
358 ASSERT_TRUE(
359 re.Match(s, 0, s.size(), RE2::UNANCHORED, group, ABSL_ARRAYSIZE(group)));
360 ASSERT_EQ(group[0], "chrisr:9000");
361 ASSERT_EQ(group[1], "chrisr:9000");
362 ASSERT_EQ(group[2], "chrisr");
363 ASSERT_EQ(group[3], "9000");
364
365 std::string all, host;
366 int port;
367 ASSERT_TRUE(RE2::PartialMatch("a chrisr:9000 here", re, &all, &host, &port));
368 ASSERT_EQ(all, "chrisr:9000");
369 ASSERT_EQ(host, "chrisr");
370 ASSERT_EQ(port, 9000);
371 }
372
TestRecursion(int size,const char * pattern)373 static void TestRecursion(int size, const char* pattern) {
374 // Fill up a string repeating the pattern given
375 std::string domain;
376 domain.resize(size);
377 size_t patlen = strlen(pattern);
378 for (int i = 0; i < size; i++) {
379 domain[i] = pattern[i % patlen];
380 }
381 // Just make sure it doesn't crash due to too much recursion.
382 RE2 re("([a-zA-Z0-9]|-)+(\\.([a-zA-Z0-9]|-)+)*(\\.)?", RE2::Quiet);
383 RE2::FullMatch(domain, re);
384 }
385
386 // A meta-quoted string, interpreted as a pattern, should always match
387 // the original unquoted string.
TestQuoteMeta(const std::string & unquoted,const RE2::Options & options=RE2::DefaultOptions)388 static void TestQuoteMeta(const std::string& unquoted,
389 const RE2::Options& options = RE2::DefaultOptions) {
390 std::string quoted = RE2::QuoteMeta(unquoted);
391 RE2 re(quoted, options);
392 EXPECT_TRUE(RE2::FullMatch(unquoted, re))
393 << "Unquoted='" << unquoted << "', quoted='" << quoted << "'.";
394 }
395
396 // A meta-quoted string, interpreted as a pattern, should always match
397 // the original unquoted string.
NegativeTestQuoteMeta(const std::string & unquoted,const std::string & should_not_match,const RE2::Options & options=RE2::DefaultOptions)398 static void NegativeTestQuoteMeta(
399 const std::string& unquoted, const std::string& should_not_match,
400 const RE2::Options& options = RE2::DefaultOptions) {
401 std::string quoted = RE2::QuoteMeta(unquoted);
402 RE2 re(quoted, options);
403 EXPECT_FALSE(RE2::FullMatch(should_not_match, re))
404 << "Unquoted='" << unquoted << "', quoted='" << quoted << "'.";
405 }
406
407 // Tests that quoted meta characters match their original strings,
408 // and that a few things that shouldn't match indeed do not.
TEST(QuoteMeta,Simple)409 TEST(QuoteMeta, Simple) {
410 TestQuoteMeta("foo");
411 TestQuoteMeta("foo.bar");
412 TestQuoteMeta("foo\\.bar");
413 TestQuoteMeta("[1-9]");
414 TestQuoteMeta("1.5-2.0?");
415 TestQuoteMeta("\\d");
416 TestQuoteMeta("Who doesn't like ice cream?");
417 TestQuoteMeta("((a|b)c?d*e+[f-h]i)");
418 TestQuoteMeta("((?!)xxx).*yyy");
419 TestQuoteMeta("([");
420 }
TEST(QuoteMeta,SimpleNegative)421 TEST(QuoteMeta, SimpleNegative) {
422 NegativeTestQuoteMeta("foo", "bar");
423 NegativeTestQuoteMeta("...", "bar");
424 NegativeTestQuoteMeta("\\.", ".");
425 NegativeTestQuoteMeta("\\.", "..");
426 NegativeTestQuoteMeta("(a)", "a");
427 NegativeTestQuoteMeta("(a|b)", "a");
428 NegativeTestQuoteMeta("(a|b)", "(a)");
429 NegativeTestQuoteMeta("(a|b)", "a|b");
430 NegativeTestQuoteMeta("[0-9]", "0");
431 NegativeTestQuoteMeta("[0-9]", "0-9");
432 NegativeTestQuoteMeta("[0-9]", "[9]");
433 NegativeTestQuoteMeta("((?!)xxx)", "xxx");
434 }
435
TEST(QuoteMeta,Latin1)436 TEST(QuoteMeta, Latin1) {
437 TestQuoteMeta("3\xb2 = 9", RE2::Latin1);
438 }
439
TEST(QuoteMeta,UTF8)440 TEST(QuoteMeta, UTF8) {
441 TestQuoteMeta("Plácido Domingo");
442 TestQuoteMeta("xyz"); // No fancy utf8.
443 TestQuoteMeta("\xc2\xb0"); // 2-byte utf8 -- a degree symbol.
444 TestQuoteMeta("27\xc2\xb0 degrees"); // As a middle character.
445 TestQuoteMeta("\xe2\x80\xb3"); // 3-byte utf8 -- a double prime.
446 TestQuoteMeta("\xf0\x9d\x85\x9f"); // 4-byte utf8 -- a music note.
447 TestQuoteMeta("27\xc2\xb0"); // Interpreted as Latin-1, this should
448 // still work.
449 NegativeTestQuoteMeta("27\xc2\xb0",
450 "27\\\xc2\\\xb0"); // 2-byte utf8 -- a degree symbol.
451 }
452
TEST(QuoteMeta,HasNull)453 TEST(QuoteMeta, HasNull) {
454 std::string has_null;
455
456 // string with one null character
457 has_null += '\0';
458 TestQuoteMeta(has_null);
459 NegativeTestQuoteMeta(has_null, "");
460
461 // Don't want null-followed-by-'1' to be interpreted as '\01'.
462 has_null += '1';
463 TestQuoteMeta(has_null);
464 NegativeTestQuoteMeta(has_null, "\1");
465 }
466
TEST(ProgramSize,BigProgram)467 TEST(ProgramSize, BigProgram) {
468 RE2 re_simple("simple regexp");
469 RE2 re_medium("medium.*regexp");
470 RE2 re_complex("complex.{1,128}regexp");
471
472 ASSERT_GT(re_simple.ProgramSize(), 0);
473 ASSERT_GT(re_medium.ProgramSize(), re_simple.ProgramSize());
474 ASSERT_GT(re_complex.ProgramSize(), re_medium.ProgramSize());
475
476 ASSERT_GT(re_simple.ReverseProgramSize(), 0);
477 ASSERT_GT(re_medium.ReverseProgramSize(), re_simple.ReverseProgramSize());
478 ASSERT_GT(re_complex.ReverseProgramSize(), re_medium.ReverseProgramSize());
479 }
480
TEST(ProgramFanout,BigProgram)481 TEST(ProgramFanout, BigProgram) {
482 RE2 re1("(?:(?:(?:(?:(?:.)?){1})*)+)");
483 RE2 re10("(?:(?:(?:(?:(?:.)?){10})*)+)");
484 RE2 re100("(?:(?:(?:(?:(?:.)?){100})*)+)");
485 RE2 re1000("(?:(?:(?:(?:(?:.)?){1000})*)+)");
486
487 std::vector<int> histogram;
488
489 // 3 is the largest non-empty bucket and has 2 element.
490 ASSERT_EQ(3, re1.ProgramFanout(&histogram));
491 ASSERT_EQ(2, histogram[3]);
492
493 // 6 is the largest non-empty bucket and has 11 elements.
494 ASSERT_EQ(6, re10.ProgramFanout(&histogram));
495 ASSERT_EQ(11, histogram[6]);
496
497 // 9 is the largest non-empty bucket and has 101 elements.
498 ASSERT_EQ(9, re100.ProgramFanout(&histogram));
499 ASSERT_EQ(101, histogram[9]);
500
501 // 13 is the largest non-empty bucket and has 1001 elements.
502 ASSERT_EQ(13, re1000.ProgramFanout(&histogram));
503 ASSERT_EQ(1001, histogram[13]);
504
505 // 2 is the largest non-empty bucket and has 2 element.
506 ASSERT_EQ(2, re1.ReverseProgramFanout(&histogram));
507 ASSERT_EQ(2, histogram[2]);
508
509 // 5 is the largest non-empty bucket and has 11 elements.
510 ASSERT_EQ(5, re10.ReverseProgramFanout(&histogram));
511 ASSERT_EQ(11, histogram[5]);
512
513 // 9 is the largest non-empty bucket and has 101 elements.
514 ASSERT_EQ(9, re100.ReverseProgramFanout(&histogram));
515 ASSERT_EQ(101, histogram[9]);
516
517 // 12 is the largest non-empty bucket and has 1001 elements.
518 ASSERT_EQ(12, re1000.ReverseProgramFanout(&histogram));
519 ASSERT_EQ(1001, histogram[12]);
520 }
521
522 // Issue 956519: handling empty character sets was
523 // causing NULL dereference. This tests a few empty character sets.
524 // (The way to get an empty character set is to negate a full one.)
TEST(EmptyCharset,Fuzz)525 TEST(EmptyCharset, Fuzz) {
526 static const char *empties[] = {
527 "[^\\S\\s]",
528 "[^\\S[:space:]]",
529 "[^\\D\\d]",
530 "[^\\D[:digit:]]"
531 };
532 for (size_t i = 0; i < ABSL_ARRAYSIZE(empties); i++)
533 ASSERT_FALSE(RE2(empties[i]).Match("abc", 0, 3, RE2::UNANCHORED, NULL, 0));
534 }
535
536 // Bitstate assumes that kInstFail instructions in
537 // alternations or capture groups have been "compiled away".
TEST(EmptyCharset,BitstateAssumptions)538 TEST(EmptyCharset, BitstateAssumptions) {
539 // Captures trigger use of Bitstate.
540 static const char *nop_empties[] = {
541 "((((()))))" "[^\\S\\s]?",
542 "((((()))))" "([^\\S\\s])?",
543 "((((()))))" "([^\\S\\s]|[^\\S\\s])?",
544 "((((()))))" "(([^\\S\\s]|[^\\S\\s])|)"
545 };
546 absl::string_view group[6];
547 for (size_t i = 0; i < ABSL_ARRAYSIZE(nop_empties); i++)
548 ASSERT_TRUE(RE2(nop_empties[i]).Match("", 0, 0, RE2::UNANCHORED, group, 6));
549 }
550
551 // Test that named groups work correctly.
TEST(Capture,NamedGroups)552 TEST(Capture, NamedGroups) {
553 {
554 RE2 re("(hello world)");
555 ASSERT_EQ(re.NumberOfCapturingGroups(), 1);
556 const std::map<std::string, int>& m = re.NamedCapturingGroups();
557 ASSERT_EQ(m.size(), 0);
558 }
559
560 {
561 RE2 re("(?P<A>expr(?P<B>expr)(?P<C>expr))((expr)(?P<D>expr))");
562 ASSERT_EQ(re.NumberOfCapturingGroups(), 6);
563 const std::map<std::string, int>& m = re.NamedCapturingGroups();
564 ASSERT_EQ(m.size(), 4);
565 ASSERT_EQ(m.find("A")->second, 1);
566 ASSERT_EQ(m.find("B")->second, 2);
567 ASSERT_EQ(m.find("C")->second, 3);
568 ASSERT_EQ(m.find("D")->second, 6); // $4 and $5 are anonymous
569 }
570 }
571
TEST(RE2,CapturedGroupTest)572 TEST(RE2, CapturedGroupTest) {
573 RE2 re("directions from (?P<S>.*) to (?P<D>.*)");
574 int num_groups = re.NumberOfCapturingGroups();
575 EXPECT_EQ(2, num_groups);
576 std::string args[4];
577 RE2::Arg arg0(&args[0]);
578 RE2::Arg arg1(&args[1]);
579 RE2::Arg arg2(&args[2]);
580 RE2::Arg arg3(&args[3]);
581
582 const RE2::Arg* const matches[4] = {&arg0, &arg1, &arg2, &arg3};
583 EXPECT_TRUE(RE2::FullMatchN("directions from mountain view to san jose",
584 re, matches, num_groups));
585 const std::map<std::string, int>& named_groups = re.NamedCapturingGroups();
586 EXPECT_TRUE(named_groups.find("S") != named_groups.end());
587 EXPECT_TRUE(named_groups.find("D") != named_groups.end());
588
589 // The named group index is 1-based.
590 int source_group_index = named_groups.find("S")->second;
591 int destination_group_index = named_groups.find("D")->second;
592 EXPECT_EQ(1, source_group_index);
593 EXPECT_EQ(2, destination_group_index);
594
595 // The args is zero-based.
596 EXPECT_EQ("mountain view", args[source_group_index - 1]);
597 EXPECT_EQ("san jose", args[destination_group_index - 1]);
598 }
599
TEST(RE2,FullMatchWithNoArgs)600 TEST(RE2, FullMatchWithNoArgs) {
601 ASSERT_TRUE(RE2::FullMatch("h", "h"));
602 ASSERT_TRUE(RE2::FullMatch("hello", "hello"));
603 ASSERT_TRUE(RE2::FullMatch("hello", "h.*o"));
604 ASSERT_FALSE(RE2::FullMatch("othello", "h.*o")); // Must be anchored at front
605 ASSERT_FALSE(RE2::FullMatch("hello!", "h.*o")); // Must be anchored at end
606 }
607
TEST(RE2,PartialMatch)608 TEST(RE2, PartialMatch) {
609 ASSERT_TRUE(RE2::PartialMatch("x", "x"));
610 ASSERT_TRUE(RE2::PartialMatch("hello", "h.*o"));
611 ASSERT_TRUE(RE2::PartialMatch("othello", "h.*o"));
612 ASSERT_TRUE(RE2::PartialMatch("hello!", "h.*o"));
613 ASSERT_TRUE(RE2::PartialMatch("x", "((((((((((((((((((((x))))))))))))))))))))"));
614 }
615
TEST(RE2,PartialMatchN)616 TEST(RE2, PartialMatchN) {
617 RE2::Arg argv[2];
618 const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
619
620 // 0 arg
621 EXPECT_TRUE(RE2::PartialMatchN("hello", "e.*o", args, 0));
622 EXPECT_FALSE(RE2::PartialMatchN("othello", "a.*o", args, 0));
623
624 // 1 arg
625 int i;
626 argv[0] = &i;
627 EXPECT_TRUE(RE2::PartialMatchN("1001 nights", "(\\d+)", args, 1));
628 EXPECT_EQ(1001, i);
629 EXPECT_FALSE(RE2::PartialMatchN("three", "(\\d+)", args, 1));
630
631 // Multi-arg
632 std::string s;
633 argv[1] = &s;
634 EXPECT_TRUE(RE2::PartialMatchN("answer: 42:life", "(\\d+):(\\w+)", args, 2));
635 EXPECT_EQ(42, i);
636 EXPECT_EQ("life", s);
637 EXPECT_FALSE(RE2::PartialMatchN("hi1", "(\\w+)(1)", args, 2));
638 }
639
TEST(RE2,FullMatchZeroArg)640 TEST(RE2, FullMatchZeroArg) {
641 // Zero-arg
642 ASSERT_TRUE(RE2::FullMatch("1001", "\\d+"));
643 }
644
TEST(RE2,FullMatchOneArg)645 TEST(RE2, FullMatchOneArg) {
646 int i;
647
648 // Single-arg
649 ASSERT_TRUE(RE2::FullMatch("1001", "(\\d+)", &i));
650 ASSERT_EQ(i, 1001);
651 ASSERT_TRUE(RE2::FullMatch("-123", "(-?\\d+)", &i));
652 ASSERT_EQ(i, -123);
653 ASSERT_FALSE(RE2::FullMatch("10", "()\\d+", &i));
654 ASSERT_FALSE(
655 RE2::FullMatch("1234567890123456789012345678901234567890", "(\\d+)", &i));
656 }
657
TEST(RE2,FullMatchIntegerArg)658 TEST(RE2, FullMatchIntegerArg) {
659 int i;
660
661 // Digits surrounding integer-arg
662 ASSERT_TRUE(RE2::FullMatch("1234", "1(\\d*)4", &i));
663 ASSERT_EQ(i, 23);
664 ASSERT_TRUE(RE2::FullMatch("1234", "(\\d)\\d+", &i));
665 ASSERT_EQ(i, 1);
666 ASSERT_TRUE(RE2::FullMatch("-1234", "(-\\d)\\d+", &i));
667 ASSERT_EQ(i, -1);
668 ASSERT_TRUE(RE2::PartialMatch("1234", "(\\d)", &i));
669 ASSERT_EQ(i, 1);
670 ASSERT_TRUE(RE2::PartialMatch("-1234", "(-\\d)", &i));
671 ASSERT_EQ(i, -1);
672 }
673
TEST(RE2,FullMatchStringArg)674 TEST(RE2, FullMatchStringArg) {
675 std::string s;
676 // string-arg
677 ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", &s));
678 ASSERT_EQ(s, std::string("ell"));
679 }
680
TEST(RE2,FullMatchStringViewArg)681 TEST(RE2, FullMatchStringViewArg) {
682 int i;
683 absl::string_view sp;
684 // string_view-arg
685 ASSERT_TRUE(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &sp, &i));
686 ASSERT_EQ(sp.size(), 4);
687 ASSERT_TRUE(memcmp(sp.data(), "ruby", 4) == 0);
688 ASSERT_EQ(i, 1234);
689 }
690
TEST(RE2,FullMatchMultiArg)691 TEST(RE2, FullMatchMultiArg) {
692 int i;
693 std::string s;
694 // Multi-arg
695 ASSERT_TRUE(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
696 ASSERT_EQ(s, std::string("ruby"));
697 ASSERT_EQ(i, 1234);
698 }
699
TEST(RE2,FullMatchN)700 TEST(RE2, FullMatchN) {
701 RE2::Arg argv[2];
702 const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
703
704 // 0 arg
705 EXPECT_TRUE(RE2::FullMatchN("hello", "h.*o", args, 0));
706 EXPECT_FALSE(RE2::FullMatchN("othello", "h.*o", args, 0));
707
708 // 1 arg
709 int i;
710 argv[0] = &i;
711 EXPECT_TRUE(RE2::FullMatchN("1001", "(\\d+)", args, 1));
712 EXPECT_EQ(1001, i);
713 EXPECT_FALSE(RE2::FullMatchN("three", "(\\d+)", args, 1));
714
715 // Multi-arg
716 std::string s;
717 argv[1] = &s;
718 EXPECT_TRUE(RE2::FullMatchN("42:life", "(\\d+):(\\w+)", args, 2));
719 EXPECT_EQ(42, i);
720 EXPECT_EQ("life", s);
721 EXPECT_FALSE(RE2::FullMatchN("hi1", "(\\w+)(1)", args, 2));
722 }
723
TEST(RE2,FullMatchIgnoredArg)724 TEST(RE2, FullMatchIgnoredArg) {
725 int i;
726 std::string s;
727
728 // Old-school NULL should be ignored.
729 ASSERT_TRUE(
730 RE2::FullMatch("ruby:1234", "(\\w+)(:)(\\d+)", &s, (void*)NULL, &i));
731 ASSERT_EQ(s, std::string("ruby"));
732 ASSERT_EQ(i, 1234);
733
734 // C++11 nullptr should also be ignored.
735 ASSERT_TRUE(RE2::FullMatch("rubz:1235", "(\\w+)(:)(\\d+)", &s, nullptr, &i));
736 ASSERT_EQ(s, std::string("rubz"));
737 ASSERT_EQ(i, 1235);
738 }
739
TEST(RE2,FullMatchTypedNullArg)740 TEST(RE2, FullMatchTypedNullArg) {
741 std::string s;
742
743 // Ignore non-void* NULL arg
744 ASSERT_TRUE(RE2::FullMatch("hello", "he(.*)lo", (char*)NULL));
745 ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", (std::string*)NULL));
746 ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", (absl::string_view*)NULL));
747 ASSERT_TRUE(RE2::FullMatch("1234", "(.*)", (int*)NULL));
748 ASSERT_TRUE(RE2::FullMatch("1234567890123456", "(.*)", (long long*)NULL));
749 ASSERT_TRUE(RE2::FullMatch("123.4567890123456", "(.*)", (double*)NULL));
750 ASSERT_TRUE(RE2::FullMatch("123.4567890123456", "(.*)", (float*)NULL));
751
752 // Fail on non-void* NULL arg if the match doesn't parse for the given type.
753 ASSERT_FALSE(RE2::FullMatch("hello", "h(.*)lo", &s, (char*)NULL));
754 ASSERT_FALSE(RE2::FullMatch("hello", "(.*)", (int*)NULL));
755 ASSERT_FALSE(RE2::FullMatch("1234567890123456", "(.*)", (int*)NULL));
756 ASSERT_FALSE(RE2::FullMatch("hello", "(.*)", (double*)NULL));
757 ASSERT_FALSE(RE2::FullMatch("hello", "(.*)", (float*)NULL));
758 }
759
760 // Check that numeric parsing code does not read past the end of
761 // the number being parsed.
762 // This implementation requires mmap(2) et al. and thus cannot
763 // be used unless they are available.
TEST(RE2,NULTerminated)764 TEST(RE2, NULTerminated) {
765 #if defined(_POSIX_MAPPED_FILES) && _POSIX_MAPPED_FILES > 0
766 char *v;
767 int x;
768 long pagesize = sysconf(_SC_PAGE_SIZE);
769
770 #ifndef MAP_ANONYMOUS
771 #define MAP_ANONYMOUS MAP_ANON
772 #endif
773 v = static_cast<char*>(mmap(NULL, 2*pagesize, PROT_READ|PROT_WRITE,
774 MAP_ANONYMOUS|MAP_PRIVATE, -1, 0));
775 ASSERT_TRUE(v != reinterpret_cast<char*>(-1));
776 LOG(INFO) << "Memory at " << (void*)v;
777 ASSERT_EQ(munmap(v + pagesize, pagesize), 0) << " error " << errno;
778 v[pagesize - 1] = '1';
779
780 x = 0;
781 ASSERT_TRUE(
782 RE2::FullMatch(absl::string_view(v + pagesize - 1, 1), "(.*)", &x));
783 ASSERT_EQ(x, 1);
784 #endif
785 }
786
TEST(RE2,FullMatchTypeTests)787 TEST(RE2, FullMatchTypeTests) {
788 // Type tests
789 std::string zeros(1000, '0');
790 {
791 char c;
792 ASSERT_TRUE(RE2::FullMatch("Hello", "(H)ello", &c));
793 ASSERT_EQ(c, 'H');
794 }
795 {
796 unsigned char c;
797 ASSERT_TRUE(RE2::FullMatch("Hello", "(H)ello", &c));
798 ASSERT_EQ(c, static_cast<unsigned char>('H'));
799 }
800 {
801 int16_t v;
802 ASSERT_TRUE(RE2::FullMatch("100", "(-?\\d+)", &v)); ASSERT_EQ(v, 100);
803 ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v)); ASSERT_EQ(v, -100);
804 ASSERT_TRUE(RE2::FullMatch("32767", "(-?\\d+)", &v)); ASSERT_EQ(v, 32767);
805 ASSERT_TRUE(RE2::FullMatch("-32768", "(-?\\d+)", &v)); ASSERT_EQ(v, -32768);
806 ASSERT_FALSE(RE2::FullMatch("-32769", "(-?\\d+)", &v));
807 ASSERT_FALSE(RE2::FullMatch("32768", "(-?\\d+)", &v));
808 }
809 {
810 uint16_t v;
811 ASSERT_TRUE(RE2::FullMatch("100", "(\\d+)", &v)); ASSERT_EQ(v, 100);
812 ASSERT_TRUE(RE2::FullMatch("32767", "(\\d+)", &v)); ASSERT_EQ(v, 32767);
813 ASSERT_TRUE(RE2::FullMatch("65535", "(\\d+)", &v)); ASSERT_EQ(v, 65535);
814 ASSERT_FALSE(RE2::FullMatch("65536", "(\\d+)", &v));
815 }
816 {
817 int32_t v;
818 static const int32_t max = INT32_C(0x7fffffff);
819 static const int32_t min = -max - 1;
820 ASSERT_TRUE(RE2::FullMatch("100", "(-?\\d+)", &v)); ASSERT_EQ(v, 100);
821 ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v)); ASSERT_EQ(v, -100);
822 ASSERT_TRUE(RE2::FullMatch("2147483647", "(-?\\d+)", &v)); ASSERT_EQ(v, max);
823 ASSERT_TRUE(RE2::FullMatch("-2147483648", "(-?\\d+)", &v)); ASSERT_EQ(v, min);
824 ASSERT_FALSE(RE2::FullMatch("-2147483649", "(-?\\d+)", &v));
825 ASSERT_FALSE(RE2::FullMatch("2147483648", "(-?\\d+)", &v));
826
827 ASSERT_TRUE(RE2::FullMatch(zeros + "2147483647", "(-?\\d+)", &v));
828 ASSERT_EQ(v, max);
829 ASSERT_TRUE(RE2::FullMatch("-" + zeros + "2147483648", "(-?\\d+)", &v));
830 ASSERT_EQ(v, min);
831
832 ASSERT_FALSE(RE2::FullMatch("-" + zeros + "2147483649", "(-?\\d+)", &v));
833 ASSERT_TRUE(RE2::FullMatch("0x7fffffff", "(.*)", RE2::CRadix(&v)));
834 ASSERT_EQ(v, max);
835 ASSERT_FALSE(RE2::FullMatch("000x7fffffff", "(.*)", RE2::CRadix(&v)));
836 }
837 {
838 uint32_t v;
839 static const uint32_t max = UINT32_C(0xffffffff);
840 ASSERT_TRUE(RE2::FullMatch("100", "(\\d+)", &v)); ASSERT_EQ(v, 100);
841 ASSERT_TRUE(RE2::FullMatch("4294967295", "(\\d+)", &v)); ASSERT_EQ(v, max);
842 ASSERT_FALSE(RE2::FullMatch("4294967296", "(\\d+)", &v));
843 ASSERT_FALSE(RE2::FullMatch("-1", "(\\d+)", &v));
844
845 ASSERT_TRUE(RE2::FullMatch(zeros + "4294967295", "(\\d+)", &v)); ASSERT_EQ(v, max);
846 }
847 {
848 int64_t v;
849 static const int64_t max = INT64_C(0x7fffffffffffffff);
850 static const int64_t min = -max - 1;
851 std::string str;
852
853 ASSERT_TRUE(RE2::FullMatch("100", "(-?\\d+)", &v)); ASSERT_EQ(v, 100);
854 ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v)); ASSERT_EQ(v, -100);
855
856 str = std::to_string(max);
857 ASSERT_TRUE(RE2::FullMatch(str, "(-?\\d+)", &v)); ASSERT_EQ(v, max);
858
859 str = std::to_string(min);
860 ASSERT_TRUE(RE2::FullMatch(str, "(-?\\d+)", &v)); ASSERT_EQ(v, min);
861
862 str = std::to_string(max);
863 ASSERT_NE(str.back(), '9');
864 str.back()++;
865 ASSERT_FALSE(RE2::FullMatch(str, "(-?\\d+)", &v));
866
867 str = std::to_string(min);
868 ASSERT_NE(str.back(), '9');
869 str.back()++;
870 ASSERT_FALSE(RE2::FullMatch(str, "(-?\\d+)", &v));
871 }
872 {
873 uint64_t v;
874 int64_t v2;
875 static const uint64_t max = UINT64_C(0xffffffffffffffff);
876 std::string str;
877
878 ASSERT_TRUE(RE2::FullMatch("100", "(-?\\d+)", &v)); ASSERT_EQ(v, 100);
879 ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v2)); ASSERT_EQ(v2, -100);
880
881 str = std::to_string(max);
882 ASSERT_TRUE(RE2::FullMatch(str, "(-?\\d+)", &v)); ASSERT_EQ(v, max);
883
884 ASSERT_NE(str.back(), '9');
885 str.back()++;
886 ASSERT_FALSE(RE2::FullMatch(str, "(-?\\d+)", &v));
887 }
888 }
889
TEST(RE2,FloatingPointFullMatchTypes)890 TEST(RE2, FloatingPointFullMatchTypes) {
891 std::string zeros(1000, '0');
892 {
893 float v;
894 ASSERT_TRUE(RE2::FullMatch("100", "(.*)", &v)); ASSERT_EQ(v, 100);
895 ASSERT_TRUE(RE2::FullMatch("-100.", "(.*)", &v)); ASSERT_EQ(v, -100);
896 ASSERT_TRUE(RE2::FullMatch("1e23", "(.*)", &v)); ASSERT_EQ(v, float(1e23));
897 ASSERT_TRUE(RE2::FullMatch(" 100", "(.*)", &v)); ASSERT_EQ(v, 100);
898
899 ASSERT_TRUE(RE2::FullMatch(zeros + "1e23", "(.*)", &v));
900 ASSERT_EQ(v, float(1e23));
901
902 // 6700000000081920.1 is an edge case.
903 // 6700000000081920 is exactly halfway between
904 // two float32s, so the .1 should make it round up.
905 // However, the .1 is outside the precision possible with
906 // a float64: the nearest float64 is 6700000000081920.
907 // So if the code uses strtod and then converts to float32,
908 // round-to-even will make it round down instead of up.
909 // To pass the test, the parser must call strtof directly.
910 // This test case is carefully chosen to use only a 17-digit
911 // number, since C does not guarantee to get the correctly
912 // rounded answer for strtod and strtof unless the input is
913 // short.
914 //
915 // This is known to fail on Cygwin and MinGW due to a broken
916 // implementation of strtof(3). And apparently MSVC too. Sigh.
917 #if !defined(_MSC_VER) && !defined(__CYGWIN__) && !defined(__MINGW32__)
918 ASSERT_TRUE(RE2::FullMatch("0.1", "(.*)", &v));
919 ASSERT_EQ(v, 0.1f) << absl::StrFormat("%.8g != %.8g", v, 0.1f);
920 ASSERT_TRUE(RE2::FullMatch("6700000000081920.1", "(.*)", &v));
921 ASSERT_EQ(v, 6700000000081920.1f)
922 << absl::StrFormat("%.8g != %.8g", v, 6700000000081920.1f);
923 #endif
924 }
925 {
926 double v;
927 ASSERT_TRUE(RE2::FullMatch("100", "(.*)", &v)); ASSERT_EQ(v, 100);
928 ASSERT_TRUE(RE2::FullMatch("-100.", "(.*)", &v)); ASSERT_EQ(v, -100);
929 ASSERT_TRUE(RE2::FullMatch("1e23", "(.*)", &v)); ASSERT_EQ(v, 1e23);
930 ASSERT_TRUE(RE2::FullMatch(zeros + "1e23", "(.*)", &v));
931 ASSERT_EQ(v, double(1e23));
932
933 ASSERT_TRUE(RE2::FullMatch("0.1", "(.*)", &v));
934 ASSERT_EQ(v, 0.1) << absl::StrFormat("%.17g != %.17g", v, 0.1);
935 ASSERT_TRUE(RE2::FullMatch("1.00000005960464485", "(.*)", &v));
936 ASSERT_EQ(v, 1.0000000596046448)
937 << absl::StrFormat("%.17g != %.17g", v, 1.0000000596046448);
938 }
939 }
940
TEST(RE2,FullMatchAnchored)941 TEST(RE2, FullMatchAnchored) {
942 int i;
943 // Check that matching is fully anchored
944 ASSERT_FALSE(RE2::FullMatch("x1001", "(\\d+)", &i));
945 ASSERT_FALSE(RE2::FullMatch("1001x", "(\\d+)", &i));
946 ASSERT_TRUE(RE2::FullMatch("x1001", "x(\\d+)", &i)); ASSERT_EQ(i, 1001);
947 ASSERT_TRUE(RE2::FullMatch("1001x", "(\\d+)x", &i)); ASSERT_EQ(i, 1001);
948 }
949
TEST(RE2,FullMatchBraces)950 TEST(RE2, FullMatchBraces) {
951 // Braces
952 ASSERT_TRUE(RE2::FullMatch("0abcd", "[0-9a-f+.-]{5,}"));
953 ASSERT_TRUE(RE2::FullMatch("0abcde", "[0-9a-f+.-]{5,}"));
954 ASSERT_FALSE(RE2::FullMatch("0abc", "[0-9a-f+.-]{5,}"));
955 }
956
TEST(RE2,Complicated)957 TEST(RE2, Complicated) {
958 // Complicated RE2
959 ASSERT_TRUE(RE2::FullMatch("foo", "foo|bar|[A-Z]"));
960 ASSERT_TRUE(RE2::FullMatch("bar", "foo|bar|[A-Z]"));
961 ASSERT_TRUE(RE2::FullMatch("X", "foo|bar|[A-Z]"));
962 ASSERT_FALSE(RE2::FullMatch("XY", "foo|bar|[A-Z]"));
963 }
964
TEST(RE2,FullMatchEnd)965 TEST(RE2, FullMatchEnd) {
966 // Check full-match handling (needs '$' tacked on internally)
967 ASSERT_TRUE(RE2::FullMatch("fo", "fo|foo"));
968 ASSERT_TRUE(RE2::FullMatch("foo", "fo|foo"));
969 ASSERT_TRUE(RE2::FullMatch("fo", "fo|foo$"));
970 ASSERT_TRUE(RE2::FullMatch("foo", "fo|foo$"));
971 ASSERT_TRUE(RE2::FullMatch("foo", "foo$"));
972 ASSERT_FALSE(RE2::FullMatch("foo$bar", "foo\\$"));
973 ASSERT_FALSE(RE2::FullMatch("fox", "fo|bar"));
974
975 // Uncomment the following if we change the handling of '$' to
976 // prevent it from matching a trailing newline
977 if (false) {
978 // Check that we don't get bitten by pcre's special handling of a
979 // '\n' at the end of the string matching '$'
980 ASSERT_FALSE(RE2::PartialMatch("foo\n", "foo$"));
981 }
982 }
983
TEST(RE2,FullMatchArgCount)984 TEST(RE2, FullMatchArgCount) {
985 // Number of args
986 int a[16];
987 ASSERT_TRUE(RE2::FullMatch("", ""));
988
989 memset(a, 0, sizeof(0));
990 ASSERT_TRUE(RE2::FullMatch("1", "(\\d){1}", &a[0]));
991 ASSERT_EQ(a[0], 1);
992
993 memset(a, 0, sizeof(0));
994 ASSERT_TRUE(RE2::FullMatch("12", "(\\d)(\\d)", &a[0], &a[1]));
995 ASSERT_EQ(a[0], 1);
996 ASSERT_EQ(a[1], 2);
997
998 memset(a, 0, sizeof(0));
999 ASSERT_TRUE(RE2::FullMatch("123", "(\\d)(\\d)(\\d)", &a[0], &a[1], &a[2]));
1000 ASSERT_EQ(a[0], 1);
1001 ASSERT_EQ(a[1], 2);
1002 ASSERT_EQ(a[2], 3);
1003
1004 memset(a, 0, sizeof(0));
1005 ASSERT_TRUE(RE2::FullMatch("1234", "(\\d)(\\d)(\\d)(\\d)", &a[0], &a[1],
1006 &a[2], &a[3]));
1007 ASSERT_EQ(a[0], 1);
1008 ASSERT_EQ(a[1], 2);
1009 ASSERT_EQ(a[2], 3);
1010 ASSERT_EQ(a[3], 4);
1011
1012 memset(a, 0, sizeof(0));
1013 ASSERT_TRUE(RE2::FullMatch("12345", "(\\d)(\\d)(\\d)(\\d)(\\d)", &a[0], &a[1],
1014 &a[2], &a[3], &a[4]));
1015 ASSERT_EQ(a[0], 1);
1016 ASSERT_EQ(a[1], 2);
1017 ASSERT_EQ(a[2], 3);
1018 ASSERT_EQ(a[3], 4);
1019 ASSERT_EQ(a[4], 5);
1020
1021 memset(a, 0, sizeof(0));
1022 ASSERT_TRUE(RE2::FullMatch("123456", "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)", &a[0],
1023 &a[1], &a[2], &a[3], &a[4], &a[5]));
1024 ASSERT_EQ(a[0], 1);
1025 ASSERT_EQ(a[1], 2);
1026 ASSERT_EQ(a[2], 3);
1027 ASSERT_EQ(a[3], 4);
1028 ASSERT_EQ(a[4], 5);
1029 ASSERT_EQ(a[5], 6);
1030
1031 memset(a, 0, sizeof(0));
1032 ASSERT_TRUE(RE2::FullMatch("1234567", "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)",
1033 &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6]));
1034 ASSERT_EQ(a[0], 1);
1035 ASSERT_EQ(a[1], 2);
1036 ASSERT_EQ(a[2], 3);
1037 ASSERT_EQ(a[3], 4);
1038 ASSERT_EQ(a[4], 5);
1039 ASSERT_EQ(a[5], 6);
1040 ASSERT_EQ(a[6], 7);
1041
1042 memset(a, 0, sizeof(0));
1043 ASSERT_TRUE(RE2::FullMatch("1234567890123456",
1044 "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)"
1045 "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)",
1046 &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
1047 &a[7], &a[8], &a[9], &a[10], &a[11], &a[12],
1048 &a[13], &a[14], &a[15]));
1049 ASSERT_EQ(a[0], 1);
1050 ASSERT_EQ(a[1], 2);
1051 ASSERT_EQ(a[2], 3);
1052 ASSERT_EQ(a[3], 4);
1053 ASSERT_EQ(a[4], 5);
1054 ASSERT_EQ(a[5], 6);
1055 ASSERT_EQ(a[6], 7);
1056 ASSERT_EQ(a[7], 8);
1057 ASSERT_EQ(a[8], 9);
1058 ASSERT_EQ(a[9], 0);
1059 ASSERT_EQ(a[10], 1);
1060 ASSERT_EQ(a[11], 2);
1061 ASSERT_EQ(a[12], 3);
1062 ASSERT_EQ(a[13], 4);
1063 ASSERT_EQ(a[14], 5);
1064 ASSERT_EQ(a[15], 6);
1065 }
1066
TEST(RE2,Accessors)1067 TEST(RE2, Accessors) {
1068 // Check the pattern() accessor
1069 {
1070 const std::string kPattern = "http://([^/]+)/.*";
1071 const RE2 re(kPattern);
1072 ASSERT_EQ(kPattern, re.pattern());
1073 }
1074
1075 // Check RE2 error field.
1076 {
1077 RE2 re("foo");
1078 ASSERT_TRUE(re.error().empty()); // Must have no error
1079 ASSERT_TRUE(re.ok());
1080 ASSERT_EQ(re.error_code(), RE2::NoError);
1081 }
1082 }
1083
TEST(RE2,UTF8)1084 TEST(RE2, UTF8) {
1085 // Check UTF-8 handling
1086 // Three Japanese characters (nihongo)
1087 const char utf8_string[] = {
1088 (char)0xe6, (char)0x97, (char)0xa5, // 65e5
1089 (char)0xe6, (char)0x9c, (char)0xac, // 627c
1090 (char)0xe8, (char)0xaa, (char)0x9e, // 8a9e
1091 0
1092 };
1093 const char utf8_pattern[] = {
1094 '.',
1095 (char)0xe6, (char)0x9c, (char)0xac, // 627c
1096 '.',
1097 0
1098 };
1099
1100 // Both should match in either mode, bytes or UTF-8
1101 RE2 re_test1(".........", RE2::Latin1);
1102 ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test1));
1103 RE2 re_test2("...");
1104 ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test2));
1105
1106 // Check that '.' matches one byte or UTF-8 character
1107 // according to the mode.
1108 std::string s;
1109 RE2 re_test3("(.)", RE2::Latin1);
1110 ASSERT_TRUE(RE2::PartialMatch(utf8_string, re_test3, &s));
1111 ASSERT_EQ(s, std::string("\xe6"));
1112 RE2 re_test4("(.)");
1113 ASSERT_TRUE(RE2::PartialMatch(utf8_string, re_test4, &s));
1114 ASSERT_EQ(s, std::string("\xe6\x97\xa5"));
1115
1116 // Check that string matches itself in either mode
1117 RE2 re_test5(utf8_string, RE2::Latin1);
1118 ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test5));
1119 RE2 re_test6(utf8_string);
1120 ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test6));
1121
1122 // Check that pattern matches string only in UTF8 mode
1123 RE2 re_test7(utf8_pattern, RE2::Latin1);
1124 ASSERT_FALSE(RE2::FullMatch(utf8_string, re_test7));
1125 RE2 re_test8(utf8_pattern);
1126 ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test8));
1127 }
1128
TEST(RE2,UngreedyUTF8)1129 TEST(RE2, UngreedyUTF8) {
1130 // Check that ungreedy, UTF8 regular expressions don't match when they
1131 // oughtn't -- see bug 82246.
1132 {
1133 // This code always worked.
1134 const char* pattern = "\\w+X";
1135 const std::string target = "a aX";
1136 RE2 match_sentence(pattern, RE2::Latin1);
1137 RE2 match_sentence_re(pattern);
1138
1139 ASSERT_FALSE(RE2::FullMatch(target, match_sentence));
1140 ASSERT_FALSE(RE2::FullMatch(target, match_sentence_re));
1141 }
1142 {
1143 const char* pattern = "(?U)\\w+X";
1144 const std::string target = "a aX";
1145 RE2 match_sentence(pattern, RE2::Latin1);
1146 ASSERT_EQ(match_sentence.error(), "");
1147 RE2 match_sentence_re(pattern);
1148
1149 ASSERT_FALSE(RE2::FullMatch(target, match_sentence));
1150 ASSERT_FALSE(RE2::FullMatch(target, match_sentence_re));
1151 }
1152 }
1153
TEST(RE2,Rejects)1154 TEST(RE2, Rejects) {
1155 {
1156 RE2 re("a\\1", RE2::Quiet);
1157 ASSERT_FALSE(re.ok()); }
1158 {
1159 RE2 re("a[x", RE2::Quiet);
1160 ASSERT_FALSE(re.ok());
1161 }
1162 {
1163 RE2 re("a[z-a]", RE2::Quiet);
1164 ASSERT_FALSE(re.ok());
1165 }
1166 {
1167 RE2 re("a[[:foobar:]]", RE2::Quiet);
1168 ASSERT_FALSE(re.ok());
1169 }
1170 {
1171 RE2 re("a(b", RE2::Quiet);
1172 ASSERT_FALSE(re.ok());
1173 }
1174 {
1175 RE2 re("a\\", RE2::Quiet);
1176 ASSERT_FALSE(re.ok());
1177 }
1178 }
1179
TEST(RE2,NoCrash)1180 TEST(RE2, NoCrash) {
1181 // Test that using a bad regexp doesn't crash.
1182 {
1183 RE2 re("a\\", RE2::Quiet);
1184 ASSERT_FALSE(re.ok());
1185 ASSERT_FALSE(RE2::PartialMatch("a\\b", re));
1186 }
1187
1188 // Test that using an enormous regexp doesn't crash
1189 {
1190 RE2 re("(((.{100}){100}){100}){100}", RE2::Quiet);
1191 ASSERT_FALSE(re.ok());
1192 ASSERT_FALSE(RE2::PartialMatch("aaa", re));
1193 }
1194
1195 // Test that a crazy regexp still compiles and runs.
1196 {
1197 RE2 re(".{512}x", RE2::Quiet);
1198 ASSERT_TRUE(re.ok());
1199 std::string s;
1200 s.append(515, 'c');
1201 s.append("x");
1202 ASSERT_TRUE(RE2::PartialMatch(s, re));
1203 }
1204 }
1205
TEST(RE2,Recursion)1206 TEST(RE2, Recursion) {
1207 // Test that recursion is stopped.
1208 // This test is PCRE-legacy -- there's no recursion in RE2.
1209 int bytes = 15 * 1024; // enough to crash PCRE
1210 TestRecursion(bytes, ".");
1211 TestRecursion(bytes, "a");
1212 TestRecursion(bytes, "a.");
1213 TestRecursion(bytes, "ab.");
1214 TestRecursion(bytes, "abc.");
1215 }
1216
TEST(RE2,BigCountedRepetition)1217 TEST(RE2, BigCountedRepetition) {
1218 // Test that counted repetition works, given tons of memory.
1219 RE2::Options opt;
1220 opt.set_max_mem(256<<20);
1221
1222 RE2 re(".{512}x", opt);
1223 ASSERT_TRUE(re.ok());
1224 std::string s;
1225 s.append(515, 'c');
1226 s.append("x");
1227 ASSERT_TRUE(RE2::PartialMatch(s, re));
1228 }
1229
TEST(RE2,DeepRecursion)1230 TEST(RE2, DeepRecursion) {
1231 // Test for deep stack recursion. This would fail with a
1232 // segmentation violation due to stack overflow before pcre was
1233 // patched.
1234 // Again, a PCRE legacy test. RE2 doesn't recurse.
1235 std::string comment("x*");
1236 std::string a(131072, 'a');
1237 comment += a;
1238 comment += "*x";
1239 RE2 re("((?:\\s|xx.*\n|x[*](?:\n|.)*?[*]x)*)");
1240 ASSERT_TRUE(RE2::FullMatch(comment, re));
1241 }
1242
1243 // Suggested by Josh Hyman. Failed when SearchOnePass was
1244 // not implementing case-folding.
TEST(CaseInsensitive,MatchAndConsume)1245 TEST(CaseInsensitive, MatchAndConsume) {
1246 std::string text = "A fish named *Wanda*";
1247 absl::string_view sp(text);
1248 absl::string_view result;
1249 EXPECT_TRUE(RE2::PartialMatch(text, "(?i)([wand]{5})", &result));
1250 EXPECT_TRUE(RE2::FindAndConsume(&sp, "(?i)([wand]{5})", &result));
1251 }
1252
1253 // RE2 should permit implicit conversions from string, string_view, const char*,
1254 // and C string literals.
TEST(RE2,ImplicitConversions)1255 TEST(RE2, ImplicitConversions) {
1256 std::string re_string(".");
1257 absl::string_view re_string_view(".");
1258 const char* re_c_string = ".";
1259 EXPECT_TRUE(RE2::PartialMatch("e", re_string));
1260 EXPECT_TRUE(RE2::PartialMatch("e", re_string_view));
1261 EXPECT_TRUE(RE2::PartialMatch("e", re_c_string));
1262 EXPECT_TRUE(RE2::PartialMatch("e", "."));
1263 }
1264
1265 // Bugs introduced by 8622304
TEST(RE2,CL8622304)1266 TEST(RE2, CL8622304) {
1267 // reported by ingow
1268 std::string dir;
1269 EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])")); // ok
1270 EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])", &dir)); // fails
1271
1272 // reported by jacobsa
1273 std::string key, val;
1274 EXPECT_TRUE(RE2::PartialMatch("bar:1,0x2F,030,4,5;baz:true;fooby:false,true",
1275 "(\\w+)(?::((?:[^;\\\\]|\\\\.)*))?;?",
1276 &key,
1277 &val));
1278 EXPECT_EQ(key, "bar");
1279 EXPECT_EQ(val, "1,0x2F,030,4,5");
1280 }
1281
1282 // Check that RE2 returns correct regexp pieces on error.
1283 // In particular, make sure it returns whole runes
1284 // and that it always reports invalid UTF-8.
1285 // Also check that Perl error flag piece is big enough.
1286 static struct ErrorTest {
1287 const char *regexp;
1288 RE2::ErrorCode error_code;
1289 const char *error_arg;
1290 } error_tests[] = {
1291 { "ab\\αcd", RE2::ErrorBadEscape, "\\α" },
1292 { "ef\\x☺01", RE2::ErrorBadEscape, "\\x☺0" },
1293 { "gh\\x1☺01", RE2::ErrorBadEscape, "\\x1☺" },
1294 { "ij\\x1", RE2::ErrorBadEscape, "\\x1" },
1295 { "kl\\x", RE2::ErrorBadEscape, "\\x" },
1296 { "uv\\x{0000☺}", RE2::ErrorBadEscape, "\\x{0000☺" },
1297 { "wx\\p{ABC", RE2::ErrorBadCharRange, "\\p{ABC" },
1298 // used to return (?s but the error is X
1299 { "yz(?smiUX:abc)", RE2::ErrorBadPerlOp, "(?smiUX" },
1300 { "aa(?sm☺i", RE2::ErrorBadPerlOp, "(?sm☺" },
1301 { "bb[abc", RE2::ErrorMissingBracket, "[abc" },
1302 { "abc(def", RE2::ErrorMissingParen, "abc(def" },
1303 { "abc)def", RE2::ErrorUnexpectedParen, "abc)def" },
1304
1305 // no argument string returned for invalid UTF-8
1306 { "mn\\x1\377", RE2::ErrorBadUTF8, "" },
1307 { "op\377qr", RE2::ErrorBadUTF8, "" },
1308 { "st\\x{00000\377", RE2::ErrorBadUTF8, "" },
1309 { "zz\\p{\377}", RE2::ErrorBadUTF8, "" },
1310 { "zz\\x{00\377}", RE2::ErrorBadUTF8, "" },
1311 { "zz(?P<name\377>abc)", RE2::ErrorBadUTF8, "" },
1312 };
TEST(RE2,ErrorCodeAndArg)1313 TEST(RE2, ErrorCodeAndArg) {
1314 for (size_t i = 0; i < ABSL_ARRAYSIZE(error_tests); i++) {
1315 RE2 re(error_tests[i].regexp, RE2::Quiet);
1316 EXPECT_FALSE(re.ok());
1317 EXPECT_EQ(re.error_code(), error_tests[i].error_code) << re.error();
1318 EXPECT_EQ(re.error_arg(), error_tests[i].error_arg) << re.error();
1319 }
1320 }
1321
1322 // Check that "never match \n" mode never matches \n.
1323 static struct NeverTest {
1324 const char* regexp;
1325 const char* text;
1326 const char* match;
1327 } never_tests[] = {
1328 { "(.*)", "abc\ndef\nghi\n", "abc" },
1329 { "(?s)(abc.*def)", "abc\ndef\n", NULL },
1330 { "(abc(.|\n)*def)", "abc\ndef\n", NULL },
1331 { "(abc[^x]*def)", "abc\ndef\n", NULL },
1332 { "(abc[^x]*def)", "abczzzdef\ndef\n", "abczzzdef" },
1333 };
TEST(RE2,NeverNewline)1334 TEST(RE2, NeverNewline) {
1335 RE2::Options opt;
1336 opt.set_never_nl(true);
1337 for (size_t i = 0; i < ABSL_ARRAYSIZE(never_tests); i++) {
1338 const NeverTest& t = never_tests[i];
1339 RE2 re(t.regexp, opt);
1340 if (t.match == NULL) {
1341 EXPECT_FALSE(re.PartialMatch(t.text, re));
1342 } else {
1343 absl::string_view m;
1344 EXPECT_TRUE(re.PartialMatch(t.text, re, &m));
1345 EXPECT_EQ(m, t.match);
1346 }
1347 }
1348 }
1349
1350 // Check that dot_nl option works.
TEST(RE2,DotNL)1351 TEST(RE2, DotNL) {
1352 RE2::Options opt;
1353 opt.set_dot_nl(true);
1354 EXPECT_TRUE(RE2::PartialMatch("\n", RE2(".", opt)));
1355 EXPECT_FALSE(RE2::PartialMatch("\n", RE2("(?-s).", opt)));
1356 opt.set_never_nl(true);
1357 EXPECT_FALSE(RE2::PartialMatch("\n", RE2(".", opt)));
1358 }
1359
1360 // Check that there are no capturing groups in "never capture" mode.
TEST(RE2,NeverCapture)1361 TEST(RE2, NeverCapture) {
1362 RE2::Options opt;
1363 opt.set_never_capture(true);
1364 RE2 re("(r)(e)", opt);
1365 EXPECT_EQ(0, re.NumberOfCapturingGroups());
1366 }
1367
1368 // Bitstate bug was looking at submatch[0] even if nsubmatch == 0.
1369 // Triggered by a failed DFA search falling back to Bitstate when
1370 // using Match with a NULL submatch set. Bitstate tried to read
1371 // the submatch[0] entry even if nsubmatch was 0.
TEST(RE2,BitstateCaptureBug)1372 TEST(RE2, BitstateCaptureBug) {
1373 RE2::Options opt;
1374 opt.set_max_mem(20000);
1375 RE2 re("(_________$)", opt);
1376 absl::string_view s = "xxxxxxxxxxxxxxxxxxxxxxxxxx_________x";
1377 EXPECT_FALSE(re.Match(s, 0, s.size(), RE2::UNANCHORED, NULL, 0));
1378 }
1379
1380 // C++ version of bug 609710.
TEST(RE2,UnicodeClasses)1381 TEST(RE2, UnicodeClasses) {
1382 const std::string str = "ABCDEFGHI譚永鋒";
1383 std::string a, b, c;
1384
1385 EXPECT_TRUE(RE2::FullMatch("A", "\\p{L}"));
1386 EXPECT_TRUE(RE2::FullMatch("A", "\\p{Lu}"));
1387 EXPECT_FALSE(RE2::FullMatch("A", "\\p{Ll}"));
1388 EXPECT_FALSE(RE2::FullMatch("A", "\\P{L}"));
1389 EXPECT_FALSE(RE2::FullMatch("A", "\\P{Lu}"));
1390 EXPECT_TRUE(RE2::FullMatch("A", "\\P{Ll}"));
1391
1392 EXPECT_TRUE(RE2::FullMatch("譚", "\\p{L}"));
1393 EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Lu}"));
1394 EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Ll}"));
1395 EXPECT_FALSE(RE2::FullMatch("譚", "\\P{L}"));
1396 EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Lu}"));
1397 EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Ll}"));
1398
1399 EXPECT_TRUE(RE2::FullMatch("永", "\\p{L}"));
1400 EXPECT_FALSE(RE2::FullMatch("永", "\\p{Lu}"));
1401 EXPECT_FALSE(RE2::FullMatch("永", "\\p{Ll}"));
1402 EXPECT_FALSE(RE2::FullMatch("永", "\\P{L}"));
1403 EXPECT_TRUE(RE2::FullMatch("永", "\\P{Lu}"));
1404 EXPECT_TRUE(RE2::FullMatch("永", "\\P{Ll}"));
1405
1406 EXPECT_TRUE(RE2::FullMatch("鋒", "\\p{L}"));
1407 EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Lu}"));
1408 EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Ll}"));
1409 EXPECT_FALSE(RE2::FullMatch("鋒", "\\P{L}"));
1410 EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Lu}"));
1411 EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Ll}"));
1412
1413 EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?(.).*?(.)", &a, &b, &c));
1414 EXPECT_EQ("A", a);
1415 EXPECT_EQ("B", b);
1416 EXPECT_EQ("C", c);
1417
1418 EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{L}]).*?(.)", &a, &b, &c));
1419 EXPECT_EQ("A", a);
1420 EXPECT_EQ("B", b);
1421 EXPECT_EQ("C", c);
1422
1423 EXPECT_FALSE(RE2::PartialMatch(str, "\\P{L}"));
1424
1425 EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{Lu}]).*?(.)", &a, &b, &c));
1426 EXPECT_EQ("A", a);
1427 EXPECT_EQ("B", b);
1428 EXPECT_EQ("C", c);
1429
1430 EXPECT_FALSE(RE2::PartialMatch(str, "[^\\p{Lu}\\p{Lo}]"));
1431
1432 EXPECT_TRUE(RE2::PartialMatch(str, ".*(.).*?([\\p{Lu}\\p{Lo}]).*?(.)", &a, &b, &c));
1433 EXPECT_EQ("譚", a);
1434 EXPECT_EQ("永", b);
1435 EXPECT_EQ("鋒", c);
1436 }
1437
TEST(RE2,LazyRE2)1438 TEST(RE2, LazyRE2) {
1439 // Test with and without options.
1440 static LazyRE2 a = {"a"};
1441 static LazyRE2 b = {"b", RE2::Latin1};
1442
1443 EXPECT_EQ("a", a->pattern());
1444 EXPECT_EQ(RE2::Options::EncodingUTF8, a->options().encoding());
1445
1446 EXPECT_EQ("b", b->pattern());
1447 EXPECT_EQ(RE2::Options::EncodingLatin1, b->options().encoding());
1448 }
1449
1450 // Bug reported by saito. 2009/02/17
TEST(RE2,NullVsEmptyString)1451 TEST(RE2, NullVsEmptyString) {
1452 RE2 re(".*");
1453 EXPECT_TRUE(re.ok());
1454
1455 absl::string_view null;
1456 EXPECT_TRUE(RE2::FullMatch(null, re));
1457
1458 absl::string_view empty("");
1459 EXPECT_TRUE(RE2::FullMatch(empty, re));
1460 }
1461
1462 // Similar to the previous test, check that the null string and the empty
1463 // string both match, but also that the null string can only provide null
1464 // submatches whereas the empty string can also provide empty submatches.
TEST(RE2,NullVsEmptyStringSubmatches)1465 TEST(RE2, NullVsEmptyStringSubmatches) {
1466 RE2 re("()|(foo)");
1467 EXPECT_TRUE(re.ok());
1468
1469 // matches[0] is overall match, [1] is (), [2] is (foo), [3] is nonexistent.
1470 absl::string_view matches[4];
1471
1472 for (size_t i = 0; i < ABSL_ARRAYSIZE(matches); i++)
1473 matches[i] = "bar";
1474
1475 absl::string_view null;
1476 EXPECT_TRUE(re.Match(null, 0, null.size(), RE2::UNANCHORED,
1477 matches, ABSL_ARRAYSIZE(matches)));
1478 for (size_t i = 0; i < ABSL_ARRAYSIZE(matches); i++) {
1479 EXPECT_TRUE(matches[i].data() == NULL); // always null
1480 EXPECT_TRUE(matches[i].empty());
1481 }
1482
1483 for (size_t i = 0; i < ABSL_ARRAYSIZE(matches); i++)
1484 matches[i] = "bar";
1485
1486 absl::string_view empty("");
1487 EXPECT_TRUE(re.Match(empty, 0, empty.size(), RE2::UNANCHORED,
1488 matches, ABSL_ARRAYSIZE(matches)));
1489 EXPECT_TRUE(matches[0].data() != NULL); // empty, not null
1490 EXPECT_TRUE(matches[0].empty());
1491 EXPECT_TRUE(matches[1].data() != NULL); // empty, not null
1492 EXPECT_TRUE(matches[1].empty());
1493 EXPECT_TRUE(matches[2].data() == NULL);
1494 EXPECT_TRUE(matches[2].empty());
1495 EXPECT_TRUE(matches[3].data() == NULL);
1496 EXPECT_TRUE(matches[3].empty());
1497 }
1498
1499 // Issue 1816809
TEST(RE2,Bug1816809)1500 TEST(RE2, Bug1816809) {
1501 RE2 re("(((((llx((-3)|(4)))(;(llx((-3)|(4))))*))))");
1502 absl::string_view piece("llx-3;llx4");
1503 std::string x;
1504 EXPECT_TRUE(RE2::Consume(&piece, re, &x));
1505 }
1506
1507 // Issue 3061120
TEST(RE2,Bug3061120)1508 TEST(RE2, Bug3061120) {
1509 RE2 re("(?i)\\W");
1510 EXPECT_FALSE(RE2::PartialMatch("x", re)); // always worked
1511 EXPECT_FALSE(RE2::PartialMatch("k", re)); // broke because of kelvin
1512 EXPECT_FALSE(RE2::PartialMatch("s", re)); // broke because of latin long s
1513 }
1514
TEST(RE2,CapturingGroupNames)1515 TEST(RE2, CapturingGroupNames) {
1516 // Opening parentheses annotated with group IDs:
1517 // 12 3 45 6 7
1518 RE2 re("((abc)(?P<G2>)|((e+)(?P<G2>.*)(?P<G1>u+)))");
1519 EXPECT_TRUE(re.ok());
1520 const std::map<int, std::string>& have = re.CapturingGroupNames();
1521 std::map<int, std::string> want;
1522 want[3] = "G2";
1523 want[6] = "G2";
1524 want[7] = "G1";
1525 EXPECT_EQ(want, have);
1526 }
1527
TEST(RE2,RegexpToStringLossOfAnchor)1528 TEST(RE2, RegexpToStringLossOfAnchor) {
1529 EXPECT_EQ(RE2("^[a-c]at", RE2::POSIX).Regexp()->ToString(), "^[a-c]at");
1530 EXPECT_EQ(RE2("^[a-c]at").Regexp()->ToString(), "(?-m:^)[a-c]at");
1531 EXPECT_EQ(RE2("ca[t-z]$", RE2::POSIX).Regexp()->ToString(), "ca[t-z]$");
1532 EXPECT_EQ(RE2("ca[t-z]$").Regexp()->ToString(), "ca[t-z](?-m:$)");
1533 }
1534
1535 // Issue 10131674
TEST(RE2,Bug10131674)1536 TEST(RE2, Bug10131674) {
1537 // Some of these escapes describe values that do not fit in a byte.
1538 RE2 re("\\140\\440\\174\\271\\150\\656\\106\\201\\004\\332", RE2::Latin1);
1539 EXPECT_FALSE(re.ok());
1540 EXPECT_FALSE(RE2::FullMatch("hello world", re));
1541 }
1542
TEST(RE2,Bug18391750)1543 TEST(RE2, Bug18391750) {
1544 // Stray write past end of match_ in nfa.cc, caught by fuzzing + address sanitizer.
1545 const char t[] = {
1546 (char)0x28, (char)0x28, (char)0xfc, (char)0xfc, (char)0x08, (char)0x08,
1547 (char)0x26, (char)0x26, (char)0x28, (char)0xc2, (char)0x9b, (char)0xc5,
1548 (char)0xc5, (char)0xd4, (char)0x8f, (char)0x8f, (char)0x69, (char)0x69,
1549 (char)0xe7, (char)0x29, (char)0x7b, (char)0x37, (char)0x31, (char)0x31,
1550 (char)0x7d, (char)0xae, (char)0x7c, (char)0x7c, (char)0xf3, (char)0x29,
1551 (char)0xae, (char)0xae, (char)0x2e, (char)0x2a, (char)0x29, (char)0x00,
1552 };
1553 RE2::Options opt;
1554 opt.set_encoding(RE2::Options::EncodingLatin1);
1555 opt.set_longest_match(true);
1556 opt.set_dot_nl(true);
1557 opt.set_case_sensitive(false);
1558 RE2 re(t, opt);
1559 ASSERT_TRUE(re.ok());
1560 RE2::PartialMatch(t, re);
1561 }
1562
TEST(RE2,Bug18458852)1563 TEST(RE2, Bug18458852) {
1564 // Bug in parser accepting invalid (too large) rune,
1565 // causing compiler to fail in DCHECK in UTF-8
1566 // character class code.
1567 const char b[] = {
1568 (char)0x28, (char)0x05, (char)0x05, (char)0x41, (char)0x41, (char)0x28,
1569 (char)0x24, (char)0x5b, (char)0x5e, (char)0xf5, (char)0x87, (char)0x87,
1570 (char)0x90, (char)0x29, (char)0x5d, (char)0x29, (char)0x29, (char)0x00,
1571 };
1572 RE2 re(b);
1573 ASSERT_FALSE(re.ok());
1574 }
1575
TEST(RE2,Bug18523943)1576 TEST(RE2, Bug18523943) {
1577 // Bug in BitState: case kFailInst failed the match entirely.
1578
1579 RE2::Options opt;
1580 const char a[] = {
1581 (char)0x29, (char)0x29, (char)0x24, (char)0x00,
1582 };
1583 const char b[] = {
1584 (char)0x28, (char)0x0a, (char)0x2a, (char)0x2a, (char)0x29, (char)0x00,
1585 };
1586 opt.set_log_errors(false);
1587 opt.set_encoding(RE2::Options::EncodingLatin1);
1588 opt.set_posix_syntax(true);
1589 opt.set_longest_match(true);
1590 opt.set_literal(false);
1591 opt.set_never_nl(true);
1592
1593 RE2 re((const char*)b, opt);
1594 ASSERT_TRUE(re.ok());
1595 std::string s1;
1596 ASSERT_TRUE(RE2::PartialMatch((const char*)a, re, &s1));
1597 }
1598
TEST(RE2,Bug21371806)1599 TEST(RE2, Bug21371806) {
1600 // Bug in parser accepting Unicode groups in Latin-1 mode,
1601 // causing compiler to fail in DCHECK in prog.cc.
1602
1603 RE2::Options opt;
1604 opt.set_encoding(RE2::Options::EncodingLatin1);
1605
1606 RE2 re("g\\p{Zl}]", opt);
1607 ASSERT_TRUE(re.ok());
1608 }
1609
TEST(RE2,Bug26356109)1610 TEST(RE2, Bug26356109) {
1611 // Bug in parser caused by factoring of common prefixes in alternations.
1612
1613 // In the past, this was factored to "a\\C*?[bc]". Thus, the automaton would
1614 // consume "ab" and then stop (when unanchored) whereas it should consume all
1615 // of "abc" as per first-match semantics.
1616 RE2 re("a\\C*?c|a\\C*?b");
1617 ASSERT_TRUE(re.ok());
1618
1619 std::string s = "abc";
1620 absl::string_view m;
1621
1622 ASSERT_TRUE(re.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1));
1623 ASSERT_EQ(m, s) << " (UNANCHORED) got m='" << m << "', want '" << s << "'";
1624
1625 ASSERT_TRUE(re.Match(s, 0, s.size(), RE2::ANCHOR_BOTH, &m, 1));
1626 ASSERT_EQ(m, s) << " (ANCHOR_BOTH) got m='" << m << "', want '" << s << "'";
1627 }
1628
TEST(RE2,Issue104)1629 TEST(RE2, Issue104) {
1630 // RE2::GlobalReplace always advanced by one byte when the empty string was
1631 // matched, which would clobber any rune that is longer than one byte.
1632
1633 std::string s = "bc";
1634 ASSERT_EQ(3, RE2::GlobalReplace(&s, "a*", "d"));
1635 ASSERT_EQ("dbdcd", s);
1636
1637 s = "ąć";
1638 ASSERT_EQ(3, RE2::GlobalReplace(&s, "Ć*", "Ĉ"));
1639 ASSERT_EQ("ĈąĈćĈ", s);
1640
1641 s = "人类";
1642 ASSERT_EQ(3, RE2::GlobalReplace(&s, "大*", "小"));
1643 ASSERT_EQ("小人小类小", s);
1644 }
1645
TEST(RE2,Issue310)1646 TEST(RE2, Issue310) {
1647 // (?:|a)* matched more text than (?:|a)+ did.
1648
1649 std::string s = "aaa";
1650 absl::string_view m;
1651
1652 RE2 star("(?:|a)*");
1653 ASSERT_TRUE(star.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1));
1654 ASSERT_EQ(m, "") << " got m='" << m << "', want ''";
1655
1656 RE2 plus("(?:|a)+");
1657 ASSERT_TRUE(plus.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1));
1658 ASSERT_EQ(m, "") << " got m='" << m << "', want ''";
1659 }
1660
TEST(RE2,Issue477)1661 TEST(RE2, Issue477) {
1662 // Regexp::LeadingString didn't output Latin1 into flags.
1663 // In the given pattern, 0xA5 should be factored out, but
1664 // shouldn't lose its Latin1-ness in the process. Because
1665 // that was happening, the prefix for accel was 0xC2 0xA5
1666 // instead of 0xA5. Note that the former doesn't occur in
1667 // the given input and so replacements weren't occurring.
1668
1669 const char bytes[] = {
1670 (char)0xa5, (char)0xd1, (char)0xa5, (char)0xd1,
1671 (char)0x61, (char)0x63, (char)0xa5, (char)0x64,
1672 };
1673 std::string s(bytes, ABSL_ARRAYSIZE(bytes));
1674 RE2 re("\xa5\xd1|\xa5\x64", RE2::Latin1);
1675 int n = RE2::GlobalReplace(&s, re, "");
1676 ASSERT_EQ(n, 3);
1677 ASSERT_EQ(s, "\x61\x63");
1678 }
1679
1680 } // namespace re2
1681