1*ccdc9c3eSSadaf Ebrahimi // Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style
3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file.
4*ccdc9c3eSSadaf Ebrahimi
5*ccdc9c3eSSadaf Ebrahimi #include <string.h>
6*ccdc9c3eSSadaf Ebrahimi #include <string>
7*ccdc9c3eSSadaf Ebrahimi #include <vector>
8*ccdc9c3eSSadaf Ebrahimi
9*ccdc9c3eSSadaf Ebrahimi #include "util/test.h"
10*ccdc9c3eSSadaf Ebrahimi #include "util/logging.h"
11*ccdc9c3eSSadaf Ebrahimi #include "util/strutil.h"
12*ccdc9c3eSSadaf Ebrahimi #include "re2/prog.h"
13*ccdc9c3eSSadaf Ebrahimi #include "re2/re2.h"
14*ccdc9c3eSSadaf Ebrahimi #include "re2/regexp.h"
15*ccdc9c3eSSadaf Ebrahimi #include "re2/testing/exhaustive_tester.h"
16*ccdc9c3eSSadaf Ebrahimi #include "re2/testing/regexp_generator.h"
17*ccdc9c3eSSadaf Ebrahimi #include "re2/testing/string_generator.h"
18*ccdc9c3eSSadaf Ebrahimi
19*ccdc9c3eSSadaf Ebrahimi namespace re2 {
20*ccdc9c3eSSadaf Ebrahimi
21*ccdc9c3eSSadaf Ebrahimi // Test that C++ strings are compared as uint8s, not int8s.
22*ccdc9c3eSSadaf Ebrahimi // PossibleMatchRange doesn't depend on this, but callers probably will.
TEST(CplusplusStrings,EightBit)23*ccdc9c3eSSadaf Ebrahimi TEST(CplusplusStrings, EightBit) {
24*ccdc9c3eSSadaf Ebrahimi string s = "\x70";
25*ccdc9c3eSSadaf Ebrahimi string t = "\xA0";
26*ccdc9c3eSSadaf Ebrahimi EXPECT_LT(s, t);
27*ccdc9c3eSSadaf Ebrahimi }
28*ccdc9c3eSSadaf Ebrahimi
29*ccdc9c3eSSadaf Ebrahimi struct PrefixTest {
30*ccdc9c3eSSadaf Ebrahimi const char* regexp;
31*ccdc9c3eSSadaf Ebrahimi int maxlen;
32*ccdc9c3eSSadaf Ebrahimi const char* min;
33*ccdc9c3eSSadaf Ebrahimi const char* max;
34*ccdc9c3eSSadaf Ebrahimi };
35*ccdc9c3eSSadaf Ebrahimi
36*ccdc9c3eSSadaf Ebrahimi static PrefixTest tests[] = {
37*ccdc9c3eSSadaf Ebrahimi { "", 10, "", "", },
38*ccdc9c3eSSadaf Ebrahimi { "Abcdef", 10, "Abcdef", "Abcdef" },
39*ccdc9c3eSSadaf Ebrahimi { "abc(def|ghi)", 10, "abcdef", "abcghi" },
40*ccdc9c3eSSadaf Ebrahimi { "a+hello", 10, "aa", "ahello" },
41*ccdc9c3eSSadaf Ebrahimi { "a*hello", 10, "a", "hello" },
42*ccdc9c3eSSadaf Ebrahimi { "def|abc", 10, "abc", "def" },
43*ccdc9c3eSSadaf Ebrahimi { "a(b)(c)[d]", 10, "abcd", "abcd" },
44*ccdc9c3eSSadaf Ebrahimi { "ab(cab|cat)", 10, "abcab", "abcat" },
45*ccdc9c3eSSadaf Ebrahimi { "ab(cab|ca)x", 10, "abcabx", "abcax" },
46*ccdc9c3eSSadaf Ebrahimi { "(ab|x)(c|de)", 10, "abc", "xde" },
47*ccdc9c3eSSadaf Ebrahimi { "(ab|x)?(c|z)?", 10, "", "z" },
48*ccdc9c3eSSadaf Ebrahimi { "[^\\s\\S]", 10, "", "" },
49*ccdc9c3eSSadaf Ebrahimi { "(abc)+", 5, "abc", "abcac" },
50*ccdc9c3eSSadaf Ebrahimi { "(abc)+", 2, "ab", "ac" },
51*ccdc9c3eSSadaf Ebrahimi { "(abc)+", 1, "a", "b" },
52*ccdc9c3eSSadaf Ebrahimi { "[a\xC3\xA1]", 4, "a", "\xC3\xA1" },
53*ccdc9c3eSSadaf Ebrahimi { "a*", 10, "", "ab" },
54*ccdc9c3eSSadaf Ebrahimi
55*ccdc9c3eSSadaf Ebrahimi { "(?i)Abcdef", 10, "ABCDEF", "abcdef" },
56*ccdc9c3eSSadaf Ebrahimi { "(?i)abc(def|ghi)", 10, "ABCDEF", "abcghi" },
57*ccdc9c3eSSadaf Ebrahimi { "(?i)a+hello", 10, "AA", "ahello" },
58*ccdc9c3eSSadaf Ebrahimi { "(?i)a*hello", 10, "A", "hello" },
59*ccdc9c3eSSadaf Ebrahimi { "(?i)def|abc", 10, "ABC", "def" },
60*ccdc9c3eSSadaf Ebrahimi { "(?i)a(b)(c)[d]", 10, "ABCD", "abcd" },
61*ccdc9c3eSSadaf Ebrahimi { "(?i)ab(cab|cat)", 10, "ABCAB", "abcat" },
62*ccdc9c3eSSadaf Ebrahimi { "(?i)ab(cab|ca)x", 10, "ABCABX", "abcax" },
63*ccdc9c3eSSadaf Ebrahimi { "(?i)(ab|x)(c|de)", 10, "ABC", "xde" },
64*ccdc9c3eSSadaf Ebrahimi { "(?i)(ab|x)?(c|z)?", 10, "", "z" },
65*ccdc9c3eSSadaf Ebrahimi { "(?i)[^\\s\\S]", 10, "", "" },
66*ccdc9c3eSSadaf Ebrahimi { "(?i)(abc)+", 5, "ABC", "abcac" },
67*ccdc9c3eSSadaf Ebrahimi { "(?i)(abc)+", 2, "AB", "ac" },
68*ccdc9c3eSSadaf Ebrahimi { "(?i)(abc)+", 1, "A", "b" },
69*ccdc9c3eSSadaf Ebrahimi { "(?i)[a\xC3\xA1]", 4, "A", "\xC3\xA1" },
70*ccdc9c3eSSadaf Ebrahimi { "(?i)a*", 10, "", "ab" },
71*ccdc9c3eSSadaf Ebrahimi { "(?i)A*", 10, "", "ab" },
72*ccdc9c3eSSadaf Ebrahimi
73*ccdc9c3eSSadaf Ebrahimi { "\\AAbcdef", 10, "Abcdef", "Abcdef" },
74*ccdc9c3eSSadaf Ebrahimi { "\\Aabc(def|ghi)", 10, "abcdef", "abcghi" },
75*ccdc9c3eSSadaf Ebrahimi { "\\Aa+hello", 10, "aa", "ahello" },
76*ccdc9c3eSSadaf Ebrahimi { "\\Aa*hello", 10, "a", "hello" },
77*ccdc9c3eSSadaf Ebrahimi { "\\Adef|abc", 10, "abc", "def" },
78*ccdc9c3eSSadaf Ebrahimi { "\\Aa(b)(c)[d]", 10, "abcd", "abcd" },
79*ccdc9c3eSSadaf Ebrahimi { "\\Aab(cab|cat)", 10, "abcab", "abcat" },
80*ccdc9c3eSSadaf Ebrahimi { "\\Aab(cab|ca)x", 10, "abcabx", "abcax" },
81*ccdc9c3eSSadaf Ebrahimi { "\\A(ab|x)(c|de)", 10, "abc", "xde" },
82*ccdc9c3eSSadaf Ebrahimi { "\\A(ab|x)?(c|z)?", 10, "", "z" },
83*ccdc9c3eSSadaf Ebrahimi { "\\A[^\\s\\S]", 10, "", "" },
84*ccdc9c3eSSadaf Ebrahimi { "\\A(abc)+", 5, "abc", "abcac" },
85*ccdc9c3eSSadaf Ebrahimi { "\\A(abc)+", 2, "ab", "ac" },
86*ccdc9c3eSSadaf Ebrahimi { "\\A(abc)+", 1, "a", "b" },
87*ccdc9c3eSSadaf Ebrahimi { "\\A[a\xC3\xA1]", 4, "a", "\xC3\xA1" },
88*ccdc9c3eSSadaf Ebrahimi { "\\Aa*", 10, "", "ab" },
89*ccdc9c3eSSadaf Ebrahimi
90*ccdc9c3eSSadaf Ebrahimi { "(?i)\\AAbcdef", 10, "ABCDEF", "abcdef" },
91*ccdc9c3eSSadaf Ebrahimi { "(?i)\\Aabc(def|ghi)", 10, "ABCDEF", "abcghi" },
92*ccdc9c3eSSadaf Ebrahimi { "(?i)\\Aa+hello", 10, "AA", "ahello" },
93*ccdc9c3eSSadaf Ebrahimi { "(?i)\\Aa*hello", 10, "A", "hello" },
94*ccdc9c3eSSadaf Ebrahimi { "(?i)\\Adef|abc", 10, "ABC", "def" },
95*ccdc9c3eSSadaf Ebrahimi { "(?i)\\Aa(b)(c)[d]", 10, "ABCD", "abcd" },
96*ccdc9c3eSSadaf Ebrahimi { "(?i)\\Aab(cab|cat)", 10, "ABCAB", "abcat" },
97*ccdc9c3eSSadaf Ebrahimi { "(?i)\\Aab(cab|ca)x", 10, "ABCABX", "abcax" },
98*ccdc9c3eSSadaf Ebrahimi { "(?i)\\A(ab|x)(c|de)", 10, "ABC", "xde" },
99*ccdc9c3eSSadaf Ebrahimi { "(?i)\\A(ab|x)?(c|z)?", 10, "", "z" },
100*ccdc9c3eSSadaf Ebrahimi { "(?i)\\A[^\\s\\S]", 10, "", "" },
101*ccdc9c3eSSadaf Ebrahimi { "(?i)\\A(abc)+", 5, "ABC", "abcac" },
102*ccdc9c3eSSadaf Ebrahimi { "(?i)\\A(abc)+", 2, "AB", "ac" },
103*ccdc9c3eSSadaf Ebrahimi { "(?i)\\A(abc)+", 1, "A", "b" },
104*ccdc9c3eSSadaf Ebrahimi { "(?i)\\A[a\xC3\xA1]", 4, "A", "\xC3\xA1" },
105*ccdc9c3eSSadaf Ebrahimi { "(?i)\\Aa*", 10, "", "ab" },
106*ccdc9c3eSSadaf Ebrahimi { "(?i)\\AA*", 10, "", "ab" },
107*ccdc9c3eSSadaf Ebrahimi };
108*ccdc9c3eSSadaf Ebrahimi
TEST(PossibleMatchRange,HandWritten)109*ccdc9c3eSSadaf Ebrahimi TEST(PossibleMatchRange, HandWritten) {
110*ccdc9c3eSSadaf Ebrahimi for (int i = 0; i < arraysize(tests); i++) {
111*ccdc9c3eSSadaf Ebrahimi for (int j = 0; j < 2; j++) {
112*ccdc9c3eSSadaf Ebrahimi const PrefixTest& t = tests[i];
113*ccdc9c3eSSadaf Ebrahimi string min, max;
114*ccdc9c3eSSadaf Ebrahimi if (j == 0) {
115*ccdc9c3eSSadaf Ebrahimi LOG(INFO) << "Checking regexp=" << CEscape(t.regexp);
116*ccdc9c3eSSadaf Ebrahimi Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
117*ccdc9c3eSSadaf Ebrahimi ASSERT_TRUE(re != NULL);
118*ccdc9c3eSSadaf Ebrahimi Prog* prog = re->CompileToProg(0);
119*ccdc9c3eSSadaf Ebrahimi ASSERT_TRUE(prog != NULL);
120*ccdc9c3eSSadaf Ebrahimi ASSERT_TRUE(prog->PossibleMatchRange(&min, &max, t.maxlen))
121*ccdc9c3eSSadaf Ebrahimi << " " << t.regexp;
122*ccdc9c3eSSadaf Ebrahimi delete prog;
123*ccdc9c3eSSadaf Ebrahimi re->Decref();
124*ccdc9c3eSSadaf Ebrahimi } else {
125*ccdc9c3eSSadaf Ebrahimi ASSERT_TRUE(RE2(t.regexp).PossibleMatchRange(&min, &max, t.maxlen));
126*ccdc9c3eSSadaf Ebrahimi }
127*ccdc9c3eSSadaf Ebrahimi EXPECT_EQ(t.min, min) << t.regexp;
128*ccdc9c3eSSadaf Ebrahimi EXPECT_EQ(t.max, max) << t.regexp;
129*ccdc9c3eSSadaf Ebrahimi }
130*ccdc9c3eSSadaf Ebrahimi }
131*ccdc9c3eSSadaf Ebrahimi }
132*ccdc9c3eSSadaf Ebrahimi
133*ccdc9c3eSSadaf Ebrahimi // Test cases where PossibleMatchRange should return false.
TEST(PossibleMatchRange,Failures)134*ccdc9c3eSSadaf Ebrahimi TEST(PossibleMatchRange, Failures) {
135*ccdc9c3eSSadaf Ebrahimi string min, max;
136*ccdc9c3eSSadaf Ebrahimi
137*ccdc9c3eSSadaf Ebrahimi // Fails because no room to write max.
138*ccdc9c3eSSadaf Ebrahimi EXPECT_FALSE(RE2("abc").PossibleMatchRange(&min, &max, 0));
139*ccdc9c3eSSadaf Ebrahimi
140*ccdc9c3eSSadaf Ebrahimi // Fails because there is no max -- any non-empty string matches
141*ccdc9c3eSSadaf Ebrahimi // or begins a match. Have to use Latin-1 input, because there
142*ccdc9c3eSSadaf Ebrahimi // are no valid UTF-8 strings beginning with byte 0xFF.
143*ccdc9c3eSSadaf Ebrahimi EXPECT_FALSE(RE2("[\\s\\S]+", RE2::Latin1).
144*ccdc9c3eSSadaf Ebrahimi PossibleMatchRange(&min, &max, 10))
145*ccdc9c3eSSadaf Ebrahimi << "min=" << CEscape(min) << ", max=" << CEscape(max);
146*ccdc9c3eSSadaf Ebrahimi EXPECT_FALSE(RE2("[\\0-\xFF]+", RE2::Latin1).
147*ccdc9c3eSSadaf Ebrahimi PossibleMatchRange(&min, &max, 10))
148*ccdc9c3eSSadaf Ebrahimi << "min=" << CEscape(min) << ", max=" << CEscape(max);
149*ccdc9c3eSSadaf Ebrahimi EXPECT_FALSE(RE2(".+hello", RE2::Latin1).
150*ccdc9c3eSSadaf Ebrahimi PossibleMatchRange(&min, &max, 10))
151*ccdc9c3eSSadaf Ebrahimi << "min=" << CEscape(min) << ", max=" << CEscape(max);
152*ccdc9c3eSSadaf Ebrahimi EXPECT_FALSE(RE2(".*hello", RE2::Latin1).
153*ccdc9c3eSSadaf Ebrahimi PossibleMatchRange(&min, &max, 10))
154*ccdc9c3eSSadaf Ebrahimi << "min=" << CEscape(min) << ", max=" << CEscape(max);
155*ccdc9c3eSSadaf Ebrahimi EXPECT_FALSE(RE2(".*", RE2::Latin1).
156*ccdc9c3eSSadaf Ebrahimi PossibleMatchRange(&min, &max, 10))
157*ccdc9c3eSSadaf Ebrahimi << "min=" << CEscape(min) << ", max=" << CEscape(max);
158*ccdc9c3eSSadaf Ebrahimi EXPECT_FALSE(RE2("\\C*").
159*ccdc9c3eSSadaf Ebrahimi PossibleMatchRange(&min, &max, 10))
160*ccdc9c3eSSadaf Ebrahimi << "min=" << CEscape(min) << ", max=" << CEscape(max);
161*ccdc9c3eSSadaf Ebrahimi
162*ccdc9c3eSSadaf Ebrahimi // Fails because it's a malformed regexp.
163*ccdc9c3eSSadaf Ebrahimi EXPECT_FALSE(RE2("*hello").PossibleMatchRange(&min, &max, 10))
164*ccdc9c3eSSadaf Ebrahimi << "min=" << CEscape(min) << ", max=" << CEscape(max);
165*ccdc9c3eSSadaf Ebrahimi }
166*ccdc9c3eSSadaf Ebrahimi
167*ccdc9c3eSSadaf Ebrahimi // Exhaustive test: generate all regexps within parameters,
168*ccdc9c3eSSadaf Ebrahimi // then generate all strings of a given length over a given alphabet,
169*ccdc9c3eSSadaf Ebrahimi // then check that the prefix information agrees with whether
170*ccdc9c3eSSadaf Ebrahimi // the regexp matches each of the strings.
171*ccdc9c3eSSadaf Ebrahimi class PossibleMatchTester : public RegexpGenerator {
172*ccdc9c3eSSadaf Ebrahimi public:
PossibleMatchTester(int maxatoms,int maxops,const std::vector<string> & alphabet,const std::vector<string> & ops,int maxstrlen,const std::vector<string> & stralphabet)173*ccdc9c3eSSadaf Ebrahimi PossibleMatchTester(int maxatoms,
174*ccdc9c3eSSadaf Ebrahimi int maxops,
175*ccdc9c3eSSadaf Ebrahimi const std::vector<string>& alphabet,
176*ccdc9c3eSSadaf Ebrahimi const std::vector<string>& ops,
177*ccdc9c3eSSadaf Ebrahimi int maxstrlen,
178*ccdc9c3eSSadaf Ebrahimi const std::vector<string>& stralphabet)
179*ccdc9c3eSSadaf Ebrahimi : RegexpGenerator(maxatoms, maxops, alphabet, ops),
180*ccdc9c3eSSadaf Ebrahimi strgen_(maxstrlen, stralphabet),
181*ccdc9c3eSSadaf Ebrahimi regexps_(0), tests_(0) { }
182*ccdc9c3eSSadaf Ebrahimi
regexps()183*ccdc9c3eSSadaf Ebrahimi int regexps() { return regexps_; }
tests()184*ccdc9c3eSSadaf Ebrahimi int tests() { return tests_; }
185*ccdc9c3eSSadaf Ebrahimi
186*ccdc9c3eSSadaf Ebrahimi // Needed for RegexpGenerator interface.
187*ccdc9c3eSSadaf Ebrahimi void HandleRegexp(const string& regexp);
188*ccdc9c3eSSadaf Ebrahimi
189*ccdc9c3eSSadaf Ebrahimi private:
190*ccdc9c3eSSadaf Ebrahimi StringGenerator strgen_;
191*ccdc9c3eSSadaf Ebrahimi
192*ccdc9c3eSSadaf Ebrahimi int regexps_; // Number of HandleRegexp calls
193*ccdc9c3eSSadaf Ebrahimi int tests_; // Number of regexp tests.
194*ccdc9c3eSSadaf Ebrahimi
195*ccdc9c3eSSadaf Ebrahimi PossibleMatchTester(const PossibleMatchTester&) = delete;
196*ccdc9c3eSSadaf Ebrahimi PossibleMatchTester& operator=(const PossibleMatchTester&) = delete;
197*ccdc9c3eSSadaf Ebrahimi };
198*ccdc9c3eSSadaf Ebrahimi
199*ccdc9c3eSSadaf Ebrahimi // Processes a single generated regexp.
200*ccdc9c3eSSadaf Ebrahimi // Checks that all accepted strings agree with the prefix range.
HandleRegexp(const string & regexp)201*ccdc9c3eSSadaf Ebrahimi void PossibleMatchTester::HandleRegexp(const string& regexp) {
202*ccdc9c3eSSadaf Ebrahimi regexps_++;
203*ccdc9c3eSSadaf Ebrahimi
204*ccdc9c3eSSadaf Ebrahimi VLOG(3) << CEscape(regexp);
205*ccdc9c3eSSadaf Ebrahimi
206*ccdc9c3eSSadaf Ebrahimi RE2 re(regexp, RE2::Latin1);
207*ccdc9c3eSSadaf Ebrahimi ASSERT_EQ(re.error(), "");
208*ccdc9c3eSSadaf Ebrahimi
209*ccdc9c3eSSadaf Ebrahimi string min, max;
210*ccdc9c3eSSadaf Ebrahimi if(!re.PossibleMatchRange(&min, &max, 10)) {
211*ccdc9c3eSSadaf Ebrahimi // There's no good max for "\\C*". Can't use strcmp
212*ccdc9c3eSSadaf Ebrahimi // because sometimes it gets embedded in more
213*ccdc9c3eSSadaf Ebrahimi // complicated expressions.
214*ccdc9c3eSSadaf Ebrahimi if(strstr(regexp.c_str(), "\\C*"))
215*ccdc9c3eSSadaf Ebrahimi return;
216*ccdc9c3eSSadaf Ebrahimi LOG(QFATAL) << "PossibleMatchRange failed on: " << CEscape(regexp);
217*ccdc9c3eSSadaf Ebrahimi }
218*ccdc9c3eSSadaf Ebrahimi
219*ccdc9c3eSSadaf Ebrahimi strgen_.Reset();
220*ccdc9c3eSSadaf Ebrahimi while (strgen_.HasNext()) {
221*ccdc9c3eSSadaf Ebrahimi const StringPiece& s = strgen_.Next();
222*ccdc9c3eSSadaf Ebrahimi tests_++;
223*ccdc9c3eSSadaf Ebrahimi if (!RE2::FullMatch(s, re))
224*ccdc9c3eSSadaf Ebrahimi continue;
225*ccdc9c3eSSadaf Ebrahimi ASSERT_GE(s, min) << " regexp: " << regexp << " max: " << max;
226*ccdc9c3eSSadaf Ebrahimi ASSERT_LE(s, max) << " regexp: " << regexp << " min: " << min;
227*ccdc9c3eSSadaf Ebrahimi }
228*ccdc9c3eSSadaf Ebrahimi }
229*ccdc9c3eSSadaf Ebrahimi
TEST(PossibleMatchRange,Exhaustive)230*ccdc9c3eSSadaf Ebrahimi TEST(PossibleMatchRange, Exhaustive) {
231*ccdc9c3eSSadaf Ebrahimi int natom = 3;
232*ccdc9c3eSSadaf Ebrahimi int noperator = 3;
233*ccdc9c3eSSadaf Ebrahimi int stringlen = 5;
234*ccdc9c3eSSadaf Ebrahimi if (RE2_DEBUG_MODE) {
235*ccdc9c3eSSadaf Ebrahimi natom = 2;
236*ccdc9c3eSSadaf Ebrahimi noperator = 3;
237*ccdc9c3eSSadaf Ebrahimi stringlen = 3;
238*ccdc9c3eSSadaf Ebrahimi }
239*ccdc9c3eSSadaf Ebrahimi PossibleMatchTester t(natom, noperator, Split(" ", "a b [0-9]"),
240*ccdc9c3eSSadaf Ebrahimi RegexpGenerator::EgrepOps(),
241*ccdc9c3eSSadaf Ebrahimi stringlen, Explode("ab4"));
242*ccdc9c3eSSadaf Ebrahimi t.Generate();
243*ccdc9c3eSSadaf Ebrahimi LOG(INFO) << t.regexps() << " regexps, "
244*ccdc9c3eSSadaf Ebrahimi << t.tests() << " tests";
245*ccdc9c3eSSadaf Ebrahimi }
246*ccdc9c3eSSadaf Ebrahimi
247*ccdc9c3eSSadaf Ebrahimi } // namespace re2
248