xref: /aosp_15_r20/external/regex-re2/re2/testing/regexp_generator.cc (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1*ccdc9c3eSSadaf Ebrahimi // Copyright 2008 The RE2 Authors.  All Rights Reserved.
2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style
3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file.
4*ccdc9c3eSSadaf Ebrahimi 
5*ccdc9c3eSSadaf Ebrahimi // Regular expression generator: generates all possible
6*ccdc9c3eSSadaf Ebrahimi // regular expressions within parameters (see regexp_generator.h for details).
7*ccdc9c3eSSadaf Ebrahimi 
8*ccdc9c3eSSadaf Ebrahimi // The regexp generator first generates a sequence of commands in a simple
9*ccdc9c3eSSadaf Ebrahimi // postfix language.  Each command in the language is a string,
10*ccdc9c3eSSadaf Ebrahimi // like "a" or "%s*" or "%s|%s".
11*ccdc9c3eSSadaf Ebrahimi //
12*ccdc9c3eSSadaf Ebrahimi // To evaluate a command, enough arguments are popped from the value stack to
13*ccdc9c3eSSadaf Ebrahimi // plug into the %s slots.  Then the result is pushed onto the stack.
14*ccdc9c3eSSadaf Ebrahimi // For example, the command sequence
15*ccdc9c3eSSadaf Ebrahimi //      a b %s%s c
16*ccdc9c3eSSadaf Ebrahimi // results in the stack
17*ccdc9c3eSSadaf Ebrahimi //      ab c
18*ccdc9c3eSSadaf Ebrahimi //
19*ccdc9c3eSSadaf Ebrahimi // GeneratePostfix generates all possible command sequences.
20*ccdc9c3eSSadaf Ebrahimi // Then RunPostfix turns each sequence into a regular expression
21*ccdc9c3eSSadaf Ebrahimi // and passes the regexp to HandleRegexp.
22*ccdc9c3eSSadaf Ebrahimi 
23*ccdc9c3eSSadaf Ebrahimi #include <stddef.h>
24*ccdc9c3eSSadaf Ebrahimi #include <stdint.h>
25*ccdc9c3eSSadaf Ebrahimi #include <stdio.h>
26*ccdc9c3eSSadaf Ebrahimi #include <string.h>
27*ccdc9c3eSSadaf Ebrahimi #include <memory>
28*ccdc9c3eSSadaf Ebrahimi #include <stack>
29*ccdc9c3eSSadaf Ebrahimi #include <string>
30*ccdc9c3eSSadaf Ebrahimi #include <vector>
31*ccdc9c3eSSadaf Ebrahimi 
32*ccdc9c3eSSadaf Ebrahimi #include "util/test.h"
33*ccdc9c3eSSadaf Ebrahimi #include "util/logging.h"
34*ccdc9c3eSSadaf Ebrahimi #include "util/strutil.h"
35*ccdc9c3eSSadaf Ebrahimi #include "util/utf.h"
36*ccdc9c3eSSadaf Ebrahimi #include "re2/testing/regexp_generator.h"
37*ccdc9c3eSSadaf Ebrahimi 
38*ccdc9c3eSSadaf Ebrahimi namespace re2 {
39*ccdc9c3eSSadaf Ebrahimi 
40*ccdc9c3eSSadaf Ebrahimi // Returns a vector of the egrep regexp operators.
EgrepOps()41*ccdc9c3eSSadaf Ebrahimi const std::vector<string>& RegexpGenerator::EgrepOps() {
42*ccdc9c3eSSadaf Ebrahimi   static const char *ops[] = {
43*ccdc9c3eSSadaf Ebrahimi     "%s%s",
44*ccdc9c3eSSadaf Ebrahimi     "%s|%s",
45*ccdc9c3eSSadaf Ebrahimi     "%s*",
46*ccdc9c3eSSadaf Ebrahimi     "%s+",
47*ccdc9c3eSSadaf Ebrahimi     "%s?",
48*ccdc9c3eSSadaf Ebrahimi     "%s\\C*",
49*ccdc9c3eSSadaf Ebrahimi   };
50*ccdc9c3eSSadaf Ebrahimi   static std::vector<string> v(ops, ops + arraysize(ops));
51*ccdc9c3eSSadaf Ebrahimi   return v;
52*ccdc9c3eSSadaf Ebrahimi }
53*ccdc9c3eSSadaf Ebrahimi 
RegexpGenerator(int maxatoms,int maxops,const std::vector<string> & atoms,const std::vector<string> & ops)54*ccdc9c3eSSadaf Ebrahimi RegexpGenerator::RegexpGenerator(int maxatoms, int maxops,
55*ccdc9c3eSSadaf Ebrahimi                                  const std::vector<string>& atoms,
56*ccdc9c3eSSadaf Ebrahimi                                  const std::vector<string>& ops)
57*ccdc9c3eSSadaf Ebrahimi     : maxatoms_(maxatoms), maxops_(maxops), atoms_(atoms), ops_(ops) {
58*ccdc9c3eSSadaf Ebrahimi   // Degenerate case.
59*ccdc9c3eSSadaf Ebrahimi   if (atoms_.empty())
60*ccdc9c3eSSadaf Ebrahimi     maxatoms_ = 0;
61*ccdc9c3eSSadaf Ebrahimi   if (ops_.empty())
62*ccdc9c3eSSadaf Ebrahimi     maxops_ = 0;
63*ccdc9c3eSSadaf Ebrahimi }
64*ccdc9c3eSSadaf Ebrahimi 
65*ccdc9c3eSSadaf Ebrahimi // Generates all possible regular expressions (within the parameters),
66*ccdc9c3eSSadaf Ebrahimi // calling HandleRegexp for each one.
Generate()67*ccdc9c3eSSadaf Ebrahimi void RegexpGenerator::Generate() {
68*ccdc9c3eSSadaf Ebrahimi   std::vector<string> postfix;
69*ccdc9c3eSSadaf Ebrahimi   GeneratePostfix(&postfix, 0, 0, 0);
70*ccdc9c3eSSadaf Ebrahimi }
71*ccdc9c3eSSadaf Ebrahimi 
72*ccdc9c3eSSadaf Ebrahimi // Generates random regular expressions, calling HandleRegexp for each one.
GenerateRandom(int32_t seed,int n)73*ccdc9c3eSSadaf Ebrahimi void RegexpGenerator::GenerateRandom(int32_t seed, int n) {
74*ccdc9c3eSSadaf Ebrahimi   rng_.seed(seed);
75*ccdc9c3eSSadaf Ebrahimi 
76*ccdc9c3eSSadaf Ebrahimi   for (int i = 0; i < n; i++) {
77*ccdc9c3eSSadaf Ebrahimi     std::vector<string> postfix;
78*ccdc9c3eSSadaf Ebrahimi     GenerateRandomPostfix(&postfix, 0, 0, 0);
79*ccdc9c3eSSadaf Ebrahimi   }
80*ccdc9c3eSSadaf Ebrahimi }
81*ccdc9c3eSSadaf Ebrahimi 
82*ccdc9c3eSSadaf Ebrahimi // Counts and returns the number of occurrences of "%s" in s.
CountArgs(const string & s)83*ccdc9c3eSSadaf Ebrahimi static int CountArgs(const string& s) {
84*ccdc9c3eSSadaf Ebrahimi   const char *p = s.c_str();
85*ccdc9c3eSSadaf Ebrahimi   int n = 0;
86*ccdc9c3eSSadaf Ebrahimi   while ((p = strstr(p, "%s")) != NULL) {
87*ccdc9c3eSSadaf Ebrahimi     p += 2;
88*ccdc9c3eSSadaf Ebrahimi     n++;
89*ccdc9c3eSSadaf Ebrahimi   }
90*ccdc9c3eSSadaf Ebrahimi   return n;
91*ccdc9c3eSSadaf Ebrahimi }
92*ccdc9c3eSSadaf Ebrahimi 
93*ccdc9c3eSSadaf Ebrahimi // Generates all possible postfix command sequences.
94*ccdc9c3eSSadaf Ebrahimi // Each sequence is handed off to RunPostfix to generate a regular expression.
95*ccdc9c3eSSadaf Ebrahimi // The arguments are:
96*ccdc9c3eSSadaf Ebrahimi //   post:  the current postfix sequence
97*ccdc9c3eSSadaf Ebrahimi //   nstk:  the number of elements that would be on the stack after executing
98*ccdc9c3eSSadaf Ebrahimi //          the sequence
99*ccdc9c3eSSadaf Ebrahimi //   ops:   the number of operators used in the sequence
100*ccdc9c3eSSadaf Ebrahimi //   atoms: the number of atoms used in the sequence
101*ccdc9c3eSSadaf Ebrahimi // For example, if post were ["a", "b", "%s%s", "c"],
102*ccdc9c3eSSadaf Ebrahimi // then nstk = 2, ops = 1, atoms = 3.
103*ccdc9c3eSSadaf Ebrahimi //
104*ccdc9c3eSSadaf Ebrahimi // The initial call should be GeneratePostfix([empty vector], 0, 0, 0).
105*ccdc9c3eSSadaf Ebrahimi //
GeneratePostfix(std::vector<string> * post,int nstk,int ops,int atoms)106*ccdc9c3eSSadaf Ebrahimi void RegexpGenerator::GeneratePostfix(std::vector<string>* post, int nstk,
107*ccdc9c3eSSadaf Ebrahimi                                       int ops, int atoms) {
108*ccdc9c3eSSadaf Ebrahimi   if (nstk == 1)
109*ccdc9c3eSSadaf Ebrahimi     RunPostfix(*post);
110*ccdc9c3eSSadaf Ebrahimi 
111*ccdc9c3eSSadaf Ebrahimi   // Early out: if used too many operators or can't
112*ccdc9c3eSSadaf Ebrahimi   // get back down to a single expression on the stack
113*ccdc9c3eSSadaf Ebrahimi   // using binary operators, give up.
114*ccdc9c3eSSadaf Ebrahimi   if (ops + nstk - 1 > maxops_)
115*ccdc9c3eSSadaf Ebrahimi     return;
116*ccdc9c3eSSadaf Ebrahimi 
117*ccdc9c3eSSadaf Ebrahimi   // Add atoms if there is room.
118*ccdc9c3eSSadaf Ebrahimi   if (atoms < maxatoms_) {
119*ccdc9c3eSSadaf Ebrahimi     for (size_t i = 0; i < atoms_.size(); i++) {
120*ccdc9c3eSSadaf Ebrahimi       post->push_back(atoms_[i]);
121*ccdc9c3eSSadaf Ebrahimi       GeneratePostfix(post, nstk + 1, ops, atoms + 1);
122*ccdc9c3eSSadaf Ebrahimi       post->pop_back();
123*ccdc9c3eSSadaf Ebrahimi     }
124*ccdc9c3eSSadaf Ebrahimi   }
125*ccdc9c3eSSadaf Ebrahimi 
126*ccdc9c3eSSadaf Ebrahimi   // Add operators if there are enough arguments.
127*ccdc9c3eSSadaf Ebrahimi   if (ops < maxops_) {
128*ccdc9c3eSSadaf Ebrahimi     for (size_t i = 0; i < ops_.size(); i++) {
129*ccdc9c3eSSadaf Ebrahimi       const string& fmt = ops_[i];
130*ccdc9c3eSSadaf Ebrahimi       int nargs = CountArgs(fmt);
131*ccdc9c3eSSadaf Ebrahimi       if (nargs <= nstk) {
132*ccdc9c3eSSadaf Ebrahimi         post->push_back(fmt);
133*ccdc9c3eSSadaf Ebrahimi         GeneratePostfix(post, nstk - nargs + 1, ops + 1, atoms);
134*ccdc9c3eSSadaf Ebrahimi         post->pop_back();
135*ccdc9c3eSSadaf Ebrahimi       }
136*ccdc9c3eSSadaf Ebrahimi     }
137*ccdc9c3eSSadaf Ebrahimi   }
138*ccdc9c3eSSadaf Ebrahimi }
139*ccdc9c3eSSadaf Ebrahimi 
140*ccdc9c3eSSadaf Ebrahimi // Generates a random postfix command sequence.
141*ccdc9c3eSSadaf Ebrahimi // Stops and returns true once a single sequence has been generated.
GenerateRandomPostfix(std::vector<string> * post,int nstk,int ops,int atoms)142*ccdc9c3eSSadaf Ebrahimi bool RegexpGenerator::GenerateRandomPostfix(std::vector<string>* post, int nstk,
143*ccdc9c3eSSadaf Ebrahimi                                             int ops, int atoms) {
144*ccdc9c3eSSadaf Ebrahimi   std::uniform_int_distribution<int> random_stop(0, maxatoms_ - atoms);
145*ccdc9c3eSSadaf Ebrahimi   std::uniform_int_distribution<int> random_bit(0, 1);
146*ccdc9c3eSSadaf Ebrahimi   std::uniform_int_distribution<int> random_ops_index(
147*ccdc9c3eSSadaf Ebrahimi       0, static_cast<int>(ops_.size()) - 1);
148*ccdc9c3eSSadaf Ebrahimi   std::uniform_int_distribution<int> random_atoms_index(
149*ccdc9c3eSSadaf Ebrahimi       0, static_cast<int>(atoms_.size()) - 1);
150*ccdc9c3eSSadaf Ebrahimi 
151*ccdc9c3eSSadaf Ebrahimi   for (;;) {
152*ccdc9c3eSSadaf Ebrahimi     // Stop if we get to a single element, but only sometimes.
153*ccdc9c3eSSadaf Ebrahimi     if (nstk == 1 && random_stop(rng_) == 0) {
154*ccdc9c3eSSadaf Ebrahimi       RunPostfix(*post);
155*ccdc9c3eSSadaf Ebrahimi       return true;
156*ccdc9c3eSSadaf Ebrahimi     }
157*ccdc9c3eSSadaf Ebrahimi 
158*ccdc9c3eSSadaf Ebrahimi     // Early out: if used too many operators or can't
159*ccdc9c3eSSadaf Ebrahimi     // get back down to a single expression on the stack
160*ccdc9c3eSSadaf Ebrahimi     // using binary operators, give up.
161*ccdc9c3eSSadaf Ebrahimi     if (ops + nstk - 1 > maxops_)
162*ccdc9c3eSSadaf Ebrahimi       return false;
163*ccdc9c3eSSadaf Ebrahimi 
164*ccdc9c3eSSadaf Ebrahimi     // Add operators if there are enough arguments.
165*ccdc9c3eSSadaf Ebrahimi     if (ops < maxops_ && random_bit(rng_) == 0) {
166*ccdc9c3eSSadaf Ebrahimi       const string& fmt = ops_[random_ops_index(rng_)];
167*ccdc9c3eSSadaf Ebrahimi       int nargs = CountArgs(fmt);
168*ccdc9c3eSSadaf Ebrahimi       if (nargs <= nstk) {
169*ccdc9c3eSSadaf Ebrahimi         post->push_back(fmt);
170*ccdc9c3eSSadaf Ebrahimi         bool ret = GenerateRandomPostfix(post, nstk - nargs + 1,
171*ccdc9c3eSSadaf Ebrahimi                                          ops + 1, atoms);
172*ccdc9c3eSSadaf Ebrahimi         post->pop_back();
173*ccdc9c3eSSadaf Ebrahimi         if (ret)
174*ccdc9c3eSSadaf Ebrahimi           return true;
175*ccdc9c3eSSadaf Ebrahimi       }
176*ccdc9c3eSSadaf Ebrahimi     }
177*ccdc9c3eSSadaf Ebrahimi 
178*ccdc9c3eSSadaf Ebrahimi     // Add atoms if there is room.
179*ccdc9c3eSSadaf Ebrahimi     if (atoms < maxatoms_ && random_bit(rng_) == 0) {
180*ccdc9c3eSSadaf Ebrahimi       post->push_back(atoms_[random_atoms_index(rng_)]);
181*ccdc9c3eSSadaf Ebrahimi       bool ret = GenerateRandomPostfix(post, nstk + 1, ops, atoms + 1);
182*ccdc9c3eSSadaf Ebrahimi       post->pop_back();
183*ccdc9c3eSSadaf Ebrahimi       if (ret)
184*ccdc9c3eSSadaf Ebrahimi         return true;
185*ccdc9c3eSSadaf Ebrahimi     }
186*ccdc9c3eSSadaf Ebrahimi   }
187*ccdc9c3eSSadaf Ebrahimi }
188*ccdc9c3eSSadaf Ebrahimi 
189*ccdc9c3eSSadaf Ebrahimi // Interprets the postfix command sequence to create a regular expression
190*ccdc9c3eSSadaf Ebrahimi // passed to HandleRegexp.  The results of operators like %s|%s are wrapped
191*ccdc9c3eSSadaf Ebrahimi // in (?: ) to avoid needing to maintain a precedence table.
RunPostfix(const std::vector<string> & post)192*ccdc9c3eSSadaf Ebrahimi void RegexpGenerator::RunPostfix(const std::vector<string>& post) {
193*ccdc9c3eSSadaf Ebrahimi   std::stack<string> regexps;
194*ccdc9c3eSSadaf Ebrahimi   for (size_t i = 0; i < post.size(); i++) {
195*ccdc9c3eSSadaf Ebrahimi     switch (CountArgs(post[i])) {
196*ccdc9c3eSSadaf Ebrahimi       default:
197*ccdc9c3eSSadaf Ebrahimi         LOG(FATAL) << "Bad operator: " << post[i];
198*ccdc9c3eSSadaf Ebrahimi       case 0:
199*ccdc9c3eSSadaf Ebrahimi         regexps.push(post[i]);
200*ccdc9c3eSSadaf Ebrahimi         break;
201*ccdc9c3eSSadaf Ebrahimi       case 1: {
202*ccdc9c3eSSadaf Ebrahimi         string a = regexps.top();
203*ccdc9c3eSSadaf Ebrahimi         regexps.pop();
204*ccdc9c3eSSadaf Ebrahimi         regexps.push("(?:" + StringPrintf(post[i].c_str(), a.c_str()) + ")");
205*ccdc9c3eSSadaf Ebrahimi         break;
206*ccdc9c3eSSadaf Ebrahimi       }
207*ccdc9c3eSSadaf Ebrahimi       case 2: {
208*ccdc9c3eSSadaf Ebrahimi         string b = regexps.top();
209*ccdc9c3eSSadaf Ebrahimi         regexps.pop();
210*ccdc9c3eSSadaf Ebrahimi         string a = regexps.top();
211*ccdc9c3eSSadaf Ebrahimi         regexps.pop();
212*ccdc9c3eSSadaf Ebrahimi         regexps.push("(?:" +
213*ccdc9c3eSSadaf Ebrahimi                      StringPrintf(post[i].c_str(), a.c_str(), b.c_str()) +
214*ccdc9c3eSSadaf Ebrahimi                      ")");
215*ccdc9c3eSSadaf Ebrahimi         break;
216*ccdc9c3eSSadaf Ebrahimi       }
217*ccdc9c3eSSadaf Ebrahimi     }
218*ccdc9c3eSSadaf Ebrahimi   }
219*ccdc9c3eSSadaf Ebrahimi 
220*ccdc9c3eSSadaf Ebrahimi   if (regexps.size() != 1) {
221*ccdc9c3eSSadaf Ebrahimi     // Internal error - should never happen.
222*ccdc9c3eSSadaf Ebrahimi     printf("Bad regexp program:\n");
223*ccdc9c3eSSadaf Ebrahimi     for (size_t i = 0; i < post.size(); i++) {
224*ccdc9c3eSSadaf Ebrahimi       printf("  %s\n", CEscape(post[i]).c_str());
225*ccdc9c3eSSadaf Ebrahimi     }
226*ccdc9c3eSSadaf Ebrahimi     printf("Stack after running program:\n");
227*ccdc9c3eSSadaf Ebrahimi     while (!regexps.empty()) {
228*ccdc9c3eSSadaf Ebrahimi       printf("  %s\n", CEscape(regexps.top()).c_str());
229*ccdc9c3eSSadaf Ebrahimi       regexps.pop();
230*ccdc9c3eSSadaf Ebrahimi     }
231*ccdc9c3eSSadaf Ebrahimi     LOG(FATAL) << "Bad regexp program.";
232*ccdc9c3eSSadaf Ebrahimi   }
233*ccdc9c3eSSadaf Ebrahimi 
234*ccdc9c3eSSadaf Ebrahimi   HandleRegexp(regexps.top());
235*ccdc9c3eSSadaf Ebrahimi   HandleRegexp("^(?:" + regexps.top() + ")$");
236*ccdc9c3eSSadaf Ebrahimi   HandleRegexp("^(?:" + regexps.top() + ")");
237*ccdc9c3eSSadaf Ebrahimi   HandleRegexp("(?:" + regexps.top() + ")$");
238*ccdc9c3eSSadaf Ebrahimi }
239*ccdc9c3eSSadaf Ebrahimi 
240*ccdc9c3eSSadaf Ebrahimi // Split s into an vector of strings, one for each UTF-8 character.
Explode(const StringPiece & s)241*ccdc9c3eSSadaf Ebrahimi std::vector<string> Explode(const StringPiece& s) {
242*ccdc9c3eSSadaf Ebrahimi   std::vector<string> v;
243*ccdc9c3eSSadaf Ebrahimi 
244*ccdc9c3eSSadaf Ebrahimi   for (const char *q = s.begin(); q < s.end(); ) {
245*ccdc9c3eSSadaf Ebrahimi     const char* p = q;
246*ccdc9c3eSSadaf Ebrahimi     Rune r;
247*ccdc9c3eSSadaf Ebrahimi     q += chartorune(&r, q);
248*ccdc9c3eSSadaf Ebrahimi     v.push_back(string(p, q - p));
249*ccdc9c3eSSadaf Ebrahimi   }
250*ccdc9c3eSSadaf Ebrahimi 
251*ccdc9c3eSSadaf Ebrahimi   return v;
252*ccdc9c3eSSadaf Ebrahimi }
253*ccdc9c3eSSadaf Ebrahimi 
254*ccdc9c3eSSadaf Ebrahimi // Split string everywhere a substring is found, returning
255*ccdc9c3eSSadaf Ebrahimi // vector of pieces.
Split(const StringPiece & sep,const StringPiece & s)256*ccdc9c3eSSadaf Ebrahimi std::vector<string> Split(const StringPiece& sep, const StringPiece& s) {
257*ccdc9c3eSSadaf Ebrahimi   std::vector<string> v;
258*ccdc9c3eSSadaf Ebrahimi 
259*ccdc9c3eSSadaf Ebrahimi   if (sep.size() == 0)
260*ccdc9c3eSSadaf Ebrahimi     return Explode(s);
261*ccdc9c3eSSadaf Ebrahimi 
262*ccdc9c3eSSadaf Ebrahimi   const char *p = s.begin();
263*ccdc9c3eSSadaf Ebrahimi   for (const char *q = s.begin(); q + sep.size() <= s.end(); q++) {
264*ccdc9c3eSSadaf Ebrahimi     if (StringPiece(q, sep.size()) == sep) {
265*ccdc9c3eSSadaf Ebrahimi       v.push_back(string(p, q - p));
266*ccdc9c3eSSadaf Ebrahimi       p = q + sep.size();
267*ccdc9c3eSSadaf Ebrahimi       q = p - 1;  // -1 for ++ in loop
268*ccdc9c3eSSadaf Ebrahimi       continue;
269*ccdc9c3eSSadaf Ebrahimi     }
270*ccdc9c3eSSadaf Ebrahimi   }
271*ccdc9c3eSSadaf Ebrahimi   if (p < s.end())
272*ccdc9c3eSSadaf Ebrahimi     v.push_back(string(p, s.end() - p));
273*ccdc9c3eSSadaf Ebrahimi   return v;
274*ccdc9c3eSSadaf Ebrahimi }
275*ccdc9c3eSSadaf Ebrahimi 
276*ccdc9c3eSSadaf Ebrahimi }  // namespace re2
277