xref: /aosp_15_r20/external/cronet/third_party/re2/src/re2/fuzzing/re2_fuzzer.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2016 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #include <fuzzer/FuzzedDataProvider.h>
6 #include <stddef.h>
7 #include <stdint.h>
8 #include <algorithm>
9 #include <string>
10 #include <vector>
11 
12 #include "re2/filtered_re2.h"
13 #include "re2/re2.h"
14 #include "re2/regexp.h"
15 #include "re2/set.h"
16 #include "re2/walker-inl.h"
17 
18 // NOT static, NOT signed.
19 uint8_t dummy = 0;
20 
21 // Walks kRegexpConcat and kRegexpAlternate subexpressions
22 // to determine their maximum length.
23 class SubexpressionWalker : public re2::Regexp::Walker<int> {
24  public:
25   SubexpressionWalker() = default;
26   ~SubexpressionWalker() override = default;
27 
PostVisit(re2::Regexp * re,int parent_arg,int pre_arg,int * child_args,int nchild_args)28   int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg,
29                 int* child_args, int nchild_args) override {
30     switch (re->op()) {
31       case re2::kRegexpConcat:
32       case re2::kRegexpAlternate: {
33         int max = nchild_args;
34         for (int i = 0; i < nchild_args; i++)
35           max = std::max(max, child_args[i]);
36         return max;
37       }
38 
39       default:
40         break;
41     }
42     return -1;
43   }
44 
45   // Should never be called: we use Walk(), not WalkExponential().
ShortVisit(re2::Regexp * re,int parent_arg)46   int ShortVisit(re2::Regexp* re, int parent_arg) override {
47     return parent_arg;
48   }
49 
50  private:
51   SubexpressionWalker(const SubexpressionWalker&) = delete;
52   SubexpressionWalker& operator=(const SubexpressionWalker&) = delete;
53 };
54 
55 // Walks substrings (i.e. kRegexpLiteralString subexpressions)
56 // to determine their maximum length... in runes, but avoiding
57 // overheads due to UTF-8 encoding is worthwhile when fuzzing.
58 class SubstringWalker : public re2::Regexp::Walker<int> {
59  public:
60   SubstringWalker() = default;
61   ~SubstringWalker() override = default;
62 
PostVisit(re2::Regexp * re,int parent_arg,int pre_arg,int * child_args,int nchild_args)63   int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg,
64                 int* child_args, int nchild_args) override {
65     switch (re->op()) {
66       case re2::kRegexpConcat:
67       case re2::kRegexpAlternate:
68       case re2::kRegexpStar:
69       case re2::kRegexpPlus:
70       case re2::kRegexpQuest:
71       case re2::kRegexpRepeat:
72       case re2::kRegexpCapture: {
73         int max = -1;
74         for (int i = 0; i < nchild_args; i++)
75           max = std::max(max, child_args[i]);
76         return max;
77       }
78 
79       case re2::kRegexpLiteralString:
80         return re->nrunes();
81 
82       default:
83         break;
84     }
85     return -1;
86   }
87 
88   // Should never be called: we use Walk(), not WalkExponential().
ShortVisit(re2::Regexp * re,int parent_arg)89   int ShortVisit(re2::Regexp* re, int parent_arg) override {
90     return parent_arg;
91   }
92 
93  private:
94   SubstringWalker(const SubstringWalker&) = delete;
95   SubstringWalker& operator=(const SubstringWalker&) = delete;
96 };
97 
TestOneInput(absl::string_view pattern,const RE2::Options & options,RE2::Anchor anchor,absl::string_view text)98 void TestOneInput(absl::string_view pattern, const RE2::Options& options,
99                   RE2::Anchor anchor, absl::string_view text) {
100   // Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W.
101   // Otherwise, we will waste time on inputs that have long runs of various
102   // character classes. The fuzzer has shown itself to be easily capable of
103   // generating such patterns that fall within the other limits, but result
104   // in timeouts nonetheless. The marginal cost is high - even more so when
105   // counted repetition is involved - whereas the marginal benefit is zero.
106   // Crudely limit the use of 'k', 'K', 's' and 'S' too because they become
107   // three-element character classes when case-insensitive and using UTF-8.
108   // TODO(junyer): Handle [[:alnum:]] et al. when they start to cause pain.
109   int char_class = 0;
110   int backslash_p = 0;  // very expensive, so handle specially
111   for (size_t i = 0; i < pattern.size(); i++) {
112     if (pattern[i] == '.' ||
113         pattern[i] == 'k' || pattern[i] == 'K' ||
114         pattern[i] == 's' || pattern[i] == 'S')
115       char_class++;
116     if (pattern[i] != '\\')
117       continue;
118     i++;
119     if (i >= pattern.size())
120       break;
121     if (pattern[i] == 'p' || pattern[i] == 'P' ||
122         pattern[i] == 'd' || pattern[i] == 'D' ||
123         pattern[i] == 's' || pattern[i] == 'S' ||
124         pattern[i] == 'w' || pattern[i] == 'W')
125       char_class++;
126     if (pattern[i] == 'p' || pattern[i] == 'P')
127       backslash_p++;
128   }
129   if (char_class > 9)
130     return;
131   if (backslash_p > 1)
132     return;
133 
134   // Iterate just once when fuzzing. Otherwise, we easily get bogged down
135   // and coverage is unlikely to improve despite significant expense.
136   RE2::FUZZING_ONLY_set_maximum_global_replace_count(1);
137   // The default is 1000. Even 100 turned out to be too generous
138   // for fuzzing, empirically speaking, so let's try 10 instead.
139   re2::Regexp::FUZZING_ONLY_set_maximum_repeat_count(10);
140 
141   RE2 re(pattern, options);
142   if (!re.ok())
143     return;
144 
145   // Don't waste time fuzzing programs with large subexpressions.
146   // They can cause bug reports due to fuzzer timeouts. And they
147   // aren't interesting for fuzzing purposes.
148   if (SubexpressionWalker().Walk(re.Regexp(), -1) > 9)
149     return;
150 
151   // Don't waste time fuzzing programs with large substrings.
152   // They can cause bug reports due to fuzzer timeouts when they
153   // are repetitions (e.g. hundreds of NUL bytes) and matching is
154   // unanchored. And they aren't interesting for fuzzing purposes.
155   if (SubstringWalker().Walk(re.Regexp(), -1) > 9)
156     return;
157 
158   // Don't waste time fuzzing high-size programs.
159   // They can cause bug reports due to fuzzer timeouts.
160   int size = re.ProgramSize();
161   if (size > 9999)
162     return;
163   int rsize = re.ReverseProgramSize();
164   if (rsize > 9999)
165     return;
166 
167   // Don't waste time fuzzing high-fanout programs.
168   // They can cause bug reports due to fuzzer timeouts.
169   std::vector<int> histogram;
170   int fanout = re.ProgramFanout(&histogram);
171   if (fanout > 9)
172     return;
173   int rfanout = re.ReverseProgramFanout(&histogram);
174   if (rfanout > 9)
175     return;
176 
177   if (re.NumberOfCapturingGroups() == 0) {
178     // Avoid early return due to too many arguments.
179     absl::string_view sp = text;
180     RE2::FullMatch(sp, re);
181     RE2::PartialMatch(sp, re);
182     RE2::Consume(&sp, re);
183     sp = text;  // Reset.
184     RE2::FindAndConsume(&sp, re);
185   } else {
186     // Okay, we have at least one capturing group...
187     // Try conversion for variously typed arguments.
188     absl::string_view sp = text;
189     short s;
190     RE2::FullMatch(sp, re, &s);
191     long l;
192     RE2::PartialMatch(sp, re, &l);
193     float f;
194     RE2::Consume(&sp, re, &f);
195     sp = text;  // Reset.
196     double d;
197     RE2::FindAndConsume(&sp, re, &d);
198   }
199 
200   std::string s = std::string(text);
201   RE2::Replace(&s, re, "");
202   s = std::string(text);  // Reset.
203   RE2::GlobalReplace(&s, re, "");
204 
205   std::string min, max;
206   re.PossibleMatchRange(&min, &max, /*maxlen=*/9);
207 
208   // Exercise some other API functionality.
209   dummy += re.NamedCapturingGroups().size();
210   dummy += re.CapturingGroupNames().size();
211   dummy += RE2::QuoteMeta(pattern).size();
212   dummy += re.Regexp()->ToString().size();
213 
214   RE2::Set set(options, anchor);
215   int index = set.Add(pattern, /*error=*/NULL);  // -1 on error
216   if (index != -1 && set.Compile()) {
217     std::vector<int> matches;
218     set.Match(text, &matches);
219   }
220 
221   re2::FilteredRE2 filter;
222   index = -1;  // not clobbered on error
223   filter.Add(pattern, options, &index);
224   if (index != -1) {
225     std::vector<std::string> atoms;
226     filter.Compile(&atoms);
227     // Pretend that all atoms match, which
228     // triggers the AND-OR tree maximally.
229     std::vector<int> matched_atoms;
230     matched_atoms.reserve(atoms.size());
231     for (size_t i = 0; i < atoms.size(); ++i)
232       matched_atoms.push_back(static_cast<int>(i));
233     std::vector<int> matches;
234     filter.AllMatches(text, matched_atoms, &matches);
235   }
236 }
237 
238 // Entry point for libFuzzer.
LLVMFuzzerTestOneInput(const uint8_t * data,size_t size)239 extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
240   // An input larger than 4 KiB probably isn't interesting. (This limit
241   // allows for fdp.ConsumeRandomLengthString()'s backslash behaviour.)
242   if (size == 0 || size > 4096)
243     return 0;
244 
245   FuzzedDataProvider fdp(data, size);
246 
247   // The convention here is that fdp.ConsumeBool() returning false sets
248   // the default value whereas returning true sets the alternate value:
249   // most options default to false and so can be set directly; encoding
250   // defaults to UTF-8; case_sensitive defaults to true. We do NOT want
251   // to log errors. max_mem is 64 MiB because we can afford to use more
252   // RAM in exchange for (hopefully) faster fuzzing.
253   RE2::Options options;
254   options.set_encoding(fdp.ConsumeBool() ? RE2::Options::EncodingLatin1
255                                          : RE2::Options::EncodingUTF8);
256   options.set_posix_syntax(fdp.ConsumeBool());
257   options.set_longest_match(fdp.ConsumeBool());
258   options.set_log_errors(false);
259   options.set_max_mem(64 << 20);
260   options.set_literal(fdp.ConsumeBool());
261   options.set_never_nl(fdp.ConsumeBool());
262   options.set_dot_nl(fdp.ConsumeBool());
263   options.set_never_capture(fdp.ConsumeBool());
264   options.set_case_sensitive(!fdp.ConsumeBool());
265   options.set_perl_classes(fdp.ConsumeBool());
266   options.set_word_boundary(fdp.ConsumeBool());
267   options.set_one_line(fdp.ConsumeBool());
268 
269   // ConsumeEnum<RE2::Anchor>() would require RE2::Anchor to specify
270   // kMaxValue, so just use PickValueInArray<RE2::Anchor>() instead.
271   RE2::Anchor anchor = fdp.PickValueInArray<RE2::Anchor>({
272       RE2::UNANCHORED,
273       RE2::ANCHOR_START,
274       RE2::ANCHOR_BOTH,
275   });
276 
277   std::string pattern = fdp.ConsumeRandomLengthString(999);
278   std::string text = fdp.ConsumeRandomLengthString(999);
279 
280   TestOneInput(pattern, options, anchor, text);
281   return 0;
282 }
283