xref: /aosp_15_r20/external/regex-re2/re2/re2.cc (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1*ccdc9c3eSSadaf Ebrahimi // Copyright 2003-2009 The RE2 Authors.  All Rights Reserved.
2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style
3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file.
4*ccdc9c3eSSadaf Ebrahimi 
5*ccdc9c3eSSadaf Ebrahimi // Regular expression interface RE2.
6*ccdc9c3eSSadaf Ebrahimi //
7*ccdc9c3eSSadaf Ebrahimi // Originally the PCRE C++ wrapper, but adapted to use
8*ccdc9c3eSSadaf Ebrahimi // the new automata-based regular expression engines.
9*ccdc9c3eSSadaf Ebrahimi 
10*ccdc9c3eSSadaf Ebrahimi #include "re2/re2.h"
11*ccdc9c3eSSadaf Ebrahimi 
12*ccdc9c3eSSadaf Ebrahimi #include <assert.h>
13*ccdc9c3eSSadaf Ebrahimi #include <ctype.h>
14*ccdc9c3eSSadaf Ebrahimi #include <errno.h>
15*ccdc9c3eSSadaf Ebrahimi #include <stdint.h>
16*ccdc9c3eSSadaf Ebrahimi #include <stdlib.h>
17*ccdc9c3eSSadaf Ebrahimi #include <string.h>
18*ccdc9c3eSSadaf Ebrahimi #include <algorithm>
19*ccdc9c3eSSadaf Ebrahimi #include <iterator>
20*ccdc9c3eSSadaf Ebrahimi #include <mutex>
21*ccdc9c3eSSadaf Ebrahimi #include <string>
22*ccdc9c3eSSadaf Ebrahimi #include <utility>
23*ccdc9c3eSSadaf Ebrahimi #include <vector>
24*ccdc9c3eSSadaf Ebrahimi 
25*ccdc9c3eSSadaf Ebrahimi #include "util/util.h"
26*ccdc9c3eSSadaf Ebrahimi #include "util/logging.h"
27*ccdc9c3eSSadaf Ebrahimi #include "util/sparse_array.h"
28*ccdc9c3eSSadaf Ebrahimi #include "util/strutil.h"
29*ccdc9c3eSSadaf Ebrahimi #include "util/utf.h"
30*ccdc9c3eSSadaf Ebrahimi #include "re2/prog.h"
31*ccdc9c3eSSadaf Ebrahimi #include "re2/regexp.h"
32*ccdc9c3eSSadaf Ebrahimi 
33*ccdc9c3eSSadaf Ebrahimi namespace re2 {
34*ccdc9c3eSSadaf Ebrahimi 
35*ccdc9c3eSSadaf Ebrahimi // Maximum number of args we can set
36*ccdc9c3eSSadaf Ebrahimi static const int kMaxArgs = 16;
37*ccdc9c3eSSadaf Ebrahimi static const int kVecSize = 1+kMaxArgs;
38*ccdc9c3eSSadaf Ebrahimi 
39*ccdc9c3eSSadaf Ebrahimi const int RE2::Options::kDefaultMaxMem;  // initialized in re2.h
40*ccdc9c3eSSadaf Ebrahimi 
Options(RE2::CannedOptions opt)41*ccdc9c3eSSadaf Ebrahimi RE2::Options::Options(RE2::CannedOptions opt)
42*ccdc9c3eSSadaf Ebrahimi   : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8),
43*ccdc9c3eSSadaf Ebrahimi     posix_syntax_(opt == RE2::POSIX),
44*ccdc9c3eSSadaf Ebrahimi     longest_match_(opt == RE2::POSIX),
45*ccdc9c3eSSadaf Ebrahimi     log_errors_(opt != RE2::Quiet),
46*ccdc9c3eSSadaf Ebrahimi     max_mem_(kDefaultMaxMem),
47*ccdc9c3eSSadaf Ebrahimi     literal_(false),
48*ccdc9c3eSSadaf Ebrahimi     never_nl_(false),
49*ccdc9c3eSSadaf Ebrahimi     dot_nl_(false),
50*ccdc9c3eSSadaf Ebrahimi     never_capture_(false),
51*ccdc9c3eSSadaf Ebrahimi     case_sensitive_(true),
52*ccdc9c3eSSadaf Ebrahimi     perl_classes_(false),
53*ccdc9c3eSSadaf Ebrahimi     word_boundary_(false),
54*ccdc9c3eSSadaf Ebrahimi     one_line_(false) {
55*ccdc9c3eSSadaf Ebrahimi }
56*ccdc9c3eSSadaf Ebrahimi 
57*ccdc9c3eSSadaf Ebrahimi // static empty objects for use as const references.
58*ccdc9c3eSSadaf Ebrahimi // To avoid global constructors, allocated in RE2::Init().
59*ccdc9c3eSSadaf Ebrahimi static const string* empty_string;
60*ccdc9c3eSSadaf Ebrahimi static const std::map<string, int>* empty_named_groups;
61*ccdc9c3eSSadaf Ebrahimi static const std::map<int, string>* empty_group_names;
62*ccdc9c3eSSadaf Ebrahimi 
63*ccdc9c3eSSadaf Ebrahimi // Converts from Regexp error code to RE2 error code.
64*ccdc9c3eSSadaf Ebrahimi // Maybe some day they will diverge.  In any event, this
65*ccdc9c3eSSadaf Ebrahimi // hides the existence of Regexp from RE2 users.
RegexpErrorToRE2(re2::RegexpStatusCode code)66*ccdc9c3eSSadaf Ebrahimi static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
67*ccdc9c3eSSadaf Ebrahimi   switch (code) {
68*ccdc9c3eSSadaf Ebrahimi     case re2::kRegexpSuccess:
69*ccdc9c3eSSadaf Ebrahimi       return RE2::NoError;
70*ccdc9c3eSSadaf Ebrahimi     case re2::kRegexpInternalError:
71*ccdc9c3eSSadaf Ebrahimi       return RE2::ErrorInternal;
72*ccdc9c3eSSadaf Ebrahimi     case re2::kRegexpBadEscape:
73*ccdc9c3eSSadaf Ebrahimi       return RE2::ErrorBadEscape;
74*ccdc9c3eSSadaf Ebrahimi     case re2::kRegexpBadCharClass:
75*ccdc9c3eSSadaf Ebrahimi       return RE2::ErrorBadCharClass;
76*ccdc9c3eSSadaf Ebrahimi     case re2::kRegexpBadCharRange:
77*ccdc9c3eSSadaf Ebrahimi       return RE2::ErrorBadCharRange;
78*ccdc9c3eSSadaf Ebrahimi     case re2::kRegexpMissingBracket:
79*ccdc9c3eSSadaf Ebrahimi       return RE2::ErrorMissingBracket;
80*ccdc9c3eSSadaf Ebrahimi     case re2::kRegexpMissingParen:
81*ccdc9c3eSSadaf Ebrahimi       return RE2::ErrorMissingParen;
82*ccdc9c3eSSadaf Ebrahimi     case re2::kRegexpTrailingBackslash:
83*ccdc9c3eSSadaf Ebrahimi       return RE2::ErrorTrailingBackslash;
84*ccdc9c3eSSadaf Ebrahimi     case re2::kRegexpRepeatArgument:
85*ccdc9c3eSSadaf Ebrahimi       return RE2::ErrorRepeatArgument;
86*ccdc9c3eSSadaf Ebrahimi     case re2::kRegexpRepeatSize:
87*ccdc9c3eSSadaf Ebrahimi       return RE2::ErrorRepeatSize;
88*ccdc9c3eSSadaf Ebrahimi     case re2::kRegexpRepeatOp:
89*ccdc9c3eSSadaf Ebrahimi       return RE2::ErrorRepeatOp;
90*ccdc9c3eSSadaf Ebrahimi     case re2::kRegexpBadPerlOp:
91*ccdc9c3eSSadaf Ebrahimi       return RE2::ErrorBadPerlOp;
92*ccdc9c3eSSadaf Ebrahimi     case re2::kRegexpBadUTF8:
93*ccdc9c3eSSadaf Ebrahimi       return RE2::ErrorBadUTF8;
94*ccdc9c3eSSadaf Ebrahimi     case re2::kRegexpBadNamedCapture:
95*ccdc9c3eSSadaf Ebrahimi       return RE2::ErrorBadNamedCapture;
96*ccdc9c3eSSadaf Ebrahimi   }
97*ccdc9c3eSSadaf Ebrahimi   return RE2::ErrorInternal;
98*ccdc9c3eSSadaf Ebrahimi }
99*ccdc9c3eSSadaf Ebrahimi 
trunc(const StringPiece & pattern)100*ccdc9c3eSSadaf Ebrahimi static string trunc(const StringPiece& pattern) {
101*ccdc9c3eSSadaf Ebrahimi   if (pattern.size() < 100)
102*ccdc9c3eSSadaf Ebrahimi     return string(pattern);
103*ccdc9c3eSSadaf Ebrahimi   return string(pattern.substr(0, 100)) + "...";
104*ccdc9c3eSSadaf Ebrahimi }
105*ccdc9c3eSSadaf Ebrahimi 
106*ccdc9c3eSSadaf Ebrahimi 
RE2(const char * pattern)107*ccdc9c3eSSadaf Ebrahimi RE2::RE2(const char* pattern) {
108*ccdc9c3eSSadaf Ebrahimi   Init(pattern, DefaultOptions);
109*ccdc9c3eSSadaf Ebrahimi }
110*ccdc9c3eSSadaf Ebrahimi 
RE2(const string & pattern)111*ccdc9c3eSSadaf Ebrahimi RE2::RE2(const string& pattern) {
112*ccdc9c3eSSadaf Ebrahimi   Init(pattern, DefaultOptions);
113*ccdc9c3eSSadaf Ebrahimi }
114*ccdc9c3eSSadaf Ebrahimi 
RE2(const StringPiece & pattern)115*ccdc9c3eSSadaf Ebrahimi RE2::RE2(const StringPiece& pattern) {
116*ccdc9c3eSSadaf Ebrahimi   Init(pattern, DefaultOptions);
117*ccdc9c3eSSadaf Ebrahimi }
118*ccdc9c3eSSadaf Ebrahimi 
RE2(const StringPiece & pattern,const Options & options)119*ccdc9c3eSSadaf Ebrahimi RE2::RE2(const StringPiece& pattern, const Options& options) {
120*ccdc9c3eSSadaf Ebrahimi   Init(pattern, options);
121*ccdc9c3eSSadaf Ebrahimi }
122*ccdc9c3eSSadaf Ebrahimi 
ParseFlags() const123*ccdc9c3eSSadaf Ebrahimi int RE2::Options::ParseFlags() const {
124*ccdc9c3eSSadaf Ebrahimi   int flags = Regexp::ClassNL;
125*ccdc9c3eSSadaf Ebrahimi   switch (encoding()) {
126*ccdc9c3eSSadaf Ebrahimi     default:
127*ccdc9c3eSSadaf Ebrahimi       if (log_errors())
128*ccdc9c3eSSadaf Ebrahimi         LOG(ERROR) << "Unknown encoding " << encoding();
129*ccdc9c3eSSadaf Ebrahimi       break;
130*ccdc9c3eSSadaf Ebrahimi     case RE2::Options::EncodingUTF8:
131*ccdc9c3eSSadaf Ebrahimi       break;
132*ccdc9c3eSSadaf Ebrahimi     case RE2::Options::EncodingLatin1:
133*ccdc9c3eSSadaf Ebrahimi       flags |= Regexp::Latin1;
134*ccdc9c3eSSadaf Ebrahimi       break;
135*ccdc9c3eSSadaf Ebrahimi   }
136*ccdc9c3eSSadaf Ebrahimi 
137*ccdc9c3eSSadaf Ebrahimi   if (!posix_syntax())
138*ccdc9c3eSSadaf Ebrahimi     flags |= Regexp::LikePerl;
139*ccdc9c3eSSadaf Ebrahimi 
140*ccdc9c3eSSadaf Ebrahimi   if (literal())
141*ccdc9c3eSSadaf Ebrahimi     flags |= Regexp::Literal;
142*ccdc9c3eSSadaf Ebrahimi 
143*ccdc9c3eSSadaf Ebrahimi   if (never_nl())
144*ccdc9c3eSSadaf Ebrahimi     flags |= Regexp::NeverNL;
145*ccdc9c3eSSadaf Ebrahimi 
146*ccdc9c3eSSadaf Ebrahimi   if (dot_nl())
147*ccdc9c3eSSadaf Ebrahimi     flags |= Regexp::DotNL;
148*ccdc9c3eSSadaf Ebrahimi 
149*ccdc9c3eSSadaf Ebrahimi   if (never_capture())
150*ccdc9c3eSSadaf Ebrahimi     flags |= Regexp::NeverCapture;
151*ccdc9c3eSSadaf Ebrahimi 
152*ccdc9c3eSSadaf Ebrahimi   if (!case_sensitive())
153*ccdc9c3eSSadaf Ebrahimi     flags |= Regexp::FoldCase;
154*ccdc9c3eSSadaf Ebrahimi 
155*ccdc9c3eSSadaf Ebrahimi   if (perl_classes())
156*ccdc9c3eSSadaf Ebrahimi     flags |= Regexp::PerlClasses;
157*ccdc9c3eSSadaf Ebrahimi 
158*ccdc9c3eSSadaf Ebrahimi   if (word_boundary())
159*ccdc9c3eSSadaf Ebrahimi     flags |= Regexp::PerlB;
160*ccdc9c3eSSadaf Ebrahimi 
161*ccdc9c3eSSadaf Ebrahimi   if (one_line())
162*ccdc9c3eSSadaf Ebrahimi     flags |= Regexp::OneLine;
163*ccdc9c3eSSadaf Ebrahimi 
164*ccdc9c3eSSadaf Ebrahimi   return flags;
165*ccdc9c3eSSadaf Ebrahimi }
166*ccdc9c3eSSadaf Ebrahimi 
Init(const StringPiece & pattern,const Options & options)167*ccdc9c3eSSadaf Ebrahimi void RE2::Init(const StringPiece& pattern, const Options& options) {
168*ccdc9c3eSSadaf Ebrahimi   static std::once_flag empty_once;
169*ccdc9c3eSSadaf Ebrahimi   std::call_once(empty_once, []() {
170*ccdc9c3eSSadaf Ebrahimi     empty_string = new string;
171*ccdc9c3eSSadaf Ebrahimi     empty_named_groups = new std::map<string, int>;
172*ccdc9c3eSSadaf Ebrahimi     empty_group_names = new std::map<int, string>;
173*ccdc9c3eSSadaf Ebrahimi   });
174*ccdc9c3eSSadaf Ebrahimi 
175*ccdc9c3eSSadaf Ebrahimi   pattern_ = string(pattern);
176*ccdc9c3eSSadaf Ebrahimi   options_.Copy(options);
177*ccdc9c3eSSadaf Ebrahimi   entire_regexp_ = NULL;
178*ccdc9c3eSSadaf Ebrahimi   suffix_regexp_ = NULL;
179*ccdc9c3eSSadaf Ebrahimi   prog_ = NULL;
180*ccdc9c3eSSadaf Ebrahimi   num_captures_ = -1;
181*ccdc9c3eSSadaf Ebrahimi   rprog_ = NULL;
182*ccdc9c3eSSadaf Ebrahimi   error_ = empty_string;
183*ccdc9c3eSSadaf Ebrahimi   error_code_ = NoError;
184*ccdc9c3eSSadaf Ebrahimi   named_groups_ = NULL;
185*ccdc9c3eSSadaf Ebrahimi   group_names_ = NULL;
186*ccdc9c3eSSadaf Ebrahimi 
187*ccdc9c3eSSadaf Ebrahimi   RegexpStatus status;
188*ccdc9c3eSSadaf Ebrahimi   entire_regexp_ = Regexp::Parse(
189*ccdc9c3eSSadaf Ebrahimi     pattern_,
190*ccdc9c3eSSadaf Ebrahimi     static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
191*ccdc9c3eSSadaf Ebrahimi     &status);
192*ccdc9c3eSSadaf Ebrahimi   if (entire_regexp_ == NULL) {
193*ccdc9c3eSSadaf Ebrahimi     if (options_.log_errors()) {
194*ccdc9c3eSSadaf Ebrahimi       LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
195*ccdc9c3eSSadaf Ebrahimi                  << status.Text();
196*ccdc9c3eSSadaf Ebrahimi     }
197*ccdc9c3eSSadaf Ebrahimi     error_ = new string(status.Text());
198*ccdc9c3eSSadaf Ebrahimi     error_code_ = RegexpErrorToRE2(status.code());
199*ccdc9c3eSSadaf Ebrahimi     error_arg_ = string(status.error_arg());
200*ccdc9c3eSSadaf Ebrahimi     return;
201*ccdc9c3eSSadaf Ebrahimi   }
202*ccdc9c3eSSadaf Ebrahimi 
203*ccdc9c3eSSadaf Ebrahimi   re2::Regexp* suffix;
204*ccdc9c3eSSadaf Ebrahimi   if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
205*ccdc9c3eSSadaf Ebrahimi     suffix_regexp_ = suffix;
206*ccdc9c3eSSadaf Ebrahimi   else
207*ccdc9c3eSSadaf Ebrahimi     suffix_regexp_ = entire_regexp_->Incref();
208*ccdc9c3eSSadaf Ebrahimi 
209*ccdc9c3eSSadaf Ebrahimi   // Two thirds of the memory goes to the forward Prog,
210*ccdc9c3eSSadaf Ebrahimi   // one third to the reverse prog, because the forward
211*ccdc9c3eSSadaf Ebrahimi   // Prog has two DFAs but the reverse prog has one.
212*ccdc9c3eSSadaf Ebrahimi   prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
213*ccdc9c3eSSadaf Ebrahimi   if (prog_ == NULL) {
214*ccdc9c3eSSadaf Ebrahimi     if (options_.log_errors())
215*ccdc9c3eSSadaf Ebrahimi       LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
216*ccdc9c3eSSadaf Ebrahimi     error_ = new string("pattern too large - compile failed");
217*ccdc9c3eSSadaf Ebrahimi     error_code_ = RE2::ErrorPatternTooLarge;
218*ccdc9c3eSSadaf Ebrahimi     return;
219*ccdc9c3eSSadaf Ebrahimi   }
220*ccdc9c3eSSadaf Ebrahimi 
221*ccdc9c3eSSadaf Ebrahimi   // We used to compute this lazily, but it's used during the
222*ccdc9c3eSSadaf Ebrahimi   // typical control flow for a match call, so we now compute
223*ccdc9c3eSSadaf Ebrahimi   // it eagerly, which avoids the overhead of std::once_flag.
224*ccdc9c3eSSadaf Ebrahimi   num_captures_ = suffix_regexp_->NumCaptures();
225*ccdc9c3eSSadaf Ebrahimi 
226*ccdc9c3eSSadaf Ebrahimi   // Could delay this until the first match call that
227*ccdc9c3eSSadaf Ebrahimi   // cares about submatch information, but the one-pass
228*ccdc9c3eSSadaf Ebrahimi   // machine's memory gets cut from the DFA memory budget,
229*ccdc9c3eSSadaf Ebrahimi   // and that is harder to do if the DFA has already
230*ccdc9c3eSSadaf Ebrahimi   // been built.
231*ccdc9c3eSSadaf Ebrahimi   is_one_pass_ = prog_->IsOnePass();
232*ccdc9c3eSSadaf Ebrahimi }
233*ccdc9c3eSSadaf Ebrahimi 
234*ccdc9c3eSSadaf Ebrahimi // Returns rprog_, computing it if needed.
ReverseProg() const235*ccdc9c3eSSadaf Ebrahimi re2::Prog* RE2::ReverseProg() const {
236*ccdc9c3eSSadaf Ebrahimi   std::call_once(rprog_once_, [](const RE2* re) {
237*ccdc9c3eSSadaf Ebrahimi     re->rprog_ =
238*ccdc9c3eSSadaf Ebrahimi         re->suffix_regexp_->CompileToReverseProg(re->options_.max_mem() / 3);
239*ccdc9c3eSSadaf Ebrahimi     if (re->rprog_ == NULL) {
240*ccdc9c3eSSadaf Ebrahimi       if (re->options_.log_errors())
241*ccdc9c3eSSadaf Ebrahimi         LOG(ERROR) << "Error reverse compiling '" << trunc(re->pattern_) << "'";
242*ccdc9c3eSSadaf Ebrahimi       re->error_ = new string("pattern too large - reverse compile failed");
243*ccdc9c3eSSadaf Ebrahimi       re->error_code_ = RE2::ErrorPatternTooLarge;
244*ccdc9c3eSSadaf Ebrahimi     }
245*ccdc9c3eSSadaf Ebrahimi   }, this);
246*ccdc9c3eSSadaf Ebrahimi   return rprog_;
247*ccdc9c3eSSadaf Ebrahimi }
248*ccdc9c3eSSadaf Ebrahimi 
~RE2()249*ccdc9c3eSSadaf Ebrahimi RE2::~RE2() {
250*ccdc9c3eSSadaf Ebrahimi   if (suffix_regexp_)
251*ccdc9c3eSSadaf Ebrahimi     suffix_regexp_->Decref();
252*ccdc9c3eSSadaf Ebrahimi   if (entire_regexp_)
253*ccdc9c3eSSadaf Ebrahimi     entire_regexp_->Decref();
254*ccdc9c3eSSadaf Ebrahimi   delete prog_;
255*ccdc9c3eSSadaf Ebrahimi   delete rprog_;
256*ccdc9c3eSSadaf Ebrahimi   if (error_ != empty_string)
257*ccdc9c3eSSadaf Ebrahimi     delete error_;
258*ccdc9c3eSSadaf Ebrahimi   if (named_groups_ != NULL && named_groups_ != empty_named_groups)
259*ccdc9c3eSSadaf Ebrahimi     delete named_groups_;
260*ccdc9c3eSSadaf Ebrahimi   if (group_names_ != NULL &&  group_names_ != empty_group_names)
261*ccdc9c3eSSadaf Ebrahimi     delete group_names_;
262*ccdc9c3eSSadaf Ebrahimi }
263*ccdc9c3eSSadaf Ebrahimi 
ProgramSize() const264*ccdc9c3eSSadaf Ebrahimi int RE2::ProgramSize() const {
265*ccdc9c3eSSadaf Ebrahimi   if (prog_ == NULL)
266*ccdc9c3eSSadaf Ebrahimi     return -1;
267*ccdc9c3eSSadaf Ebrahimi   return prog_->size();
268*ccdc9c3eSSadaf Ebrahimi }
269*ccdc9c3eSSadaf Ebrahimi 
ReverseProgramSize() const270*ccdc9c3eSSadaf Ebrahimi int RE2::ReverseProgramSize() const {
271*ccdc9c3eSSadaf Ebrahimi   if (prog_ == NULL)
272*ccdc9c3eSSadaf Ebrahimi     return -1;
273*ccdc9c3eSSadaf Ebrahimi   Prog* prog = ReverseProg();
274*ccdc9c3eSSadaf Ebrahimi   if (prog == NULL)
275*ccdc9c3eSSadaf Ebrahimi     return -1;
276*ccdc9c3eSSadaf Ebrahimi   return prog->size();
277*ccdc9c3eSSadaf Ebrahimi }
278*ccdc9c3eSSadaf Ebrahimi 
Fanout(Prog * prog,std::map<int,int> * histogram)279*ccdc9c3eSSadaf Ebrahimi static int Fanout(Prog* prog, std::map<int, int>* histogram) {
280*ccdc9c3eSSadaf Ebrahimi   SparseArray<int> fanout(prog->size());
281*ccdc9c3eSSadaf Ebrahimi   prog->Fanout(&fanout);
282*ccdc9c3eSSadaf Ebrahimi   histogram->clear();
283*ccdc9c3eSSadaf Ebrahimi   for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) {
284*ccdc9c3eSSadaf Ebrahimi     // TODO(junyer): Optimise this?
285*ccdc9c3eSSadaf Ebrahimi     int bucket = 0;
286*ccdc9c3eSSadaf Ebrahimi     while (1 << bucket < i->value()) {
287*ccdc9c3eSSadaf Ebrahimi       bucket++;
288*ccdc9c3eSSadaf Ebrahimi     }
289*ccdc9c3eSSadaf Ebrahimi     (*histogram)[bucket]++;
290*ccdc9c3eSSadaf Ebrahimi   }
291*ccdc9c3eSSadaf Ebrahimi   return histogram->rbegin()->first;
292*ccdc9c3eSSadaf Ebrahimi }
293*ccdc9c3eSSadaf Ebrahimi 
ProgramFanout(std::map<int,int> * histogram) const294*ccdc9c3eSSadaf Ebrahimi int RE2::ProgramFanout(std::map<int, int>* histogram) const {
295*ccdc9c3eSSadaf Ebrahimi   if (prog_ == NULL)
296*ccdc9c3eSSadaf Ebrahimi     return -1;
297*ccdc9c3eSSadaf Ebrahimi   return Fanout(prog_, histogram);
298*ccdc9c3eSSadaf Ebrahimi }
299*ccdc9c3eSSadaf Ebrahimi 
ReverseProgramFanout(std::map<int,int> * histogram) const300*ccdc9c3eSSadaf Ebrahimi int RE2::ReverseProgramFanout(std::map<int, int>* histogram) const {
301*ccdc9c3eSSadaf Ebrahimi   if (prog_ == NULL)
302*ccdc9c3eSSadaf Ebrahimi     return -1;
303*ccdc9c3eSSadaf Ebrahimi   Prog* prog = ReverseProg();
304*ccdc9c3eSSadaf Ebrahimi   if (prog == NULL)
305*ccdc9c3eSSadaf Ebrahimi     return -1;
306*ccdc9c3eSSadaf Ebrahimi   return Fanout(prog, histogram);
307*ccdc9c3eSSadaf Ebrahimi }
308*ccdc9c3eSSadaf Ebrahimi 
309*ccdc9c3eSSadaf Ebrahimi // Returns named_groups_, computing it if needed.
NamedCapturingGroups() const310*ccdc9c3eSSadaf Ebrahimi const std::map<string, int>& RE2::NamedCapturingGroups() const {
311*ccdc9c3eSSadaf Ebrahimi   std::call_once(named_groups_once_, [](const RE2* re) {
312*ccdc9c3eSSadaf Ebrahimi     if (re->suffix_regexp_ != NULL)
313*ccdc9c3eSSadaf Ebrahimi       re->named_groups_ = re->suffix_regexp_->NamedCaptures();
314*ccdc9c3eSSadaf Ebrahimi     if (re->named_groups_ == NULL)
315*ccdc9c3eSSadaf Ebrahimi       re->named_groups_ = empty_named_groups;
316*ccdc9c3eSSadaf Ebrahimi   }, this);
317*ccdc9c3eSSadaf Ebrahimi   return *named_groups_;
318*ccdc9c3eSSadaf Ebrahimi }
319*ccdc9c3eSSadaf Ebrahimi 
320*ccdc9c3eSSadaf Ebrahimi // Returns group_names_, computing it if needed.
CapturingGroupNames() const321*ccdc9c3eSSadaf Ebrahimi const std::map<int, string>& RE2::CapturingGroupNames() const {
322*ccdc9c3eSSadaf Ebrahimi   std::call_once(group_names_once_, [](const RE2* re) {
323*ccdc9c3eSSadaf Ebrahimi     if (re->suffix_regexp_ != NULL)
324*ccdc9c3eSSadaf Ebrahimi       re->group_names_ = re->suffix_regexp_->CaptureNames();
325*ccdc9c3eSSadaf Ebrahimi     if (re->group_names_ == NULL)
326*ccdc9c3eSSadaf Ebrahimi       re->group_names_ = empty_group_names;
327*ccdc9c3eSSadaf Ebrahimi   }, this);
328*ccdc9c3eSSadaf Ebrahimi   return *group_names_;
329*ccdc9c3eSSadaf Ebrahimi }
330*ccdc9c3eSSadaf Ebrahimi 
331*ccdc9c3eSSadaf Ebrahimi /***** Convenience interfaces *****/
332*ccdc9c3eSSadaf Ebrahimi 
FullMatchN(const StringPiece & text,const RE2 & re,const Arg * const args[],int n)333*ccdc9c3eSSadaf Ebrahimi bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
334*ccdc9c3eSSadaf Ebrahimi                      const Arg* const args[], int n) {
335*ccdc9c3eSSadaf Ebrahimi   return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
336*ccdc9c3eSSadaf Ebrahimi }
337*ccdc9c3eSSadaf Ebrahimi 
PartialMatchN(const StringPiece & text,const RE2 & re,const Arg * const args[],int n)338*ccdc9c3eSSadaf Ebrahimi bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,
339*ccdc9c3eSSadaf Ebrahimi                         const Arg* const args[], int n) {
340*ccdc9c3eSSadaf Ebrahimi   return re.DoMatch(text, UNANCHORED, NULL, args, n);
341*ccdc9c3eSSadaf Ebrahimi }
342*ccdc9c3eSSadaf Ebrahimi 
ConsumeN(StringPiece * input,const RE2 & re,const Arg * const args[],int n)343*ccdc9c3eSSadaf Ebrahimi bool RE2::ConsumeN(StringPiece* input, const RE2& re,
344*ccdc9c3eSSadaf Ebrahimi                    const Arg* const args[], int n) {
345*ccdc9c3eSSadaf Ebrahimi   size_t consumed;
346*ccdc9c3eSSadaf Ebrahimi   if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
347*ccdc9c3eSSadaf Ebrahimi     input->remove_prefix(consumed);
348*ccdc9c3eSSadaf Ebrahimi     return true;
349*ccdc9c3eSSadaf Ebrahimi   } else {
350*ccdc9c3eSSadaf Ebrahimi     return false;
351*ccdc9c3eSSadaf Ebrahimi   }
352*ccdc9c3eSSadaf Ebrahimi }
353*ccdc9c3eSSadaf Ebrahimi 
FindAndConsumeN(StringPiece * input,const RE2 & re,const Arg * const args[],int n)354*ccdc9c3eSSadaf Ebrahimi bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
355*ccdc9c3eSSadaf Ebrahimi                           const Arg* const args[], int n) {
356*ccdc9c3eSSadaf Ebrahimi   size_t consumed;
357*ccdc9c3eSSadaf Ebrahimi   if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
358*ccdc9c3eSSadaf Ebrahimi     input->remove_prefix(consumed);
359*ccdc9c3eSSadaf Ebrahimi     return true;
360*ccdc9c3eSSadaf Ebrahimi   } else {
361*ccdc9c3eSSadaf Ebrahimi     return false;
362*ccdc9c3eSSadaf Ebrahimi   }
363*ccdc9c3eSSadaf Ebrahimi }
364*ccdc9c3eSSadaf Ebrahimi 
Replace(string * str,const RE2 & re,const StringPiece & rewrite)365*ccdc9c3eSSadaf Ebrahimi bool RE2::Replace(string* str,
366*ccdc9c3eSSadaf Ebrahimi                   const RE2& re,
367*ccdc9c3eSSadaf Ebrahimi                   const StringPiece& rewrite) {
368*ccdc9c3eSSadaf Ebrahimi   StringPiece vec[kVecSize];
369*ccdc9c3eSSadaf Ebrahimi   int nvec = 1 + MaxSubmatch(rewrite);
370*ccdc9c3eSSadaf Ebrahimi   if (nvec > arraysize(vec))
371*ccdc9c3eSSadaf Ebrahimi     return false;
372*ccdc9c3eSSadaf Ebrahimi   if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
373*ccdc9c3eSSadaf Ebrahimi     return false;
374*ccdc9c3eSSadaf Ebrahimi 
375*ccdc9c3eSSadaf Ebrahimi   string s;
376*ccdc9c3eSSadaf Ebrahimi   if (!re.Rewrite(&s, rewrite, vec, nvec))
377*ccdc9c3eSSadaf Ebrahimi     return false;
378*ccdc9c3eSSadaf Ebrahimi 
379*ccdc9c3eSSadaf Ebrahimi   assert(vec[0].begin() >= str->data());
380*ccdc9c3eSSadaf Ebrahimi   assert(vec[0].end() <= str->data()+str->size());
381*ccdc9c3eSSadaf Ebrahimi   str->replace(vec[0].data() - str->data(), vec[0].size(), s);
382*ccdc9c3eSSadaf Ebrahimi   return true;
383*ccdc9c3eSSadaf Ebrahimi }
384*ccdc9c3eSSadaf Ebrahimi 
GlobalReplace(string * str,const RE2 & re,const StringPiece & rewrite)385*ccdc9c3eSSadaf Ebrahimi int RE2::GlobalReplace(string* str,
386*ccdc9c3eSSadaf Ebrahimi                        const RE2& re,
387*ccdc9c3eSSadaf Ebrahimi                        const StringPiece& rewrite) {
388*ccdc9c3eSSadaf Ebrahimi   StringPiece vec[kVecSize];
389*ccdc9c3eSSadaf Ebrahimi   int nvec = 1 + MaxSubmatch(rewrite);
390*ccdc9c3eSSadaf Ebrahimi   if (nvec > arraysize(vec))
391*ccdc9c3eSSadaf Ebrahimi     return false;
392*ccdc9c3eSSadaf Ebrahimi 
393*ccdc9c3eSSadaf Ebrahimi   const char* p = str->data();
394*ccdc9c3eSSadaf Ebrahimi   const char* ep = p + str->size();
395*ccdc9c3eSSadaf Ebrahimi   const char* lastend = NULL;
396*ccdc9c3eSSadaf Ebrahimi   string out;
397*ccdc9c3eSSadaf Ebrahimi   int count = 0;
398*ccdc9c3eSSadaf Ebrahimi #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
399*ccdc9c3eSSadaf Ebrahimi   // Iterate just once when fuzzing. Otherwise, we easily get bogged down
400*ccdc9c3eSSadaf Ebrahimi   // and coverage is unlikely to improve despite significant expense.
401*ccdc9c3eSSadaf Ebrahimi   while (p == str->data()) {
402*ccdc9c3eSSadaf Ebrahimi #else
403*ccdc9c3eSSadaf Ebrahimi   while (p <= ep) {
404*ccdc9c3eSSadaf Ebrahimi #endif
405*ccdc9c3eSSadaf Ebrahimi     if (!re.Match(*str, static_cast<size_t>(p - str->data()),
406*ccdc9c3eSSadaf Ebrahimi                   str->size(), UNANCHORED, vec, nvec))
407*ccdc9c3eSSadaf Ebrahimi       break;
408*ccdc9c3eSSadaf Ebrahimi     if (p < vec[0].begin())
409*ccdc9c3eSSadaf Ebrahimi       out.append(p, vec[0].begin() - p);
410*ccdc9c3eSSadaf Ebrahimi     if (vec[0].begin() == lastend && vec[0].size() == 0) {
411*ccdc9c3eSSadaf Ebrahimi       // Disallow empty match at end of last match: skip ahead.
412*ccdc9c3eSSadaf Ebrahimi       //
413*ccdc9c3eSSadaf Ebrahimi       // fullrune() takes int, not size_t. However, it just looks
414*ccdc9c3eSSadaf Ebrahimi       // at the leading byte and treats any length >= 4 the same.
415*ccdc9c3eSSadaf Ebrahimi       if (re.options().encoding() == RE2::Options::EncodingUTF8 &&
416*ccdc9c3eSSadaf Ebrahimi           fullrune(p, static_cast<int>(std::min(static_cast<ptrdiff_t>(4),
417*ccdc9c3eSSadaf Ebrahimi                                                 ep - p)))) {
418*ccdc9c3eSSadaf Ebrahimi         // re is in UTF-8 mode and there is enough left of str
419*ccdc9c3eSSadaf Ebrahimi         // to allow us to advance by up to UTFmax bytes.
420*ccdc9c3eSSadaf Ebrahimi         Rune r;
421*ccdc9c3eSSadaf Ebrahimi         int n = chartorune(&r, p);
422*ccdc9c3eSSadaf Ebrahimi         // Some copies of chartorune have a bug that accepts
423*ccdc9c3eSSadaf Ebrahimi         // encodings of values in (10FFFF, 1FFFFF] as valid.
424*ccdc9c3eSSadaf Ebrahimi         if (r > Runemax) {
425*ccdc9c3eSSadaf Ebrahimi           n = 1;
426*ccdc9c3eSSadaf Ebrahimi           r = Runeerror;
427*ccdc9c3eSSadaf Ebrahimi         }
428*ccdc9c3eSSadaf Ebrahimi         if (!(n == 1 && r == Runeerror)) {  // no decoding error
429*ccdc9c3eSSadaf Ebrahimi           out.append(p, n);
430*ccdc9c3eSSadaf Ebrahimi           p += n;
431*ccdc9c3eSSadaf Ebrahimi           continue;
432*ccdc9c3eSSadaf Ebrahimi         }
433*ccdc9c3eSSadaf Ebrahimi       }
434*ccdc9c3eSSadaf Ebrahimi       // Most likely, re is in Latin-1 mode. If it is in UTF-8 mode,
435*ccdc9c3eSSadaf Ebrahimi       // we fell through from above and the GIGO principle applies.
436*ccdc9c3eSSadaf Ebrahimi       if (p < ep)
437*ccdc9c3eSSadaf Ebrahimi         out.append(p, 1);
438*ccdc9c3eSSadaf Ebrahimi       p++;
439*ccdc9c3eSSadaf Ebrahimi       continue;
440*ccdc9c3eSSadaf Ebrahimi     }
441*ccdc9c3eSSadaf Ebrahimi     re.Rewrite(&out, rewrite, vec, nvec);
442*ccdc9c3eSSadaf Ebrahimi     p = vec[0].end();
443*ccdc9c3eSSadaf Ebrahimi     lastend = p;
444*ccdc9c3eSSadaf Ebrahimi     count++;
445*ccdc9c3eSSadaf Ebrahimi   }
446*ccdc9c3eSSadaf Ebrahimi 
447*ccdc9c3eSSadaf Ebrahimi   if (count == 0)
448*ccdc9c3eSSadaf Ebrahimi     return 0;
449*ccdc9c3eSSadaf Ebrahimi 
450*ccdc9c3eSSadaf Ebrahimi   if (p < ep)
451*ccdc9c3eSSadaf Ebrahimi     out.append(p, ep - p);
452*ccdc9c3eSSadaf Ebrahimi   using std::swap;
453*ccdc9c3eSSadaf Ebrahimi   swap(out, *str);
454*ccdc9c3eSSadaf Ebrahimi   return count;
455*ccdc9c3eSSadaf Ebrahimi }
456*ccdc9c3eSSadaf Ebrahimi 
457*ccdc9c3eSSadaf Ebrahimi bool RE2::Extract(const StringPiece& text,
458*ccdc9c3eSSadaf Ebrahimi                   const RE2& re,
459*ccdc9c3eSSadaf Ebrahimi                   const StringPiece& rewrite,
460*ccdc9c3eSSadaf Ebrahimi                   string* out) {
461*ccdc9c3eSSadaf Ebrahimi   StringPiece vec[kVecSize];
462*ccdc9c3eSSadaf Ebrahimi   int nvec = 1 + MaxSubmatch(rewrite);
463*ccdc9c3eSSadaf Ebrahimi   if (nvec > arraysize(vec))
464*ccdc9c3eSSadaf Ebrahimi     return false;
465*ccdc9c3eSSadaf Ebrahimi 
466*ccdc9c3eSSadaf Ebrahimi   if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
467*ccdc9c3eSSadaf Ebrahimi     return false;
468*ccdc9c3eSSadaf Ebrahimi 
469*ccdc9c3eSSadaf Ebrahimi   out->clear();
470*ccdc9c3eSSadaf Ebrahimi   return re.Rewrite(out, rewrite, vec, nvec);
471*ccdc9c3eSSadaf Ebrahimi }
472*ccdc9c3eSSadaf Ebrahimi 
473*ccdc9c3eSSadaf Ebrahimi string RE2::QuoteMeta(const StringPiece& unquoted) {
474*ccdc9c3eSSadaf Ebrahimi   string result;
475*ccdc9c3eSSadaf Ebrahimi   result.reserve(unquoted.size() << 1);
476*ccdc9c3eSSadaf Ebrahimi 
477*ccdc9c3eSSadaf Ebrahimi   // Escape any ascii character not in [A-Za-z_0-9].
478*ccdc9c3eSSadaf Ebrahimi   //
479*ccdc9c3eSSadaf Ebrahimi   // Note that it's legal to escape a character even if it has no
480*ccdc9c3eSSadaf Ebrahimi   // special meaning in a regular expression -- so this function does
481*ccdc9c3eSSadaf Ebrahimi   // that.  (This also makes it identical to the perl function of the
482*ccdc9c3eSSadaf Ebrahimi   // same name except for the null-character special case;
483*ccdc9c3eSSadaf Ebrahimi   // see `perldoc -f quotemeta`.)
484*ccdc9c3eSSadaf Ebrahimi   for (size_t ii = 0; ii < unquoted.size(); ++ii) {
485*ccdc9c3eSSadaf Ebrahimi     // Note that using 'isalnum' here raises the benchmark time from
486*ccdc9c3eSSadaf Ebrahimi     // 32ns to 58ns:
487*ccdc9c3eSSadaf Ebrahimi     if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
488*ccdc9c3eSSadaf Ebrahimi         (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
489*ccdc9c3eSSadaf Ebrahimi         (unquoted[ii] < '0' || unquoted[ii] > '9') &&
490*ccdc9c3eSSadaf Ebrahimi         unquoted[ii] != '_' &&
491*ccdc9c3eSSadaf Ebrahimi         // If this is the part of a UTF8 or Latin1 character, we need
492*ccdc9c3eSSadaf Ebrahimi         // to copy this byte without escaping.  Experimentally this is
493*ccdc9c3eSSadaf Ebrahimi         // what works correctly with the regexp library.
494*ccdc9c3eSSadaf Ebrahimi         !(unquoted[ii] & 128)) {
495*ccdc9c3eSSadaf Ebrahimi       if (unquoted[ii] == '\0') {  // Special handling for null chars.
496*ccdc9c3eSSadaf Ebrahimi         // Note that this special handling is not strictly required for RE2,
497*ccdc9c3eSSadaf Ebrahimi         // but this quoting is required for other regexp libraries such as
498*ccdc9c3eSSadaf Ebrahimi         // PCRE.
499*ccdc9c3eSSadaf Ebrahimi         // Can't use "\\0" since the next character might be a digit.
500*ccdc9c3eSSadaf Ebrahimi         result += "\\x00";
501*ccdc9c3eSSadaf Ebrahimi         continue;
502*ccdc9c3eSSadaf Ebrahimi       }
503*ccdc9c3eSSadaf Ebrahimi       result += '\\';
504*ccdc9c3eSSadaf Ebrahimi     }
505*ccdc9c3eSSadaf Ebrahimi     result += unquoted[ii];
506*ccdc9c3eSSadaf Ebrahimi   }
507*ccdc9c3eSSadaf Ebrahimi 
508*ccdc9c3eSSadaf Ebrahimi   return result;
509*ccdc9c3eSSadaf Ebrahimi }
510*ccdc9c3eSSadaf Ebrahimi 
511*ccdc9c3eSSadaf Ebrahimi bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const {
512*ccdc9c3eSSadaf Ebrahimi   if (prog_ == NULL)
513*ccdc9c3eSSadaf Ebrahimi     return false;
514*ccdc9c3eSSadaf Ebrahimi 
515*ccdc9c3eSSadaf Ebrahimi   int n = static_cast<int>(prefix_.size());
516*ccdc9c3eSSadaf Ebrahimi   if (n > maxlen)
517*ccdc9c3eSSadaf Ebrahimi     n = maxlen;
518*ccdc9c3eSSadaf Ebrahimi 
519*ccdc9c3eSSadaf Ebrahimi   // Determine initial min max from prefix_ literal.
520*ccdc9c3eSSadaf Ebrahimi   *min = prefix_.substr(0, n);
521*ccdc9c3eSSadaf Ebrahimi   *max = prefix_.substr(0, n);
522*ccdc9c3eSSadaf Ebrahimi   if (prefix_foldcase_) {
523*ccdc9c3eSSadaf Ebrahimi     // prefix is ASCII lowercase; change *min to uppercase.
524*ccdc9c3eSSadaf Ebrahimi     for (int i = 0; i < n; i++) {
525*ccdc9c3eSSadaf Ebrahimi       char& c = (*min)[i];
526*ccdc9c3eSSadaf Ebrahimi       if ('a' <= c && c <= 'z')
527*ccdc9c3eSSadaf Ebrahimi         c += 'A' - 'a';
528*ccdc9c3eSSadaf Ebrahimi     }
529*ccdc9c3eSSadaf Ebrahimi   }
530*ccdc9c3eSSadaf Ebrahimi 
531*ccdc9c3eSSadaf Ebrahimi   // Add to prefix min max using PossibleMatchRange on regexp.
532*ccdc9c3eSSadaf Ebrahimi   string dmin, dmax;
533*ccdc9c3eSSadaf Ebrahimi   maxlen -= n;
534*ccdc9c3eSSadaf Ebrahimi   if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
535*ccdc9c3eSSadaf Ebrahimi     min->append(dmin);
536*ccdc9c3eSSadaf Ebrahimi     max->append(dmax);
537*ccdc9c3eSSadaf Ebrahimi   } else if (!max->empty()) {
538*ccdc9c3eSSadaf Ebrahimi     // prog_->PossibleMatchRange has failed us,
539*ccdc9c3eSSadaf Ebrahimi     // but we still have useful information from prefix_.
540*ccdc9c3eSSadaf Ebrahimi     // Round up *max to allow any possible suffix.
541*ccdc9c3eSSadaf Ebrahimi     PrefixSuccessor(max);
542*ccdc9c3eSSadaf Ebrahimi   } else {
543*ccdc9c3eSSadaf Ebrahimi     // Nothing useful.
544*ccdc9c3eSSadaf Ebrahimi     *min = "";
545*ccdc9c3eSSadaf Ebrahimi     *max = "";
546*ccdc9c3eSSadaf Ebrahimi     return false;
547*ccdc9c3eSSadaf Ebrahimi   }
548*ccdc9c3eSSadaf Ebrahimi 
549*ccdc9c3eSSadaf Ebrahimi   return true;
550*ccdc9c3eSSadaf Ebrahimi }
551*ccdc9c3eSSadaf Ebrahimi 
552*ccdc9c3eSSadaf Ebrahimi // Avoid possible locale nonsense in standard strcasecmp.
553*ccdc9c3eSSadaf Ebrahimi // The string a is known to be all lowercase.
554*ccdc9c3eSSadaf Ebrahimi static int ascii_strcasecmp(const char* a, const char* b, size_t len) {
555*ccdc9c3eSSadaf Ebrahimi   const char* ae = a + len;
556*ccdc9c3eSSadaf Ebrahimi 
557*ccdc9c3eSSadaf Ebrahimi   for (; a < ae; a++, b++) {
558*ccdc9c3eSSadaf Ebrahimi     uint8_t x = *a;
559*ccdc9c3eSSadaf Ebrahimi     uint8_t y = *b;
560*ccdc9c3eSSadaf Ebrahimi     if ('A' <= y && y <= 'Z')
561*ccdc9c3eSSadaf Ebrahimi       y += 'a' - 'A';
562*ccdc9c3eSSadaf Ebrahimi     if (x != y)
563*ccdc9c3eSSadaf Ebrahimi       return x - y;
564*ccdc9c3eSSadaf Ebrahimi   }
565*ccdc9c3eSSadaf Ebrahimi   return 0;
566*ccdc9c3eSSadaf Ebrahimi }
567*ccdc9c3eSSadaf Ebrahimi 
568*ccdc9c3eSSadaf Ebrahimi 
569*ccdc9c3eSSadaf Ebrahimi /***** Actual matching and rewriting code *****/
570*ccdc9c3eSSadaf Ebrahimi 
571*ccdc9c3eSSadaf Ebrahimi bool RE2::Match(const StringPiece& text,
572*ccdc9c3eSSadaf Ebrahimi                 size_t startpos,
573*ccdc9c3eSSadaf Ebrahimi                 size_t endpos,
574*ccdc9c3eSSadaf Ebrahimi                 Anchor re_anchor,
575*ccdc9c3eSSadaf Ebrahimi                 StringPiece* submatch,
576*ccdc9c3eSSadaf Ebrahimi                 int nsubmatch) const {
577*ccdc9c3eSSadaf Ebrahimi   if (!ok()) {
578*ccdc9c3eSSadaf Ebrahimi     if (options_.log_errors())
579*ccdc9c3eSSadaf Ebrahimi       LOG(ERROR) << "Invalid RE2: " << *error_;
580*ccdc9c3eSSadaf Ebrahimi     return false;
581*ccdc9c3eSSadaf Ebrahimi   }
582*ccdc9c3eSSadaf Ebrahimi 
583*ccdc9c3eSSadaf Ebrahimi   if (startpos > endpos || endpos > text.size()) {
584*ccdc9c3eSSadaf Ebrahimi     if (options_.log_errors())
585*ccdc9c3eSSadaf Ebrahimi       LOG(ERROR) << "RE2: invalid startpos, endpos pair. ["
586*ccdc9c3eSSadaf Ebrahimi                  << "startpos: " << startpos << ", "
587*ccdc9c3eSSadaf Ebrahimi                  << "endpos: " << endpos << ", "
588*ccdc9c3eSSadaf Ebrahimi                  << "text size: " << text.size() << "]";
589*ccdc9c3eSSadaf Ebrahimi     return false;
590*ccdc9c3eSSadaf Ebrahimi   }
591*ccdc9c3eSSadaf Ebrahimi 
592*ccdc9c3eSSadaf Ebrahimi   StringPiece subtext = text;
593*ccdc9c3eSSadaf Ebrahimi   subtext.remove_prefix(startpos);
594*ccdc9c3eSSadaf Ebrahimi   subtext.remove_suffix(text.size() - endpos);
595*ccdc9c3eSSadaf Ebrahimi 
596*ccdc9c3eSSadaf Ebrahimi   // Use DFAs to find exact location of match, filter out non-matches.
597*ccdc9c3eSSadaf Ebrahimi 
598*ccdc9c3eSSadaf Ebrahimi   // Don't ask for the location if we won't use it.
599*ccdc9c3eSSadaf Ebrahimi   // SearchDFA can do extra optimizations in that case.
600*ccdc9c3eSSadaf Ebrahimi   StringPiece match;
601*ccdc9c3eSSadaf Ebrahimi   StringPiece* matchp = &match;
602*ccdc9c3eSSadaf Ebrahimi   if (nsubmatch == 0)
603*ccdc9c3eSSadaf Ebrahimi     matchp = NULL;
604*ccdc9c3eSSadaf Ebrahimi 
605*ccdc9c3eSSadaf Ebrahimi   int ncap = 1 + NumberOfCapturingGroups();
606*ccdc9c3eSSadaf Ebrahimi   if (ncap > nsubmatch)
607*ccdc9c3eSSadaf Ebrahimi     ncap = nsubmatch;
608*ccdc9c3eSSadaf Ebrahimi 
609*ccdc9c3eSSadaf Ebrahimi   // If the regexp is anchored explicitly, must not be in middle of text.
610*ccdc9c3eSSadaf Ebrahimi   if (prog_->anchor_start() && startpos != 0)
611*ccdc9c3eSSadaf Ebrahimi     return false;
612*ccdc9c3eSSadaf Ebrahimi 
613*ccdc9c3eSSadaf Ebrahimi   // If the regexp is anchored explicitly, update re_anchor
614*ccdc9c3eSSadaf Ebrahimi   // so that we can potentially fall into a faster case below.
615*ccdc9c3eSSadaf Ebrahimi   if (prog_->anchor_start() && prog_->anchor_end())
616*ccdc9c3eSSadaf Ebrahimi     re_anchor = ANCHOR_BOTH;
617*ccdc9c3eSSadaf Ebrahimi   else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
618*ccdc9c3eSSadaf Ebrahimi     re_anchor = ANCHOR_START;
619*ccdc9c3eSSadaf Ebrahimi 
620*ccdc9c3eSSadaf Ebrahimi   // Check for the required prefix, if any.
621*ccdc9c3eSSadaf Ebrahimi   size_t prefixlen = 0;
622*ccdc9c3eSSadaf Ebrahimi   if (!prefix_.empty()) {
623*ccdc9c3eSSadaf Ebrahimi     if (startpos != 0)
624*ccdc9c3eSSadaf Ebrahimi       return false;
625*ccdc9c3eSSadaf Ebrahimi     prefixlen = prefix_.size();
626*ccdc9c3eSSadaf Ebrahimi     if (prefixlen > subtext.size())
627*ccdc9c3eSSadaf Ebrahimi       return false;
628*ccdc9c3eSSadaf Ebrahimi     if (prefix_foldcase_) {
629*ccdc9c3eSSadaf Ebrahimi       if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)
630*ccdc9c3eSSadaf Ebrahimi         return false;
631*ccdc9c3eSSadaf Ebrahimi     } else {
632*ccdc9c3eSSadaf Ebrahimi       if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)
633*ccdc9c3eSSadaf Ebrahimi         return false;
634*ccdc9c3eSSadaf Ebrahimi     }
635*ccdc9c3eSSadaf Ebrahimi     subtext.remove_prefix(prefixlen);
636*ccdc9c3eSSadaf Ebrahimi     // If there is a required prefix, the anchor must be at least ANCHOR_START.
637*ccdc9c3eSSadaf Ebrahimi     if (re_anchor != ANCHOR_BOTH)
638*ccdc9c3eSSadaf Ebrahimi       re_anchor = ANCHOR_START;
639*ccdc9c3eSSadaf Ebrahimi   }
640*ccdc9c3eSSadaf Ebrahimi 
641*ccdc9c3eSSadaf Ebrahimi   Prog::Anchor anchor = Prog::kUnanchored;
642*ccdc9c3eSSadaf Ebrahimi   Prog::MatchKind kind = Prog::kFirstMatch;
643*ccdc9c3eSSadaf Ebrahimi   if (options_.longest_match())
644*ccdc9c3eSSadaf Ebrahimi     kind = Prog::kLongestMatch;
645*ccdc9c3eSSadaf Ebrahimi   bool skipped_test = false;
646*ccdc9c3eSSadaf Ebrahimi 
647*ccdc9c3eSSadaf Ebrahimi   bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);
648*ccdc9c3eSSadaf Ebrahimi 
649*ccdc9c3eSSadaf Ebrahimi   // SearchBitState allocates a bit vector of size prog_->size() * text.size().
650*ccdc9c3eSSadaf Ebrahimi   // It also allocates a stack of 3-word structures which could potentially
651*ccdc9c3eSSadaf Ebrahimi   // grow as large as prog_->size() * text.size() but in practice is much
652*ccdc9c3eSSadaf Ebrahimi   // smaller.
653*ccdc9c3eSSadaf Ebrahimi   // Conditions for using SearchBitState:
654*ccdc9c3eSSadaf Ebrahimi   const int MaxBitStateProg = 500;   // prog_->size() <= Max.
655*ccdc9c3eSSadaf Ebrahimi   const int MaxBitStateVector = 256*1024;  // bit vector size <= Max (bits)
656*ccdc9c3eSSadaf Ebrahimi   bool can_bit_state = prog_->size() <= MaxBitStateProg;
657*ccdc9c3eSSadaf Ebrahimi   size_t bit_state_text_max = MaxBitStateVector / prog_->size();
658*ccdc9c3eSSadaf Ebrahimi 
659*ccdc9c3eSSadaf Ebrahimi   bool dfa_failed = false;
660*ccdc9c3eSSadaf Ebrahimi   switch (re_anchor) {
661*ccdc9c3eSSadaf Ebrahimi     default:
662*ccdc9c3eSSadaf Ebrahimi     case UNANCHORED: {
663*ccdc9c3eSSadaf Ebrahimi       if (!prog_->SearchDFA(subtext, text, anchor, kind,
664*ccdc9c3eSSadaf Ebrahimi                             matchp, &dfa_failed, NULL)) {
665*ccdc9c3eSSadaf Ebrahimi         if (dfa_failed) {
666*ccdc9c3eSSadaf Ebrahimi           if (options_.log_errors())
667*ccdc9c3eSSadaf Ebrahimi             LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", "
668*ccdc9c3eSSadaf Ebrahimi                        << "bytemap range " << prog_->bytemap_range() << ", "
669*ccdc9c3eSSadaf Ebrahimi                        << "list count " << prog_->list_count();
670*ccdc9c3eSSadaf Ebrahimi           // Fall back to NFA below.
671*ccdc9c3eSSadaf Ebrahimi           skipped_test = true;
672*ccdc9c3eSSadaf Ebrahimi           break;
673*ccdc9c3eSSadaf Ebrahimi         }
674*ccdc9c3eSSadaf Ebrahimi         return false;
675*ccdc9c3eSSadaf Ebrahimi       }
676*ccdc9c3eSSadaf Ebrahimi       if (matchp == NULL)  // Matched.  Don't care where
677*ccdc9c3eSSadaf Ebrahimi         return true;
678*ccdc9c3eSSadaf Ebrahimi       // SearchDFA set match[0].end() but didn't know where the
679*ccdc9c3eSSadaf Ebrahimi       // match started.  Run the regexp backward from match[0].end()
680*ccdc9c3eSSadaf Ebrahimi       // to find the longest possible match -- that's where it started.
681*ccdc9c3eSSadaf Ebrahimi       Prog* prog = ReverseProg();
682*ccdc9c3eSSadaf Ebrahimi       if (prog == NULL)
683*ccdc9c3eSSadaf Ebrahimi         return false;
684*ccdc9c3eSSadaf Ebrahimi       if (!prog->SearchDFA(match, text, Prog::kAnchored,
685*ccdc9c3eSSadaf Ebrahimi                            Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
686*ccdc9c3eSSadaf Ebrahimi         if (dfa_failed) {
687*ccdc9c3eSSadaf Ebrahimi           if (options_.log_errors())
688*ccdc9c3eSSadaf Ebrahimi             LOG(ERROR) << "DFA out of memory: size " << prog->size() << ", "
689*ccdc9c3eSSadaf Ebrahimi                        << "bytemap range " << prog->bytemap_range() << ", "
690*ccdc9c3eSSadaf Ebrahimi                        << "list count " << prog->list_count();
691*ccdc9c3eSSadaf Ebrahimi           // Fall back to NFA below.
692*ccdc9c3eSSadaf Ebrahimi           skipped_test = true;
693*ccdc9c3eSSadaf Ebrahimi           break;
694*ccdc9c3eSSadaf Ebrahimi         }
695*ccdc9c3eSSadaf Ebrahimi         if (options_.log_errors())
696*ccdc9c3eSSadaf Ebrahimi           LOG(ERROR) << "SearchDFA inconsistency";
697*ccdc9c3eSSadaf Ebrahimi         return false;
698*ccdc9c3eSSadaf Ebrahimi       }
699*ccdc9c3eSSadaf Ebrahimi       break;
700*ccdc9c3eSSadaf Ebrahimi     }
701*ccdc9c3eSSadaf Ebrahimi 
702*ccdc9c3eSSadaf Ebrahimi     case ANCHOR_BOTH:
703*ccdc9c3eSSadaf Ebrahimi     case ANCHOR_START:
704*ccdc9c3eSSadaf Ebrahimi       if (re_anchor == ANCHOR_BOTH)
705*ccdc9c3eSSadaf Ebrahimi         kind = Prog::kFullMatch;
706*ccdc9c3eSSadaf Ebrahimi       anchor = Prog::kAnchored;
707*ccdc9c3eSSadaf Ebrahimi 
708*ccdc9c3eSSadaf Ebrahimi       // If only a small amount of text and need submatch
709*ccdc9c3eSSadaf Ebrahimi       // information anyway and we're going to use OnePass or BitState
710*ccdc9c3eSSadaf Ebrahimi       // to get it, we might as well not even bother with the DFA:
711*ccdc9c3eSSadaf Ebrahimi       // OnePass or BitState will be fast enough.
712*ccdc9c3eSSadaf Ebrahimi       // On tiny texts, OnePass outruns even the DFA, and
713*ccdc9c3eSSadaf Ebrahimi       // it doesn't have the shared state and occasional mutex that
714*ccdc9c3eSSadaf Ebrahimi       // the DFA does.
715*ccdc9c3eSSadaf Ebrahimi       if (can_one_pass && text.size() <= 4096 &&
716*ccdc9c3eSSadaf Ebrahimi           (ncap > 1 || text.size() <= 8)) {
717*ccdc9c3eSSadaf Ebrahimi         skipped_test = true;
718*ccdc9c3eSSadaf Ebrahimi         break;
719*ccdc9c3eSSadaf Ebrahimi       }
720*ccdc9c3eSSadaf Ebrahimi       if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) {
721*ccdc9c3eSSadaf Ebrahimi         skipped_test = true;
722*ccdc9c3eSSadaf Ebrahimi         break;
723*ccdc9c3eSSadaf Ebrahimi       }
724*ccdc9c3eSSadaf Ebrahimi       if (!prog_->SearchDFA(subtext, text, anchor, kind,
725*ccdc9c3eSSadaf Ebrahimi                             &match, &dfa_failed, NULL)) {
726*ccdc9c3eSSadaf Ebrahimi         if (dfa_failed) {
727*ccdc9c3eSSadaf Ebrahimi           if (options_.log_errors())
728*ccdc9c3eSSadaf Ebrahimi             LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", "
729*ccdc9c3eSSadaf Ebrahimi                        << "bytemap range " << prog_->bytemap_range() << ", "
730*ccdc9c3eSSadaf Ebrahimi                        << "list count " << prog_->list_count();
731*ccdc9c3eSSadaf Ebrahimi           // Fall back to NFA below.
732*ccdc9c3eSSadaf Ebrahimi           skipped_test = true;
733*ccdc9c3eSSadaf Ebrahimi           break;
734*ccdc9c3eSSadaf Ebrahimi         }
735*ccdc9c3eSSadaf Ebrahimi         return false;
736*ccdc9c3eSSadaf Ebrahimi       }
737*ccdc9c3eSSadaf Ebrahimi       break;
738*ccdc9c3eSSadaf Ebrahimi   }
739*ccdc9c3eSSadaf Ebrahimi 
740*ccdc9c3eSSadaf Ebrahimi   if (!skipped_test && ncap <= 1) {
741*ccdc9c3eSSadaf Ebrahimi     // We know exactly where it matches.  That's enough.
742*ccdc9c3eSSadaf Ebrahimi     if (ncap == 1)
743*ccdc9c3eSSadaf Ebrahimi       submatch[0] = match;
744*ccdc9c3eSSadaf Ebrahimi   } else {
745*ccdc9c3eSSadaf Ebrahimi     StringPiece subtext1;
746*ccdc9c3eSSadaf Ebrahimi     if (skipped_test) {
747*ccdc9c3eSSadaf Ebrahimi       // DFA ran out of memory or was skipped:
748*ccdc9c3eSSadaf Ebrahimi       // need to search in entire original text.
749*ccdc9c3eSSadaf Ebrahimi       subtext1 = subtext;
750*ccdc9c3eSSadaf Ebrahimi     } else {
751*ccdc9c3eSSadaf Ebrahimi       // DFA found the exact match location:
752*ccdc9c3eSSadaf Ebrahimi       // let NFA run an anchored, full match search
753*ccdc9c3eSSadaf Ebrahimi       // to find submatch locations.
754*ccdc9c3eSSadaf Ebrahimi       subtext1 = match;
755*ccdc9c3eSSadaf Ebrahimi       anchor = Prog::kAnchored;
756*ccdc9c3eSSadaf Ebrahimi       kind = Prog::kFullMatch;
757*ccdc9c3eSSadaf Ebrahimi     }
758*ccdc9c3eSSadaf Ebrahimi 
759*ccdc9c3eSSadaf Ebrahimi     if (can_one_pass && anchor != Prog::kUnanchored) {
760*ccdc9c3eSSadaf Ebrahimi       if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
761*ccdc9c3eSSadaf Ebrahimi         if (!skipped_test && options_.log_errors())
762*ccdc9c3eSSadaf Ebrahimi           LOG(ERROR) << "SearchOnePass inconsistency";
763*ccdc9c3eSSadaf Ebrahimi         return false;
764*ccdc9c3eSSadaf Ebrahimi       }
765*ccdc9c3eSSadaf Ebrahimi     } else if (can_bit_state && subtext1.size() <= bit_state_text_max) {
766*ccdc9c3eSSadaf Ebrahimi       if (!prog_->SearchBitState(subtext1, text, anchor,
767*ccdc9c3eSSadaf Ebrahimi                                  kind, submatch, ncap)) {
768*ccdc9c3eSSadaf Ebrahimi         if (!skipped_test && options_.log_errors())
769*ccdc9c3eSSadaf Ebrahimi           LOG(ERROR) << "SearchBitState inconsistency";
770*ccdc9c3eSSadaf Ebrahimi         return false;
771*ccdc9c3eSSadaf Ebrahimi       }
772*ccdc9c3eSSadaf Ebrahimi     } else {
773*ccdc9c3eSSadaf Ebrahimi       if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
774*ccdc9c3eSSadaf Ebrahimi         if (!skipped_test && options_.log_errors())
775*ccdc9c3eSSadaf Ebrahimi           LOG(ERROR) << "SearchNFA inconsistency";
776*ccdc9c3eSSadaf Ebrahimi         return false;
777*ccdc9c3eSSadaf Ebrahimi       }
778*ccdc9c3eSSadaf Ebrahimi     }
779*ccdc9c3eSSadaf Ebrahimi   }
780*ccdc9c3eSSadaf Ebrahimi 
781*ccdc9c3eSSadaf Ebrahimi   // Adjust overall match for required prefix that we stripped off.
782*ccdc9c3eSSadaf Ebrahimi   if (prefixlen > 0 && nsubmatch > 0)
783*ccdc9c3eSSadaf Ebrahimi     submatch[0] = StringPiece(submatch[0].data() - prefixlen,
784*ccdc9c3eSSadaf Ebrahimi                               submatch[0].size() + prefixlen);
785*ccdc9c3eSSadaf Ebrahimi 
786*ccdc9c3eSSadaf Ebrahimi   // Zero submatches that don't exist in the regexp.
787*ccdc9c3eSSadaf Ebrahimi   for (int i = ncap; i < nsubmatch; i++)
788*ccdc9c3eSSadaf Ebrahimi     submatch[i] = StringPiece();
789*ccdc9c3eSSadaf Ebrahimi   return true;
790*ccdc9c3eSSadaf Ebrahimi }
791*ccdc9c3eSSadaf Ebrahimi 
792*ccdc9c3eSSadaf Ebrahimi // Internal matcher - like Match() but takes Args not StringPieces.
793*ccdc9c3eSSadaf Ebrahimi bool RE2::DoMatch(const StringPiece& text,
794*ccdc9c3eSSadaf Ebrahimi                   Anchor re_anchor,
795*ccdc9c3eSSadaf Ebrahimi                   size_t* consumed,
796*ccdc9c3eSSadaf Ebrahimi                   const Arg* const* args,
797*ccdc9c3eSSadaf Ebrahimi                   int n) const {
798*ccdc9c3eSSadaf Ebrahimi   if (!ok()) {
799*ccdc9c3eSSadaf Ebrahimi     if (options_.log_errors())
800*ccdc9c3eSSadaf Ebrahimi       LOG(ERROR) << "Invalid RE2: " << *error_;
801*ccdc9c3eSSadaf Ebrahimi     return false;
802*ccdc9c3eSSadaf Ebrahimi   }
803*ccdc9c3eSSadaf Ebrahimi 
804*ccdc9c3eSSadaf Ebrahimi   if (NumberOfCapturingGroups() < n) {
805*ccdc9c3eSSadaf Ebrahimi     // RE has fewer capturing groups than number of Arg pointers passed in.
806*ccdc9c3eSSadaf Ebrahimi     return false;
807*ccdc9c3eSSadaf Ebrahimi   }
808*ccdc9c3eSSadaf Ebrahimi 
809*ccdc9c3eSSadaf Ebrahimi   // Count number of capture groups needed.
810*ccdc9c3eSSadaf Ebrahimi   int nvec;
811*ccdc9c3eSSadaf Ebrahimi   if (n == 0 && consumed == NULL)
812*ccdc9c3eSSadaf Ebrahimi     nvec = 0;
813*ccdc9c3eSSadaf Ebrahimi   else
814*ccdc9c3eSSadaf Ebrahimi     nvec = n+1;
815*ccdc9c3eSSadaf Ebrahimi 
816*ccdc9c3eSSadaf Ebrahimi   StringPiece* vec;
817*ccdc9c3eSSadaf Ebrahimi   StringPiece stkvec[kVecSize];
818*ccdc9c3eSSadaf Ebrahimi   StringPiece* heapvec = NULL;
819*ccdc9c3eSSadaf Ebrahimi 
820*ccdc9c3eSSadaf Ebrahimi   if (nvec <= arraysize(stkvec)) {
821*ccdc9c3eSSadaf Ebrahimi     vec = stkvec;
822*ccdc9c3eSSadaf Ebrahimi   } else {
823*ccdc9c3eSSadaf Ebrahimi     vec = new StringPiece[nvec];
824*ccdc9c3eSSadaf Ebrahimi     heapvec = vec;
825*ccdc9c3eSSadaf Ebrahimi   }
826*ccdc9c3eSSadaf Ebrahimi 
827*ccdc9c3eSSadaf Ebrahimi   if (!Match(text, 0, text.size(), re_anchor, vec, nvec)) {
828*ccdc9c3eSSadaf Ebrahimi     delete[] heapvec;
829*ccdc9c3eSSadaf Ebrahimi     return false;
830*ccdc9c3eSSadaf Ebrahimi   }
831*ccdc9c3eSSadaf Ebrahimi 
832*ccdc9c3eSSadaf Ebrahimi   if (consumed != NULL)
833*ccdc9c3eSSadaf Ebrahimi     *consumed = static_cast<size_t>(vec[0].end() - text.begin());
834*ccdc9c3eSSadaf Ebrahimi 
835*ccdc9c3eSSadaf Ebrahimi   if (n == 0 || args == NULL) {
836*ccdc9c3eSSadaf Ebrahimi     // We are not interested in results
837*ccdc9c3eSSadaf Ebrahimi     delete[] heapvec;
838*ccdc9c3eSSadaf Ebrahimi     return true;
839*ccdc9c3eSSadaf Ebrahimi   }
840*ccdc9c3eSSadaf Ebrahimi 
841*ccdc9c3eSSadaf Ebrahimi   // If we got here, we must have matched the whole pattern.
842*ccdc9c3eSSadaf Ebrahimi   for (int i = 0; i < n; i++) {
843*ccdc9c3eSSadaf Ebrahimi     const StringPiece& s = vec[i+1];
844*ccdc9c3eSSadaf Ebrahimi     if (!args[i]->Parse(s.data(), s.size())) {
845*ccdc9c3eSSadaf Ebrahimi       // TODO: Should we indicate what the error was?
846*ccdc9c3eSSadaf Ebrahimi       delete[] heapvec;
847*ccdc9c3eSSadaf Ebrahimi       return false;
848*ccdc9c3eSSadaf Ebrahimi     }
849*ccdc9c3eSSadaf Ebrahimi   }
850*ccdc9c3eSSadaf Ebrahimi 
851*ccdc9c3eSSadaf Ebrahimi   delete[] heapvec;
852*ccdc9c3eSSadaf Ebrahimi   return true;
853*ccdc9c3eSSadaf Ebrahimi }
854*ccdc9c3eSSadaf Ebrahimi 
855*ccdc9c3eSSadaf Ebrahimi // Checks that the rewrite string is well-formed with respect to this
856*ccdc9c3eSSadaf Ebrahimi // regular expression.
857*ccdc9c3eSSadaf Ebrahimi bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const {
858*ccdc9c3eSSadaf Ebrahimi   int max_token = -1;
859*ccdc9c3eSSadaf Ebrahimi   for (const char *s = rewrite.data(), *end = s + rewrite.size();
860*ccdc9c3eSSadaf Ebrahimi        s < end; s++) {
861*ccdc9c3eSSadaf Ebrahimi     int c = *s;
862*ccdc9c3eSSadaf Ebrahimi     if (c != '\\') {
863*ccdc9c3eSSadaf Ebrahimi       continue;
864*ccdc9c3eSSadaf Ebrahimi     }
865*ccdc9c3eSSadaf Ebrahimi     if (++s == end) {
866*ccdc9c3eSSadaf Ebrahimi       *error = "Rewrite schema error: '\\' not allowed at end.";
867*ccdc9c3eSSadaf Ebrahimi       return false;
868*ccdc9c3eSSadaf Ebrahimi     }
869*ccdc9c3eSSadaf Ebrahimi     c = *s;
870*ccdc9c3eSSadaf Ebrahimi     if (c == '\\') {
871*ccdc9c3eSSadaf Ebrahimi       continue;
872*ccdc9c3eSSadaf Ebrahimi     }
873*ccdc9c3eSSadaf Ebrahimi     if (!isdigit(c)) {
874*ccdc9c3eSSadaf Ebrahimi       *error = "Rewrite schema error: "
875*ccdc9c3eSSadaf Ebrahimi                "'\\' must be followed by a digit or '\\'.";
876*ccdc9c3eSSadaf Ebrahimi       return false;
877*ccdc9c3eSSadaf Ebrahimi     }
878*ccdc9c3eSSadaf Ebrahimi     int n = (c - '0');
879*ccdc9c3eSSadaf Ebrahimi     if (max_token < n) {
880*ccdc9c3eSSadaf Ebrahimi       max_token = n;
881*ccdc9c3eSSadaf Ebrahimi     }
882*ccdc9c3eSSadaf Ebrahimi   }
883*ccdc9c3eSSadaf Ebrahimi 
884*ccdc9c3eSSadaf Ebrahimi   if (max_token > NumberOfCapturingGroups()) {
885*ccdc9c3eSSadaf Ebrahimi     SStringPrintf(error, "Rewrite schema requests %d matches, "
886*ccdc9c3eSSadaf Ebrahimi                   "but the regexp only has %d parenthesized subexpressions.",
887*ccdc9c3eSSadaf Ebrahimi                   max_token, NumberOfCapturingGroups());
888*ccdc9c3eSSadaf Ebrahimi     return false;
889*ccdc9c3eSSadaf Ebrahimi   }
890*ccdc9c3eSSadaf Ebrahimi   return true;
891*ccdc9c3eSSadaf Ebrahimi }
892*ccdc9c3eSSadaf Ebrahimi 
893*ccdc9c3eSSadaf Ebrahimi // Returns the maximum submatch needed for the rewrite to be done by Replace().
894*ccdc9c3eSSadaf Ebrahimi // E.g. if rewrite == "foo \\2,\\1", returns 2.
895*ccdc9c3eSSadaf Ebrahimi int RE2::MaxSubmatch(const StringPiece& rewrite) {
896*ccdc9c3eSSadaf Ebrahimi   int max = 0;
897*ccdc9c3eSSadaf Ebrahimi   for (const char *s = rewrite.data(), *end = s + rewrite.size();
898*ccdc9c3eSSadaf Ebrahimi        s < end; s++) {
899*ccdc9c3eSSadaf Ebrahimi     if (*s == '\\') {
900*ccdc9c3eSSadaf Ebrahimi       s++;
901*ccdc9c3eSSadaf Ebrahimi       int c = (s < end) ? *s : -1;
902*ccdc9c3eSSadaf Ebrahimi       if (isdigit(c)) {
903*ccdc9c3eSSadaf Ebrahimi         int n = (c - '0');
904*ccdc9c3eSSadaf Ebrahimi         if (n > max)
905*ccdc9c3eSSadaf Ebrahimi           max = n;
906*ccdc9c3eSSadaf Ebrahimi       }
907*ccdc9c3eSSadaf Ebrahimi     }
908*ccdc9c3eSSadaf Ebrahimi   }
909*ccdc9c3eSSadaf Ebrahimi   return max;
910*ccdc9c3eSSadaf Ebrahimi }
911*ccdc9c3eSSadaf Ebrahimi 
912*ccdc9c3eSSadaf Ebrahimi // Append the "rewrite" string, with backslash subsitutions from "vec",
913*ccdc9c3eSSadaf Ebrahimi // to string "out".
914*ccdc9c3eSSadaf Ebrahimi bool RE2::Rewrite(string* out,
915*ccdc9c3eSSadaf Ebrahimi                   const StringPiece& rewrite,
916*ccdc9c3eSSadaf Ebrahimi                   const StringPiece* vec,
917*ccdc9c3eSSadaf Ebrahimi                   int veclen) const {
918*ccdc9c3eSSadaf Ebrahimi   for (const char *s = rewrite.data(), *end = s + rewrite.size();
919*ccdc9c3eSSadaf Ebrahimi        s < end; s++) {
920*ccdc9c3eSSadaf Ebrahimi     if (*s != '\\') {
921*ccdc9c3eSSadaf Ebrahimi       out->push_back(*s);
922*ccdc9c3eSSadaf Ebrahimi       continue;
923*ccdc9c3eSSadaf Ebrahimi     }
924*ccdc9c3eSSadaf Ebrahimi     s++;
925*ccdc9c3eSSadaf Ebrahimi     int c = (s < end) ? *s : -1;
926*ccdc9c3eSSadaf Ebrahimi     if (isdigit(c)) {
927*ccdc9c3eSSadaf Ebrahimi       int n = (c - '0');
928*ccdc9c3eSSadaf Ebrahimi       if (n >= veclen) {
929*ccdc9c3eSSadaf Ebrahimi         if (options_.log_errors()) {
930*ccdc9c3eSSadaf Ebrahimi           LOG(ERROR) << "requested group " << n
931*ccdc9c3eSSadaf Ebrahimi                      << " in regexp " << rewrite.data();
932*ccdc9c3eSSadaf Ebrahimi         }
933*ccdc9c3eSSadaf Ebrahimi         return false;
934*ccdc9c3eSSadaf Ebrahimi       }
935*ccdc9c3eSSadaf Ebrahimi       StringPiece snip = vec[n];
936*ccdc9c3eSSadaf Ebrahimi       if (snip.size() > 0)
937*ccdc9c3eSSadaf Ebrahimi         out->append(snip.data(), snip.size());
938*ccdc9c3eSSadaf Ebrahimi     } else if (c == '\\') {
939*ccdc9c3eSSadaf Ebrahimi       out->push_back('\\');
940*ccdc9c3eSSadaf Ebrahimi     } else {
941*ccdc9c3eSSadaf Ebrahimi       if (options_.log_errors())
942*ccdc9c3eSSadaf Ebrahimi         LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
943*ccdc9c3eSSadaf Ebrahimi       return false;
944*ccdc9c3eSSadaf Ebrahimi     }
945*ccdc9c3eSSadaf Ebrahimi   }
946*ccdc9c3eSSadaf Ebrahimi   return true;
947*ccdc9c3eSSadaf Ebrahimi }
948*ccdc9c3eSSadaf Ebrahimi 
949*ccdc9c3eSSadaf Ebrahimi /***** Parsers for various types *****/
950*ccdc9c3eSSadaf Ebrahimi 
951*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_null(const char* str, size_t n, void* dest) {
952*ccdc9c3eSSadaf Ebrahimi   // We fail if somebody asked us to store into a non-NULL void* pointer
953*ccdc9c3eSSadaf Ebrahimi   return (dest == NULL);
954*ccdc9c3eSSadaf Ebrahimi }
955*ccdc9c3eSSadaf Ebrahimi 
956*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_string(const char* str, size_t n, void* dest) {
957*ccdc9c3eSSadaf Ebrahimi   if (dest == NULL) return true;
958*ccdc9c3eSSadaf Ebrahimi   reinterpret_cast<string*>(dest)->assign(str, n);
959*ccdc9c3eSSadaf Ebrahimi   return true;
960*ccdc9c3eSSadaf Ebrahimi }
961*ccdc9c3eSSadaf Ebrahimi 
962*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_stringpiece(const char* str, size_t n, void* dest) {
963*ccdc9c3eSSadaf Ebrahimi   if (dest == NULL) return true;
964*ccdc9c3eSSadaf Ebrahimi   *(reinterpret_cast<StringPiece*>(dest)) = StringPiece(str, n);
965*ccdc9c3eSSadaf Ebrahimi   return true;
966*ccdc9c3eSSadaf Ebrahimi }
967*ccdc9c3eSSadaf Ebrahimi 
968*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_char(const char* str, size_t n, void* dest) {
969*ccdc9c3eSSadaf Ebrahimi   if (n != 1) return false;
970*ccdc9c3eSSadaf Ebrahimi   if (dest == NULL) return true;
971*ccdc9c3eSSadaf Ebrahimi   *(reinterpret_cast<char*>(dest)) = str[0];
972*ccdc9c3eSSadaf Ebrahimi   return true;
973*ccdc9c3eSSadaf Ebrahimi }
974*ccdc9c3eSSadaf Ebrahimi 
975*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_schar(const char* str, size_t n, void* dest) {
976*ccdc9c3eSSadaf Ebrahimi   if (n != 1) return false;
977*ccdc9c3eSSadaf Ebrahimi   if (dest == NULL) return true;
978*ccdc9c3eSSadaf Ebrahimi   *(reinterpret_cast<signed char*>(dest)) = str[0];
979*ccdc9c3eSSadaf Ebrahimi   return true;
980*ccdc9c3eSSadaf Ebrahimi }
981*ccdc9c3eSSadaf Ebrahimi 
982*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_uchar(const char* str, size_t n, void* dest) {
983*ccdc9c3eSSadaf Ebrahimi   if (n != 1) return false;
984*ccdc9c3eSSadaf Ebrahimi   if (dest == NULL) return true;
985*ccdc9c3eSSadaf Ebrahimi   *(reinterpret_cast<unsigned char*>(dest)) = str[0];
986*ccdc9c3eSSadaf Ebrahimi   return true;
987*ccdc9c3eSSadaf Ebrahimi }
988*ccdc9c3eSSadaf Ebrahimi 
989*ccdc9c3eSSadaf Ebrahimi // Largest number spec that we are willing to parse
990*ccdc9c3eSSadaf Ebrahimi static const int kMaxNumberLength = 32;
991*ccdc9c3eSSadaf Ebrahimi 
992*ccdc9c3eSSadaf Ebrahimi // REQUIRES "buf" must have length at least nbuf.
993*ccdc9c3eSSadaf Ebrahimi // Copies "str" into "buf" and null-terminates.
994*ccdc9c3eSSadaf Ebrahimi // Overwrites *np with the new length.
995*ccdc9c3eSSadaf Ebrahimi static const char* TerminateNumber(char* buf, size_t nbuf, const char* str,
996*ccdc9c3eSSadaf Ebrahimi                                    size_t* np, bool accept_spaces) {
997*ccdc9c3eSSadaf Ebrahimi   size_t n = *np;
998*ccdc9c3eSSadaf Ebrahimi   if (n == 0) return "";
999*ccdc9c3eSSadaf Ebrahimi   if (n > 0 && isspace(*str)) {
1000*ccdc9c3eSSadaf Ebrahimi     // We are less forgiving than the strtoxxx() routines and do not
1001*ccdc9c3eSSadaf Ebrahimi     // allow leading spaces. We do allow leading spaces for floats.
1002*ccdc9c3eSSadaf Ebrahimi     if (!accept_spaces) {
1003*ccdc9c3eSSadaf Ebrahimi       return "";
1004*ccdc9c3eSSadaf Ebrahimi     }
1005*ccdc9c3eSSadaf Ebrahimi     while (n > 0 && isspace(*str)) {
1006*ccdc9c3eSSadaf Ebrahimi       n--;
1007*ccdc9c3eSSadaf Ebrahimi       str++;
1008*ccdc9c3eSSadaf Ebrahimi     }
1009*ccdc9c3eSSadaf Ebrahimi   }
1010*ccdc9c3eSSadaf Ebrahimi 
1011*ccdc9c3eSSadaf Ebrahimi   // Although buf has a fixed maximum size, we can still handle
1012*ccdc9c3eSSadaf Ebrahimi   // arbitrarily large integers correctly by omitting leading zeros.
1013*ccdc9c3eSSadaf Ebrahimi   // (Numbers that are still too long will be out of range.)
1014*ccdc9c3eSSadaf Ebrahimi   // Before deciding whether str is too long,
1015*ccdc9c3eSSadaf Ebrahimi   // remove leading zeros with s/000+/00/.
1016*ccdc9c3eSSadaf Ebrahimi   // Leaving the leading two zeros in place means that
1017*ccdc9c3eSSadaf Ebrahimi   // we don't change 0000x123 (invalid) into 0x123 (valid).
1018*ccdc9c3eSSadaf Ebrahimi   // Skip over leading - before replacing.
1019*ccdc9c3eSSadaf Ebrahimi   bool neg = false;
1020*ccdc9c3eSSadaf Ebrahimi   if (n >= 1 && str[0] == '-') {
1021*ccdc9c3eSSadaf Ebrahimi     neg = true;
1022*ccdc9c3eSSadaf Ebrahimi     n--;
1023*ccdc9c3eSSadaf Ebrahimi     str++;
1024*ccdc9c3eSSadaf Ebrahimi   }
1025*ccdc9c3eSSadaf Ebrahimi 
1026*ccdc9c3eSSadaf Ebrahimi   if (n >= 3 && str[0] == '0' && str[1] == '0') {
1027*ccdc9c3eSSadaf Ebrahimi     while (n >= 3 && str[2] == '0') {
1028*ccdc9c3eSSadaf Ebrahimi       n--;
1029*ccdc9c3eSSadaf Ebrahimi       str++;
1030*ccdc9c3eSSadaf Ebrahimi     }
1031*ccdc9c3eSSadaf Ebrahimi   }
1032*ccdc9c3eSSadaf Ebrahimi 
1033*ccdc9c3eSSadaf Ebrahimi   if (neg) {  // make room in buf for -
1034*ccdc9c3eSSadaf Ebrahimi     n++;
1035*ccdc9c3eSSadaf Ebrahimi     str--;
1036*ccdc9c3eSSadaf Ebrahimi   }
1037*ccdc9c3eSSadaf Ebrahimi 
1038*ccdc9c3eSSadaf Ebrahimi   if (n > nbuf-1) return "";
1039*ccdc9c3eSSadaf Ebrahimi 
1040*ccdc9c3eSSadaf Ebrahimi   memmove(buf, str, n);
1041*ccdc9c3eSSadaf Ebrahimi   if (neg) {
1042*ccdc9c3eSSadaf Ebrahimi     buf[0] = '-';
1043*ccdc9c3eSSadaf Ebrahimi   }
1044*ccdc9c3eSSadaf Ebrahimi   buf[n] = '\0';
1045*ccdc9c3eSSadaf Ebrahimi   *np = n;
1046*ccdc9c3eSSadaf Ebrahimi   return buf;
1047*ccdc9c3eSSadaf Ebrahimi }
1048*ccdc9c3eSSadaf Ebrahimi 
1049*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_long_radix(const char* str,
1050*ccdc9c3eSSadaf Ebrahimi                                 size_t n,
1051*ccdc9c3eSSadaf Ebrahimi                                 void* dest,
1052*ccdc9c3eSSadaf Ebrahimi                                 int radix) {
1053*ccdc9c3eSSadaf Ebrahimi   if (n == 0) return false;
1054*ccdc9c3eSSadaf Ebrahimi   char buf[kMaxNumberLength+1];
1055*ccdc9c3eSSadaf Ebrahimi   str = TerminateNumber(buf, sizeof buf, str, &n, false);
1056*ccdc9c3eSSadaf Ebrahimi   char* end;
1057*ccdc9c3eSSadaf Ebrahimi   errno = 0;
1058*ccdc9c3eSSadaf Ebrahimi   long r = strtol(str, &end, radix);
1059*ccdc9c3eSSadaf Ebrahimi   if (end != str + n) return false;   // Leftover junk
1060*ccdc9c3eSSadaf Ebrahimi   if (errno) return false;
1061*ccdc9c3eSSadaf Ebrahimi   if (dest == NULL) return true;
1062*ccdc9c3eSSadaf Ebrahimi   *(reinterpret_cast<long*>(dest)) = r;
1063*ccdc9c3eSSadaf Ebrahimi   return true;
1064*ccdc9c3eSSadaf Ebrahimi }
1065*ccdc9c3eSSadaf Ebrahimi 
1066*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_ulong_radix(const char* str,
1067*ccdc9c3eSSadaf Ebrahimi                                  size_t n,
1068*ccdc9c3eSSadaf Ebrahimi                                  void* dest,
1069*ccdc9c3eSSadaf Ebrahimi                                  int radix) {
1070*ccdc9c3eSSadaf Ebrahimi   if (n == 0) return false;
1071*ccdc9c3eSSadaf Ebrahimi   char buf[kMaxNumberLength+1];
1072*ccdc9c3eSSadaf Ebrahimi   str = TerminateNumber(buf, sizeof buf, str, &n, false);
1073*ccdc9c3eSSadaf Ebrahimi   if (str[0] == '-') {
1074*ccdc9c3eSSadaf Ebrahimi     // strtoul() will silently accept negative numbers and parse
1075*ccdc9c3eSSadaf Ebrahimi     // them.  This module is more strict and treats them as errors.
1076*ccdc9c3eSSadaf Ebrahimi     return false;
1077*ccdc9c3eSSadaf Ebrahimi   }
1078*ccdc9c3eSSadaf Ebrahimi 
1079*ccdc9c3eSSadaf Ebrahimi   char* end;
1080*ccdc9c3eSSadaf Ebrahimi   errno = 0;
1081*ccdc9c3eSSadaf Ebrahimi   unsigned long r = strtoul(str, &end, radix);
1082*ccdc9c3eSSadaf Ebrahimi   if (end != str + n) return false;   // Leftover junk
1083*ccdc9c3eSSadaf Ebrahimi   if (errno) return false;
1084*ccdc9c3eSSadaf Ebrahimi   if (dest == NULL) return true;
1085*ccdc9c3eSSadaf Ebrahimi   *(reinterpret_cast<unsigned long*>(dest)) = r;
1086*ccdc9c3eSSadaf Ebrahimi   return true;
1087*ccdc9c3eSSadaf Ebrahimi }
1088*ccdc9c3eSSadaf Ebrahimi 
1089*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_short_radix(const char* str,
1090*ccdc9c3eSSadaf Ebrahimi                                  size_t n,
1091*ccdc9c3eSSadaf Ebrahimi                                  void* dest,
1092*ccdc9c3eSSadaf Ebrahimi                                  int radix) {
1093*ccdc9c3eSSadaf Ebrahimi   long r;
1094*ccdc9c3eSSadaf Ebrahimi   if (!parse_long_radix(str, n, &r, radix)) return false;  // Could not parse
1095*ccdc9c3eSSadaf Ebrahimi   if ((short)r != r) return false;                         // Out of range
1096*ccdc9c3eSSadaf Ebrahimi   if (dest == NULL) return true;
1097*ccdc9c3eSSadaf Ebrahimi   *(reinterpret_cast<short*>(dest)) = (short)r;
1098*ccdc9c3eSSadaf Ebrahimi   return true;
1099*ccdc9c3eSSadaf Ebrahimi }
1100*ccdc9c3eSSadaf Ebrahimi 
1101*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_ushort_radix(const char* str,
1102*ccdc9c3eSSadaf Ebrahimi                                   size_t n,
1103*ccdc9c3eSSadaf Ebrahimi                                   void* dest,
1104*ccdc9c3eSSadaf Ebrahimi                                   int radix) {
1105*ccdc9c3eSSadaf Ebrahimi   unsigned long r;
1106*ccdc9c3eSSadaf Ebrahimi   if (!parse_ulong_radix(str, n, &r, radix)) return false;  // Could not parse
1107*ccdc9c3eSSadaf Ebrahimi   if ((unsigned short)r != r) return false;                 // Out of range
1108*ccdc9c3eSSadaf Ebrahimi   if (dest == NULL) return true;
1109*ccdc9c3eSSadaf Ebrahimi   *(reinterpret_cast<unsigned short*>(dest)) = (unsigned short)r;
1110*ccdc9c3eSSadaf Ebrahimi   return true;
1111*ccdc9c3eSSadaf Ebrahimi }
1112*ccdc9c3eSSadaf Ebrahimi 
1113*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_int_radix(const char* str,
1114*ccdc9c3eSSadaf Ebrahimi                                size_t n,
1115*ccdc9c3eSSadaf Ebrahimi                                void* dest,
1116*ccdc9c3eSSadaf Ebrahimi                                int radix) {
1117*ccdc9c3eSSadaf Ebrahimi   long r;
1118*ccdc9c3eSSadaf Ebrahimi   if (!parse_long_radix(str, n, &r, radix)) return false;  // Could not parse
1119*ccdc9c3eSSadaf Ebrahimi   if ((int)r != r) return false;                           // Out of range
1120*ccdc9c3eSSadaf Ebrahimi   if (dest == NULL) return true;
1121*ccdc9c3eSSadaf Ebrahimi   *(reinterpret_cast<int*>(dest)) = (int)r;
1122*ccdc9c3eSSadaf Ebrahimi   return true;
1123*ccdc9c3eSSadaf Ebrahimi }
1124*ccdc9c3eSSadaf Ebrahimi 
1125*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_uint_radix(const char* str,
1126*ccdc9c3eSSadaf Ebrahimi                                 size_t n,
1127*ccdc9c3eSSadaf Ebrahimi                                 void* dest,
1128*ccdc9c3eSSadaf Ebrahimi                                 int radix) {
1129*ccdc9c3eSSadaf Ebrahimi   unsigned long r;
1130*ccdc9c3eSSadaf Ebrahimi   if (!parse_ulong_radix(str, n, &r, radix)) return false;  // Could not parse
1131*ccdc9c3eSSadaf Ebrahimi   if ((unsigned int)r != r) return false;                   // Out of range
1132*ccdc9c3eSSadaf Ebrahimi   if (dest == NULL) return true;
1133*ccdc9c3eSSadaf Ebrahimi   *(reinterpret_cast<unsigned int*>(dest)) = (unsigned int)r;
1134*ccdc9c3eSSadaf Ebrahimi   return true;
1135*ccdc9c3eSSadaf Ebrahimi }
1136*ccdc9c3eSSadaf Ebrahimi 
1137*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_longlong_radix(const char* str,
1138*ccdc9c3eSSadaf Ebrahimi                                     size_t n,
1139*ccdc9c3eSSadaf Ebrahimi                                     void* dest,
1140*ccdc9c3eSSadaf Ebrahimi                                     int radix) {
1141*ccdc9c3eSSadaf Ebrahimi   if (n == 0) return false;
1142*ccdc9c3eSSadaf Ebrahimi   char buf[kMaxNumberLength+1];
1143*ccdc9c3eSSadaf Ebrahimi   str = TerminateNumber(buf, sizeof buf, str, &n, false);
1144*ccdc9c3eSSadaf Ebrahimi   char* end;
1145*ccdc9c3eSSadaf Ebrahimi   errno = 0;
1146*ccdc9c3eSSadaf Ebrahimi   long long r = strtoll(str, &end, radix);
1147*ccdc9c3eSSadaf Ebrahimi   if (end != str + n) return false;   // Leftover junk
1148*ccdc9c3eSSadaf Ebrahimi   if (errno) return false;
1149*ccdc9c3eSSadaf Ebrahimi   if (dest == NULL) return true;
1150*ccdc9c3eSSadaf Ebrahimi   *(reinterpret_cast<long long*>(dest)) = r;
1151*ccdc9c3eSSadaf Ebrahimi   return true;
1152*ccdc9c3eSSadaf Ebrahimi }
1153*ccdc9c3eSSadaf Ebrahimi 
1154*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_ulonglong_radix(const char* str,
1155*ccdc9c3eSSadaf Ebrahimi                                      size_t n,
1156*ccdc9c3eSSadaf Ebrahimi                                      void* dest,
1157*ccdc9c3eSSadaf Ebrahimi                                      int radix) {
1158*ccdc9c3eSSadaf Ebrahimi   if (n == 0) return false;
1159*ccdc9c3eSSadaf Ebrahimi   char buf[kMaxNumberLength+1];
1160*ccdc9c3eSSadaf Ebrahimi   str = TerminateNumber(buf, sizeof buf, str, &n, false);
1161*ccdc9c3eSSadaf Ebrahimi   if (str[0] == '-') {
1162*ccdc9c3eSSadaf Ebrahimi     // strtoull() will silently accept negative numbers and parse
1163*ccdc9c3eSSadaf Ebrahimi     // them.  This module is more strict and treats them as errors.
1164*ccdc9c3eSSadaf Ebrahimi     return false;
1165*ccdc9c3eSSadaf Ebrahimi   }
1166*ccdc9c3eSSadaf Ebrahimi   char* end;
1167*ccdc9c3eSSadaf Ebrahimi   errno = 0;
1168*ccdc9c3eSSadaf Ebrahimi   unsigned long long r = strtoull(str, &end, radix);
1169*ccdc9c3eSSadaf Ebrahimi   if (end != str + n) return false;   // Leftover junk
1170*ccdc9c3eSSadaf Ebrahimi   if (errno) return false;
1171*ccdc9c3eSSadaf Ebrahimi   if (dest == NULL) return true;
1172*ccdc9c3eSSadaf Ebrahimi   *(reinterpret_cast<unsigned long long*>(dest)) = r;
1173*ccdc9c3eSSadaf Ebrahimi   return true;
1174*ccdc9c3eSSadaf Ebrahimi }
1175*ccdc9c3eSSadaf Ebrahimi 
1176*ccdc9c3eSSadaf Ebrahimi static bool parse_double_float(const char* str, size_t n, bool isfloat,
1177*ccdc9c3eSSadaf Ebrahimi                                void* dest) {
1178*ccdc9c3eSSadaf Ebrahimi   if (n == 0) return false;
1179*ccdc9c3eSSadaf Ebrahimi   static const int kMaxLength = 200;
1180*ccdc9c3eSSadaf Ebrahimi   char buf[kMaxLength+1];
1181*ccdc9c3eSSadaf Ebrahimi   str = TerminateNumber(buf, sizeof buf, str, &n, true);
1182*ccdc9c3eSSadaf Ebrahimi   char* end;
1183*ccdc9c3eSSadaf Ebrahimi   errno = 0;
1184*ccdc9c3eSSadaf Ebrahimi   double r;
1185*ccdc9c3eSSadaf Ebrahimi   if (isfloat) {
1186*ccdc9c3eSSadaf Ebrahimi     r = strtof(str, &end);
1187*ccdc9c3eSSadaf Ebrahimi   } else {
1188*ccdc9c3eSSadaf Ebrahimi     r = strtod(str, &end);
1189*ccdc9c3eSSadaf Ebrahimi   }
1190*ccdc9c3eSSadaf Ebrahimi   if (end != str + n) return false;   // Leftover junk
1191*ccdc9c3eSSadaf Ebrahimi   if (errno) return false;
1192*ccdc9c3eSSadaf Ebrahimi   if (dest == NULL) return true;
1193*ccdc9c3eSSadaf Ebrahimi   if (isfloat) {
1194*ccdc9c3eSSadaf Ebrahimi     *(reinterpret_cast<float*>(dest)) = (float)r;
1195*ccdc9c3eSSadaf Ebrahimi   } else {
1196*ccdc9c3eSSadaf Ebrahimi     *(reinterpret_cast<double*>(dest)) = r;
1197*ccdc9c3eSSadaf Ebrahimi   }
1198*ccdc9c3eSSadaf Ebrahimi   return true;
1199*ccdc9c3eSSadaf Ebrahimi }
1200*ccdc9c3eSSadaf Ebrahimi 
1201*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_double(const char* str, size_t n, void* dest) {
1202*ccdc9c3eSSadaf Ebrahimi   return parse_double_float(str, n, false, dest);
1203*ccdc9c3eSSadaf Ebrahimi }
1204*ccdc9c3eSSadaf Ebrahimi 
1205*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_float(const char* str, size_t n, void* dest) {
1206*ccdc9c3eSSadaf Ebrahimi   return parse_double_float(str, n, true, dest);
1207*ccdc9c3eSSadaf Ebrahimi }
1208*ccdc9c3eSSadaf Ebrahimi 
1209*ccdc9c3eSSadaf Ebrahimi #define DEFINE_INTEGER_PARSER(name)                                            \
1210*ccdc9c3eSSadaf Ebrahimi   bool RE2::Arg::parse_##name(const char* str, size_t n, void* dest) {         \
1211*ccdc9c3eSSadaf Ebrahimi     return parse_##name##_radix(str, n, dest, 10);                             \
1212*ccdc9c3eSSadaf Ebrahimi   }                                                                            \
1213*ccdc9c3eSSadaf Ebrahimi   bool RE2::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) {   \
1214*ccdc9c3eSSadaf Ebrahimi     return parse_##name##_radix(str, n, dest, 16);                             \
1215*ccdc9c3eSSadaf Ebrahimi   }                                                                            \
1216*ccdc9c3eSSadaf Ebrahimi   bool RE2::Arg::parse_##name##_octal(const char* str, size_t n, void* dest) { \
1217*ccdc9c3eSSadaf Ebrahimi     return parse_##name##_radix(str, n, dest, 8);                              \
1218*ccdc9c3eSSadaf Ebrahimi   }                                                                            \
1219*ccdc9c3eSSadaf Ebrahimi   bool RE2::Arg::parse_##name##_cradix(const char* str, size_t n,              \
1220*ccdc9c3eSSadaf Ebrahimi                                        void* dest) {                           \
1221*ccdc9c3eSSadaf Ebrahimi     return parse_##name##_radix(str, n, dest, 0);                              \
1222*ccdc9c3eSSadaf Ebrahimi   }
1223*ccdc9c3eSSadaf Ebrahimi 
1224*ccdc9c3eSSadaf Ebrahimi DEFINE_INTEGER_PARSER(short);
1225*ccdc9c3eSSadaf Ebrahimi DEFINE_INTEGER_PARSER(ushort);
1226*ccdc9c3eSSadaf Ebrahimi DEFINE_INTEGER_PARSER(int);
1227*ccdc9c3eSSadaf Ebrahimi DEFINE_INTEGER_PARSER(uint);
1228*ccdc9c3eSSadaf Ebrahimi DEFINE_INTEGER_PARSER(long);
1229*ccdc9c3eSSadaf Ebrahimi DEFINE_INTEGER_PARSER(ulong);
1230*ccdc9c3eSSadaf Ebrahimi DEFINE_INTEGER_PARSER(longlong);
1231*ccdc9c3eSSadaf Ebrahimi DEFINE_INTEGER_PARSER(ulonglong);
1232*ccdc9c3eSSadaf Ebrahimi 
1233*ccdc9c3eSSadaf Ebrahimi #undef DEFINE_INTEGER_PARSER
1234*ccdc9c3eSSadaf Ebrahimi 
1235*ccdc9c3eSSadaf Ebrahimi }  // namespace re2
1236