1*ccdc9c3eSSadaf Ebrahimi // Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style
3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file.
4*ccdc9c3eSSadaf Ebrahimi
5*ccdc9c3eSSadaf Ebrahimi // Regular expression interface RE2.
6*ccdc9c3eSSadaf Ebrahimi //
7*ccdc9c3eSSadaf Ebrahimi // Originally the PCRE C++ wrapper, but adapted to use
8*ccdc9c3eSSadaf Ebrahimi // the new automata-based regular expression engines.
9*ccdc9c3eSSadaf Ebrahimi
10*ccdc9c3eSSadaf Ebrahimi #include "re2/re2.h"
11*ccdc9c3eSSadaf Ebrahimi
12*ccdc9c3eSSadaf Ebrahimi #include <assert.h>
13*ccdc9c3eSSadaf Ebrahimi #include <ctype.h>
14*ccdc9c3eSSadaf Ebrahimi #include <errno.h>
15*ccdc9c3eSSadaf Ebrahimi #include <stdint.h>
16*ccdc9c3eSSadaf Ebrahimi #include <stdlib.h>
17*ccdc9c3eSSadaf Ebrahimi #include <string.h>
18*ccdc9c3eSSadaf Ebrahimi #include <algorithm>
19*ccdc9c3eSSadaf Ebrahimi #include <iterator>
20*ccdc9c3eSSadaf Ebrahimi #include <mutex>
21*ccdc9c3eSSadaf Ebrahimi #include <string>
22*ccdc9c3eSSadaf Ebrahimi #include <utility>
23*ccdc9c3eSSadaf Ebrahimi #include <vector>
24*ccdc9c3eSSadaf Ebrahimi
25*ccdc9c3eSSadaf Ebrahimi #include "util/util.h"
26*ccdc9c3eSSadaf Ebrahimi #include "util/logging.h"
27*ccdc9c3eSSadaf Ebrahimi #include "util/sparse_array.h"
28*ccdc9c3eSSadaf Ebrahimi #include "util/strutil.h"
29*ccdc9c3eSSadaf Ebrahimi #include "util/utf.h"
30*ccdc9c3eSSadaf Ebrahimi #include "re2/prog.h"
31*ccdc9c3eSSadaf Ebrahimi #include "re2/regexp.h"
32*ccdc9c3eSSadaf Ebrahimi
33*ccdc9c3eSSadaf Ebrahimi namespace re2 {
34*ccdc9c3eSSadaf Ebrahimi
35*ccdc9c3eSSadaf Ebrahimi // Maximum number of args we can set
36*ccdc9c3eSSadaf Ebrahimi static const int kMaxArgs = 16;
37*ccdc9c3eSSadaf Ebrahimi static const int kVecSize = 1+kMaxArgs;
38*ccdc9c3eSSadaf Ebrahimi
39*ccdc9c3eSSadaf Ebrahimi const int RE2::Options::kDefaultMaxMem; // initialized in re2.h
40*ccdc9c3eSSadaf Ebrahimi
Options(RE2::CannedOptions opt)41*ccdc9c3eSSadaf Ebrahimi RE2::Options::Options(RE2::CannedOptions opt)
42*ccdc9c3eSSadaf Ebrahimi : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8),
43*ccdc9c3eSSadaf Ebrahimi posix_syntax_(opt == RE2::POSIX),
44*ccdc9c3eSSadaf Ebrahimi longest_match_(opt == RE2::POSIX),
45*ccdc9c3eSSadaf Ebrahimi log_errors_(opt != RE2::Quiet),
46*ccdc9c3eSSadaf Ebrahimi max_mem_(kDefaultMaxMem),
47*ccdc9c3eSSadaf Ebrahimi literal_(false),
48*ccdc9c3eSSadaf Ebrahimi never_nl_(false),
49*ccdc9c3eSSadaf Ebrahimi dot_nl_(false),
50*ccdc9c3eSSadaf Ebrahimi never_capture_(false),
51*ccdc9c3eSSadaf Ebrahimi case_sensitive_(true),
52*ccdc9c3eSSadaf Ebrahimi perl_classes_(false),
53*ccdc9c3eSSadaf Ebrahimi word_boundary_(false),
54*ccdc9c3eSSadaf Ebrahimi one_line_(false) {
55*ccdc9c3eSSadaf Ebrahimi }
56*ccdc9c3eSSadaf Ebrahimi
57*ccdc9c3eSSadaf Ebrahimi // static empty objects for use as const references.
58*ccdc9c3eSSadaf Ebrahimi // To avoid global constructors, allocated in RE2::Init().
59*ccdc9c3eSSadaf Ebrahimi static const string* empty_string;
60*ccdc9c3eSSadaf Ebrahimi static const std::map<string, int>* empty_named_groups;
61*ccdc9c3eSSadaf Ebrahimi static const std::map<int, string>* empty_group_names;
62*ccdc9c3eSSadaf Ebrahimi
63*ccdc9c3eSSadaf Ebrahimi // Converts from Regexp error code to RE2 error code.
64*ccdc9c3eSSadaf Ebrahimi // Maybe some day they will diverge. In any event, this
65*ccdc9c3eSSadaf Ebrahimi // hides the existence of Regexp from RE2 users.
RegexpErrorToRE2(re2::RegexpStatusCode code)66*ccdc9c3eSSadaf Ebrahimi static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
67*ccdc9c3eSSadaf Ebrahimi switch (code) {
68*ccdc9c3eSSadaf Ebrahimi case re2::kRegexpSuccess:
69*ccdc9c3eSSadaf Ebrahimi return RE2::NoError;
70*ccdc9c3eSSadaf Ebrahimi case re2::kRegexpInternalError:
71*ccdc9c3eSSadaf Ebrahimi return RE2::ErrorInternal;
72*ccdc9c3eSSadaf Ebrahimi case re2::kRegexpBadEscape:
73*ccdc9c3eSSadaf Ebrahimi return RE2::ErrorBadEscape;
74*ccdc9c3eSSadaf Ebrahimi case re2::kRegexpBadCharClass:
75*ccdc9c3eSSadaf Ebrahimi return RE2::ErrorBadCharClass;
76*ccdc9c3eSSadaf Ebrahimi case re2::kRegexpBadCharRange:
77*ccdc9c3eSSadaf Ebrahimi return RE2::ErrorBadCharRange;
78*ccdc9c3eSSadaf Ebrahimi case re2::kRegexpMissingBracket:
79*ccdc9c3eSSadaf Ebrahimi return RE2::ErrorMissingBracket;
80*ccdc9c3eSSadaf Ebrahimi case re2::kRegexpMissingParen:
81*ccdc9c3eSSadaf Ebrahimi return RE2::ErrorMissingParen;
82*ccdc9c3eSSadaf Ebrahimi case re2::kRegexpTrailingBackslash:
83*ccdc9c3eSSadaf Ebrahimi return RE2::ErrorTrailingBackslash;
84*ccdc9c3eSSadaf Ebrahimi case re2::kRegexpRepeatArgument:
85*ccdc9c3eSSadaf Ebrahimi return RE2::ErrorRepeatArgument;
86*ccdc9c3eSSadaf Ebrahimi case re2::kRegexpRepeatSize:
87*ccdc9c3eSSadaf Ebrahimi return RE2::ErrorRepeatSize;
88*ccdc9c3eSSadaf Ebrahimi case re2::kRegexpRepeatOp:
89*ccdc9c3eSSadaf Ebrahimi return RE2::ErrorRepeatOp;
90*ccdc9c3eSSadaf Ebrahimi case re2::kRegexpBadPerlOp:
91*ccdc9c3eSSadaf Ebrahimi return RE2::ErrorBadPerlOp;
92*ccdc9c3eSSadaf Ebrahimi case re2::kRegexpBadUTF8:
93*ccdc9c3eSSadaf Ebrahimi return RE2::ErrorBadUTF8;
94*ccdc9c3eSSadaf Ebrahimi case re2::kRegexpBadNamedCapture:
95*ccdc9c3eSSadaf Ebrahimi return RE2::ErrorBadNamedCapture;
96*ccdc9c3eSSadaf Ebrahimi }
97*ccdc9c3eSSadaf Ebrahimi return RE2::ErrorInternal;
98*ccdc9c3eSSadaf Ebrahimi }
99*ccdc9c3eSSadaf Ebrahimi
trunc(const StringPiece & pattern)100*ccdc9c3eSSadaf Ebrahimi static string trunc(const StringPiece& pattern) {
101*ccdc9c3eSSadaf Ebrahimi if (pattern.size() < 100)
102*ccdc9c3eSSadaf Ebrahimi return string(pattern);
103*ccdc9c3eSSadaf Ebrahimi return string(pattern.substr(0, 100)) + "...";
104*ccdc9c3eSSadaf Ebrahimi }
105*ccdc9c3eSSadaf Ebrahimi
106*ccdc9c3eSSadaf Ebrahimi
RE2(const char * pattern)107*ccdc9c3eSSadaf Ebrahimi RE2::RE2(const char* pattern) {
108*ccdc9c3eSSadaf Ebrahimi Init(pattern, DefaultOptions);
109*ccdc9c3eSSadaf Ebrahimi }
110*ccdc9c3eSSadaf Ebrahimi
RE2(const string & pattern)111*ccdc9c3eSSadaf Ebrahimi RE2::RE2(const string& pattern) {
112*ccdc9c3eSSadaf Ebrahimi Init(pattern, DefaultOptions);
113*ccdc9c3eSSadaf Ebrahimi }
114*ccdc9c3eSSadaf Ebrahimi
RE2(const StringPiece & pattern)115*ccdc9c3eSSadaf Ebrahimi RE2::RE2(const StringPiece& pattern) {
116*ccdc9c3eSSadaf Ebrahimi Init(pattern, DefaultOptions);
117*ccdc9c3eSSadaf Ebrahimi }
118*ccdc9c3eSSadaf Ebrahimi
RE2(const StringPiece & pattern,const Options & options)119*ccdc9c3eSSadaf Ebrahimi RE2::RE2(const StringPiece& pattern, const Options& options) {
120*ccdc9c3eSSadaf Ebrahimi Init(pattern, options);
121*ccdc9c3eSSadaf Ebrahimi }
122*ccdc9c3eSSadaf Ebrahimi
ParseFlags() const123*ccdc9c3eSSadaf Ebrahimi int RE2::Options::ParseFlags() const {
124*ccdc9c3eSSadaf Ebrahimi int flags = Regexp::ClassNL;
125*ccdc9c3eSSadaf Ebrahimi switch (encoding()) {
126*ccdc9c3eSSadaf Ebrahimi default:
127*ccdc9c3eSSadaf Ebrahimi if (log_errors())
128*ccdc9c3eSSadaf Ebrahimi LOG(ERROR) << "Unknown encoding " << encoding();
129*ccdc9c3eSSadaf Ebrahimi break;
130*ccdc9c3eSSadaf Ebrahimi case RE2::Options::EncodingUTF8:
131*ccdc9c3eSSadaf Ebrahimi break;
132*ccdc9c3eSSadaf Ebrahimi case RE2::Options::EncodingLatin1:
133*ccdc9c3eSSadaf Ebrahimi flags |= Regexp::Latin1;
134*ccdc9c3eSSadaf Ebrahimi break;
135*ccdc9c3eSSadaf Ebrahimi }
136*ccdc9c3eSSadaf Ebrahimi
137*ccdc9c3eSSadaf Ebrahimi if (!posix_syntax())
138*ccdc9c3eSSadaf Ebrahimi flags |= Regexp::LikePerl;
139*ccdc9c3eSSadaf Ebrahimi
140*ccdc9c3eSSadaf Ebrahimi if (literal())
141*ccdc9c3eSSadaf Ebrahimi flags |= Regexp::Literal;
142*ccdc9c3eSSadaf Ebrahimi
143*ccdc9c3eSSadaf Ebrahimi if (never_nl())
144*ccdc9c3eSSadaf Ebrahimi flags |= Regexp::NeverNL;
145*ccdc9c3eSSadaf Ebrahimi
146*ccdc9c3eSSadaf Ebrahimi if (dot_nl())
147*ccdc9c3eSSadaf Ebrahimi flags |= Regexp::DotNL;
148*ccdc9c3eSSadaf Ebrahimi
149*ccdc9c3eSSadaf Ebrahimi if (never_capture())
150*ccdc9c3eSSadaf Ebrahimi flags |= Regexp::NeverCapture;
151*ccdc9c3eSSadaf Ebrahimi
152*ccdc9c3eSSadaf Ebrahimi if (!case_sensitive())
153*ccdc9c3eSSadaf Ebrahimi flags |= Regexp::FoldCase;
154*ccdc9c3eSSadaf Ebrahimi
155*ccdc9c3eSSadaf Ebrahimi if (perl_classes())
156*ccdc9c3eSSadaf Ebrahimi flags |= Regexp::PerlClasses;
157*ccdc9c3eSSadaf Ebrahimi
158*ccdc9c3eSSadaf Ebrahimi if (word_boundary())
159*ccdc9c3eSSadaf Ebrahimi flags |= Regexp::PerlB;
160*ccdc9c3eSSadaf Ebrahimi
161*ccdc9c3eSSadaf Ebrahimi if (one_line())
162*ccdc9c3eSSadaf Ebrahimi flags |= Regexp::OneLine;
163*ccdc9c3eSSadaf Ebrahimi
164*ccdc9c3eSSadaf Ebrahimi return flags;
165*ccdc9c3eSSadaf Ebrahimi }
166*ccdc9c3eSSadaf Ebrahimi
Init(const StringPiece & pattern,const Options & options)167*ccdc9c3eSSadaf Ebrahimi void RE2::Init(const StringPiece& pattern, const Options& options) {
168*ccdc9c3eSSadaf Ebrahimi static std::once_flag empty_once;
169*ccdc9c3eSSadaf Ebrahimi std::call_once(empty_once, []() {
170*ccdc9c3eSSadaf Ebrahimi empty_string = new string;
171*ccdc9c3eSSadaf Ebrahimi empty_named_groups = new std::map<string, int>;
172*ccdc9c3eSSadaf Ebrahimi empty_group_names = new std::map<int, string>;
173*ccdc9c3eSSadaf Ebrahimi });
174*ccdc9c3eSSadaf Ebrahimi
175*ccdc9c3eSSadaf Ebrahimi pattern_ = string(pattern);
176*ccdc9c3eSSadaf Ebrahimi options_.Copy(options);
177*ccdc9c3eSSadaf Ebrahimi entire_regexp_ = NULL;
178*ccdc9c3eSSadaf Ebrahimi suffix_regexp_ = NULL;
179*ccdc9c3eSSadaf Ebrahimi prog_ = NULL;
180*ccdc9c3eSSadaf Ebrahimi num_captures_ = -1;
181*ccdc9c3eSSadaf Ebrahimi rprog_ = NULL;
182*ccdc9c3eSSadaf Ebrahimi error_ = empty_string;
183*ccdc9c3eSSadaf Ebrahimi error_code_ = NoError;
184*ccdc9c3eSSadaf Ebrahimi named_groups_ = NULL;
185*ccdc9c3eSSadaf Ebrahimi group_names_ = NULL;
186*ccdc9c3eSSadaf Ebrahimi
187*ccdc9c3eSSadaf Ebrahimi RegexpStatus status;
188*ccdc9c3eSSadaf Ebrahimi entire_regexp_ = Regexp::Parse(
189*ccdc9c3eSSadaf Ebrahimi pattern_,
190*ccdc9c3eSSadaf Ebrahimi static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
191*ccdc9c3eSSadaf Ebrahimi &status);
192*ccdc9c3eSSadaf Ebrahimi if (entire_regexp_ == NULL) {
193*ccdc9c3eSSadaf Ebrahimi if (options_.log_errors()) {
194*ccdc9c3eSSadaf Ebrahimi LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
195*ccdc9c3eSSadaf Ebrahimi << status.Text();
196*ccdc9c3eSSadaf Ebrahimi }
197*ccdc9c3eSSadaf Ebrahimi error_ = new string(status.Text());
198*ccdc9c3eSSadaf Ebrahimi error_code_ = RegexpErrorToRE2(status.code());
199*ccdc9c3eSSadaf Ebrahimi error_arg_ = string(status.error_arg());
200*ccdc9c3eSSadaf Ebrahimi return;
201*ccdc9c3eSSadaf Ebrahimi }
202*ccdc9c3eSSadaf Ebrahimi
203*ccdc9c3eSSadaf Ebrahimi re2::Regexp* suffix;
204*ccdc9c3eSSadaf Ebrahimi if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
205*ccdc9c3eSSadaf Ebrahimi suffix_regexp_ = suffix;
206*ccdc9c3eSSadaf Ebrahimi else
207*ccdc9c3eSSadaf Ebrahimi suffix_regexp_ = entire_regexp_->Incref();
208*ccdc9c3eSSadaf Ebrahimi
209*ccdc9c3eSSadaf Ebrahimi // Two thirds of the memory goes to the forward Prog,
210*ccdc9c3eSSadaf Ebrahimi // one third to the reverse prog, because the forward
211*ccdc9c3eSSadaf Ebrahimi // Prog has two DFAs but the reverse prog has one.
212*ccdc9c3eSSadaf Ebrahimi prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
213*ccdc9c3eSSadaf Ebrahimi if (prog_ == NULL) {
214*ccdc9c3eSSadaf Ebrahimi if (options_.log_errors())
215*ccdc9c3eSSadaf Ebrahimi LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
216*ccdc9c3eSSadaf Ebrahimi error_ = new string("pattern too large - compile failed");
217*ccdc9c3eSSadaf Ebrahimi error_code_ = RE2::ErrorPatternTooLarge;
218*ccdc9c3eSSadaf Ebrahimi return;
219*ccdc9c3eSSadaf Ebrahimi }
220*ccdc9c3eSSadaf Ebrahimi
221*ccdc9c3eSSadaf Ebrahimi // We used to compute this lazily, but it's used during the
222*ccdc9c3eSSadaf Ebrahimi // typical control flow for a match call, so we now compute
223*ccdc9c3eSSadaf Ebrahimi // it eagerly, which avoids the overhead of std::once_flag.
224*ccdc9c3eSSadaf Ebrahimi num_captures_ = suffix_regexp_->NumCaptures();
225*ccdc9c3eSSadaf Ebrahimi
226*ccdc9c3eSSadaf Ebrahimi // Could delay this until the first match call that
227*ccdc9c3eSSadaf Ebrahimi // cares about submatch information, but the one-pass
228*ccdc9c3eSSadaf Ebrahimi // machine's memory gets cut from the DFA memory budget,
229*ccdc9c3eSSadaf Ebrahimi // and that is harder to do if the DFA has already
230*ccdc9c3eSSadaf Ebrahimi // been built.
231*ccdc9c3eSSadaf Ebrahimi is_one_pass_ = prog_->IsOnePass();
232*ccdc9c3eSSadaf Ebrahimi }
233*ccdc9c3eSSadaf Ebrahimi
234*ccdc9c3eSSadaf Ebrahimi // Returns rprog_, computing it if needed.
ReverseProg() const235*ccdc9c3eSSadaf Ebrahimi re2::Prog* RE2::ReverseProg() const {
236*ccdc9c3eSSadaf Ebrahimi std::call_once(rprog_once_, [](const RE2* re) {
237*ccdc9c3eSSadaf Ebrahimi re->rprog_ =
238*ccdc9c3eSSadaf Ebrahimi re->suffix_regexp_->CompileToReverseProg(re->options_.max_mem() / 3);
239*ccdc9c3eSSadaf Ebrahimi if (re->rprog_ == NULL) {
240*ccdc9c3eSSadaf Ebrahimi if (re->options_.log_errors())
241*ccdc9c3eSSadaf Ebrahimi LOG(ERROR) << "Error reverse compiling '" << trunc(re->pattern_) << "'";
242*ccdc9c3eSSadaf Ebrahimi re->error_ = new string("pattern too large - reverse compile failed");
243*ccdc9c3eSSadaf Ebrahimi re->error_code_ = RE2::ErrorPatternTooLarge;
244*ccdc9c3eSSadaf Ebrahimi }
245*ccdc9c3eSSadaf Ebrahimi }, this);
246*ccdc9c3eSSadaf Ebrahimi return rprog_;
247*ccdc9c3eSSadaf Ebrahimi }
248*ccdc9c3eSSadaf Ebrahimi
~RE2()249*ccdc9c3eSSadaf Ebrahimi RE2::~RE2() {
250*ccdc9c3eSSadaf Ebrahimi if (suffix_regexp_)
251*ccdc9c3eSSadaf Ebrahimi suffix_regexp_->Decref();
252*ccdc9c3eSSadaf Ebrahimi if (entire_regexp_)
253*ccdc9c3eSSadaf Ebrahimi entire_regexp_->Decref();
254*ccdc9c3eSSadaf Ebrahimi delete prog_;
255*ccdc9c3eSSadaf Ebrahimi delete rprog_;
256*ccdc9c3eSSadaf Ebrahimi if (error_ != empty_string)
257*ccdc9c3eSSadaf Ebrahimi delete error_;
258*ccdc9c3eSSadaf Ebrahimi if (named_groups_ != NULL && named_groups_ != empty_named_groups)
259*ccdc9c3eSSadaf Ebrahimi delete named_groups_;
260*ccdc9c3eSSadaf Ebrahimi if (group_names_ != NULL && group_names_ != empty_group_names)
261*ccdc9c3eSSadaf Ebrahimi delete group_names_;
262*ccdc9c3eSSadaf Ebrahimi }
263*ccdc9c3eSSadaf Ebrahimi
ProgramSize() const264*ccdc9c3eSSadaf Ebrahimi int RE2::ProgramSize() const {
265*ccdc9c3eSSadaf Ebrahimi if (prog_ == NULL)
266*ccdc9c3eSSadaf Ebrahimi return -1;
267*ccdc9c3eSSadaf Ebrahimi return prog_->size();
268*ccdc9c3eSSadaf Ebrahimi }
269*ccdc9c3eSSadaf Ebrahimi
ReverseProgramSize() const270*ccdc9c3eSSadaf Ebrahimi int RE2::ReverseProgramSize() const {
271*ccdc9c3eSSadaf Ebrahimi if (prog_ == NULL)
272*ccdc9c3eSSadaf Ebrahimi return -1;
273*ccdc9c3eSSadaf Ebrahimi Prog* prog = ReverseProg();
274*ccdc9c3eSSadaf Ebrahimi if (prog == NULL)
275*ccdc9c3eSSadaf Ebrahimi return -1;
276*ccdc9c3eSSadaf Ebrahimi return prog->size();
277*ccdc9c3eSSadaf Ebrahimi }
278*ccdc9c3eSSadaf Ebrahimi
Fanout(Prog * prog,std::map<int,int> * histogram)279*ccdc9c3eSSadaf Ebrahimi static int Fanout(Prog* prog, std::map<int, int>* histogram) {
280*ccdc9c3eSSadaf Ebrahimi SparseArray<int> fanout(prog->size());
281*ccdc9c3eSSadaf Ebrahimi prog->Fanout(&fanout);
282*ccdc9c3eSSadaf Ebrahimi histogram->clear();
283*ccdc9c3eSSadaf Ebrahimi for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) {
284*ccdc9c3eSSadaf Ebrahimi // TODO(junyer): Optimise this?
285*ccdc9c3eSSadaf Ebrahimi int bucket = 0;
286*ccdc9c3eSSadaf Ebrahimi while (1 << bucket < i->value()) {
287*ccdc9c3eSSadaf Ebrahimi bucket++;
288*ccdc9c3eSSadaf Ebrahimi }
289*ccdc9c3eSSadaf Ebrahimi (*histogram)[bucket]++;
290*ccdc9c3eSSadaf Ebrahimi }
291*ccdc9c3eSSadaf Ebrahimi return histogram->rbegin()->first;
292*ccdc9c3eSSadaf Ebrahimi }
293*ccdc9c3eSSadaf Ebrahimi
ProgramFanout(std::map<int,int> * histogram) const294*ccdc9c3eSSadaf Ebrahimi int RE2::ProgramFanout(std::map<int, int>* histogram) const {
295*ccdc9c3eSSadaf Ebrahimi if (prog_ == NULL)
296*ccdc9c3eSSadaf Ebrahimi return -1;
297*ccdc9c3eSSadaf Ebrahimi return Fanout(prog_, histogram);
298*ccdc9c3eSSadaf Ebrahimi }
299*ccdc9c3eSSadaf Ebrahimi
ReverseProgramFanout(std::map<int,int> * histogram) const300*ccdc9c3eSSadaf Ebrahimi int RE2::ReverseProgramFanout(std::map<int, int>* histogram) const {
301*ccdc9c3eSSadaf Ebrahimi if (prog_ == NULL)
302*ccdc9c3eSSadaf Ebrahimi return -1;
303*ccdc9c3eSSadaf Ebrahimi Prog* prog = ReverseProg();
304*ccdc9c3eSSadaf Ebrahimi if (prog == NULL)
305*ccdc9c3eSSadaf Ebrahimi return -1;
306*ccdc9c3eSSadaf Ebrahimi return Fanout(prog, histogram);
307*ccdc9c3eSSadaf Ebrahimi }
308*ccdc9c3eSSadaf Ebrahimi
309*ccdc9c3eSSadaf Ebrahimi // Returns named_groups_, computing it if needed.
NamedCapturingGroups() const310*ccdc9c3eSSadaf Ebrahimi const std::map<string, int>& RE2::NamedCapturingGroups() const {
311*ccdc9c3eSSadaf Ebrahimi std::call_once(named_groups_once_, [](const RE2* re) {
312*ccdc9c3eSSadaf Ebrahimi if (re->suffix_regexp_ != NULL)
313*ccdc9c3eSSadaf Ebrahimi re->named_groups_ = re->suffix_regexp_->NamedCaptures();
314*ccdc9c3eSSadaf Ebrahimi if (re->named_groups_ == NULL)
315*ccdc9c3eSSadaf Ebrahimi re->named_groups_ = empty_named_groups;
316*ccdc9c3eSSadaf Ebrahimi }, this);
317*ccdc9c3eSSadaf Ebrahimi return *named_groups_;
318*ccdc9c3eSSadaf Ebrahimi }
319*ccdc9c3eSSadaf Ebrahimi
320*ccdc9c3eSSadaf Ebrahimi // Returns group_names_, computing it if needed.
CapturingGroupNames() const321*ccdc9c3eSSadaf Ebrahimi const std::map<int, string>& RE2::CapturingGroupNames() const {
322*ccdc9c3eSSadaf Ebrahimi std::call_once(group_names_once_, [](const RE2* re) {
323*ccdc9c3eSSadaf Ebrahimi if (re->suffix_regexp_ != NULL)
324*ccdc9c3eSSadaf Ebrahimi re->group_names_ = re->suffix_regexp_->CaptureNames();
325*ccdc9c3eSSadaf Ebrahimi if (re->group_names_ == NULL)
326*ccdc9c3eSSadaf Ebrahimi re->group_names_ = empty_group_names;
327*ccdc9c3eSSadaf Ebrahimi }, this);
328*ccdc9c3eSSadaf Ebrahimi return *group_names_;
329*ccdc9c3eSSadaf Ebrahimi }
330*ccdc9c3eSSadaf Ebrahimi
331*ccdc9c3eSSadaf Ebrahimi /***** Convenience interfaces *****/
332*ccdc9c3eSSadaf Ebrahimi
FullMatchN(const StringPiece & text,const RE2 & re,const Arg * const args[],int n)333*ccdc9c3eSSadaf Ebrahimi bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
334*ccdc9c3eSSadaf Ebrahimi const Arg* const args[], int n) {
335*ccdc9c3eSSadaf Ebrahimi return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
336*ccdc9c3eSSadaf Ebrahimi }
337*ccdc9c3eSSadaf Ebrahimi
PartialMatchN(const StringPiece & text,const RE2 & re,const Arg * const args[],int n)338*ccdc9c3eSSadaf Ebrahimi bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,
339*ccdc9c3eSSadaf Ebrahimi const Arg* const args[], int n) {
340*ccdc9c3eSSadaf Ebrahimi return re.DoMatch(text, UNANCHORED, NULL, args, n);
341*ccdc9c3eSSadaf Ebrahimi }
342*ccdc9c3eSSadaf Ebrahimi
ConsumeN(StringPiece * input,const RE2 & re,const Arg * const args[],int n)343*ccdc9c3eSSadaf Ebrahimi bool RE2::ConsumeN(StringPiece* input, const RE2& re,
344*ccdc9c3eSSadaf Ebrahimi const Arg* const args[], int n) {
345*ccdc9c3eSSadaf Ebrahimi size_t consumed;
346*ccdc9c3eSSadaf Ebrahimi if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
347*ccdc9c3eSSadaf Ebrahimi input->remove_prefix(consumed);
348*ccdc9c3eSSadaf Ebrahimi return true;
349*ccdc9c3eSSadaf Ebrahimi } else {
350*ccdc9c3eSSadaf Ebrahimi return false;
351*ccdc9c3eSSadaf Ebrahimi }
352*ccdc9c3eSSadaf Ebrahimi }
353*ccdc9c3eSSadaf Ebrahimi
FindAndConsumeN(StringPiece * input,const RE2 & re,const Arg * const args[],int n)354*ccdc9c3eSSadaf Ebrahimi bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
355*ccdc9c3eSSadaf Ebrahimi const Arg* const args[], int n) {
356*ccdc9c3eSSadaf Ebrahimi size_t consumed;
357*ccdc9c3eSSadaf Ebrahimi if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
358*ccdc9c3eSSadaf Ebrahimi input->remove_prefix(consumed);
359*ccdc9c3eSSadaf Ebrahimi return true;
360*ccdc9c3eSSadaf Ebrahimi } else {
361*ccdc9c3eSSadaf Ebrahimi return false;
362*ccdc9c3eSSadaf Ebrahimi }
363*ccdc9c3eSSadaf Ebrahimi }
364*ccdc9c3eSSadaf Ebrahimi
Replace(string * str,const RE2 & re,const StringPiece & rewrite)365*ccdc9c3eSSadaf Ebrahimi bool RE2::Replace(string* str,
366*ccdc9c3eSSadaf Ebrahimi const RE2& re,
367*ccdc9c3eSSadaf Ebrahimi const StringPiece& rewrite) {
368*ccdc9c3eSSadaf Ebrahimi StringPiece vec[kVecSize];
369*ccdc9c3eSSadaf Ebrahimi int nvec = 1 + MaxSubmatch(rewrite);
370*ccdc9c3eSSadaf Ebrahimi if (nvec > arraysize(vec))
371*ccdc9c3eSSadaf Ebrahimi return false;
372*ccdc9c3eSSadaf Ebrahimi if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
373*ccdc9c3eSSadaf Ebrahimi return false;
374*ccdc9c3eSSadaf Ebrahimi
375*ccdc9c3eSSadaf Ebrahimi string s;
376*ccdc9c3eSSadaf Ebrahimi if (!re.Rewrite(&s, rewrite, vec, nvec))
377*ccdc9c3eSSadaf Ebrahimi return false;
378*ccdc9c3eSSadaf Ebrahimi
379*ccdc9c3eSSadaf Ebrahimi assert(vec[0].begin() >= str->data());
380*ccdc9c3eSSadaf Ebrahimi assert(vec[0].end() <= str->data()+str->size());
381*ccdc9c3eSSadaf Ebrahimi str->replace(vec[0].data() - str->data(), vec[0].size(), s);
382*ccdc9c3eSSadaf Ebrahimi return true;
383*ccdc9c3eSSadaf Ebrahimi }
384*ccdc9c3eSSadaf Ebrahimi
GlobalReplace(string * str,const RE2 & re,const StringPiece & rewrite)385*ccdc9c3eSSadaf Ebrahimi int RE2::GlobalReplace(string* str,
386*ccdc9c3eSSadaf Ebrahimi const RE2& re,
387*ccdc9c3eSSadaf Ebrahimi const StringPiece& rewrite) {
388*ccdc9c3eSSadaf Ebrahimi StringPiece vec[kVecSize];
389*ccdc9c3eSSadaf Ebrahimi int nvec = 1 + MaxSubmatch(rewrite);
390*ccdc9c3eSSadaf Ebrahimi if (nvec > arraysize(vec))
391*ccdc9c3eSSadaf Ebrahimi return false;
392*ccdc9c3eSSadaf Ebrahimi
393*ccdc9c3eSSadaf Ebrahimi const char* p = str->data();
394*ccdc9c3eSSadaf Ebrahimi const char* ep = p + str->size();
395*ccdc9c3eSSadaf Ebrahimi const char* lastend = NULL;
396*ccdc9c3eSSadaf Ebrahimi string out;
397*ccdc9c3eSSadaf Ebrahimi int count = 0;
398*ccdc9c3eSSadaf Ebrahimi #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
399*ccdc9c3eSSadaf Ebrahimi // Iterate just once when fuzzing. Otherwise, we easily get bogged down
400*ccdc9c3eSSadaf Ebrahimi // and coverage is unlikely to improve despite significant expense.
401*ccdc9c3eSSadaf Ebrahimi while (p == str->data()) {
402*ccdc9c3eSSadaf Ebrahimi #else
403*ccdc9c3eSSadaf Ebrahimi while (p <= ep) {
404*ccdc9c3eSSadaf Ebrahimi #endif
405*ccdc9c3eSSadaf Ebrahimi if (!re.Match(*str, static_cast<size_t>(p - str->data()),
406*ccdc9c3eSSadaf Ebrahimi str->size(), UNANCHORED, vec, nvec))
407*ccdc9c3eSSadaf Ebrahimi break;
408*ccdc9c3eSSadaf Ebrahimi if (p < vec[0].begin())
409*ccdc9c3eSSadaf Ebrahimi out.append(p, vec[0].begin() - p);
410*ccdc9c3eSSadaf Ebrahimi if (vec[0].begin() == lastend && vec[0].size() == 0) {
411*ccdc9c3eSSadaf Ebrahimi // Disallow empty match at end of last match: skip ahead.
412*ccdc9c3eSSadaf Ebrahimi //
413*ccdc9c3eSSadaf Ebrahimi // fullrune() takes int, not size_t. However, it just looks
414*ccdc9c3eSSadaf Ebrahimi // at the leading byte and treats any length >= 4 the same.
415*ccdc9c3eSSadaf Ebrahimi if (re.options().encoding() == RE2::Options::EncodingUTF8 &&
416*ccdc9c3eSSadaf Ebrahimi fullrune(p, static_cast<int>(std::min(static_cast<ptrdiff_t>(4),
417*ccdc9c3eSSadaf Ebrahimi ep - p)))) {
418*ccdc9c3eSSadaf Ebrahimi // re is in UTF-8 mode and there is enough left of str
419*ccdc9c3eSSadaf Ebrahimi // to allow us to advance by up to UTFmax bytes.
420*ccdc9c3eSSadaf Ebrahimi Rune r;
421*ccdc9c3eSSadaf Ebrahimi int n = chartorune(&r, p);
422*ccdc9c3eSSadaf Ebrahimi // Some copies of chartorune have a bug that accepts
423*ccdc9c3eSSadaf Ebrahimi // encodings of values in (10FFFF, 1FFFFF] as valid.
424*ccdc9c3eSSadaf Ebrahimi if (r > Runemax) {
425*ccdc9c3eSSadaf Ebrahimi n = 1;
426*ccdc9c3eSSadaf Ebrahimi r = Runeerror;
427*ccdc9c3eSSadaf Ebrahimi }
428*ccdc9c3eSSadaf Ebrahimi if (!(n == 1 && r == Runeerror)) { // no decoding error
429*ccdc9c3eSSadaf Ebrahimi out.append(p, n);
430*ccdc9c3eSSadaf Ebrahimi p += n;
431*ccdc9c3eSSadaf Ebrahimi continue;
432*ccdc9c3eSSadaf Ebrahimi }
433*ccdc9c3eSSadaf Ebrahimi }
434*ccdc9c3eSSadaf Ebrahimi // Most likely, re is in Latin-1 mode. If it is in UTF-8 mode,
435*ccdc9c3eSSadaf Ebrahimi // we fell through from above and the GIGO principle applies.
436*ccdc9c3eSSadaf Ebrahimi if (p < ep)
437*ccdc9c3eSSadaf Ebrahimi out.append(p, 1);
438*ccdc9c3eSSadaf Ebrahimi p++;
439*ccdc9c3eSSadaf Ebrahimi continue;
440*ccdc9c3eSSadaf Ebrahimi }
441*ccdc9c3eSSadaf Ebrahimi re.Rewrite(&out, rewrite, vec, nvec);
442*ccdc9c3eSSadaf Ebrahimi p = vec[0].end();
443*ccdc9c3eSSadaf Ebrahimi lastend = p;
444*ccdc9c3eSSadaf Ebrahimi count++;
445*ccdc9c3eSSadaf Ebrahimi }
446*ccdc9c3eSSadaf Ebrahimi
447*ccdc9c3eSSadaf Ebrahimi if (count == 0)
448*ccdc9c3eSSadaf Ebrahimi return 0;
449*ccdc9c3eSSadaf Ebrahimi
450*ccdc9c3eSSadaf Ebrahimi if (p < ep)
451*ccdc9c3eSSadaf Ebrahimi out.append(p, ep - p);
452*ccdc9c3eSSadaf Ebrahimi using std::swap;
453*ccdc9c3eSSadaf Ebrahimi swap(out, *str);
454*ccdc9c3eSSadaf Ebrahimi return count;
455*ccdc9c3eSSadaf Ebrahimi }
456*ccdc9c3eSSadaf Ebrahimi
457*ccdc9c3eSSadaf Ebrahimi bool RE2::Extract(const StringPiece& text,
458*ccdc9c3eSSadaf Ebrahimi const RE2& re,
459*ccdc9c3eSSadaf Ebrahimi const StringPiece& rewrite,
460*ccdc9c3eSSadaf Ebrahimi string* out) {
461*ccdc9c3eSSadaf Ebrahimi StringPiece vec[kVecSize];
462*ccdc9c3eSSadaf Ebrahimi int nvec = 1 + MaxSubmatch(rewrite);
463*ccdc9c3eSSadaf Ebrahimi if (nvec > arraysize(vec))
464*ccdc9c3eSSadaf Ebrahimi return false;
465*ccdc9c3eSSadaf Ebrahimi
466*ccdc9c3eSSadaf Ebrahimi if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
467*ccdc9c3eSSadaf Ebrahimi return false;
468*ccdc9c3eSSadaf Ebrahimi
469*ccdc9c3eSSadaf Ebrahimi out->clear();
470*ccdc9c3eSSadaf Ebrahimi return re.Rewrite(out, rewrite, vec, nvec);
471*ccdc9c3eSSadaf Ebrahimi }
472*ccdc9c3eSSadaf Ebrahimi
473*ccdc9c3eSSadaf Ebrahimi string RE2::QuoteMeta(const StringPiece& unquoted) {
474*ccdc9c3eSSadaf Ebrahimi string result;
475*ccdc9c3eSSadaf Ebrahimi result.reserve(unquoted.size() << 1);
476*ccdc9c3eSSadaf Ebrahimi
477*ccdc9c3eSSadaf Ebrahimi // Escape any ascii character not in [A-Za-z_0-9].
478*ccdc9c3eSSadaf Ebrahimi //
479*ccdc9c3eSSadaf Ebrahimi // Note that it's legal to escape a character even if it has no
480*ccdc9c3eSSadaf Ebrahimi // special meaning in a regular expression -- so this function does
481*ccdc9c3eSSadaf Ebrahimi // that. (This also makes it identical to the perl function of the
482*ccdc9c3eSSadaf Ebrahimi // same name except for the null-character special case;
483*ccdc9c3eSSadaf Ebrahimi // see `perldoc -f quotemeta`.)
484*ccdc9c3eSSadaf Ebrahimi for (size_t ii = 0; ii < unquoted.size(); ++ii) {
485*ccdc9c3eSSadaf Ebrahimi // Note that using 'isalnum' here raises the benchmark time from
486*ccdc9c3eSSadaf Ebrahimi // 32ns to 58ns:
487*ccdc9c3eSSadaf Ebrahimi if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
488*ccdc9c3eSSadaf Ebrahimi (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
489*ccdc9c3eSSadaf Ebrahimi (unquoted[ii] < '0' || unquoted[ii] > '9') &&
490*ccdc9c3eSSadaf Ebrahimi unquoted[ii] != '_' &&
491*ccdc9c3eSSadaf Ebrahimi // If this is the part of a UTF8 or Latin1 character, we need
492*ccdc9c3eSSadaf Ebrahimi // to copy this byte without escaping. Experimentally this is
493*ccdc9c3eSSadaf Ebrahimi // what works correctly with the regexp library.
494*ccdc9c3eSSadaf Ebrahimi !(unquoted[ii] & 128)) {
495*ccdc9c3eSSadaf Ebrahimi if (unquoted[ii] == '\0') { // Special handling for null chars.
496*ccdc9c3eSSadaf Ebrahimi // Note that this special handling is not strictly required for RE2,
497*ccdc9c3eSSadaf Ebrahimi // but this quoting is required for other regexp libraries such as
498*ccdc9c3eSSadaf Ebrahimi // PCRE.
499*ccdc9c3eSSadaf Ebrahimi // Can't use "\\0" since the next character might be a digit.
500*ccdc9c3eSSadaf Ebrahimi result += "\\x00";
501*ccdc9c3eSSadaf Ebrahimi continue;
502*ccdc9c3eSSadaf Ebrahimi }
503*ccdc9c3eSSadaf Ebrahimi result += '\\';
504*ccdc9c3eSSadaf Ebrahimi }
505*ccdc9c3eSSadaf Ebrahimi result += unquoted[ii];
506*ccdc9c3eSSadaf Ebrahimi }
507*ccdc9c3eSSadaf Ebrahimi
508*ccdc9c3eSSadaf Ebrahimi return result;
509*ccdc9c3eSSadaf Ebrahimi }
510*ccdc9c3eSSadaf Ebrahimi
511*ccdc9c3eSSadaf Ebrahimi bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const {
512*ccdc9c3eSSadaf Ebrahimi if (prog_ == NULL)
513*ccdc9c3eSSadaf Ebrahimi return false;
514*ccdc9c3eSSadaf Ebrahimi
515*ccdc9c3eSSadaf Ebrahimi int n = static_cast<int>(prefix_.size());
516*ccdc9c3eSSadaf Ebrahimi if (n > maxlen)
517*ccdc9c3eSSadaf Ebrahimi n = maxlen;
518*ccdc9c3eSSadaf Ebrahimi
519*ccdc9c3eSSadaf Ebrahimi // Determine initial min max from prefix_ literal.
520*ccdc9c3eSSadaf Ebrahimi *min = prefix_.substr(0, n);
521*ccdc9c3eSSadaf Ebrahimi *max = prefix_.substr(0, n);
522*ccdc9c3eSSadaf Ebrahimi if (prefix_foldcase_) {
523*ccdc9c3eSSadaf Ebrahimi // prefix is ASCII lowercase; change *min to uppercase.
524*ccdc9c3eSSadaf Ebrahimi for (int i = 0; i < n; i++) {
525*ccdc9c3eSSadaf Ebrahimi char& c = (*min)[i];
526*ccdc9c3eSSadaf Ebrahimi if ('a' <= c && c <= 'z')
527*ccdc9c3eSSadaf Ebrahimi c += 'A' - 'a';
528*ccdc9c3eSSadaf Ebrahimi }
529*ccdc9c3eSSadaf Ebrahimi }
530*ccdc9c3eSSadaf Ebrahimi
531*ccdc9c3eSSadaf Ebrahimi // Add to prefix min max using PossibleMatchRange on regexp.
532*ccdc9c3eSSadaf Ebrahimi string dmin, dmax;
533*ccdc9c3eSSadaf Ebrahimi maxlen -= n;
534*ccdc9c3eSSadaf Ebrahimi if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
535*ccdc9c3eSSadaf Ebrahimi min->append(dmin);
536*ccdc9c3eSSadaf Ebrahimi max->append(dmax);
537*ccdc9c3eSSadaf Ebrahimi } else if (!max->empty()) {
538*ccdc9c3eSSadaf Ebrahimi // prog_->PossibleMatchRange has failed us,
539*ccdc9c3eSSadaf Ebrahimi // but we still have useful information from prefix_.
540*ccdc9c3eSSadaf Ebrahimi // Round up *max to allow any possible suffix.
541*ccdc9c3eSSadaf Ebrahimi PrefixSuccessor(max);
542*ccdc9c3eSSadaf Ebrahimi } else {
543*ccdc9c3eSSadaf Ebrahimi // Nothing useful.
544*ccdc9c3eSSadaf Ebrahimi *min = "";
545*ccdc9c3eSSadaf Ebrahimi *max = "";
546*ccdc9c3eSSadaf Ebrahimi return false;
547*ccdc9c3eSSadaf Ebrahimi }
548*ccdc9c3eSSadaf Ebrahimi
549*ccdc9c3eSSadaf Ebrahimi return true;
550*ccdc9c3eSSadaf Ebrahimi }
551*ccdc9c3eSSadaf Ebrahimi
552*ccdc9c3eSSadaf Ebrahimi // Avoid possible locale nonsense in standard strcasecmp.
553*ccdc9c3eSSadaf Ebrahimi // The string a is known to be all lowercase.
554*ccdc9c3eSSadaf Ebrahimi static int ascii_strcasecmp(const char* a, const char* b, size_t len) {
555*ccdc9c3eSSadaf Ebrahimi const char* ae = a + len;
556*ccdc9c3eSSadaf Ebrahimi
557*ccdc9c3eSSadaf Ebrahimi for (; a < ae; a++, b++) {
558*ccdc9c3eSSadaf Ebrahimi uint8_t x = *a;
559*ccdc9c3eSSadaf Ebrahimi uint8_t y = *b;
560*ccdc9c3eSSadaf Ebrahimi if ('A' <= y && y <= 'Z')
561*ccdc9c3eSSadaf Ebrahimi y += 'a' - 'A';
562*ccdc9c3eSSadaf Ebrahimi if (x != y)
563*ccdc9c3eSSadaf Ebrahimi return x - y;
564*ccdc9c3eSSadaf Ebrahimi }
565*ccdc9c3eSSadaf Ebrahimi return 0;
566*ccdc9c3eSSadaf Ebrahimi }
567*ccdc9c3eSSadaf Ebrahimi
568*ccdc9c3eSSadaf Ebrahimi
569*ccdc9c3eSSadaf Ebrahimi /***** Actual matching and rewriting code *****/
570*ccdc9c3eSSadaf Ebrahimi
571*ccdc9c3eSSadaf Ebrahimi bool RE2::Match(const StringPiece& text,
572*ccdc9c3eSSadaf Ebrahimi size_t startpos,
573*ccdc9c3eSSadaf Ebrahimi size_t endpos,
574*ccdc9c3eSSadaf Ebrahimi Anchor re_anchor,
575*ccdc9c3eSSadaf Ebrahimi StringPiece* submatch,
576*ccdc9c3eSSadaf Ebrahimi int nsubmatch) const {
577*ccdc9c3eSSadaf Ebrahimi if (!ok()) {
578*ccdc9c3eSSadaf Ebrahimi if (options_.log_errors())
579*ccdc9c3eSSadaf Ebrahimi LOG(ERROR) << "Invalid RE2: " << *error_;
580*ccdc9c3eSSadaf Ebrahimi return false;
581*ccdc9c3eSSadaf Ebrahimi }
582*ccdc9c3eSSadaf Ebrahimi
583*ccdc9c3eSSadaf Ebrahimi if (startpos > endpos || endpos > text.size()) {
584*ccdc9c3eSSadaf Ebrahimi if (options_.log_errors())
585*ccdc9c3eSSadaf Ebrahimi LOG(ERROR) << "RE2: invalid startpos, endpos pair. ["
586*ccdc9c3eSSadaf Ebrahimi << "startpos: " << startpos << ", "
587*ccdc9c3eSSadaf Ebrahimi << "endpos: " << endpos << ", "
588*ccdc9c3eSSadaf Ebrahimi << "text size: " << text.size() << "]";
589*ccdc9c3eSSadaf Ebrahimi return false;
590*ccdc9c3eSSadaf Ebrahimi }
591*ccdc9c3eSSadaf Ebrahimi
592*ccdc9c3eSSadaf Ebrahimi StringPiece subtext = text;
593*ccdc9c3eSSadaf Ebrahimi subtext.remove_prefix(startpos);
594*ccdc9c3eSSadaf Ebrahimi subtext.remove_suffix(text.size() - endpos);
595*ccdc9c3eSSadaf Ebrahimi
596*ccdc9c3eSSadaf Ebrahimi // Use DFAs to find exact location of match, filter out non-matches.
597*ccdc9c3eSSadaf Ebrahimi
598*ccdc9c3eSSadaf Ebrahimi // Don't ask for the location if we won't use it.
599*ccdc9c3eSSadaf Ebrahimi // SearchDFA can do extra optimizations in that case.
600*ccdc9c3eSSadaf Ebrahimi StringPiece match;
601*ccdc9c3eSSadaf Ebrahimi StringPiece* matchp = &match;
602*ccdc9c3eSSadaf Ebrahimi if (nsubmatch == 0)
603*ccdc9c3eSSadaf Ebrahimi matchp = NULL;
604*ccdc9c3eSSadaf Ebrahimi
605*ccdc9c3eSSadaf Ebrahimi int ncap = 1 + NumberOfCapturingGroups();
606*ccdc9c3eSSadaf Ebrahimi if (ncap > nsubmatch)
607*ccdc9c3eSSadaf Ebrahimi ncap = nsubmatch;
608*ccdc9c3eSSadaf Ebrahimi
609*ccdc9c3eSSadaf Ebrahimi // If the regexp is anchored explicitly, must not be in middle of text.
610*ccdc9c3eSSadaf Ebrahimi if (prog_->anchor_start() && startpos != 0)
611*ccdc9c3eSSadaf Ebrahimi return false;
612*ccdc9c3eSSadaf Ebrahimi
613*ccdc9c3eSSadaf Ebrahimi // If the regexp is anchored explicitly, update re_anchor
614*ccdc9c3eSSadaf Ebrahimi // so that we can potentially fall into a faster case below.
615*ccdc9c3eSSadaf Ebrahimi if (prog_->anchor_start() && prog_->anchor_end())
616*ccdc9c3eSSadaf Ebrahimi re_anchor = ANCHOR_BOTH;
617*ccdc9c3eSSadaf Ebrahimi else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
618*ccdc9c3eSSadaf Ebrahimi re_anchor = ANCHOR_START;
619*ccdc9c3eSSadaf Ebrahimi
620*ccdc9c3eSSadaf Ebrahimi // Check for the required prefix, if any.
621*ccdc9c3eSSadaf Ebrahimi size_t prefixlen = 0;
622*ccdc9c3eSSadaf Ebrahimi if (!prefix_.empty()) {
623*ccdc9c3eSSadaf Ebrahimi if (startpos != 0)
624*ccdc9c3eSSadaf Ebrahimi return false;
625*ccdc9c3eSSadaf Ebrahimi prefixlen = prefix_.size();
626*ccdc9c3eSSadaf Ebrahimi if (prefixlen > subtext.size())
627*ccdc9c3eSSadaf Ebrahimi return false;
628*ccdc9c3eSSadaf Ebrahimi if (prefix_foldcase_) {
629*ccdc9c3eSSadaf Ebrahimi if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)
630*ccdc9c3eSSadaf Ebrahimi return false;
631*ccdc9c3eSSadaf Ebrahimi } else {
632*ccdc9c3eSSadaf Ebrahimi if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)
633*ccdc9c3eSSadaf Ebrahimi return false;
634*ccdc9c3eSSadaf Ebrahimi }
635*ccdc9c3eSSadaf Ebrahimi subtext.remove_prefix(prefixlen);
636*ccdc9c3eSSadaf Ebrahimi // If there is a required prefix, the anchor must be at least ANCHOR_START.
637*ccdc9c3eSSadaf Ebrahimi if (re_anchor != ANCHOR_BOTH)
638*ccdc9c3eSSadaf Ebrahimi re_anchor = ANCHOR_START;
639*ccdc9c3eSSadaf Ebrahimi }
640*ccdc9c3eSSadaf Ebrahimi
641*ccdc9c3eSSadaf Ebrahimi Prog::Anchor anchor = Prog::kUnanchored;
642*ccdc9c3eSSadaf Ebrahimi Prog::MatchKind kind = Prog::kFirstMatch;
643*ccdc9c3eSSadaf Ebrahimi if (options_.longest_match())
644*ccdc9c3eSSadaf Ebrahimi kind = Prog::kLongestMatch;
645*ccdc9c3eSSadaf Ebrahimi bool skipped_test = false;
646*ccdc9c3eSSadaf Ebrahimi
647*ccdc9c3eSSadaf Ebrahimi bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);
648*ccdc9c3eSSadaf Ebrahimi
649*ccdc9c3eSSadaf Ebrahimi // SearchBitState allocates a bit vector of size prog_->size() * text.size().
650*ccdc9c3eSSadaf Ebrahimi // It also allocates a stack of 3-word structures which could potentially
651*ccdc9c3eSSadaf Ebrahimi // grow as large as prog_->size() * text.size() but in practice is much
652*ccdc9c3eSSadaf Ebrahimi // smaller.
653*ccdc9c3eSSadaf Ebrahimi // Conditions for using SearchBitState:
654*ccdc9c3eSSadaf Ebrahimi const int MaxBitStateProg = 500; // prog_->size() <= Max.
655*ccdc9c3eSSadaf Ebrahimi const int MaxBitStateVector = 256*1024; // bit vector size <= Max (bits)
656*ccdc9c3eSSadaf Ebrahimi bool can_bit_state = prog_->size() <= MaxBitStateProg;
657*ccdc9c3eSSadaf Ebrahimi size_t bit_state_text_max = MaxBitStateVector / prog_->size();
658*ccdc9c3eSSadaf Ebrahimi
659*ccdc9c3eSSadaf Ebrahimi bool dfa_failed = false;
660*ccdc9c3eSSadaf Ebrahimi switch (re_anchor) {
661*ccdc9c3eSSadaf Ebrahimi default:
662*ccdc9c3eSSadaf Ebrahimi case UNANCHORED: {
663*ccdc9c3eSSadaf Ebrahimi if (!prog_->SearchDFA(subtext, text, anchor, kind,
664*ccdc9c3eSSadaf Ebrahimi matchp, &dfa_failed, NULL)) {
665*ccdc9c3eSSadaf Ebrahimi if (dfa_failed) {
666*ccdc9c3eSSadaf Ebrahimi if (options_.log_errors())
667*ccdc9c3eSSadaf Ebrahimi LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", "
668*ccdc9c3eSSadaf Ebrahimi << "bytemap range " << prog_->bytemap_range() << ", "
669*ccdc9c3eSSadaf Ebrahimi << "list count " << prog_->list_count();
670*ccdc9c3eSSadaf Ebrahimi // Fall back to NFA below.
671*ccdc9c3eSSadaf Ebrahimi skipped_test = true;
672*ccdc9c3eSSadaf Ebrahimi break;
673*ccdc9c3eSSadaf Ebrahimi }
674*ccdc9c3eSSadaf Ebrahimi return false;
675*ccdc9c3eSSadaf Ebrahimi }
676*ccdc9c3eSSadaf Ebrahimi if (matchp == NULL) // Matched. Don't care where
677*ccdc9c3eSSadaf Ebrahimi return true;
678*ccdc9c3eSSadaf Ebrahimi // SearchDFA set match[0].end() but didn't know where the
679*ccdc9c3eSSadaf Ebrahimi // match started. Run the regexp backward from match[0].end()
680*ccdc9c3eSSadaf Ebrahimi // to find the longest possible match -- that's where it started.
681*ccdc9c3eSSadaf Ebrahimi Prog* prog = ReverseProg();
682*ccdc9c3eSSadaf Ebrahimi if (prog == NULL)
683*ccdc9c3eSSadaf Ebrahimi return false;
684*ccdc9c3eSSadaf Ebrahimi if (!prog->SearchDFA(match, text, Prog::kAnchored,
685*ccdc9c3eSSadaf Ebrahimi Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
686*ccdc9c3eSSadaf Ebrahimi if (dfa_failed) {
687*ccdc9c3eSSadaf Ebrahimi if (options_.log_errors())
688*ccdc9c3eSSadaf Ebrahimi LOG(ERROR) << "DFA out of memory: size " << prog->size() << ", "
689*ccdc9c3eSSadaf Ebrahimi << "bytemap range " << prog->bytemap_range() << ", "
690*ccdc9c3eSSadaf Ebrahimi << "list count " << prog->list_count();
691*ccdc9c3eSSadaf Ebrahimi // Fall back to NFA below.
692*ccdc9c3eSSadaf Ebrahimi skipped_test = true;
693*ccdc9c3eSSadaf Ebrahimi break;
694*ccdc9c3eSSadaf Ebrahimi }
695*ccdc9c3eSSadaf Ebrahimi if (options_.log_errors())
696*ccdc9c3eSSadaf Ebrahimi LOG(ERROR) << "SearchDFA inconsistency";
697*ccdc9c3eSSadaf Ebrahimi return false;
698*ccdc9c3eSSadaf Ebrahimi }
699*ccdc9c3eSSadaf Ebrahimi break;
700*ccdc9c3eSSadaf Ebrahimi }
701*ccdc9c3eSSadaf Ebrahimi
702*ccdc9c3eSSadaf Ebrahimi case ANCHOR_BOTH:
703*ccdc9c3eSSadaf Ebrahimi case ANCHOR_START:
704*ccdc9c3eSSadaf Ebrahimi if (re_anchor == ANCHOR_BOTH)
705*ccdc9c3eSSadaf Ebrahimi kind = Prog::kFullMatch;
706*ccdc9c3eSSadaf Ebrahimi anchor = Prog::kAnchored;
707*ccdc9c3eSSadaf Ebrahimi
708*ccdc9c3eSSadaf Ebrahimi // If only a small amount of text and need submatch
709*ccdc9c3eSSadaf Ebrahimi // information anyway and we're going to use OnePass or BitState
710*ccdc9c3eSSadaf Ebrahimi // to get it, we might as well not even bother with the DFA:
711*ccdc9c3eSSadaf Ebrahimi // OnePass or BitState will be fast enough.
712*ccdc9c3eSSadaf Ebrahimi // On tiny texts, OnePass outruns even the DFA, and
713*ccdc9c3eSSadaf Ebrahimi // it doesn't have the shared state and occasional mutex that
714*ccdc9c3eSSadaf Ebrahimi // the DFA does.
715*ccdc9c3eSSadaf Ebrahimi if (can_one_pass && text.size() <= 4096 &&
716*ccdc9c3eSSadaf Ebrahimi (ncap > 1 || text.size() <= 8)) {
717*ccdc9c3eSSadaf Ebrahimi skipped_test = true;
718*ccdc9c3eSSadaf Ebrahimi break;
719*ccdc9c3eSSadaf Ebrahimi }
720*ccdc9c3eSSadaf Ebrahimi if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) {
721*ccdc9c3eSSadaf Ebrahimi skipped_test = true;
722*ccdc9c3eSSadaf Ebrahimi break;
723*ccdc9c3eSSadaf Ebrahimi }
724*ccdc9c3eSSadaf Ebrahimi if (!prog_->SearchDFA(subtext, text, anchor, kind,
725*ccdc9c3eSSadaf Ebrahimi &match, &dfa_failed, NULL)) {
726*ccdc9c3eSSadaf Ebrahimi if (dfa_failed) {
727*ccdc9c3eSSadaf Ebrahimi if (options_.log_errors())
728*ccdc9c3eSSadaf Ebrahimi LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", "
729*ccdc9c3eSSadaf Ebrahimi << "bytemap range " << prog_->bytemap_range() << ", "
730*ccdc9c3eSSadaf Ebrahimi << "list count " << prog_->list_count();
731*ccdc9c3eSSadaf Ebrahimi // Fall back to NFA below.
732*ccdc9c3eSSadaf Ebrahimi skipped_test = true;
733*ccdc9c3eSSadaf Ebrahimi break;
734*ccdc9c3eSSadaf Ebrahimi }
735*ccdc9c3eSSadaf Ebrahimi return false;
736*ccdc9c3eSSadaf Ebrahimi }
737*ccdc9c3eSSadaf Ebrahimi break;
738*ccdc9c3eSSadaf Ebrahimi }
739*ccdc9c3eSSadaf Ebrahimi
740*ccdc9c3eSSadaf Ebrahimi if (!skipped_test && ncap <= 1) {
741*ccdc9c3eSSadaf Ebrahimi // We know exactly where it matches. That's enough.
742*ccdc9c3eSSadaf Ebrahimi if (ncap == 1)
743*ccdc9c3eSSadaf Ebrahimi submatch[0] = match;
744*ccdc9c3eSSadaf Ebrahimi } else {
745*ccdc9c3eSSadaf Ebrahimi StringPiece subtext1;
746*ccdc9c3eSSadaf Ebrahimi if (skipped_test) {
747*ccdc9c3eSSadaf Ebrahimi // DFA ran out of memory or was skipped:
748*ccdc9c3eSSadaf Ebrahimi // need to search in entire original text.
749*ccdc9c3eSSadaf Ebrahimi subtext1 = subtext;
750*ccdc9c3eSSadaf Ebrahimi } else {
751*ccdc9c3eSSadaf Ebrahimi // DFA found the exact match location:
752*ccdc9c3eSSadaf Ebrahimi // let NFA run an anchored, full match search
753*ccdc9c3eSSadaf Ebrahimi // to find submatch locations.
754*ccdc9c3eSSadaf Ebrahimi subtext1 = match;
755*ccdc9c3eSSadaf Ebrahimi anchor = Prog::kAnchored;
756*ccdc9c3eSSadaf Ebrahimi kind = Prog::kFullMatch;
757*ccdc9c3eSSadaf Ebrahimi }
758*ccdc9c3eSSadaf Ebrahimi
759*ccdc9c3eSSadaf Ebrahimi if (can_one_pass && anchor != Prog::kUnanchored) {
760*ccdc9c3eSSadaf Ebrahimi if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
761*ccdc9c3eSSadaf Ebrahimi if (!skipped_test && options_.log_errors())
762*ccdc9c3eSSadaf Ebrahimi LOG(ERROR) << "SearchOnePass inconsistency";
763*ccdc9c3eSSadaf Ebrahimi return false;
764*ccdc9c3eSSadaf Ebrahimi }
765*ccdc9c3eSSadaf Ebrahimi } else if (can_bit_state && subtext1.size() <= bit_state_text_max) {
766*ccdc9c3eSSadaf Ebrahimi if (!prog_->SearchBitState(subtext1, text, anchor,
767*ccdc9c3eSSadaf Ebrahimi kind, submatch, ncap)) {
768*ccdc9c3eSSadaf Ebrahimi if (!skipped_test && options_.log_errors())
769*ccdc9c3eSSadaf Ebrahimi LOG(ERROR) << "SearchBitState inconsistency";
770*ccdc9c3eSSadaf Ebrahimi return false;
771*ccdc9c3eSSadaf Ebrahimi }
772*ccdc9c3eSSadaf Ebrahimi } else {
773*ccdc9c3eSSadaf Ebrahimi if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
774*ccdc9c3eSSadaf Ebrahimi if (!skipped_test && options_.log_errors())
775*ccdc9c3eSSadaf Ebrahimi LOG(ERROR) << "SearchNFA inconsistency";
776*ccdc9c3eSSadaf Ebrahimi return false;
777*ccdc9c3eSSadaf Ebrahimi }
778*ccdc9c3eSSadaf Ebrahimi }
779*ccdc9c3eSSadaf Ebrahimi }
780*ccdc9c3eSSadaf Ebrahimi
781*ccdc9c3eSSadaf Ebrahimi // Adjust overall match for required prefix that we stripped off.
782*ccdc9c3eSSadaf Ebrahimi if (prefixlen > 0 && nsubmatch > 0)
783*ccdc9c3eSSadaf Ebrahimi submatch[0] = StringPiece(submatch[0].data() - prefixlen,
784*ccdc9c3eSSadaf Ebrahimi submatch[0].size() + prefixlen);
785*ccdc9c3eSSadaf Ebrahimi
786*ccdc9c3eSSadaf Ebrahimi // Zero submatches that don't exist in the regexp.
787*ccdc9c3eSSadaf Ebrahimi for (int i = ncap; i < nsubmatch; i++)
788*ccdc9c3eSSadaf Ebrahimi submatch[i] = StringPiece();
789*ccdc9c3eSSadaf Ebrahimi return true;
790*ccdc9c3eSSadaf Ebrahimi }
791*ccdc9c3eSSadaf Ebrahimi
792*ccdc9c3eSSadaf Ebrahimi // Internal matcher - like Match() but takes Args not StringPieces.
793*ccdc9c3eSSadaf Ebrahimi bool RE2::DoMatch(const StringPiece& text,
794*ccdc9c3eSSadaf Ebrahimi Anchor re_anchor,
795*ccdc9c3eSSadaf Ebrahimi size_t* consumed,
796*ccdc9c3eSSadaf Ebrahimi const Arg* const* args,
797*ccdc9c3eSSadaf Ebrahimi int n) const {
798*ccdc9c3eSSadaf Ebrahimi if (!ok()) {
799*ccdc9c3eSSadaf Ebrahimi if (options_.log_errors())
800*ccdc9c3eSSadaf Ebrahimi LOG(ERROR) << "Invalid RE2: " << *error_;
801*ccdc9c3eSSadaf Ebrahimi return false;
802*ccdc9c3eSSadaf Ebrahimi }
803*ccdc9c3eSSadaf Ebrahimi
804*ccdc9c3eSSadaf Ebrahimi if (NumberOfCapturingGroups() < n) {
805*ccdc9c3eSSadaf Ebrahimi // RE has fewer capturing groups than number of Arg pointers passed in.
806*ccdc9c3eSSadaf Ebrahimi return false;
807*ccdc9c3eSSadaf Ebrahimi }
808*ccdc9c3eSSadaf Ebrahimi
809*ccdc9c3eSSadaf Ebrahimi // Count number of capture groups needed.
810*ccdc9c3eSSadaf Ebrahimi int nvec;
811*ccdc9c3eSSadaf Ebrahimi if (n == 0 && consumed == NULL)
812*ccdc9c3eSSadaf Ebrahimi nvec = 0;
813*ccdc9c3eSSadaf Ebrahimi else
814*ccdc9c3eSSadaf Ebrahimi nvec = n+1;
815*ccdc9c3eSSadaf Ebrahimi
816*ccdc9c3eSSadaf Ebrahimi StringPiece* vec;
817*ccdc9c3eSSadaf Ebrahimi StringPiece stkvec[kVecSize];
818*ccdc9c3eSSadaf Ebrahimi StringPiece* heapvec = NULL;
819*ccdc9c3eSSadaf Ebrahimi
820*ccdc9c3eSSadaf Ebrahimi if (nvec <= arraysize(stkvec)) {
821*ccdc9c3eSSadaf Ebrahimi vec = stkvec;
822*ccdc9c3eSSadaf Ebrahimi } else {
823*ccdc9c3eSSadaf Ebrahimi vec = new StringPiece[nvec];
824*ccdc9c3eSSadaf Ebrahimi heapvec = vec;
825*ccdc9c3eSSadaf Ebrahimi }
826*ccdc9c3eSSadaf Ebrahimi
827*ccdc9c3eSSadaf Ebrahimi if (!Match(text, 0, text.size(), re_anchor, vec, nvec)) {
828*ccdc9c3eSSadaf Ebrahimi delete[] heapvec;
829*ccdc9c3eSSadaf Ebrahimi return false;
830*ccdc9c3eSSadaf Ebrahimi }
831*ccdc9c3eSSadaf Ebrahimi
832*ccdc9c3eSSadaf Ebrahimi if (consumed != NULL)
833*ccdc9c3eSSadaf Ebrahimi *consumed = static_cast<size_t>(vec[0].end() - text.begin());
834*ccdc9c3eSSadaf Ebrahimi
835*ccdc9c3eSSadaf Ebrahimi if (n == 0 || args == NULL) {
836*ccdc9c3eSSadaf Ebrahimi // We are not interested in results
837*ccdc9c3eSSadaf Ebrahimi delete[] heapvec;
838*ccdc9c3eSSadaf Ebrahimi return true;
839*ccdc9c3eSSadaf Ebrahimi }
840*ccdc9c3eSSadaf Ebrahimi
841*ccdc9c3eSSadaf Ebrahimi // If we got here, we must have matched the whole pattern.
842*ccdc9c3eSSadaf Ebrahimi for (int i = 0; i < n; i++) {
843*ccdc9c3eSSadaf Ebrahimi const StringPiece& s = vec[i+1];
844*ccdc9c3eSSadaf Ebrahimi if (!args[i]->Parse(s.data(), s.size())) {
845*ccdc9c3eSSadaf Ebrahimi // TODO: Should we indicate what the error was?
846*ccdc9c3eSSadaf Ebrahimi delete[] heapvec;
847*ccdc9c3eSSadaf Ebrahimi return false;
848*ccdc9c3eSSadaf Ebrahimi }
849*ccdc9c3eSSadaf Ebrahimi }
850*ccdc9c3eSSadaf Ebrahimi
851*ccdc9c3eSSadaf Ebrahimi delete[] heapvec;
852*ccdc9c3eSSadaf Ebrahimi return true;
853*ccdc9c3eSSadaf Ebrahimi }
854*ccdc9c3eSSadaf Ebrahimi
855*ccdc9c3eSSadaf Ebrahimi // Checks that the rewrite string is well-formed with respect to this
856*ccdc9c3eSSadaf Ebrahimi // regular expression.
857*ccdc9c3eSSadaf Ebrahimi bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const {
858*ccdc9c3eSSadaf Ebrahimi int max_token = -1;
859*ccdc9c3eSSadaf Ebrahimi for (const char *s = rewrite.data(), *end = s + rewrite.size();
860*ccdc9c3eSSadaf Ebrahimi s < end; s++) {
861*ccdc9c3eSSadaf Ebrahimi int c = *s;
862*ccdc9c3eSSadaf Ebrahimi if (c != '\\') {
863*ccdc9c3eSSadaf Ebrahimi continue;
864*ccdc9c3eSSadaf Ebrahimi }
865*ccdc9c3eSSadaf Ebrahimi if (++s == end) {
866*ccdc9c3eSSadaf Ebrahimi *error = "Rewrite schema error: '\\' not allowed at end.";
867*ccdc9c3eSSadaf Ebrahimi return false;
868*ccdc9c3eSSadaf Ebrahimi }
869*ccdc9c3eSSadaf Ebrahimi c = *s;
870*ccdc9c3eSSadaf Ebrahimi if (c == '\\') {
871*ccdc9c3eSSadaf Ebrahimi continue;
872*ccdc9c3eSSadaf Ebrahimi }
873*ccdc9c3eSSadaf Ebrahimi if (!isdigit(c)) {
874*ccdc9c3eSSadaf Ebrahimi *error = "Rewrite schema error: "
875*ccdc9c3eSSadaf Ebrahimi "'\\' must be followed by a digit or '\\'.";
876*ccdc9c3eSSadaf Ebrahimi return false;
877*ccdc9c3eSSadaf Ebrahimi }
878*ccdc9c3eSSadaf Ebrahimi int n = (c - '0');
879*ccdc9c3eSSadaf Ebrahimi if (max_token < n) {
880*ccdc9c3eSSadaf Ebrahimi max_token = n;
881*ccdc9c3eSSadaf Ebrahimi }
882*ccdc9c3eSSadaf Ebrahimi }
883*ccdc9c3eSSadaf Ebrahimi
884*ccdc9c3eSSadaf Ebrahimi if (max_token > NumberOfCapturingGroups()) {
885*ccdc9c3eSSadaf Ebrahimi SStringPrintf(error, "Rewrite schema requests %d matches, "
886*ccdc9c3eSSadaf Ebrahimi "but the regexp only has %d parenthesized subexpressions.",
887*ccdc9c3eSSadaf Ebrahimi max_token, NumberOfCapturingGroups());
888*ccdc9c3eSSadaf Ebrahimi return false;
889*ccdc9c3eSSadaf Ebrahimi }
890*ccdc9c3eSSadaf Ebrahimi return true;
891*ccdc9c3eSSadaf Ebrahimi }
892*ccdc9c3eSSadaf Ebrahimi
893*ccdc9c3eSSadaf Ebrahimi // Returns the maximum submatch needed for the rewrite to be done by Replace().
894*ccdc9c3eSSadaf Ebrahimi // E.g. if rewrite == "foo \\2,\\1", returns 2.
895*ccdc9c3eSSadaf Ebrahimi int RE2::MaxSubmatch(const StringPiece& rewrite) {
896*ccdc9c3eSSadaf Ebrahimi int max = 0;
897*ccdc9c3eSSadaf Ebrahimi for (const char *s = rewrite.data(), *end = s + rewrite.size();
898*ccdc9c3eSSadaf Ebrahimi s < end; s++) {
899*ccdc9c3eSSadaf Ebrahimi if (*s == '\\') {
900*ccdc9c3eSSadaf Ebrahimi s++;
901*ccdc9c3eSSadaf Ebrahimi int c = (s < end) ? *s : -1;
902*ccdc9c3eSSadaf Ebrahimi if (isdigit(c)) {
903*ccdc9c3eSSadaf Ebrahimi int n = (c - '0');
904*ccdc9c3eSSadaf Ebrahimi if (n > max)
905*ccdc9c3eSSadaf Ebrahimi max = n;
906*ccdc9c3eSSadaf Ebrahimi }
907*ccdc9c3eSSadaf Ebrahimi }
908*ccdc9c3eSSadaf Ebrahimi }
909*ccdc9c3eSSadaf Ebrahimi return max;
910*ccdc9c3eSSadaf Ebrahimi }
911*ccdc9c3eSSadaf Ebrahimi
912*ccdc9c3eSSadaf Ebrahimi // Append the "rewrite" string, with backslash subsitutions from "vec",
913*ccdc9c3eSSadaf Ebrahimi // to string "out".
914*ccdc9c3eSSadaf Ebrahimi bool RE2::Rewrite(string* out,
915*ccdc9c3eSSadaf Ebrahimi const StringPiece& rewrite,
916*ccdc9c3eSSadaf Ebrahimi const StringPiece* vec,
917*ccdc9c3eSSadaf Ebrahimi int veclen) const {
918*ccdc9c3eSSadaf Ebrahimi for (const char *s = rewrite.data(), *end = s + rewrite.size();
919*ccdc9c3eSSadaf Ebrahimi s < end; s++) {
920*ccdc9c3eSSadaf Ebrahimi if (*s != '\\') {
921*ccdc9c3eSSadaf Ebrahimi out->push_back(*s);
922*ccdc9c3eSSadaf Ebrahimi continue;
923*ccdc9c3eSSadaf Ebrahimi }
924*ccdc9c3eSSadaf Ebrahimi s++;
925*ccdc9c3eSSadaf Ebrahimi int c = (s < end) ? *s : -1;
926*ccdc9c3eSSadaf Ebrahimi if (isdigit(c)) {
927*ccdc9c3eSSadaf Ebrahimi int n = (c - '0');
928*ccdc9c3eSSadaf Ebrahimi if (n >= veclen) {
929*ccdc9c3eSSadaf Ebrahimi if (options_.log_errors()) {
930*ccdc9c3eSSadaf Ebrahimi LOG(ERROR) << "requested group " << n
931*ccdc9c3eSSadaf Ebrahimi << " in regexp " << rewrite.data();
932*ccdc9c3eSSadaf Ebrahimi }
933*ccdc9c3eSSadaf Ebrahimi return false;
934*ccdc9c3eSSadaf Ebrahimi }
935*ccdc9c3eSSadaf Ebrahimi StringPiece snip = vec[n];
936*ccdc9c3eSSadaf Ebrahimi if (snip.size() > 0)
937*ccdc9c3eSSadaf Ebrahimi out->append(snip.data(), snip.size());
938*ccdc9c3eSSadaf Ebrahimi } else if (c == '\\') {
939*ccdc9c3eSSadaf Ebrahimi out->push_back('\\');
940*ccdc9c3eSSadaf Ebrahimi } else {
941*ccdc9c3eSSadaf Ebrahimi if (options_.log_errors())
942*ccdc9c3eSSadaf Ebrahimi LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
943*ccdc9c3eSSadaf Ebrahimi return false;
944*ccdc9c3eSSadaf Ebrahimi }
945*ccdc9c3eSSadaf Ebrahimi }
946*ccdc9c3eSSadaf Ebrahimi return true;
947*ccdc9c3eSSadaf Ebrahimi }
948*ccdc9c3eSSadaf Ebrahimi
949*ccdc9c3eSSadaf Ebrahimi /***** Parsers for various types *****/
950*ccdc9c3eSSadaf Ebrahimi
951*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_null(const char* str, size_t n, void* dest) {
952*ccdc9c3eSSadaf Ebrahimi // We fail if somebody asked us to store into a non-NULL void* pointer
953*ccdc9c3eSSadaf Ebrahimi return (dest == NULL);
954*ccdc9c3eSSadaf Ebrahimi }
955*ccdc9c3eSSadaf Ebrahimi
956*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_string(const char* str, size_t n, void* dest) {
957*ccdc9c3eSSadaf Ebrahimi if (dest == NULL) return true;
958*ccdc9c3eSSadaf Ebrahimi reinterpret_cast<string*>(dest)->assign(str, n);
959*ccdc9c3eSSadaf Ebrahimi return true;
960*ccdc9c3eSSadaf Ebrahimi }
961*ccdc9c3eSSadaf Ebrahimi
962*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_stringpiece(const char* str, size_t n, void* dest) {
963*ccdc9c3eSSadaf Ebrahimi if (dest == NULL) return true;
964*ccdc9c3eSSadaf Ebrahimi *(reinterpret_cast<StringPiece*>(dest)) = StringPiece(str, n);
965*ccdc9c3eSSadaf Ebrahimi return true;
966*ccdc9c3eSSadaf Ebrahimi }
967*ccdc9c3eSSadaf Ebrahimi
968*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_char(const char* str, size_t n, void* dest) {
969*ccdc9c3eSSadaf Ebrahimi if (n != 1) return false;
970*ccdc9c3eSSadaf Ebrahimi if (dest == NULL) return true;
971*ccdc9c3eSSadaf Ebrahimi *(reinterpret_cast<char*>(dest)) = str[0];
972*ccdc9c3eSSadaf Ebrahimi return true;
973*ccdc9c3eSSadaf Ebrahimi }
974*ccdc9c3eSSadaf Ebrahimi
975*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_schar(const char* str, size_t n, void* dest) {
976*ccdc9c3eSSadaf Ebrahimi if (n != 1) return false;
977*ccdc9c3eSSadaf Ebrahimi if (dest == NULL) return true;
978*ccdc9c3eSSadaf Ebrahimi *(reinterpret_cast<signed char*>(dest)) = str[0];
979*ccdc9c3eSSadaf Ebrahimi return true;
980*ccdc9c3eSSadaf Ebrahimi }
981*ccdc9c3eSSadaf Ebrahimi
982*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_uchar(const char* str, size_t n, void* dest) {
983*ccdc9c3eSSadaf Ebrahimi if (n != 1) return false;
984*ccdc9c3eSSadaf Ebrahimi if (dest == NULL) return true;
985*ccdc9c3eSSadaf Ebrahimi *(reinterpret_cast<unsigned char*>(dest)) = str[0];
986*ccdc9c3eSSadaf Ebrahimi return true;
987*ccdc9c3eSSadaf Ebrahimi }
988*ccdc9c3eSSadaf Ebrahimi
989*ccdc9c3eSSadaf Ebrahimi // Largest number spec that we are willing to parse
990*ccdc9c3eSSadaf Ebrahimi static const int kMaxNumberLength = 32;
991*ccdc9c3eSSadaf Ebrahimi
992*ccdc9c3eSSadaf Ebrahimi // REQUIRES "buf" must have length at least nbuf.
993*ccdc9c3eSSadaf Ebrahimi // Copies "str" into "buf" and null-terminates.
994*ccdc9c3eSSadaf Ebrahimi // Overwrites *np with the new length.
995*ccdc9c3eSSadaf Ebrahimi static const char* TerminateNumber(char* buf, size_t nbuf, const char* str,
996*ccdc9c3eSSadaf Ebrahimi size_t* np, bool accept_spaces) {
997*ccdc9c3eSSadaf Ebrahimi size_t n = *np;
998*ccdc9c3eSSadaf Ebrahimi if (n == 0) return "";
999*ccdc9c3eSSadaf Ebrahimi if (n > 0 && isspace(*str)) {
1000*ccdc9c3eSSadaf Ebrahimi // We are less forgiving than the strtoxxx() routines and do not
1001*ccdc9c3eSSadaf Ebrahimi // allow leading spaces. We do allow leading spaces for floats.
1002*ccdc9c3eSSadaf Ebrahimi if (!accept_spaces) {
1003*ccdc9c3eSSadaf Ebrahimi return "";
1004*ccdc9c3eSSadaf Ebrahimi }
1005*ccdc9c3eSSadaf Ebrahimi while (n > 0 && isspace(*str)) {
1006*ccdc9c3eSSadaf Ebrahimi n--;
1007*ccdc9c3eSSadaf Ebrahimi str++;
1008*ccdc9c3eSSadaf Ebrahimi }
1009*ccdc9c3eSSadaf Ebrahimi }
1010*ccdc9c3eSSadaf Ebrahimi
1011*ccdc9c3eSSadaf Ebrahimi // Although buf has a fixed maximum size, we can still handle
1012*ccdc9c3eSSadaf Ebrahimi // arbitrarily large integers correctly by omitting leading zeros.
1013*ccdc9c3eSSadaf Ebrahimi // (Numbers that are still too long will be out of range.)
1014*ccdc9c3eSSadaf Ebrahimi // Before deciding whether str is too long,
1015*ccdc9c3eSSadaf Ebrahimi // remove leading zeros with s/000+/00/.
1016*ccdc9c3eSSadaf Ebrahimi // Leaving the leading two zeros in place means that
1017*ccdc9c3eSSadaf Ebrahimi // we don't change 0000x123 (invalid) into 0x123 (valid).
1018*ccdc9c3eSSadaf Ebrahimi // Skip over leading - before replacing.
1019*ccdc9c3eSSadaf Ebrahimi bool neg = false;
1020*ccdc9c3eSSadaf Ebrahimi if (n >= 1 && str[0] == '-') {
1021*ccdc9c3eSSadaf Ebrahimi neg = true;
1022*ccdc9c3eSSadaf Ebrahimi n--;
1023*ccdc9c3eSSadaf Ebrahimi str++;
1024*ccdc9c3eSSadaf Ebrahimi }
1025*ccdc9c3eSSadaf Ebrahimi
1026*ccdc9c3eSSadaf Ebrahimi if (n >= 3 && str[0] == '0' && str[1] == '0') {
1027*ccdc9c3eSSadaf Ebrahimi while (n >= 3 && str[2] == '0') {
1028*ccdc9c3eSSadaf Ebrahimi n--;
1029*ccdc9c3eSSadaf Ebrahimi str++;
1030*ccdc9c3eSSadaf Ebrahimi }
1031*ccdc9c3eSSadaf Ebrahimi }
1032*ccdc9c3eSSadaf Ebrahimi
1033*ccdc9c3eSSadaf Ebrahimi if (neg) { // make room in buf for -
1034*ccdc9c3eSSadaf Ebrahimi n++;
1035*ccdc9c3eSSadaf Ebrahimi str--;
1036*ccdc9c3eSSadaf Ebrahimi }
1037*ccdc9c3eSSadaf Ebrahimi
1038*ccdc9c3eSSadaf Ebrahimi if (n > nbuf-1) return "";
1039*ccdc9c3eSSadaf Ebrahimi
1040*ccdc9c3eSSadaf Ebrahimi memmove(buf, str, n);
1041*ccdc9c3eSSadaf Ebrahimi if (neg) {
1042*ccdc9c3eSSadaf Ebrahimi buf[0] = '-';
1043*ccdc9c3eSSadaf Ebrahimi }
1044*ccdc9c3eSSadaf Ebrahimi buf[n] = '\0';
1045*ccdc9c3eSSadaf Ebrahimi *np = n;
1046*ccdc9c3eSSadaf Ebrahimi return buf;
1047*ccdc9c3eSSadaf Ebrahimi }
1048*ccdc9c3eSSadaf Ebrahimi
1049*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_long_radix(const char* str,
1050*ccdc9c3eSSadaf Ebrahimi size_t n,
1051*ccdc9c3eSSadaf Ebrahimi void* dest,
1052*ccdc9c3eSSadaf Ebrahimi int radix) {
1053*ccdc9c3eSSadaf Ebrahimi if (n == 0) return false;
1054*ccdc9c3eSSadaf Ebrahimi char buf[kMaxNumberLength+1];
1055*ccdc9c3eSSadaf Ebrahimi str = TerminateNumber(buf, sizeof buf, str, &n, false);
1056*ccdc9c3eSSadaf Ebrahimi char* end;
1057*ccdc9c3eSSadaf Ebrahimi errno = 0;
1058*ccdc9c3eSSadaf Ebrahimi long r = strtol(str, &end, radix);
1059*ccdc9c3eSSadaf Ebrahimi if (end != str + n) return false; // Leftover junk
1060*ccdc9c3eSSadaf Ebrahimi if (errno) return false;
1061*ccdc9c3eSSadaf Ebrahimi if (dest == NULL) return true;
1062*ccdc9c3eSSadaf Ebrahimi *(reinterpret_cast<long*>(dest)) = r;
1063*ccdc9c3eSSadaf Ebrahimi return true;
1064*ccdc9c3eSSadaf Ebrahimi }
1065*ccdc9c3eSSadaf Ebrahimi
1066*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_ulong_radix(const char* str,
1067*ccdc9c3eSSadaf Ebrahimi size_t n,
1068*ccdc9c3eSSadaf Ebrahimi void* dest,
1069*ccdc9c3eSSadaf Ebrahimi int radix) {
1070*ccdc9c3eSSadaf Ebrahimi if (n == 0) return false;
1071*ccdc9c3eSSadaf Ebrahimi char buf[kMaxNumberLength+1];
1072*ccdc9c3eSSadaf Ebrahimi str = TerminateNumber(buf, sizeof buf, str, &n, false);
1073*ccdc9c3eSSadaf Ebrahimi if (str[0] == '-') {
1074*ccdc9c3eSSadaf Ebrahimi // strtoul() will silently accept negative numbers and parse
1075*ccdc9c3eSSadaf Ebrahimi // them. This module is more strict and treats them as errors.
1076*ccdc9c3eSSadaf Ebrahimi return false;
1077*ccdc9c3eSSadaf Ebrahimi }
1078*ccdc9c3eSSadaf Ebrahimi
1079*ccdc9c3eSSadaf Ebrahimi char* end;
1080*ccdc9c3eSSadaf Ebrahimi errno = 0;
1081*ccdc9c3eSSadaf Ebrahimi unsigned long r = strtoul(str, &end, radix);
1082*ccdc9c3eSSadaf Ebrahimi if (end != str + n) return false; // Leftover junk
1083*ccdc9c3eSSadaf Ebrahimi if (errno) return false;
1084*ccdc9c3eSSadaf Ebrahimi if (dest == NULL) return true;
1085*ccdc9c3eSSadaf Ebrahimi *(reinterpret_cast<unsigned long*>(dest)) = r;
1086*ccdc9c3eSSadaf Ebrahimi return true;
1087*ccdc9c3eSSadaf Ebrahimi }
1088*ccdc9c3eSSadaf Ebrahimi
1089*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_short_radix(const char* str,
1090*ccdc9c3eSSadaf Ebrahimi size_t n,
1091*ccdc9c3eSSadaf Ebrahimi void* dest,
1092*ccdc9c3eSSadaf Ebrahimi int radix) {
1093*ccdc9c3eSSadaf Ebrahimi long r;
1094*ccdc9c3eSSadaf Ebrahimi if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
1095*ccdc9c3eSSadaf Ebrahimi if ((short)r != r) return false; // Out of range
1096*ccdc9c3eSSadaf Ebrahimi if (dest == NULL) return true;
1097*ccdc9c3eSSadaf Ebrahimi *(reinterpret_cast<short*>(dest)) = (short)r;
1098*ccdc9c3eSSadaf Ebrahimi return true;
1099*ccdc9c3eSSadaf Ebrahimi }
1100*ccdc9c3eSSadaf Ebrahimi
1101*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_ushort_radix(const char* str,
1102*ccdc9c3eSSadaf Ebrahimi size_t n,
1103*ccdc9c3eSSadaf Ebrahimi void* dest,
1104*ccdc9c3eSSadaf Ebrahimi int radix) {
1105*ccdc9c3eSSadaf Ebrahimi unsigned long r;
1106*ccdc9c3eSSadaf Ebrahimi if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
1107*ccdc9c3eSSadaf Ebrahimi if ((unsigned short)r != r) return false; // Out of range
1108*ccdc9c3eSSadaf Ebrahimi if (dest == NULL) return true;
1109*ccdc9c3eSSadaf Ebrahimi *(reinterpret_cast<unsigned short*>(dest)) = (unsigned short)r;
1110*ccdc9c3eSSadaf Ebrahimi return true;
1111*ccdc9c3eSSadaf Ebrahimi }
1112*ccdc9c3eSSadaf Ebrahimi
1113*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_int_radix(const char* str,
1114*ccdc9c3eSSadaf Ebrahimi size_t n,
1115*ccdc9c3eSSadaf Ebrahimi void* dest,
1116*ccdc9c3eSSadaf Ebrahimi int radix) {
1117*ccdc9c3eSSadaf Ebrahimi long r;
1118*ccdc9c3eSSadaf Ebrahimi if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
1119*ccdc9c3eSSadaf Ebrahimi if ((int)r != r) return false; // Out of range
1120*ccdc9c3eSSadaf Ebrahimi if (dest == NULL) return true;
1121*ccdc9c3eSSadaf Ebrahimi *(reinterpret_cast<int*>(dest)) = (int)r;
1122*ccdc9c3eSSadaf Ebrahimi return true;
1123*ccdc9c3eSSadaf Ebrahimi }
1124*ccdc9c3eSSadaf Ebrahimi
1125*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_uint_radix(const char* str,
1126*ccdc9c3eSSadaf Ebrahimi size_t n,
1127*ccdc9c3eSSadaf Ebrahimi void* dest,
1128*ccdc9c3eSSadaf Ebrahimi int radix) {
1129*ccdc9c3eSSadaf Ebrahimi unsigned long r;
1130*ccdc9c3eSSadaf Ebrahimi if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
1131*ccdc9c3eSSadaf Ebrahimi if ((unsigned int)r != r) return false; // Out of range
1132*ccdc9c3eSSadaf Ebrahimi if (dest == NULL) return true;
1133*ccdc9c3eSSadaf Ebrahimi *(reinterpret_cast<unsigned int*>(dest)) = (unsigned int)r;
1134*ccdc9c3eSSadaf Ebrahimi return true;
1135*ccdc9c3eSSadaf Ebrahimi }
1136*ccdc9c3eSSadaf Ebrahimi
1137*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_longlong_radix(const char* str,
1138*ccdc9c3eSSadaf Ebrahimi size_t n,
1139*ccdc9c3eSSadaf Ebrahimi void* dest,
1140*ccdc9c3eSSadaf Ebrahimi int radix) {
1141*ccdc9c3eSSadaf Ebrahimi if (n == 0) return false;
1142*ccdc9c3eSSadaf Ebrahimi char buf[kMaxNumberLength+1];
1143*ccdc9c3eSSadaf Ebrahimi str = TerminateNumber(buf, sizeof buf, str, &n, false);
1144*ccdc9c3eSSadaf Ebrahimi char* end;
1145*ccdc9c3eSSadaf Ebrahimi errno = 0;
1146*ccdc9c3eSSadaf Ebrahimi long long r = strtoll(str, &end, radix);
1147*ccdc9c3eSSadaf Ebrahimi if (end != str + n) return false; // Leftover junk
1148*ccdc9c3eSSadaf Ebrahimi if (errno) return false;
1149*ccdc9c3eSSadaf Ebrahimi if (dest == NULL) return true;
1150*ccdc9c3eSSadaf Ebrahimi *(reinterpret_cast<long long*>(dest)) = r;
1151*ccdc9c3eSSadaf Ebrahimi return true;
1152*ccdc9c3eSSadaf Ebrahimi }
1153*ccdc9c3eSSadaf Ebrahimi
1154*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_ulonglong_radix(const char* str,
1155*ccdc9c3eSSadaf Ebrahimi size_t n,
1156*ccdc9c3eSSadaf Ebrahimi void* dest,
1157*ccdc9c3eSSadaf Ebrahimi int radix) {
1158*ccdc9c3eSSadaf Ebrahimi if (n == 0) return false;
1159*ccdc9c3eSSadaf Ebrahimi char buf[kMaxNumberLength+1];
1160*ccdc9c3eSSadaf Ebrahimi str = TerminateNumber(buf, sizeof buf, str, &n, false);
1161*ccdc9c3eSSadaf Ebrahimi if (str[0] == '-') {
1162*ccdc9c3eSSadaf Ebrahimi // strtoull() will silently accept negative numbers and parse
1163*ccdc9c3eSSadaf Ebrahimi // them. This module is more strict and treats them as errors.
1164*ccdc9c3eSSadaf Ebrahimi return false;
1165*ccdc9c3eSSadaf Ebrahimi }
1166*ccdc9c3eSSadaf Ebrahimi char* end;
1167*ccdc9c3eSSadaf Ebrahimi errno = 0;
1168*ccdc9c3eSSadaf Ebrahimi unsigned long long r = strtoull(str, &end, radix);
1169*ccdc9c3eSSadaf Ebrahimi if (end != str + n) return false; // Leftover junk
1170*ccdc9c3eSSadaf Ebrahimi if (errno) return false;
1171*ccdc9c3eSSadaf Ebrahimi if (dest == NULL) return true;
1172*ccdc9c3eSSadaf Ebrahimi *(reinterpret_cast<unsigned long long*>(dest)) = r;
1173*ccdc9c3eSSadaf Ebrahimi return true;
1174*ccdc9c3eSSadaf Ebrahimi }
1175*ccdc9c3eSSadaf Ebrahimi
1176*ccdc9c3eSSadaf Ebrahimi static bool parse_double_float(const char* str, size_t n, bool isfloat,
1177*ccdc9c3eSSadaf Ebrahimi void* dest) {
1178*ccdc9c3eSSadaf Ebrahimi if (n == 0) return false;
1179*ccdc9c3eSSadaf Ebrahimi static const int kMaxLength = 200;
1180*ccdc9c3eSSadaf Ebrahimi char buf[kMaxLength+1];
1181*ccdc9c3eSSadaf Ebrahimi str = TerminateNumber(buf, sizeof buf, str, &n, true);
1182*ccdc9c3eSSadaf Ebrahimi char* end;
1183*ccdc9c3eSSadaf Ebrahimi errno = 0;
1184*ccdc9c3eSSadaf Ebrahimi double r;
1185*ccdc9c3eSSadaf Ebrahimi if (isfloat) {
1186*ccdc9c3eSSadaf Ebrahimi r = strtof(str, &end);
1187*ccdc9c3eSSadaf Ebrahimi } else {
1188*ccdc9c3eSSadaf Ebrahimi r = strtod(str, &end);
1189*ccdc9c3eSSadaf Ebrahimi }
1190*ccdc9c3eSSadaf Ebrahimi if (end != str + n) return false; // Leftover junk
1191*ccdc9c3eSSadaf Ebrahimi if (errno) return false;
1192*ccdc9c3eSSadaf Ebrahimi if (dest == NULL) return true;
1193*ccdc9c3eSSadaf Ebrahimi if (isfloat) {
1194*ccdc9c3eSSadaf Ebrahimi *(reinterpret_cast<float*>(dest)) = (float)r;
1195*ccdc9c3eSSadaf Ebrahimi } else {
1196*ccdc9c3eSSadaf Ebrahimi *(reinterpret_cast<double*>(dest)) = r;
1197*ccdc9c3eSSadaf Ebrahimi }
1198*ccdc9c3eSSadaf Ebrahimi return true;
1199*ccdc9c3eSSadaf Ebrahimi }
1200*ccdc9c3eSSadaf Ebrahimi
1201*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_double(const char* str, size_t n, void* dest) {
1202*ccdc9c3eSSadaf Ebrahimi return parse_double_float(str, n, false, dest);
1203*ccdc9c3eSSadaf Ebrahimi }
1204*ccdc9c3eSSadaf Ebrahimi
1205*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_float(const char* str, size_t n, void* dest) {
1206*ccdc9c3eSSadaf Ebrahimi return parse_double_float(str, n, true, dest);
1207*ccdc9c3eSSadaf Ebrahimi }
1208*ccdc9c3eSSadaf Ebrahimi
1209*ccdc9c3eSSadaf Ebrahimi #define DEFINE_INTEGER_PARSER(name) \
1210*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_##name(const char* str, size_t n, void* dest) { \
1211*ccdc9c3eSSadaf Ebrahimi return parse_##name##_radix(str, n, dest, 10); \
1212*ccdc9c3eSSadaf Ebrahimi } \
1213*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) { \
1214*ccdc9c3eSSadaf Ebrahimi return parse_##name##_radix(str, n, dest, 16); \
1215*ccdc9c3eSSadaf Ebrahimi } \
1216*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_##name##_octal(const char* str, size_t n, void* dest) { \
1217*ccdc9c3eSSadaf Ebrahimi return parse_##name##_radix(str, n, dest, 8); \
1218*ccdc9c3eSSadaf Ebrahimi } \
1219*ccdc9c3eSSadaf Ebrahimi bool RE2::Arg::parse_##name##_cradix(const char* str, size_t n, \
1220*ccdc9c3eSSadaf Ebrahimi void* dest) { \
1221*ccdc9c3eSSadaf Ebrahimi return parse_##name##_radix(str, n, dest, 0); \
1222*ccdc9c3eSSadaf Ebrahimi }
1223*ccdc9c3eSSadaf Ebrahimi
1224*ccdc9c3eSSadaf Ebrahimi DEFINE_INTEGER_PARSER(short);
1225*ccdc9c3eSSadaf Ebrahimi DEFINE_INTEGER_PARSER(ushort);
1226*ccdc9c3eSSadaf Ebrahimi DEFINE_INTEGER_PARSER(int);
1227*ccdc9c3eSSadaf Ebrahimi DEFINE_INTEGER_PARSER(uint);
1228*ccdc9c3eSSadaf Ebrahimi DEFINE_INTEGER_PARSER(long);
1229*ccdc9c3eSSadaf Ebrahimi DEFINE_INTEGER_PARSER(ulong);
1230*ccdc9c3eSSadaf Ebrahimi DEFINE_INTEGER_PARSER(longlong);
1231*ccdc9c3eSSadaf Ebrahimi DEFINE_INTEGER_PARSER(ulonglong);
1232*ccdc9c3eSSadaf Ebrahimi
1233*ccdc9c3eSSadaf Ebrahimi #undef DEFINE_INTEGER_PARSER
1234*ccdc9c3eSSadaf Ebrahimi
1235*ccdc9c3eSSadaf Ebrahimi } // namespace re2
1236