xref: /aosp_15_r20/external/cronet/third_party/re2/src/util/pcre.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2003-2009 Google Inc.  All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // This is a variant of PCRE's pcrecpp.cc, originally written at Google.
6 // The main changes are the addition of the HitLimit method and
7 // compilation as PCRE in namespace re2.
8 
9 #include <assert.h>
10 #include <ctype.h>
11 #include <errno.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <limits>
15 #include <string>
16 #include <utility>
17 
18 #include "absl/flags/flag.h"
19 #include "absl/strings/str_format.h"
20 #include "util/logging.h"
21 #include "util/pcre.h"
22 
23 // Silence warnings about the wacky formatting in the operator() functions.
24 #if defined(__GNUC__)
25 #pragma GCC diagnostic ignored "-Wmisleading-indentation"
26 #endif
27 
28 #define PCREPORT(level) LOG(level)
29 
30 // Default PCRE limits.
31 // Defaults chosen to allow a plausible amount of CPU and
32 // not exceed main thread stacks.  Note that other threads
33 // often have smaller stacks, and therefore tightening
34 // regexp_stack_limit may frequently be necessary.
35 ABSL_FLAG(int, regexp_stack_limit, 256 << 10,
36           "default PCRE stack limit (bytes)");
37 ABSL_FLAG(int, regexp_match_limit, 1000000,
38           "default PCRE match limit (function calls)");
39 
40 #ifndef USEPCRE
41 
42 // Fake just enough of the PCRE API to allow this file to build. :)
43 
44 struct pcre_extra {
45   int flags;
46   int match_limit;
47   int match_limit_recursion;
48 };
49 
50 #define PCRE_EXTRA_MATCH_LIMIT 0
51 #define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0
52 #define PCRE_ANCHORED 0
53 #define PCRE_NOTEMPTY 0
54 #define PCRE_ERROR_NOMATCH 1
55 #define PCRE_ERROR_MATCHLIMIT 2
56 #define PCRE_ERROR_RECURSIONLIMIT 3
57 #define PCRE_INFO_CAPTURECOUNT 0
58 
pcre_free(void *)59 void pcre_free(void*) {
60 }
61 
pcre_compile(const char *,int,const char **,int *,const unsigned char *)62 pcre* pcre_compile(const char*, int, const char**, int*, const unsigned char*) {
63   return NULL;
64 }
65 
pcre_exec(const pcre *,const pcre_extra *,const char *,int,int,int,int *,int)66 int pcre_exec(const pcre*, const pcre_extra*, const char*, int, int, int, int*, int) {
67   return 0;
68 }
69 
pcre_fullinfo(const pcre *,const pcre_extra *,int,void *)70 int pcre_fullinfo(const pcre*, const pcre_extra*, int, void*) {
71   return 0;
72 }
73 
74 #endif
75 
76 namespace re2 {
77 
78 // Maximum number of args we can set
79 static const int kMaxArgs = 16;
80 static const int kVecSize = (1 + kMaxArgs) * 3;  // results + PCRE workspace
81 
82 // Approximate size of a recursive invocation of PCRE's
83 // internal "match()" frame.  This varies depending on the
84 // compiler and architecture, of course, so the constant is
85 // just a conservative estimate.  To find the exact number,
86 // run regexp_unittest with --regexp_stack_limit=0 under
87 // a debugger and look at the frames when it crashes.
88 // The exact frame size was 656 in production on 2008/02/03.
89 static const int kPCREFrameSize = 700;
90 
91 // Special name for missing C++ arguments.
92 PCRE::Arg PCRE::no_more_args((void*)NULL);
93 
94 const PCRE::PartialMatchFunctor PCRE::PartialMatch = { };
95 const PCRE::FullMatchFunctor PCRE::FullMatch = { } ;
96 const PCRE::ConsumeFunctor PCRE::Consume = { };
97 const PCRE::FindAndConsumeFunctor PCRE::FindAndConsume = { };
98 
99 // If a regular expression has no error, its error_ field points here
100 static const std::string empty_string;
101 
Init(const char * pattern,Option options,int match_limit,int stack_limit,bool report_errors)102 void PCRE::Init(const char* pattern, Option options, int match_limit,
103               int stack_limit, bool report_errors) {
104   pattern_ = pattern;
105   options_ = options;
106   match_limit_ = match_limit;
107   stack_limit_ = stack_limit;
108   hit_limit_ = false;
109   error_ = &empty_string;
110   report_errors_ = report_errors;
111   re_full_ = NULL;
112   re_partial_ = NULL;
113 
114   if (options & ~(EnabledCompileOptions | EnabledExecOptions)) {
115     error_ = new std::string("illegal regexp option");
116     PCREPORT(ERROR)
117         << "Error compiling '" << pattern << "': illegal regexp option";
118   } else {
119     re_partial_ = Compile(UNANCHORED);
120     if (re_partial_ != NULL) {
121       re_full_ = Compile(ANCHOR_BOTH);
122     }
123   }
124 }
125 
PCRE(const char * pattern)126 PCRE::PCRE(const char* pattern) {
127   Init(pattern, None, 0, 0, true);
128 }
PCRE(const char * pattern,Option option)129 PCRE::PCRE(const char* pattern, Option option) {
130   Init(pattern, option, 0, 0, true);
131 }
PCRE(const std::string & pattern)132 PCRE::PCRE(const std::string& pattern) {
133   Init(pattern.c_str(), None, 0, 0, true);
134 }
PCRE(const std::string & pattern,Option option)135 PCRE::PCRE(const std::string& pattern, Option option) {
136   Init(pattern.c_str(), option, 0, 0, true);
137 }
PCRE(const std::string & pattern,const PCRE_Options & re_option)138 PCRE::PCRE(const std::string& pattern, const PCRE_Options& re_option) {
139   Init(pattern.c_str(), re_option.option(), re_option.match_limit(),
140        re_option.stack_limit(), re_option.report_errors());
141 }
142 
PCRE(const char * pattern,const PCRE_Options & re_option)143 PCRE::PCRE(const char *pattern, const PCRE_Options& re_option) {
144   Init(pattern, re_option.option(), re_option.match_limit(),
145        re_option.stack_limit(), re_option.report_errors());
146 }
147 
~PCRE()148 PCRE::~PCRE() {
149   if (re_full_ != NULL)         pcre_free(re_full_);
150   if (re_partial_ != NULL)      pcre_free(re_partial_);
151   if (error_ != &empty_string)  delete error_;
152 }
153 
Compile(Anchor anchor)154 pcre* PCRE::Compile(Anchor anchor) {
155   // Special treatment for anchoring.  This is needed because at
156   // runtime pcre only provides an option for anchoring at the
157   // beginning of a string.
158   //
159   // There are three types of anchoring we want:
160   //    UNANCHORED      Compile the original pattern, and use
161   //                    a pcre unanchored match.
162   //    ANCHOR_START    Compile the original pattern, and use
163   //                    a pcre anchored match.
164   //    ANCHOR_BOTH     Tack a "\z" to the end of the original pattern
165   //                    and use a pcre anchored match.
166 
167   const char* error = "";
168   int eoffset;
169   pcre* re;
170   if (anchor != ANCHOR_BOTH) {
171     re = pcre_compile(pattern_.c_str(),
172                       (options_ & EnabledCompileOptions),
173                       &error, &eoffset, NULL);
174   } else {
175     // Tack a '\z' at the end of PCRE.  Parenthesize it first so that
176     // the '\z' applies to all top-level alternatives in the regexp.
177     std::string wrapped = "(?:";  // A non-counting grouping operator
178     wrapped += pattern_;
179     wrapped += ")\\z";
180     re = pcre_compile(wrapped.c_str(),
181                       (options_ & EnabledCompileOptions),
182                       &error, &eoffset, NULL);
183   }
184   if (re == NULL) {
185     if (error_ == &empty_string) error_ = new std::string(error);
186     PCREPORT(ERROR) << "Error compiling '" << pattern_ << "': " << error;
187   }
188   return re;
189 }
190 
191 /***** Convenience interfaces *****/
192 
operator ()(absl::string_view text,const PCRE & re,const Arg & a0,const Arg & a1,const Arg & a2,const Arg & a3,const Arg & a4,const Arg & a5,const Arg & a6,const Arg & a7,const Arg & a8,const Arg & a9,const Arg & a10,const Arg & a11,const Arg & a12,const Arg & a13,const Arg & a14,const Arg & a15) const193 bool PCRE::FullMatchFunctor::operator()(
194     absl::string_view text, const PCRE& re, const Arg& a0, const Arg& a1,
195     const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, const Arg& a6,
196     const Arg& a7, const Arg& a8, const Arg& a9, const Arg& a10, const Arg& a11,
197     const Arg& a12, const Arg& a13, const Arg& a14, const Arg& a15) const {
198   const Arg* args[kMaxArgs];
199   int n = 0;
200   if (&a0 == &no_more_args)  goto done; args[n++] = &a0;
201   if (&a1 == &no_more_args)  goto done; args[n++] = &a1;
202   if (&a2 == &no_more_args)  goto done; args[n++] = &a2;
203   if (&a3 == &no_more_args)  goto done; args[n++] = &a3;
204   if (&a4 == &no_more_args)  goto done; args[n++] = &a4;
205   if (&a5 == &no_more_args)  goto done; args[n++] = &a5;
206   if (&a6 == &no_more_args)  goto done; args[n++] = &a6;
207   if (&a7 == &no_more_args)  goto done; args[n++] = &a7;
208   if (&a8 == &no_more_args)  goto done; args[n++] = &a8;
209   if (&a9 == &no_more_args)  goto done; args[n++] = &a9;
210   if (&a10 == &no_more_args) goto done; args[n++] = &a10;
211   if (&a11 == &no_more_args) goto done; args[n++] = &a11;
212   if (&a12 == &no_more_args) goto done; args[n++] = &a12;
213   if (&a13 == &no_more_args) goto done; args[n++] = &a13;
214   if (&a14 == &no_more_args) goto done; args[n++] = &a14;
215   if (&a15 == &no_more_args) goto done; args[n++] = &a15;
216 done:
217 
218   size_t consumed;
219   int vec[kVecSize] = {};
220   return re.DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
221 }
222 
operator ()(absl::string_view text,const PCRE & re,const Arg & a0,const Arg & a1,const Arg & a2,const Arg & a3,const Arg & a4,const Arg & a5,const Arg & a6,const Arg & a7,const Arg & a8,const Arg & a9,const Arg & a10,const Arg & a11,const Arg & a12,const Arg & a13,const Arg & a14,const Arg & a15) const223 bool PCRE::PartialMatchFunctor::operator()(
224     absl::string_view text, const PCRE& re, const Arg& a0, const Arg& a1,
225     const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, const Arg& a6,
226     const Arg& a7, const Arg& a8, const Arg& a9, const Arg& a10, const Arg& a11,
227     const Arg& a12, const Arg& a13, const Arg& a14, const Arg& a15) const {
228   const Arg* args[kMaxArgs];
229   int n = 0;
230   if (&a0 == &no_more_args)  goto done; args[n++] = &a0;
231   if (&a1 == &no_more_args)  goto done; args[n++] = &a1;
232   if (&a2 == &no_more_args)  goto done; args[n++] = &a2;
233   if (&a3 == &no_more_args)  goto done; args[n++] = &a3;
234   if (&a4 == &no_more_args)  goto done; args[n++] = &a4;
235   if (&a5 == &no_more_args)  goto done; args[n++] = &a5;
236   if (&a6 == &no_more_args)  goto done; args[n++] = &a6;
237   if (&a7 == &no_more_args)  goto done; args[n++] = &a7;
238   if (&a8 == &no_more_args)  goto done; args[n++] = &a8;
239   if (&a9 == &no_more_args)  goto done; args[n++] = &a9;
240   if (&a10 == &no_more_args) goto done; args[n++] = &a10;
241   if (&a11 == &no_more_args) goto done; args[n++] = &a11;
242   if (&a12 == &no_more_args) goto done; args[n++] = &a12;
243   if (&a13 == &no_more_args) goto done; args[n++] = &a13;
244   if (&a14 == &no_more_args) goto done; args[n++] = &a14;
245   if (&a15 == &no_more_args) goto done; args[n++] = &a15;
246 done:
247 
248   size_t consumed;
249   int vec[kVecSize] = {};
250   return re.DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
251 }
252 
operator ()(absl::string_view * input,const PCRE & pattern,const Arg & a0,const Arg & a1,const Arg & a2,const Arg & a3,const Arg & a4,const Arg & a5,const Arg & a6,const Arg & a7,const Arg & a8,const Arg & a9,const Arg & a10,const Arg & a11,const Arg & a12,const Arg & a13,const Arg & a14,const Arg & a15) const253 bool PCRE::ConsumeFunctor::operator()(
254     absl::string_view* input, const PCRE& pattern, const Arg& a0, const Arg& a1,
255     const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, const Arg& a6,
256     const Arg& a7, const Arg& a8, const Arg& a9, const Arg& a10, const Arg& a11,
257     const Arg& a12, const Arg& a13, const Arg& a14, const Arg& a15) const {
258   const Arg* args[kMaxArgs];
259   int n = 0;
260   if (&a0 == &no_more_args)  goto done; args[n++] = &a0;
261   if (&a1 == &no_more_args)  goto done; args[n++] = &a1;
262   if (&a2 == &no_more_args)  goto done; args[n++] = &a2;
263   if (&a3 == &no_more_args)  goto done; args[n++] = &a3;
264   if (&a4 == &no_more_args)  goto done; args[n++] = &a4;
265   if (&a5 == &no_more_args)  goto done; args[n++] = &a5;
266   if (&a6 == &no_more_args)  goto done; args[n++] = &a6;
267   if (&a7 == &no_more_args)  goto done; args[n++] = &a7;
268   if (&a8 == &no_more_args)  goto done; args[n++] = &a8;
269   if (&a9 == &no_more_args)  goto done; args[n++] = &a9;
270   if (&a10 == &no_more_args) goto done; args[n++] = &a10;
271   if (&a11 == &no_more_args) goto done; args[n++] = &a11;
272   if (&a12 == &no_more_args) goto done; args[n++] = &a12;
273   if (&a13 == &no_more_args) goto done; args[n++] = &a13;
274   if (&a14 == &no_more_args) goto done; args[n++] = &a14;
275   if (&a15 == &no_more_args) goto done; args[n++] = &a15;
276 done:
277 
278   size_t consumed;
279   int vec[kVecSize] = {};
280   if (pattern.DoMatchImpl(*input, ANCHOR_START, &consumed,
281                           args, n, vec, kVecSize)) {
282     input->remove_prefix(consumed);
283     return true;
284   } else {
285     return false;
286   }
287 }
288 
operator ()(absl::string_view * input,const PCRE & pattern,const Arg & a0,const Arg & a1,const Arg & a2,const Arg & a3,const Arg & a4,const Arg & a5,const Arg & a6,const Arg & a7,const Arg & a8,const Arg & a9,const Arg & a10,const Arg & a11,const Arg & a12,const Arg & a13,const Arg & a14,const Arg & a15) const289 bool PCRE::FindAndConsumeFunctor::operator()(
290     absl::string_view* input, const PCRE& pattern, const Arg& a0, const Arg& a1,
291     const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, const Arg& a6,
292     const Arg& a7, const Arg& a8, const Arg& a9, const Arg& a10, const Arg& a11,
293     const Arg& a12, const Arg& a13, const Arg& a14, const Arg& a15) const {
294   const Arg* args[kMaxArgs];
295   int n = 0;
296   if (&a0 == &no_more_args)  goto done; args[n++] = &a0;
297   if (&a1 == &no_more_args)  goto done; args[n++] = &a1;
298   if (&a2 == &no_more_args)  goto done; args[n++] = &a2;
299   if (&a3 == &no_more_args)  goto done; args[n++] = &a3;
300   if (&a4 == &no_more_args)  goto done; args[n++] = &a4;
301   if (&a5 == &no_more_args)  goto done; args[n++] = &a5;
302   if (&a6 == &no_more_args)  goto done; args[n++] = &a6;
303   if (&a7 == &no_more_args)  goto done; args[n++] = &a7;
304   if (&a8 == &no_more_args)  goto done; args[n++] = &a8;
305   if (&a9 == &no_more_args)  goto done; args[n++] = &a9;
306   if (&a10 == &no_more_args) goto done; args[n++] = &a10;
307   if (&a11 == &no_more_args) goto done; args[n++] = &a11;
308   if (&a12 == &no_more_args) goto done; args[n++] = &a12;
309   if (&a13 == &no_more_args) goto done; args[n++] = &a13;
310   if (&a14 == &no_more_args) goto done; args[n++] = &a14;
311   if (&a15 == &no_more_args) goto done; args[n++] = &a15;
312 done:
313 
314   size_t consumed;
315   int vec[kVecSize] = {};
316   if (pattern.DoMatchImpl(*input, UNANCHORED, &consumed,
317                           args, n, vec, kVecSize)) {
318     input->remove_prefix(consumed);
319     return true;
320   } else {
321     return false;
322   }
323 }
324 
Replace(std::string * str,const PCRE & pattern,absl::string_view rewrite)325 bool PCRE::Replace(std::string* str, const PCRE& pattern,
326                    absl::string_view rewrite) {
327   int vec[kVecSize] = {};
328   int matches = pattern.TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
329   if (matches == 0)
330     return false;
331 
332   std::string s;
333   if (!pattern.Rewrite(&s, rewrite, *str, vec, matches))
334     return false;
335 
336   assert(vec[0] >= 0);
337   assert(vec[1] >= 0);
338   str->replace(vec[0], vec[1] - vec[0], s);
339   return true;
340 }
341 
GlobalReplace(std::string * str,const PCRE & pattern,absl::string_view rewrite)342 int PCRE::GlobalReplace(std::string* str, const PCRE& pattern,
343                         absl::string_view rewrite) {
344   int count = 0;
345   int vec[kVecSize] = {};
346   std::string out;
347   size_t start = 0;
348   bool last_match_was_empty_string = false;
349 
350   while (start <= str->size()) {
351     // If the previous match was for the empty string, we shouldn't
352     // just match again: we'll match in the same way and get an
353     // infinite loop.  Instead, we do the match in a special way:
354     // anchored -- to force another try at the same position --
355     // and with a flag saying that this time, ignore empty matches.
356     // If this special match returns, that means there's a non-empty
357     // match at this position as well, and we can continue.  If not,
358     // we do what perl does, and just advance by one.
359     // Notice that perl prints '@@@' for this;
360     //    perl -le '$_ = "aa"; s/b*|aa/@/g; print'
361     int matches;
362     if (last_match_was_empty_string) {
363       matches = pattern.TryMatch(*str, start, ANCHOR_START, false,
364                                  vec, kVecSize);
365       if (matches <= 0) {
366         if (start < str->size())
367           out.push_back((*str)[start]);
368         start++;
369         last_match_was_empty_string = false;
370         continue;
371       }
372     } else {
373       matches = pattern.TryMatch(*str, start, UNANCHORED, true,
374                                  vec, kVecSize);
375       if (matches <= 0)
376         break;
377     }
378     size_t matchstart = vec[0], matchend = vec[1];
379     assert(matchstart >= start);
380     assert(matchend >= matchstart);
381 
382     out.append(*str, start, matchstart - start);
383     pattern.Rewrite(&out, rewrite, *str, vec, matches);
384     start = matchend;
385     count++;
386     last_match_was_empty_string = (matchstart == matchend);
387   }
388 
389   if (count == 0)
390     return 0;
391 
392   if (start < str->size())
393     out.append(*str, start, str->size() - start);
394   using std::swap;
395   swap(out, *str);
396   return count;
397 }
398 
Extract(absl::string_view text,const PCRE & pattern,absl::string_view rewrite,std::string * out)399 bool PCRE::Extract(absl::string_view text, const PCRE& pattern,
400                    absl::string_view rewrite, std::string* out) {
401   int vec[kVecSize] = {};
402   int matches = pattern.TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
403   if (matches == 0)
404     return false;
405   out->clear();
406   return pattern.Rewrite(out, rewrite, text, vec, matches);
407 }
408 
QuoteMeta(absl::string_view unquoted)409 std::string PCRE::QuoteMeta(absl::string_view unquoted) {
410   std::string result;
411   result.reserve(unquoted.size() << 1);
412 
413   // Escape any ascii character not in [A-Za-z_0-9].
414   //
415   // Note that it's legal to escape a character even if it has no
416   // special meaning in a regular expression -- so this function does
417   // that.  (This also makes it identical to the perl function of the
418   // same name except for the null-character special case;
419   // see `perldoc -f quotemeta`.)
420   for (size_t ii = 0; ii < unquoted.size(); ++ii) {
421     // Note that using 'isalnum' here raises the benchmark time from
422     // 32ns to 58ns:
423     if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
424         (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
425         (unquoted[ii] < '0' || unquoted[ii] > '9') &&
426         unquoted[ii] != '_' &&
427         // If this is the part of a UTF8 or Latin1 character, we need
428         // to copy this byte without escaping.  Experimentally this is
429         // what works correctly with the regexp library.
430         !(unquoted[ii] & 128)) {
431       if (unquoted[ii] == '\0') {  // Special handling for null chars.
432         // Can't use "\\0" since the next character might be a digit.
433         result += "\\x00";
434         continue;
435       }
436       result += '\\';
437     }
438     result += unquoted[ii];
439   }
440 
441   return result;
442 }
443 
444 /***** Actual matching and rewriting code *****/
445 
HitLimit()446 bool PCRE::HitLimit() {
447   return hit_limit_ != 0;
448 }
449 
ClearHitLimit()450 void PCRE::ClearHitLimit() {
451   hit_limit_ = 0;
452 }
453 
TryMatch(absl::string_view text,size_t startpos,Anchor anchor,bool empty_ok,int * vec,int vecsize) const454 int PCRE::TryMatch(absl::string_view text, size_t startpos, Anchor anchor,
455                    bool empty_ok, int* vec, int vecsize) const {
456   pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
457   if (re == NULL) {
458     PCREPORT(ERROR) << "Matching against invalid re: " << *error_;
459     return 0;
460   }
461 
462   int match_limit = match_limit_;
463   if (match_limit <= 0) {
464     match_limit = absl::GetFlag(FLAGS_regexp_match_limit);
465   }
466 
467   int stack_limit = stack_limit_;
468   if (stack_limit <= 0) {
469     stack_limit = absl::GetFlag(FLAGS_regexp_stack_limit);
470   }
471 
472   pcre_extra extra = { 0 };
473   if (match_limit > 0) {
474     extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
475     extra.match_limit = match_limit;
476   }
477   if (stack_limit > 0) {
478     extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
479     extra.match_limit_recursion = stack_limit / kPCREFrameSize;
480   }
481 
482   int options = 0;
483   if (anchor != UNANCHORED)
484     options |= PCRE_ANCHORED;
485   if (!empty_ok)
486     options |= PCRE_NOTEMPTY;
487 
488   int rc = pcre_exec(re,              // The regular expression object
489                      &extra,
490                      (text.data() == NULL) ? "" : text.data(),
491                      static_cast<int>(text.size()),
492                      static_cast<int>(startpos),
493                      options,
494                      vec,
495                      vecsize);
496 
497   // Handle errors
498   if (rc == 0) {
499     // pcre_exec() returns 0 as a special case when the number of
500     // capturing subpatterns exceeds the size of the vector.
501     // When this happens, there is a match and the output vector
502     // is filled, but we miss out on the positions of the extra subpatterns.
503     rc = vecsize / 2;
504   } else if (rc < 0) {
505     switch (rc) {
506       case PCRE_ERROR_NOMATCH:
507         return 0;
508       case PCRE_ERROR_MATCHLIMIT:
509         // Writing to hit_limit is not safe if multiple threads
510         // are using the PCRE, but the flag is only intended
511         // for use by unit tests anyway, so we let it go.
512         hit_limit_ = true;
513         PCREPORT(WARNING) << "Exceeded match limit of " << match_limit
514                         << " when matching '" << pattern_ << "'"
515                         << " against text that is " << text.size() << " bytes.";
516         return 0;
517       case PCRE_ERROR_RECURSIONLIMIT:
518         // See comment about hit_limit above.
519         hit_limit_ = true;
520         PCREPORT(WARNING) << "Exceeded stack limit of " << stack_limit
521                         << " when matching '" << pattern_ << "'"
522                         << " against text that is " << text.size() << " bytes.";
523         return 0;
524       default:
525         // There are other return codes from pcre.h :
526         // PCRE_ERROR_NULL           (-2)
527         // PCRE_ERROR_BADOPTION      (-3)
528         // PCRE_ERROR_BADMAGIC       (-4)
529         // PCRE_ERROR_UNKNOWN_NODE   (-5)
530         // PCRE_ERROR_NOMEMORY       (-6)
531         // PCRE_ERROR_NOSUBSTRING    (-7)
532         // ...
533         PCREPORT(ERROR) << "Unexpected return code: " << rc
534                       << " when matching '" << pattern_ << "'"
535                       << ", re=" << re
536                       << ", text=" << text
537                       << ", vec=" << vec
538                       << ", vecsize=" << vecsize;
539         return 0;
540     }
541   }
542 
543   return rc;
544 }
545 
DoMatchImpl(absl::string_view text,Anchor anchor,size_t * consumed,const Arg * const * args,int n,int * vec,int vecsize) const546 bool PCRE::DoMatchImpl(absl::string_view text, Anchor anchor, size_t* consumed,
547                        const Arg* const* args, int n, int* vec,
548                        int vecsize) const {
549   assert((1 + n) * 3 <= vecsize);  // results + PCRE workspace
550   if (NumberOfCapturingGroups() < n) {
551     // RE has fewer capturing groups than number of Arg pointers passed in.
552     return false;
553   }
554 
555   int matches = TryMatch(text, 0, anchor, true, vec, vecsize);
556   assert(matches >= 0);  // TryMatch never returns negatives
557   if (matches == 0)
558     return false;
559 
560   *consumed = vec[1];
561 
562   if (n == 0 || args == NULL) {
563     // We are not interested in results
564     return true;
565   }
566 
567   // If we got here, we must have matched the whole pattern.
568   // We do not need (can not do) any more checks on the value of 'matches' here
569   // -- see the comment for TryMatch.
570   for (int i = 0; i < n; i++) {
571     const int start = vec[2*(i+1)];
572     const int limit = vec[2*(i+1)+1];
573 
574     // Avoid invoking undefined behavior when text.data() happens
575     // to be null and start happens to be -1, the latter being the
576     // case for an unmatched subexpression. Even if text.data() is
577     // not null, pointing one byte before was a longstanding bug.
578     const char* addr = NULL;
579     if (start != -1) {
580       addr = text.data() + start;
581     }
582 
583     if (!args[i]->Parse(addr, limit-start)) {
584       // TODO: Should we indicate what the error was?
585       return false;
586     }
587   }
588 
589   return true;
590 }
591 
DoMatch(absl::string_view text,Anchor anchor,size_t * consumed,const Arg * const args[],int n) const592 bool PCRE::DoMatch(absl::string_view text, Anchor anchor, size_t* consumed,
593                    const Arg* const args[], int n) const {
594   assert(n >= 0);
595   const int vecsize = (1 + n) * 3;  // results + PCRE workspace
596                                     // (as for kVecSize)
597   int* vec = new int[vecsize];
598   bool b = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize);
599   delete[] vec;
600   return b;
601 }
602 
Rewrite(std::string * out,absl::string_view rewrite,absl::string_view text,int * vec,int veclen) const603 bool PCRE::Rewrite(std::string* out, absl::string_view rewrite,
604                    absl::string_view text, int* vec, int veclen) const {
605   int number_of_capturing_groups = NumberOfCapturingGroups();
606   for (const char *s = rewrite.data(), *end = s + rewrite.size();
607        s < end; s++) {
608     int c = *s;
609     if (c == '\\') {
610       c = *++s;
611       if (isdigit(c)) {
612         int n = (c - '0');
613         if (n >= veclen) {
614           if (n <= number_of_capturing_groups) {
615             // unmatched optional capturing group. treat
616             // its value as empty string; i.e., nothing to append.
617           } else {
618             PCREPORT(ERROR) << "requested group " << n
619                           << " in regexp " << rewrite.data();
620             return false;
621           }
622         }
623         int start = vec[2 * n];
624         if (start >= 0)
625           out->append(text.data() + start, vec[2 * n + 1] - start);
626       } else if (c == '\\') {
627         out->push_back('\\');
628       } else {
629         PCREPORT(ERROR) << "invalid rewrite pattern: " << rewrite.data();
630         return false;
631       }
632     } else {
633       out->push_back(c);
634     }
635   }
636   return true;
637 }
638 
CheckRewriteString(absl::string_view rewrite,std::string * error) const639 bool PCRE::CheckRewriteString(absl::string_view rewrite,
640                               std::string* error) const {
641   int max_token = -1;
642   for (const char *s = rewrite.data(), *end = s + rewrite.size();
643        s < end; s++) {
644     int c = *s;
645     if (c != '\\') {
646       continue;
647     }
648     if (++s == end) {
649       *error = "Rewrite schema error: '\\' not allowed at end.";
650       return false;
651     }
652     c = *s;
653     if (c == '\\') {
654       continue;
655     }
656     if (!isdigit(c)) {
657       *error = "Rewrite schema error: "
658                "'\\' must be followed by a digit or '\\'.";
659       return false;
660     }
661     int n = (c - '0');
662     if (max_token < n) {
663       max_token = n;
664     }
665   }
666 
667   if (max_token > NumberOfCapturingGroups()) {
668     *error = absl::StrFormat(
669         "Rewrite schema requests %d matches, but the regexp only has %d "
670         "parenthesized subexpressions.",
671         max_token, NumberOfCapturingGroups());
672     return false;
673   }
674   return true;
675 }
676 
677 // Return the number of capturing subpatterns, or -1 if the
678 // regexp wasn't valid on construction.
NumberOfCapturingGroups() const679 int PCRE::NumberOfCapturingGroups() const {
680   if (re_partial_ == NULL) return -1;
681 
682   int result;
683   int rc = pcre_fullinfo(re_partial_,       // The regular expression object
684                          NULL,              // We did not study the pattern
685                          PCRE_INFO_CAPTURECOUNT,
686                          &result);
687   if (rc != 0) {
688     PCREPORT(ERROR) << "Unexpected return code: " << rc;
689     return -1;
690   }
691   return result;
692 }
693 
694 
695 /***** Parsers for various types *****/
696 
parse_null(const char * str,size_t n,void * dest)697 bool PCRE::Arg::parse_null(const char* str, size_t n, void* dest) {
698   // We fail if somebody asked us to store into a non-NULL void* pointer
699   return (dest == NULL);
700 }
701 
parse_string(const char * str,size_t n,void * dest)702 bool PCRE::Arg::parse_string(const char* str, size_t n, void* dest) {
703   if (dest == NULL) return true;
704   reinterpret_cast<std::string*>(dest)->assign(str, n);
705   return true;
706 }
707 
parse_string_view(const char * str,size_t n,void * dest)708 bool PCRE::Arg::parse_string_view(const char* str, size_t n, void* dest) {
709   if (dest == NULL) return true;
710   *(reinterpret_cast<absl::string_view*>(dest)) = absl::string_view(str, n);
711   return true;
712 }
713 
parse_char(const char * str,size_t n,void * dest)714 bool PCRE::Arg::parse_char(const char* str, size_t n, void* dest) {
715   if (n != 1) return false;
716   if (dest == NULL) return true;
717   *(reinterpret_cast<char*>(dest)) = str[0];
718   return true;
719 }
720 
parse_schar(const char * str,size_t n,void * dest)721 bool PCRE::Arg::parse_schar(const char* str, size_t n, void* dest) {
722   if (n != 1) return false;
723   if (dest == NULL) return true;
724   *(reinterpret_cast<signed char*>(dest)) = str[0];
725   return true;
726 }
727 
parse_uchar(const char * str,size_t n,void * dest)728 bool PCRE::Arg::parse_uchar(const char* str, size_t n, void* dest) {
729   if (n != 1) return false;
730   if (dest == NULL) return true;
731   *(reinterpret_cast<unsigned char*>(dest)) = str[0];
732   return true;
733 }
734 
735 // Largest number spec that we are willing to parse
736 static const int kMaxNumberLength = 32;
737 
738 // PCREQUIPCRES "buf" must have length at least kMaxNumberLength+1
739 // PCREQUIPCRES "n > 0"
740 // Copies "str" into "buf" and null-terminates if necessary.
741 // Returns one of:
742 //      a. "str" if no termination is needed
743 //      b. "buf" if the string was copied and null-terminated
744 //      c. "" if the input was invalid and has no hope of being parsed
TerminateNumber(char * buf,const char * str,size_t n)745 static const char* TerminateNumber(char* buf, const char* str, size_t n) {
746   if ((n > 0) && isspace(*str)) {
747     // We are less forgiving than the strtoxxx() routines and do not
748     // allow leading spaces.
749     return "";
750   }
751 
752   // See if the character right after the input text may potentially
753   // look like a digit.
754   if (isdigit(str[n]) ||
755       ((str[n] >= 'a') && (str[n] <= 'f')) ||
756       ((str[n] >= 'A') && (str[n] <= 'F'))) {
757     if (n > kMaxNumberLength) return ""; // Input too big to be a valid number
758     memcpy(buf, str, n);
759     buf[n] = '\0';
760     return buf;
761   } else {
762     // We can parse right out of the supplied string, so return it.
763     return str;
764   }
765 }
766 
parse_long_radix(const char * str,size_t n,void * dest,int radix)767 bool PCRE::Arg::parse_long_radix(const char* str,
768                                  size_t n,
769                                  void* dest,
770                                  int radix) {
771   if (n == 0) return false;
772   char buf[kMaxNumberLength+1];
773   str = TerminateNumber(buf, str, n);
774   char* end;
775   errno = 0;
776   long r = strtol(str, &end, radix);
777   if (end != str + n) return false;   // Leftover junk
778   if (errno) return false;
779   if (dest == NULL) return true;
780   *(reinterpret_cast<long*>(dest)) = r;
781   return true;
782 }
783 
parse_ulong_radix(const char * str,size_t n,void * dest,int radix)784 bool PCRE::Arg::parse_ulong_radix(const char* str,
785                                   size_t n,
786                                   void* dest,
787                                   int radix) {
788   if (n == 0) return false;
789   char buf[kMaxNumberLength+1];
790   str = TerminateNumber(buf, str, n);
791   if (str[0] == '-') {
792     // strtoul() will silently accept negative numbers and parse
793     // them.  This module is more strict and treats them as errors.
794     return false;
795   }
796 
797   char* end;
798   errno = 0;
799   unsigned long r = strtoul(str, &end, radix);
800   if (end != str + n) return false;   // Leftover junk
801   if (errno) return false;
802   if (dest == NULL) return true;
803   *(reinterpret_cast<unsigned long*>(dest)) = r;
804   return true;
805 }
806 
parse_short_radix(const char * str,size_t n,void * dest,int radix)807 bool PCRE::Arg::parse_short_radix(const char* str,
808                                   size_t n,
809                                   void* dest,
810                                   int radix) {
811   long r;
812   if (!parse_long_radix(str, n, &r, radix)) return false;  // Could not parse
813   if ((short)r != r) return false;                         // Out of range
814   if (dest == NULL) return true;
815   *(reinterpret_cast<short*>(dest)) = (short)r;
816   return true;
817 }
818 
parse_ushort_radix(const char * str,size_t n,void * dest,int radix)819 bool PCRE::Arg::parse_ushort_radix(const char* str,
820                                    size_t n,
821                                    void* dest,
822                                    int radix) {
823   unsigned long r;
824   if (!parse_ulong_radix(str, n, &r, radix)) return false;  // Could not parse
825   if ((unsigned short)r != r) return false;                 // Out of range
826   if (dest == NULL) return true;
827   *(reinterpret_cast<unsigned short*>(dest)) = (unsigned short)r;
828   return true;
829 }
830 
parse_int_radix(const char * str,size_t n,void * dest,int radix)831 bool PCRE::Arg::parse_int_radix(const char* str,
832                                 size_t n,
833                                 void* dest,
834                                 int radix) {
835   long r;
836   if (!parse_long_radix(str, n, &r, radix)) return false;  // Could not parse
837   if ((int)r != r) return false;                           // Out of range
838   if (dest == NULL) return true;
839   *(reinterpret_cast<int*>(dest)) = (int)r;
840   return true;
841 }
842 
parse_uint_radix(const char * str,size_t n,void * dest,int radix)843 bool PCRE::Arg::parse_uint_radix(const char* str,
844                                  size_t n,
845                                  void* dest,
846                                  int radix) {
847   unsigned long r;
848   if (!parse_ulong_radix(str, n, &r, radix)) return false;  // Could not parse
849   if ((unsigned int)r != r) return false;                   // Out of range
850   if (dest == NULL) return true;
851   *(reinterpret_cast<unsigned int*>(dest)) = (unsigned int)r;
852   return true;
853 }
854 
parse_longlong_radix(const char * str,size_t n,void * dest,int radix)855 bool PCRE::Arg::parse_longlong_radix(const char* str,
856                                      size_t n,
857                                      void* dest,
858                                      int radix) {
859   if (n == 0) return false;
860   char buf[kMaxNumberLength+1];
861   str = TerminateNumber(buf, str, n);
862   char* end;
863   errno = 0;
864   long long r = strtoll(str, &end, radix);
865   if (end != str + n) return false;   // Leftover junk
866   if (errno) return false;
867   if (dest == NULL) return true;
868   *(reinterpret_cast<long long*>(dest)) = r;
869   return true;
870 }
871 
parse_ulonglong_radix(const char * str,size_t n,void * dest,int radix)872 bool PCRE::Arg::parse_ulonglong_radix(const char* str,
873                                       size_t n,
874                                       void* dest,
875                                       int radix) {
876   if (n == 0) return false;
877   char buf[kMaxNumberLength+1];
878   str = TerminateNumber(buf, str, n);
879   if (str[0] == '-') {
880     // strtoull() will silently accept negative numbers and parse
881     // them.  This module is more strict and treats them as errors.
882     return false;
883   }
884   char* end;
885   errno = 0;
886   unsigned long long r = strtoull(str, &end, radix);
887   if (end != str + n) return false;   // Leftover junk
888   if (errno) return false;
889   if (dest == NULL) return true;
890   *(reinterpret_cast<unsigned long long*>(dest)) = r;
891   return true;
892 }
893 
parse_double_float(const char * str,size_t n,bool isfloat,void * dest)894 static bool parse_double_float(const char* str, size_t n, bool isfloat,
895                                void* dest) {
896   if (n == 0) return false;
897   static const int kMaxLength = 200;
898   char buf[kMaxLength];
899   if (n >= kMaxLength) return false;
900   memcpy(buf, str, n);
901   buf[n] = '\0';
902   char* end;
903   errno = 0;
904   double r;
905   if (isfloat) {
906     r = strtof(buf, &end);
907   } else {
908     r = strtod(buf, &end);
909   }
910   if (end != buf + n) return false;   // Leftover junk
911   if (errno) return false;
912   if (dest == NULL) return true;
913   if (isfloat) {
914     *(reinterpret_cast<float*>(dest)) = (float)r;
915   } else {
916     *(reinterpret_cast<double*>(dest)) = r;
917   }
918   return true;
919 }
920 
parse_double(const char * str,size_t n,void * dest)921 bool PCRE::Arg::parse_double(const char* str, size_t n, void* dest) {
922   return parse_double_float(str, n, false, dest);
923 }
924 
parse_float(const char * str,size_t n,void * dest)925 bool PCRE::Arg::parse_float(const char* str, size_t n, void* dest) {
926   return parse_double_float(str, n, true, dest);
927 }
928 
929 #define DEFINE_INTEGER_PARSER(name)                                           \
930   bool PCRE::Arg::parse_##name(const char* str, size_t n, void* dest) {       \
931     return parse_##name##_radix(str, n, dest, 10);                            \
932   }                                                                           \
933   bool PCRE::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) { \
934     return parse_##name##_radix(str, n, dest, 16);                            \
935   }                                                                           \
936   bool PCRE::Arg::parse_##name##_octal(const char* str, size_t n,             \
937                                        void* dest) {                          \
938     return parse_##name##_radix(str, n, dest, 8);                             \
939   }                                                                           \
940   bool PCRE::Arg::parse_##name##_cradix(const char* str, size_t n,            \
941                                         void* dest) {                         \
942     return parse_##name##_radix(str, n, dest, 0);                             \
943   }
944 
945 DEFINE_INTEGER_PARSER(short);
946 DEFINE_INTEGER_PARSER(ushort);
947 DEFINE_INTEGER_PARSER(int);
948 DEFINE_INTEGER_PARSER(uint);
949 DEFINE_INTEGER_PARSER(long);
950 DEFINE_INTEGER_PARSER(ulong);
951 DEFINE_INTEGER_PARSER(longlong);
952 DEFINE_INTEGER_PARSER(ulonglong);
953 
954 #undef DEFINE_INTEGER_PARSER
955 
956 }  // namespace re2
957