xref: /aosp_15_r20/external/regex-re2/re2/compile.cc (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1*ccdc9c3eSSadaf Ebrahimi // Copyright 2007 The RE2 Authors.  All Rights Reserved.
2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style
3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file.
4*ccdc9c3eSSadaf Ebrahimi 
5*ccdc9c3eSSadaf Ebrahimi // Compile regular expression to Prog.
6*ccdc9c3eSSadaf Ebrahimi //
7*ccdc9c3eSSadaf Ebrahimi // Prog and Inst are defined in prog.h.
8*ccdc9c3eSSadaf Ebrahimi // This file's external interface is just Regexp::CompileToProg.
9*ccdc9c3eSSadaf Ebrahimi // The Compiler class defined in this file is private.
10*ccdc9c3eSSadaf Ebrahimi 
11*ccdc9c3eSSadaf Ebrahimi #include <stdint.h>
12*ccdc9c3eSSadaf Ebrahimi #include <string.h>
13*ccdc9c3eSSadaf Ebrahimi #include <unordered_map>
14*ccdc9c3eSSadaf Ebrahimi #include <utility>
15*ccdc9c3eSSadaf Ebrahimi 
16*ccdc9c3eSSadaf Ebrahimi #include "util/logging.h"
17*ccdc9c3eSSadaf Ebrahimi #include "util/pod_array.h"
18*ccdc9c3eSSadaf Ebrahimi #include "util/utf.h"
19*ccdc9c3eSSadaf Ebrahimi #include "re2/prog.h"
20*ccdc9c3eSSadaf Ebrahimi #include "re2/re2.h"
21*ccdc9c3eSSadaf Ebrahimi #include "re2/regexp.h"
22*ccdc9c3eSSadaf Ebrahimi #include "re2/walker-inl.h"
23*ccdc9c3eSSadaf Ebrahimi 
24*ccdc9c3eSSadaf Ebrahimi namespace re2 {
25*ccdc9c3eSSadaf Ebrahimi 
26*ccdc9c3eSSadaf Ebrahimi // List of pointers to Inst* that need to be filled in (patched).
27*ccdc9c3eSSadaf Ebrahimi // Because the Inst* haven't been filled in yet,
28*ccdc9c3eSSadaf Ebrahimi // we can use the Inst* word to hold the list's "next" pointer.
29*ccdc9c3eSSadaf Ebrahimi // It's kind of sleazy, but it works well in practice.
30*ccdc9c3eSSadaf Ebrahimi // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
31*ccdc9c3eSSadaf Ebrahimi //
32*ccdc9c3eSSadaf Ebrahimi // Because the out and out1 fields in Inst are no longer pointers,
33*ccdc9c3eSSadaf Ebrahimi // we can't use pointers directly here either.  Instead, p refers
34*ccdc9c3eSSadaf Ebrahimi // to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1).
35*ccdc9c3eSSadaf Ebrahimi // p == 0 represents the NULL list.  This is okay because instruction #0
36*ccdc9c3eSSadaf Ebrahimi // is always the fail instruction, which never appears on a list.
37*ccdc9c3eSSadaf Ebrahimi 
38*ccdc9c3eSSadaf Ebrahimi struct PatchList {
39*ccdc9c3eSSadaf Ebrahimi   uint32_t p;
40*ccdc9c3eSSadaf Ebrahimi 
41*ccdc9c3eSSadaf Ebrahimi   // Returns patch list containing just p.
42*ccdc9c3eSSadaf Ebrahimi   static PatchList Mk(uint32_t p);
43*ccdc9c3eSSadaf Ebrahimi 
44*ccdc9c3eSSadaf Ebrahimi   // Patches all the entries on l to have value v.
45*ccdc9c3eSSadaf Ebrahimi   // Caller must not ever use patch list again.
46*ccdc9c3eSSadaf Ebrahimi   static void Patch(Prog::Inst *inst0, PatchList l, uint32_t v);
47*ccdc9c3eSSadaf Ebrahimi 
48*ccdc9c3eSSadaf Ebrahimi   // Deref returns the next pointer pointed at by p.
49*ccdc9c3eSSadaf Ebrahimi   static PatchList Deref(Prog::Inst *inst0, PatchList l);
50*ccdc9c3eSSadaf Ebrahimi 
51*ccdc9c3eSSadaf Ebrahimi   // Appends two patch lists and returns result.
52*ccdc9c3eSSadaf Ebrahimi   static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2);
53*ccdc9c3eSSadaf Ebrahimi };
54*ccdc9c3eSSadaf Ebrahimi 
55*ccdc9c3eSSadaf Ebrahimi static PatchList nullPatchList = { 0 };
56*ccdc9c3eSSadaf Ebrahimi 
57*ccdc9c3eSSadaf Ebrahimi // Returns patch list containing just p.
Mk(uint32_t p)58*ccdc9c3eSSadaf Ebrahimi PatchList PatchList::Mk(uint32_t p) {
59*ccdc9c3eSSadaf Ebrahimi   PatchList l;
60*ccdc9c3eSSadaf Ebrahimi   l.p = p;
61*ccdc9c3eSSadaf Ebrahimi   return l;
62*ccdc9c3eSSadaf Ebrahimi }
63*ccdc9c3eSSadaf Ebrahimi 
64*ccdc9c3eSSadaf Ebrahimi // Returns the next pointer pointed at by l.
Deref(Prog::Inst * inst0,PatchList l)65*ccdc9c3eSSadaf Ebrahimi PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) {
66*ccdc9c3eSSadaf Ebrahimi   Prog::Inst* ip = &inst0[l.p>>1];
67*ccdc9c3eSSadaf Ebrahimi   if (l.p&1)
68*ccdc9c3eSSadaf Ebrahimi     l.p = ip->out1();
69*ccdc9c3eSSadaf Ebrahimi   else
70*ccdc9c3eSSadaf Ebrahimi     l.p = ip->out();
71*ccdc9c3eSSadaf Ebrahimi   return l;
72*ccdc9c3eSSadaf Ebrahimi }
73*ccdc9c3eSSadaf Ebrahimi 
74*ccdc9c3eSSadaf Ebrahimi // Patches all the entries on l to have value v.
Patch(Prog::Inst * inst0,PatchList l,uint32_t val)75*ccdc9c3eSSadaf Ebrahimi void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32_t val) {
76*ccdc9c3eSSadaf Ebrahimi   while (l.p != 0) {
77*ccdc9c3eSSadaf Ebrahimi     Prog::Inst* ip = &inst0[l.p>>1];
78*ccdc9c3eSSadaf Ebrahimi     if (l.p&1) {
79*ccdc9c3eSSadaf Ebrahimi       l.p = ip->out1();
80*ccdc9c3eSSadaf Ebrahimi       ip->out1_ = val;
81*ccdc9c3eSSadaf Ebrahimi     } else {
82*ccdc9c3eSSadaf Ebrahimi       l.p = ip->out();
83*ccdc9c3eSSadaf Ebrahimi       ip->set_out(val);
84*ccdc9c3eSSadaf Ebrahimi     }
85*ccdc9c3eSSadaf Ebrahimi   }
86*ccdc9c3eSSadaf Ebrahimi }
87*ccdc9c3eSSadaf Ebrahimi 
88*ccdc9c3eSSadaf Ebrahimi // Appends two patch lists and returns result.
Append(Prog::Inst * inst0,PatchList l1,PatchList l2)89*ccdc9c3eSSadaf Ebrahimi PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {
90*ccdc9c3eSSadaf Ebrahimi   if (l1.p == 0)
91*ccdc9c3eSSadaf Ebrahimi     return l2;
92*ccdc9c3eSSadaf Ebrahimi   if (l2.p == 0)
93*ccdc9c3eSSadaf Ebrahimi     return l1;
94*ccdc9c3eSSadaf Ebrahimi 
95*ccdc9c3eSSadaf Ebrahimi   PatchList l = l1;
96*ccdc9c3eSSadaf Ebrahimi   for (;;) {
97*ccdc9c3eSSadaf Ebrahimi     PatchList next = PatchList::Deref(inst0, l);
98*ccdc9c3eSSadaf Ebrahimi     if (next.p == 0)
99*ccdc9c3eSSadaf Ebrahimi       break;
100*ccdc9c3eSSadaf Ebrahimi     l = next;
101*ccdc9c3eSSadaf Ebrahimi   }
102*ccdc9c3eSSadaf Ebrahimi 
103*ccdc9c3eSSadaf Ebrahimi   Prog::Inst* ip = &inst0[l.p>>1];
104*ccdc9c3eSSadaf Ebrahimi   if (l.p&1)
105*ccdc9c3eSSadaf Ebrahimi     ip->out1_ = l2.p;
106*ccdc9c3eSSadaf Ebrahimi   else
107*ccdc9c3eSSadaf Ebrahimi     ip->set_out(l2.p);
108*ccdc9c3eSSadaf Ebrahimi 
109*ccdc9c3eSSadaf Ebrahimi   return l1;
110*ccdc9c3eSSadaf Ebrahimi }
111*ccdc9c3eSSadaf Ebrahimi 
112*ccdc9c3eSSadaf Ebrahimi // Compiled program fragment.
113*ccdc9c3eSSadaf Ebrahimi struct Frag {
114*ccdc9c3eSSadaf Ebrahimi   uint32_t begin;
115*ccdc9c3eSSadaf Ebrahimi   PatchList end;
116*ccdc9c3eSSadaf Ebrahimi 
Fragre2::Frag117*ccdc9c3eSSadaf Ebrahimi   Frag() : begin(0) { end.p = 0; }  // needed so Frag can go in vector
Fragre2::Frag118*ccdc9c3eSSadaf Ebrahimi   Frag(uint32_t begin, PatchList end) : begin(begin), end(end) {}
119*ccdc9c3eSSadaf Ebrahimi };
120*ccdc9c3eSSadaf Ebrahimi 
121*ccdc9c3eSSadaf Ebrahimi // Input encodings.
122*ccdc9c3eSSadaf Ebrahimi enum Encoding {
123*ccdc9c3eSSadaf Ebrahimi   kEncodingUTF8 = 1,  // UTF-8 (0-10FFFF)
124*ccdc9c3eSSadaf Ebrahimi   kEncodingLatin1,    // Latin-1 (0-FF)
125*ccdc9c3eSSadaf Ebrahimi };
126*ccdc9c3eSSadaf Ebrahimi 
127*ccdc9c3eSSadaf Ebrahimi class Compiler : public Regexp::Walker<Frag> {
128*ccdc9c3eSSadaf Ebrahimi  public:
129*ccdc9c3eSSadaf Ebrahimi   explicit Compiler();
130*ccdc9c3eSSadaf Ebrahimi   ~Compiler();
131*ccdc9c3eSSadaf Ebrahimi 
132*ccdc9c3eSSadaf Ebrahimi   // Compiles Regexp to a new Prog.
133*ccdc9c3eSSadaf Ebrahimi   // Caller is responsible for deleting Prog when finished with it.
134*ccdc9c3eSSadaf Ebrahimi   // If reversed is true, compiles for walking over the input
135*ccdc9c3eSSadaf Ebrahimi   // string backward (reverses all concatenations).
136*ccdc9c3eSSadaf Ebrahimi   static Prog *Compile(Regexp* re, bool reversed, int64_t max_mem);
137*ccdc9c3eSSadaf Ebrahimi 
138*ccdc9c3eSSadaf Ebrahimi   // Compiles alternation of all the re to a new Prog.
139*ccdc9c3eSSadaf Ebrahimi   // Each re has a match with an id equal to its index in the vector.
140*ccdc9c3eSSadaf Ebrahimi   static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
141*ccdc9c3eSSadaf Ebrahimi 
142*ccdc9c3eSSadaf Ebrahimi   // Interface for Regexp::Walker, which helps traverse the Regexp.
143*ccdc9c3eSSadaf Ebrahimi   // The walk is purely post-recursive: given the machines for the
144*ccdc9c3eSSadaf Ebrahimi   // children, PostVisit combines them to create the machine for
145*ccdc9c3eSSadaf Ebrahimi   // the current node.  The child_args are Frags.
146*ccdc9c3eSSadaf Ebrahimi   // The Compiler traverses the Regexp parse tree, visiting
147*ccdc9c3eSSadaf Ebrahimi   // each node in depth-first order.  It invokes PreVisit before
148*ccdc9c3eSSadaf Ebrahimi   // visiting the node's children and PostVisit after visiting
149*ccdc9c3eSSadaf Ebrahimi   // the children.
150*ccdc9c3eSSadaf Ebrahimi   Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop);
151*ccdc9c3eSSadaf Ebrahimi   Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args,
152*ccdc9c3eSSadaf Ebrahimi                  int nchild_args);
153*ccdc9c3eSSadaf Ebrahimi   Frag ShortVisit(Regexp* re, Frag parent_arg);
154*ccdc9c3eSSadaf Ebrahimi   Frag Copy(Frag arg);
155*ccdc9c3eSSadaf Ebrahimi 
156*ccdc9c3eSSadaf Ebrahimi   // Given fragment a, returns a+ or a+?; a* or a*?; a? or a??
157*ccdc9c3eSSadaf Ebrahimi   Frag Plus(Frag a, bool nongreedy);
158*ccdc9c3eSSadaf Ebrahimi   Frag Star(Frag a, bool nongreedy);
159*ccdc9c3eSSadaf Ebrahimi   Frag Quest(Frag a, bool nongreedy);
160*ccdc9c3eSSadaf Ebrahimi 
161*ccdc9c3eSSadaf Ebrahimi   // Given fragment a, returns (a) capturing as \n.
162*ccdc9c3eSSadaf Ebrahimi   Frag Capture(Frag a, int n);
163*ccdc9c3eSSadaf Ebrahimi 
164*ccdc9c3eSSadaf Ebrahimi   // Given fragments a and b, returns ab; a|b
165*ccdc9c3eSSadaf Ebrahimi   Frag Cat(Frag a, Frag b);
166*ccdc9c3eSSadaf Ebrahimi   Frag Alt(Frag a, Frag b);
167*ccdc9c3eSSadaf Ebrahimi 
168*ccdc9c3eSSadaf Ebrahimi   // Returns a fragment that can't match anything.
169*ccdc9c3eSSadaf Ebrahimi   Frag NoMatch();
170*ccdc9c3eSSadaf Ebrahimi 
171*ccdc9c3eSSadaf Ebrahimi   // Returns a fragment that matches the empty string.
172*ccdc9c3eSSadaf Ebrahimi   Frag Match(int32_t id);
173*ccdc9c3eSSadaf Ebrahimi 
174*ccdc9c3eSSadaf Ebrahimi   // Returns a no-op fragment.
175*ccdc9c3eSSadaf Ebrahimi   Frag Nop();
176*ccdc9c3eSSadaf Ebrahimi 
177*ccdc9c3eSSadaf Ebrahimi   // Returns a fragment matching the byte range lo-hi.
178*ccdc9c3eSSadaf Ebrahimi   Frag ByteRange(int lo, int hi, bool foldcase);
179*ccdc9c3eSSadaf Ebrahimi 
180*ccdc9c3eSSadaf Ebrahimi   // Returns a fragment matching an empty-width special op.
181*ccdc9c3eSSadaf Ebrahimi   Frag EmptyWidth(EmptyOp op);
182*ccdc9c3eSSadaf Ebrahimi 
183*ccdc9c3eSSadaf Ebrahimi   // Adds n instructions to the program.
184*ccdc9c3eSSadaf Ebrahimi   // Returns the index of the first one.
185*ccdc9c3eSSadaf Ebrahimi   // Returns -1 if no more instructions are available.
186*ccdc9c3eSSadaf Ebrahimi   int AllocInst(int n);
187*ccdc9c3eSSadaf Ebrahimi 
188*ccdc9c3eSSadaf Ebrahimi   // Rune range compiler.
189*ccdc9c3eSSadaf Ebrahimi 
190*ccdc9c3eSSadaf Ebrahimi   // Begins a new alternation.
191*ccdc9c3eSSadaf Ebrahimi   void BeginRange();
192*ccdc9c3eSSadaf Ebrahimi 
193*ccdc9c3eSSadaf Ebrahimi   // Adds a fragment matching the rune range lo-hi.
194*ccdc9c3eSSadaf Ebrahimi   void AddRuneRange(Rune lo, Rune hi, bool foldcase);
195*ccdc9c3eSSadaf Ebrahimi   void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
196*ccdc9c3eSSadaf Ebrahimi   void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
197*ccdc9c3eSSadaf Ebrahimi   void Add_80_10ffff();
198*ccdc9c3eSSadaf Ebrahimi 
199*ccdc9c3eSSadaf Ebrahimi   // New suffix that matches the byte range lo-hi, then goes to next.
200*ccdc9c3eSSadaf Ebrahimi   int UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
201*ccdc9c3eSSadaf Ebrahimi   int CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
202*ccdc9c3eSSadaf Ebrahimi 
203*ccdc9c3eSSadaf Ebrahimi   // Returns true iff the suffix is cached.
204*ccdc9c3eSSadaf Ebrahimi   bool IsCachedRuneByteSuffix(int id);
205*ccdc9c3eSSadaf Ebrahimi 
206*ccdc9c3eSSadaf Ebrahimi   // Adds a suffix to alternation.
207*ccdc9c3eSSadaf Ebrahimi   void AddSuffix(int id);
208*ccdc9c3eSSadaf Ebrahimi 
209*ccdc9c3eSSadaf Ebrahimi   // Adds a suffix to the trie starting from the given root node.
210*ccdc9c3eSSadaf Ebrahimi   // Returns zero iff allocating an instruction fails. Otherwise, returns
211*ccdc9c3eSSadaf Ebrahimi   // the current root node, which might be different from what was given.
212*ccdc9c3eSSadaf Ebrahimi   int AddSuffixRecursive(int root, int id);
213*ccdc9c3eSSadaf Ebrahimi 
214*ccdc9c3eSSadaf Ebrahimi   // Finds the trie node for the given suffix. Returns a Frag in order to
215*ccdc9c3eSSadaf Ebrahimi   // distinguish between pointing at the root node directly (end.p == 0)
216*ccdc9c3eSSadaf Ebrahimi   // and pointing at an Alt's out1 or out (end.p&1 == 1 or 0, respectively).
217*ccdc9c3eSSadaf Ebrahimi   Frag FindByteRange(int root, int id);
218*ccdc9c3eSSadaf Ebrahimi 
219*ccdc9c3eSSadaf Ebrahimi   // Compares two ByteRanges and returns true iff they are equal.
220*ccdc9c3eSSadaf Ebrahimi   bool ByteRangeEqual(int id1, int id2);
221*ccdc9c3eSSadaf Ebrahimi 
222*ccdc9c3eSSadaf Ebrahimi   // Returns the alternation of all the added suffixes.
223*ccdc9c3eSSadaf Ebrahimi   Frag EndRange();
224*ccdc9c3eSSadaf Ebrahimi 
225*ccdc9c3eSSadaf Ebrahimi   // Single rune.
226*ccdc9c3eSSadaf Ebrahimi   Frag Literal(Rune r, bool foldcase);
227*ccdc9c3eSSadaf Ebrahimi 
228*ccdc9c3eSSadaf Ebrahimi   void Setup(Regexp::ParseFlags, int64_t, RE2::Anchor);
229*ccdc9c3eSSadaf Ebrahimi   Prog* Finish();
230*ccdc9c3eSSadaf Ebrahimi 
231*ccdc9c3eSSadaf Ebrahimi   // Returns .* where dot = any byte
232*ccdc9c3eSSadaf Ebrahimi   Frag DotStar();
233*ccdc9c3eSSadaf Ebrahimi 
234*ccdc9c3eSSadaf Ebrahimi  private:
235*ccdc9c3eSSadaf Ebrahimi   Prog* prog_;         // Program being built.
236*ccdc9c3eSSadaf Ebrahimi   bool failed_;        // Did we give up compiling?
237*ccdc9c3eSSadaf Ebrahimi   Encoding encoding_;  // Input encoding
238*ccdc9c3eSSadaf Ebrahimi   bool reversed_;      // Should program run backward over text?
239*ccdc9c3eSSadaf Ebrahimi 
240*ccdc9c3eSSadaf Ebrahimi   PODArray<Prog::Inst> inst_;
241*ccdc9c3eSSadaf Ebrahimi   int ninst_;          // Number of instructions used.
242*ccdc9c3eSSadaf Ebrahimi   int max_ninst_;      // Maximum number of instructions.
243*ccdc9c3eSSadaf Ebrahimi 
244*ccdc9c3eSSadaf Ebrahimi   int64_t max_mem_;    // Total memory budget.
245*ccdc9c3eSSadaf Ebrahimi 
246*ccdc9c3eSSadaf Ebrahimi   std::unordered_map<uint64_t, int> rune_cache_;
247*ccdc9c3eSSadaf Ebrahimi   Frag rune_range_;
248*ccdc9c3eSSadaf Ebrahimi 
249*ccdc9c3eSSadaf Ebrahimi   RE2::Anchor anchor_;  // anchor mode for RE2::Set
250*ccdc9c3eSSadaf Ebrahimi 
251*ccdc9c3eSSadaf Ebrahimi   Compiler(const Compiler&) = delete;
252*ccdc9c3eSSadaf Ebrahimi   Compiler& operator=(const Compiler&) = delete;
253*ccdc9c3eSSadaf Ebrahimi };
254*ccdc9c3eSSadaf Ebrahimi 
Compiler()255*ccdc9c3eSSadaf Ebrahimi Compiler::Compiler() {
256*ccdc9c3eSSadaf Ebrahimi   prog_ = new Prog();
257*ccdc9c3eSSadaf Ebrahimi   failed_ = false;
258*ccdc9c3eSSadaf Ebrahimi   encoding_ = kEncodingUTF8;
259*ccdc9c3eSSadaf Ebrahimi   reversed_ = false;
260*ccdc9c3eSSadaf Ebrahimi   ninst_ = 0;
261*ccdc9c3eSSadaf Ebrahimi   max_ninst_ = 1;  // make AllocInst for fail instruction okay
262*ccdc9c3eSSadaf Ebrahimi   max_mem_ = 0;
263*ccdc9c3eSSadaf Ebrahimi   int fail = AllocInst(1);
264*ccdc9c3eSSadaf Ebrahimi   inst_[fail].InitFail();
265*ccdc9c3eSSadaf Ebrahimi   max_ninst_ = 0;  // Caller must change
266*ccdc9c3eSSadaf Ebrahimi }
267*ccdc9c3eSSadaf Ebrahimi 
~Compiler()268*ccdc9c3eSSadaf Ebrahimi Compiler::~Compiler() {
269*ccdc9c3eSSadaf Ebrahimi   delete prog_;
270*ccdc9c3eSSadaf Ebrahimi }
271*ccdc9c3eSSadaf Ebrahimi 
AllocInst(int n)272*ccdc9c3eSSadaf Ebrahimi int Compiler::AllocInst(int n) {
273*ccdc9c3eSSadaf Ebrahimi   if (failed_ || ninst_ + n > max_ninst_) {
274*ccdc9c3eSSadaf Ebrahimi     failed_ = true;
275*ccdc9c3eSSadaf Ebrahimi     return -1;
276*ccdc9c3eSSadaf Ebrahimi   }
277*ccdc9c3eSSadaf Ebrahimi 
278*ccdc9c3eSSadaf Ebrahimi   if (ninst_ + n > inst_.size()) {
279*ccdc9c3eSSadaf Ebrahimi     int cap = inst_.size();
280*ccdc9c3eSSadaf Ebrahimi     if (cap == 0)
281*ccdc9c3eSSadaf Ebrahimi       cap = 8;
282*ccdc9c3eSSadaf Ebrahimi     while (ninst_ + n > cap)
283*ccdc9c3eSSadaf Ebrahimi       cap *= 2;
284*ccdc9c3eSSadaf Ebrahimi     PODArray<Prog::Inst> inst(cap);
285*ccdc9c3eSSadaf Ebrahimi     if (inst_.data() != NULL)
286*ccdc9c3eSSadaf Ebrahimi       memmove(inst.data(), inst_.data(), ninst_*sizeof inst_[0]);
287*ccdc9c3eSSadaf Ebrahimi     memset(inst.data() + ninst_, 0, (cap - ninst_)*sizeof inst_[0]);
288*ccdc9c3eSSadaf Ebrahimi     inst_ = std::move(inst);
289*ccdc9c3eSSadaf Ebrahimi   }
290*ccdc9c3eSSadaf Ebrahimi   int id = ninst_;
291*ccdc9c3eSSadaf Ebrahimi   ninst_ += n;
292*ccdc9c3eSSadaf Ebrahimi   return id;
293*ccdc9c3eSSadaf Ebrahimi }
294*ccdc9c3eSSadaf Ebrahimi 
295*ccdc9c3eSSadaf Ebrahimi // These routines are somewhat hard to visualize in text --
296*ccdc9c3eSSadaf Ebrahimi // see http://swtch.com/~rsc/regexp/regexp1.html for
297*ccdc9c3eSSadaf Ebrahimi // pictures explaining what is going on here.
298*ccdc9c3eSSadaf Ebrahimi 
299*ccdc9c3eSSadaf Ebrahimi // Returns an unmatchable fragment.
NoMatch()300*ccdc9c3eSSadaf Ebrahimi Frag Compiler::NoMatch() {
301*ccdc9c3eSSadaf Ebrahimi   return Frag(0, nullPatchList);
302*ccdc9c3eSSadaf Ebrahimi }
303*ccdc9c3eSSadaf Ebrahimi 
304*ccdc9c3eSSadaf Ebrahimi // Is a an unmatchable fragment?
IsNoMatch(Frag a)305*ccdc9c3eSSadaf Ebrahimi static bool IsNoMatch(Frag a) {
306*ccdc9c3eSSadaf Ebrahimi   return a.begin == 0;
307*ccdc9c3eSSadaf Ebrahimi }
308*ccdc9c3eSSadaf Ebrahimi 
309*ccdc9c3eSSadaf Ebrahimi // Given fragments a and b, returns fragment for ab.
Cat(Frag a,Frag b)310*ccdc9c3eSSadaf Ebrahimi Frag Compiler::Cat(Frag a, Frag b) {
311*ccdc9c3eSSadaf Ebrahimi   if (IsNoMatch(a) || IsNoMatch(b))
312*ccdc9c3eSSadaf Ebrahimi     return NoMatch();
313*ccdc9c3eSSadaf Ebrahimi 
314*ccdc9c3eSSadaf Ebrahimi   // Elide no-op.
315*ccdc9c3eSSadaf Ebrahimi   Prog::Inst* begin = &inst_[a.begin];
316*ccdc9c3eSSadaf Ebrahimi   if (begin->opcode() == kInstNop &&
317*ccdc9c3eSSadaf Ebrahimi       a.end.p == (a.begin << 1) &&
318*ccdc9c3eSSadaf Ebrahimi       begin->out() == 0) {
319*ccdc9c3eSSadaf Ebrahimi     // in case refs to a somewhere
320*ccdc9c3eSSadaf Ebrahimi     PatchList::Patch(inst_.data(), a.end, b.begin);
321*ccdc9c3eSSadaf Ebrahimi     return b;
322*ccdc9c3eSSadaf Ebrahimi   }
323*ccdc9c3eSSadaf Ebrahimi 
324*ccdc9c3eSSadaf Ebrahimi   // To run backward over string, reverse all concatenations.
325*ccdc9c3eSSadaf Ebrahimi   if (reversed_) {
326*ccdc9c3eSSadaf Ebrahimi     PatchList::Patch(inst_.data(), b.end, a.begin);
327*ccdc9c3eSSadaf Ebrahimi     return Frag(b.begin, a.end);
328*ccdc9c3eSSadaf Ebrahimi   }
329*ccdc9c3eSSadaf Ebrahimi 
330*ccdc9c3eSSadaf Ebrahimi   PatchList::Patch(inst_.data(), a.end, b.begin);
331*ccdc9c3eSSadaf Ebrahimi   return Frag(a.begin, b.end);
332*ccdc9c3eSSadaf Ebrahimi }
333*ccdc9c3eSSadaf Ebrahimi 
334*ccdc9c3eSSadaf Ebrahimi // Given fragments for a and b, returns fragment for a|b.
Alt(Frag a,Frag b)335*ccdc9c3eSSadaf Ebrahimi Frag Compiler::Alt(Frag a, Frag b) {
336*ccdc9c3eSSadaf Ebrahimi   // Special case for convenience in loops.
337*ccdc9c3eSSadaf Ebrahimi   if (IsNoMatch(a))
338*ccdc9c3eSSadaf Ebrahimi     return b;
339*ccdc9c3eSSadaf Ebrahimi   if (IsNoMatch(b))
340*ccdc9c3eSSadaf Ebrahimi     return a;
341*ccdc9c3eSSadaf Ebrahimi 
342*ccdc9c3eSSadaf Ebrahimi   int id = AllocInst(1);
343*ccdc9c3eSSadaf Ebrahimi   if (id < 0)
344*ccdc9c3eSSadaf Ebrahimi     return NoMatch();
345*ccdc9c3eSSadaf Ebrahimi 
346*ccdc9c3eSSadaf Ebrahimi   inst_[id].InitAlt(a.begin, b.begin);
347*ccdc9c3eSSadaf Ebrahimi   return Frag(id, PatchList::Append(inst_.data(), a.end, b.end));
348*ccdc9c3eSSadaf Ebrahimi }
349*ccdc9c3eSSadaf Ebrahimi 
350*ccdc9c3eSSadaf Ebrahimi // When capturing submatches in like-Perl mode, a kOpAlt Inst
351*ccdc9c3eSSadaf Ebrahimi // treats out_ as the first choice, out1_ as the second.
352*ccdc9c3eSSadaf Ebrahimi //
353*ccdc9c3eSSadaf Ebrahimi // For *, +, and ?, if out_ causes another repetition,
354*ccdc9c3eSSadaf Ebrahimi // then the operator is greedy.  If out1_ is the repetition
355*ccdc9c3eSSadaf Ebrahimi // (and out_ moves forward), then the operator is non-greedy.
356*ccdc9c3eSSadaf Ebrahimi 
357*ccdc9c3eSSadaf Ebrahimi // Given a fragment a, returns a fragment for a* or a*? (if nongreedy)
Star(Frag a,bool nongreedy)358*ccdc9c3eSSadaf Ebrahimi Frag Compiler::Star(Frag a, bool nongreedy) {
359*ccdc9c3eSSadaf Ebrahimi   int id = AllocInst(1);
360*ccdc9c3eSSadaf Ebrahimi   if (id < 0)
361*ccdc9c3eSSadaf Ebrahimi     return NoMatch();
362*ccdc9c3eSSadaf Ebrahimi   inst_[id].InitAlt(0, 0);
363*ccdc9c3eSSadaf Ebrahimi   PatchList::Patch(inst_.data(), a.end, id);
364*ccdc9c3eSSadaf Ebrahimi   if (nongreedy) {
365*ccdc9c3eSSadaf Ebrahimi     inst_[id].out1_ = a.begin;
366*ccdc9c3eSSadaf Ebrahimi     return Frag(id, PatchList::Mk(id << 1));
367*ccdc9c3eSSadaf Ebrahimi   } else {
368*ccdc9c3eSSadaf Ebrahimi     inst_[id].set_out(a.begin);
369*ccdc9c3eSSadaf Ebrahimi     return Frag(id, PatchList::Mk((id << 1) | 1));
370*ccdc9c3eSSadaf Ebrahimi   }
371*ccdc9c3eSSadaf Ebrahimi }
372*ccdc9c3eSSadaf Ebrahimi 
373*ccdc9c3eSSadaf Ebrahimi // Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
Plus(Frag a,bool nongreedy)374*ccdc9c3eSSadaf Ebrahimi Frag Compiler::Plus(Frag a, bool nongreedy) {
375*ccdc9c3eSSadaf Ebrahimi   // a+ is just a* with a different entry point.
376*ccdc9c3eSSadaf Ebrahimi   Frag f = Star(a, nongreedy);
377*ccdc9c3eSSadaf Ebrahimi   return Frag(a.begin, f.end);
378*ccdc9c3eSSadaf Ebrahimi }
379*ccdc9c3eSSadaf Ebrahimi 
380*ccdc9c3eSSadaf Ebrahimi // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
Quest(Frag a,bool nongreedy)381*ccdc9c3eSSadaf Ebrahimi Frag Compiler::Quest(Frag a, bool nongreedy) {
382*ccdc9c3eSSadaf Ebrahimi   if (IsNoMatch(a))
383*ccdc9c3eSSadaf Ebrahimi     return Nop();
384*ccdc9c3eSSadaf Ebrahimi   int id = AllocInst(1);
385*ccdc9c3eSSadaf Ebrahimi   if (id < 0)
386*ccdc9c3eSSadaf Ebrahimi     return NoMatch();
387*ccdc9c3eSSadaf Ebrahimi   PatchList pl;
388*ccdc9c3eSSadaf Ebrahimi   if (nongreedy) {
389*ccdc9c3eSSadaf Ebrahimi     inst_[id].InitAlt(0, a.begin);
390*ccdc9c3eSSadaf Ebrahimi     pl = PatchList::Mk(id << 1);
391*ccdc9c3eSSadaf Ebrahimi   } else {
392*ccdc9c3eSSadaf Ebrahimi     inst_[id].InitAlt(a.begin, 0);
393*ccdc9c3eSSadaf Ebrahimi     pl = PatchList::Mk((id << 1) | 1);
394*ccdc9c3eSSadaf Ebrahimi   }
395*ccdc9c3eSSadaf Ebrahimi   return Frag(id, PatchList::Append(inst_.data(), pl, a.end));
396*ccdc9c3eSSadaf Ebrahimi }
397*ccdc9c3eSSadaf Ebrahimi 
398*ccdc9c3eSSadaf Ebrahimi // Returns a fragment for the byte range lo-hi.
ByteRange(int lo,int hi,bool foldcase)399*ccdc9c3eSSadaf Ebrahimi Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
400*ccdc9c3eSSadaf Ebrahimi   int id = AllocInst(1);
401*ccdc9c3eSSadaf Ebrahimi   if (id < 0)
402*ccdc9c3eSSadaf Ebrahimi     return NoMatch();
403*ccdc9c3eSSadaf Ebrahimi   inst_[id].InitByteRange(lo, hi, foldcase, 0);
404*ccdc9c3eSSadaf Ebrahimi   return Frag(id, PatchList::Mk(id << 1));
405*ccdc9c3eSSadaf Ebrahimi }
406*ccdc9c3eSSadaf Ebrahimi 
407*ccdc9c3eSSadaf Ebrahimi // Returns a no-op fragment.  Sometimes unavoidable.
Nop()408*ccdc9c3eSSadaf Ebrahimi Frag Compiler::Nop() {
409*ccdc9c3eSSadaf Ebrahimi   int id = AllocInst(1);
410*ccdc9c3eSSadaf Ebrahimi   if (id < 0)
411*ccdc9c3eSSadaf Ebrahimi     return NoMatch();
412*ccdc9c3eSSadaf Ebrahimi   inst_[id].InitNop(0);
413*ccdc9c3eSSadaf Ebrahimi   return Frag(id, PatchList::Mk(id << 1));
414*ccdc9c3eSSadaf Ebrahimi }
415*ccdc9c3eSSadaf Ebrahimi 
416*ccdc9c3eSSadaf Ebrahimi // Returns a fragment that signals a match.
Match(int32_t match_id)417*ccdc9c3eSSadaf Ebrahimi Frag Compiler::Match(int32_t match_id) {
418*ccdc9c3eSSadaf Ebrahimi   int id = AllocInst(1);
419*ccdc9c3eSSadaf Ebrahimi   if (id < 0)
420*ccdc9c3eSSadaf Ebrahimi     return NoMatch();
421*ccdc9c3eSSadaf Ebrahimi   inst_[id].InitMatch(match_id);
422*ccdc9c3eSSadaf Ebrahimi   return Frag(id, nullPatchList);
423*ccdc9c3eSSadaf Ebrahimi }
424*ccdc9c3eSSadaf Ebrahimi 
425*ccdc9c3eSSadaf Ebrahimi // Returns a fragment matching a particular empty-width op (like ^ or $)
EmptyWidth(EmptyOp empty)426*ccdc9c3eSSadaf Ebrahimi Frag Compiler::EmptyWidth(EmptyOp empty) {
427*ccdc9c3eSSadaf Ebrahimi   int id = AllocInst(1);
428*ccdc9c3eSSadaf Ebrahimi   if (id < 0)
429*ccdc9c3eSSadaf Ebrahimi     return NoMatch();
430*ccdc9c3eSSadaf Ebrahimi   inst_[id].InitEmptyWidth(empty, 0);
431*ccdc9c3eSSadaf Ebrahimi   return Frag(id, PatchList::Mk(id << 1));
432*ccdc9c3eSSadaf Ebrahimi }
433*ccdc9c3eSSadaf Ebrahimi 
434*ccdc9c3eSSadaf Ebrahimi // Given a fragment a, returns a fragment with capturing parens around a.
Capture(Frag a,int n)435*ccdc9c3eSSadaf Ebrahimi Frag Compiler::Capture(Frag a, int n) {
436*ccdc9c3eSSadaf Ebrahimi   if (IsNoMatch(a))
437*ccdc9c3eSSadaf Ebrahimi     return NoMatch();
438*ccdc9c3eSSadaf Ebrahimi   int id = AllocInst(2);
439*ccdc9c3eSSadaf Ebrahimi   if (id < 0)
440*ccdc9c3eSSadaf Ebrahimi     return NoMatch();
441*ccdc9c3eSSadaf Ebrahimi   inst_[id].InitCapture(2*n, a.begin);
442*ccdc9c3eSSadaf Ebrahimi   inst_[id+1].InitCapture(2*n+1, 0);
443*ccdc9c3eSSadaf Ebrahimi   PatchList::Patch(inst_.data(), a.end, id+1);
444*ccdc9c3eSSadaf Ebrahimi 
445*ccdc9c3eSSadaf Ebrahimi   return Frag(id, PatchList::Mk((id+1) << 1));
446*ccdc9c3eSSadaf Ebrahimi }
447*ccdc9c3eSSadaf Ebrahimi 
448*ccdc9c3eSSadaf Ebrahimi // A Rune is a name for a Unicode code point.
449*ccdc9c3eSSadaf Ebrahimi // Returns maximum rune encoded by UTF-8 sequence of length len.
MaxRune(int len)450*ccdc9c3eSSadaf Ebrahimi static int MaxRune(int len) {
451*ccdc9c3eSSadaf Ebrahimi   int b;  // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax)
452*ccdc9c3eSSadaf Ebrahimi   if (len == 1)
453*ccdc9c3eSSadaf Ebrahimi     b = 7;
454*ccdc9c3eSSadaf Ebrahimi   else
455*ccdc9c3eSSadaf Ebrahimi     b = 8-(len+1) + 6*(len-1);
456*ccdc9c3eSSadaf Ebrahimi   return (1<<b) - 1;   // maximum Rune for b bits.
457*ccdc9c3eSSadaf Ebrahimi }
458*ccdc9c3eSSadaf Ebrahimi 
459*ccdc9c3eSSadaf Ebrahimi // The rune range compiler caches common suffix fragments,
460*ccdc9c3eSSadaf Ebrahimi // which are very common in UTF-8 (e.g., [80-bf]).
461*ccdc9c3eSSadaf Ebrahimi // The fragment suffixes are identified by their start
462*ccdc9c3eSSadaf Ebrahimi // instructions.  NULL denotes the eventual end match.
463*ccdc9c3eSSadaf Ebrahimi // The Frag accumulates in rune_range_.  Caching common
464*ccdc9c3eSSadaf Ebrahimi // suffixes reduces the UTF-8 "." from 32 to 24 instructions,
465*ccdc9c3eSSadaf Ebrahimi // and it reduces the corresponding one-pass NFA from 16 nodes to 8.
466*ccdc9c3eSSadaf Ebrahimi 
BeginRange()467*ccdc9c3eSSadaf Ebrahimi void Compiler::BeginRange() {
468*ccdc9c3eSSadaf Ebrahimi   rune_cache_.clear();
469*ccdc9c3eSSadaf Ebrahimi   rune_range_.begin = 0;
470*ccdc9c3eSSadaf Ebrahimi   rune_range_.end = nullPatchList;
471*ccdc9c3eSSadaf Ebrahimi }
472*ccdc9c3eSSadaf Ebrahimi 
UncachedRuneByteSuffix(uint8_t lo,uint8_t hi,bool foldcase,int next)473*ccdc9c3eSSadaf Ebrahimi int Compiler::UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
474*ccdc9c3eSSadaf Ebrahimi                                      int next) {
475*ccdc9c3eSSadaf Ebrahimi   Frag f = ByteRange(lo, hi, foldcase);
476*ccdc9c3eSSadaf Ebrahimi   if (next != 0) {
477*ccdc9c3eSSadaf Ebrahimi     PatchList::Patch(inst_.data(), f.end, next);
478*ccdc9c3eSSadaf Ebrahimi   } else {
479*ccdc9c3eSSadaf Ebrahimi     rune_range_.end = PatchList::Append(inst_.data(), rune_range_.end, f.end);
480*ccdc9c3eSSadaf Ebrahimi   }
481*ccdc9c3eSSadaf Ebrahimi   return f.begin;
482*ccdc9c3eSSadaf Ebrahimi }
483*ccdc9c3eSSadaf Ebrahimi 
MakeRuneCacheKey(uint8_t lo,uint8_t hi,bool foldcase,int next)484*ccdc9c3eSSadaf Ebrahimi static uint64_t MakeRuneCacheKey(uint8_t lo, uint8_t hi, bool foldcase,
485*ccdc9c3eSSadaf Ebrahimi                                  int next) {
486*ccdc9c3eSSadaf Ebrahimi   return (uint64_t)next << 17 |
487*ccdc9c3eSSadaf Ebrahimi          (uint64_t)lo   <<  9 |
488*ccdc9c3eSSadaf Ebrahimi          (uint64_t)hi   <<  1 |
489*ccdc9c3eSSadaf Ebrahimi          (uint64_t)foldcase;
490*ccdc9c3eSSadaf Ebrahimi }
491*ccdc9c3eSSadaf Ebrahimi 
CachedRuneByteSuffix(uint8_t lo,uint8_t hi,bool foldcase,int next)492*ccdc9c3eSSadaf Ebrahimi int Compiler::CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
493*ccdc9c3eSSadaf Ebrahimi                                    int next) {
494*ccdc9c3eSSadaf Ebrahimi   uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
495*ccdc9c3eSSadaf Ebrahimi   std::unordered_map<uint64_t, int>::const_iterator it = rune_cache_.find(key);
496*ccdc9c3eSSadaf Ebrahimi   if (it != rune_cache_.end())
497*ccdc9c3eSSadaf Ebrahimi     return it->second;
498*ccdc9c3eSSadaf Ebrahimi   int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
499*ccdc9c3eSSadaf Ebrahimi   rune_cache_[key] = id;
500*ccdc9c3eSSadaf Ebrahimi   return id;
501*ccdc9c3eSSadaf Ebrahimi }
502*ccdc9c3eSSadaf Ebrahimi 
IsCachedRuneByteSuffix(int id)503*ccdc9c3eSSadaf Ebrahimi bool Compiler::IsCachedRuneByteSuffix(int id) {
504*ccdc9c3eSSadaf Ebrahimi   uint8_t lo = inst_[id].lo_;
505*ccdc9c3eSSadaf Ebrahimi   uint8_t hi = inst_[id].hi_;
506*ccdc9c3eSSadaf Ebrahimi   bool foldcase = inst_[id].foldcase() != 0;
507*ccdc9c3eSSadaf Ebrahimi   int next = inst_[id].out();
508*ccdc9c3eSSadaf Ebrahimi 
509*ccdc9c3eSSadaf Ebrahimi   uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
510*ccdc9c3eSSadaf Ebrahimi   return rune_cache_.find(key) != rune_cache_.end();
511*ccdc9c3eSSadaf Ebrahimi }
512*ccdc9c3eSSadaf Ebrahimi 
AddSuffix(int id)513*ccdc9c3eSSadaf Ebrahimi void Compiler::AddSuffix(int id) {
514*ccdc9c3eSSadaf Ebrahimi   if (failed_)
515*ccdc9c3eSSadaf Ebrahimi     return;
516*ccdc9c3eSSadaf Ebrahimi 
517*ccdc9c3eSSadaf Ebrahimi   if (rune_range_.begin == 0) {
518*ccdc9c3eSSadaf Ebrahimi     rune_range_.begin = id;
519*ccdc9c3eSSadaf Ebrahimi     return;
520*ccdc9c3eSSadaf Ebrahimi   }
521*ccdc9c3eSSadaf Ebrahimi 
522*ccdc9c3eSSadaf Ebrahimi   if (encoding_ == kEncodingUTF8) {
523*ccdc9c3eSSadaf Ebrahimi     // Build a trie in order to reduce fanout.
524*ccdc9c3eSSadaf Ebrahimi     rune_range_.begin = AddSuffixRecursive(rune_range_.begin, id);
525*ccdc9c3eSSadaf Ebrahimi     return;
526*ccdc9c3eSSadaf Ebrahimi   }
527*ccdc9c3eSSadaf Ebrahimi 
528*ccdc9c3eSSadaf Ebrahimi   int alt = AllocInst(1);
529*ccdc9c3eSSadaf Ebrahimi   if (alt < 0) {
530*ccdc9c3eSSadaf Ebrahimi     rune_range_.begin = 0;
531*ccdc9c3eSSadaf Ebrahimi     return;
532*ccdc9c3eSSadaf Ebrahimi   }
533*ccdc9c3eSSadaf Ebrahimi   inst_[alt].InitAlt(rune_range_.begin, id);
534*ccdc9c3eSSadaf Ebrahimi   rune_range_.begin = alt;
535*ccdc9c3eSSadaf Ebrahimi }
536*ccdc9c3eSSadaf Ebrahimi 
AddSuffixRecursive(int root,int id)537*ccdc9c3eSSadaf Ebrahimi int Compiler::AddSuffixRecursive(int root, int id) {
538*ccdc9c3eSSadaf Ebrahimi   DCHECK(inst_[root].opcode() == kInstAlt ||
539*ccdc9c3eSSadaf Ebrahimi          inst_[root].opcode() == kInstByteRange);
540*ccdc9c3eSSadaf Ebrahimi 
541*ccdc9c3eSSadaf Ebrahimi   Frag f = FindByteRange(root, id);
542*ccdc9c3eSSadaf Ebrahimi   if (IsNoMatch(f)) {
543*ccdc9c3eSSadaf Ebrahimi     int alt = AllocInst(1);
544*ccdc9c3eSSadaf Ebrahimi     if (alt < 0)
545*ccdc9c3eSSadaf Ebrahimi       return 0;
546*ccdc9c3eSSadaf Ebrahimi     inst_[alt].InitAlt(root, id);
547*ccdc9c3eSSadaf Ebrahimi     return alt;
548*ccdc9c3eSSadaf Ebrahimi   }
549*ccdc9c3eSSadaf Ebrahimi 
550*ccdc9c3eSSadaf Ebrahimi   int br;
551*ccdc9c3eSSadaf Ebrahimi   if (f.end.p == 0)
552*ccdc9c3eSSadaf Ebrahimi     br = root;
553*ccdc9c3eSSadaf Ebrahimi   else if (f.end.p&1)
554*ccdc9c3eSSadaf Ebrahimi     br = inst_[f.begin].out1();
555*ccdc9c3eSSadaf Ebrahimi   else
556*ccdc9c3eSSadaf Ebrahimi     br = inst_[f.begin].out();
557*ccdc9c3eSSadaf Ebrahimi 
558*ccdc9c3eSSadaf Ebrahimi   if (IsCachedRuneByteSuffix(br)) {
559*ccdc9c3eSSadaf Ebrahimi     // We can't fiddle with cached suffixes, so make a clone of the head.
560*ccdc9c3eSSadaf Ebrahimi     int byterange = AllocInst(1);
561*ccdc9c3eSSadaf Ebrahimi     if (byterange < 0)
562*ccdc9c3eSSadaf Ebrahimi       return 0;
563*ccdc9c3eSSadaf Ebrahimi     inst_[byterange].InitByteRange(inst_[br].lo(), inst_[br].hi(),
564*ccdc9c3eSSadaf Ebrahimi                                    inst_[br].foldcase(), inst_[br].out());
565*ccdc9c3eSSadaf Ebrahimi 
566*ccdc9c3eSSadaf Ebrahimi     // Ensure that the parent points to the clone, not to the original.
567*ccdc9c3eSSadaf Ebrahimi     // Note that this could leave the head unreachable except via the cache.
568*ccdc9c3eSSadaf Ebrahimi     br = byterange;
569*ccdc9c3eSSadaf Ebrahimi     if (f.end.p == 0)
570*ccdc9c3eSSadaf Ebrahimi       root = br;
571*ccdc9c3eSSadaf Ebrahimi     else if (f.end.p&1)
572*ccdc9c3eSSadaf Ebrahimi       inst_[f.begin].out1_ = br;
573*ccdc9c3eSSadaf Ebrahimi     else
574*ccdc9c3eSSadaf Ebrahimi       inst_[f.begin].set_out(br);
575*ccdc9c3eSSadaf Ebrahimi   }
576*ccdc9c3eSSadaf Ebrahimi 
577*ccdc9c3eSSadaf Ebrahimi   int out = inst_[id].out();
578*ccdc9c3eSSadaf Ebrahimi   if (!IsCachedRuneByteSuffix(id)) {
579*ccdc9c3eSSadaf Ebrahimi     // The head should be the instruction most recently allocated, so free it
580*ccdc9c3eSSadaf Ebrahimi     // instead of leaving it unreachable.
581*ccdc9c3eSSadaf Ebrahimi     DCHECK_EQ(id, ninst_-1);
582*ccdc9c3eSSadaf Ebrahimi     inst_[id].out_opcode_ = 0;
583*ccdc9c3eSSadaf Ebrahimi     inst_[id].out1_ = 0;
584*ccdc9c3eSSadaf Ebrahimi     ninst_--;
585*ccdc9c3eSSadaf Ebrahimi   }
586*ccdc9c3eSSadaf Ebrahimi 
587*ccdc9c3eSSadaf Ebrahimi   out = AddSuffixRecursive(inst_[br].out(), out);
588*ccdc9c3eSSadaf Ebrahimi   if (out == 0)
589*ccdc9c3eSSadaf Ebrahimi     return 0;
590*ccdc9c3eSSadaf Ebrahimi 
591*ccdc9c3eSSadaf Ebrahimi   inst_[br].set_out(out);
592*ccdc9c3eSSadaf Ebrahimi   return root;
593*ccdc9c3eSSadaf Ebrahimi }
594*ccdc9c3eSSadaf Ebrahimi 
ByteRangeEqual(int id1,int id2)595*ccdc9c3eSSadaf Ebrahimi bool Compiler::ByteRangeEqual(int id1, int id2) {
596*ccdc9c3eSSadaf Ebrahimi   return inst_[id1].lo() == inst_[id2].lo() &&
597*ccdc9c3eSSadaf Ebrahimi          inst_[id1].hi() == inst_[id2].hi() &&
598*ccdc9c3eSSadaf Ebrahimi          inst_[id1].foldcase() == inst_[id2].foldcase();
599*ccdc9c3eSSadaf Ebrahimi }
600*ccdc9c3eSSadaf Ebrahimi 
FindByteRange(int root,int id)601*ccdc9c3eSSadaf Ebrahimi Frag Compiler::FindByteRange(int root, int id) {
602*ccdc9c3eSSadaf Ebrahimi   if (inst_[root].opcode() == kInstByteRange) {
603*ccdc9c3eSSadaf Ebrahimi     if (ByteRangeEqual(root, id))
604*ccdc9c3eSSadaf Ebrahimi       return Frag(root, nullPatchList);
605*ccdc9c3eSSadaf Ebrahimi     else
606*ccdc9c3eSSadaf Ebrahimi       return NoMatch();
607*ccdc9c3eSSadaf Ebrahimi   }
608*ccdc9c3eSSadaf Ebrahimi 
609*ccdc9c3eSSadaf Ebrahimi   while (inst_[root].opcode() == kInstAlt) {
610*ccdc9c3eSSadaf Ebrahimi     int out1 = inst_[root].out1();
611*ccdc9c3eSSadaf Ebrahimi     if (ByteRangeEqual(out1, id))
612*ccdc9c3eSSadaf Ebrahimi       return Frag(root, PatchList::Mk((root << 1) | 1));
613*ccdc9c3eSSadaf Ebrahimi 
614*ccdc9c3eSSadaf Ebrahimi     // CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't
615*ccdc9c3eSSadaf Ebrahimi     // what we're looking for, then we can stop immediately. Unfortunately, we
616*ccdc9c3eSSadaf Ebrahimi     // can't short-circuit the search in reverse mode.
617*ccdc9c3eSSadaf Ebrahimi     if (!reversed_)
618*ccdc9c3eSSadaf Ebrahimi       return NoMatch();
619*ccdc9c3eSSadaf Ebrahimi 
620*ccdc9c3eSSadaf Ebrahimi     int out = inst_[root].out();
621*ccdc9c3eSSadaf Ebrahimi     if (inst_[out].opcode() == kInstAlt)
622*ccdc9c3eSSadaf Ebrahimi       root = out;
623*ccdc9c3eSSadaf Ebrahimi     else if (ByteRangeEqual(out, id))
624*ccdc9c3eSSadaf Ebrahimi       return Frag(root, PatchList::Mk(root << 1));
625*ccdc9c3eSSadaf Ebrahimi     else
626*ccdc9c3eSSadaf Ebrahimi       return NoMatch();
627*ccdc9c3eSSadaf Ebrahimi   }
628*ccdc9c3eSSadaf Ebrahimi 
629*ccdc9c3eSSadaf Ebrahimi   LOG(DFATAL) << "should never happen";
630*ccdc9c3eSSadaf Ebrahimi   return NoMatch();
631*ccdc9c3eSSadaf Ebrahimi }
632*ccdc9c3eSSadaf Ebrahimi 
EndRange()633*ccdc9c3eSSadaf Ebrahimi Frag Compiler::EndRange() {
634*ccdc9c3eSSadaf Ebrahimi   return rune_range_;
635*ccdc9c3eSSadaf Ebrahimi }
636*ccdc9c3eSSadaf Ebrahimi 
637*ccdc9c3eSSadaf Ebrahimi // Converts rune range lo-hi into a fragment that recognizes
638*ccdc9c3eSSadaf Ebrahimi // the bytes that would make up those runes in the current
639*ccdc9c3eSSadaf Ebrahimi // encoding (Latin 1 or UTF-8).
640*ccdc9c3eSSadaf Ebrahimi // This lets the machine work byte-by-byte even when
641*ccdc9c3eSSadaf Ebrahimi // using multibyte encodings.
642*ccdc9c3eSSadaf Ebrahimi 
AddRuneRange(Rune lo,Rune hi,bool foldcase)643*ccdc9c3eSSadaf Ebrahimi void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) {
644*ccdc9c3eSSadaf Ebrahimi   switch (encoding_) {
645*ccdc9c3eSSadaf Ebrahimi     default:
646*ccdc9c3eSSadaf Ebrahimi     case kEncodingUTF8:
647*ccdc9c3eSSadaf Ebrahimi       AddRuneRangeUTF8(lo, hi, foldcase);
648*ccdc9c3eSSadaf Ebrahimi       break;
649*ccdc9c3eSSadaf Ebrahimi     case kEncodingLatin1:
650*ccdc9c3eSSadaf Ebrahimi       AddRuneRangeLatin1(lo, hi, foldcase);
651*ccdc9c3eSSadaf Ebrahimi       break;
652*ccdc9c3eSSadaf Ebrahimi   }
653*ccdc9c3eSSadaf Ebrahimi }
654*ccdc9c3eSSadaf Ebrahimi 
AddRuneRangeLatin1(Rune lo,Rune hi,bool foldcase)655*ccdc9c3eSSadaf Ebrahimi void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
656*ccdc9c3eSSadaf Ebrahimi   // Latin-1 is easy: runes *are* bytes.
657*ccdc9c3eSSadaf Ebrahimi   if (lo > hi || lo > 0xFF)
658*ccdc9c3eSSadaf Ebrahimi     return;
659*ccdc9c3eSSadaf Ebrahimi   if (hi > 0xFF)
660*ccdc9c3eSSadaf Ebrahimi     hi = 0xFF;
661*ccdc9c3eSSadaf Ebrahimi   AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
662*ccdc9c3eSSadaf Ebrahimi                                    static_cast<uint8_t>(hi), foldcase, 0));
663*ccdc9c3eSSadaf Ebrahimi }
664*ccdc9c3eSSadaf Ebrahimi 
665*ccdc9c3eSSadaf Ebrahimi // Table describing how to make a UTF-8 matching machine
666*ccdc9c3eSSadaf Ebrahimi // for the rune range 80-10FFFF (Runeself-Runemax).
667*ccdc9c3eSSadaf Ebrahimi // This range happens frequently enough (for example /./ and /[^a-z]/)
668*ccdc9c3eSSadaf Ebrahimi // and the rune_cache_ map is slow enough that this is worth
669*ccdc9c3eSSadaf Ebrahimi // special handling.  Makes compilation of a small expression
670*ccdc9c3eSSadaf Ebrahimi // with a dot in it about 10% faster.
671*ccdc9c3eSSadaf Ebrahimi // The * in the comments below mark whole sequences.
672*ccdc9c3eSSadaf Ebrahimi static struct ByteRangeProg {
673*ccdc9c3eSSadaf Ebrahimi   int next;
674*ccdc9c3eSSadaf Ebrahimi   int lo;
675*ccdc9c3eSSadaf Ebrahimi   int hi;
676*ccdc9c3eSSadaf Ebrahimi } prog_80_10ffff[] = {
677*ccdc9c3eSSadaf Ebrahimi   // Two-byte
678*ccdc9c3eSSadaf Ebrahimi   { -1, 0x80, 0xBF, },  // 0:  80-BF
679*ccdc9c3eSSadaf Ebrahimi   {  0, 0xC2, 0xDF, },  // 1:  C2-DF 80-BF*
680*ccdc9c3eSSadaf Ebrahimi 
681*ccdc9c3eSSadaf Ebrahimi   // Three-byte
682*ccdc9c3eSSadaf Ebrahimi   {  0, 0xA0, 0xBF, },  // 2:  A0-BF 80-BF
683*ccdc9c3eSSadaf Ebrahimi   {  2, 0xE0, 0xE0, },  // 3:  E0 A0-BF 80-BF*
684*ccdc9c3eSSadaf Ebrahimi   {  0, 0x80, 0xBF, },  // 4:  80-BF 80-BF
685*ccdc9c3eSSadaf Ebrahimi   {  4, 0xE1, 0xEF, },  // 5:  E1-EF 80-BF 80-BF*
686*ccdc9c3eSSadaf Ebrahimi 
687*ccdc9c3eSSadaf Ebrahimi   // Four-byte
688*ccdc9c3eSSadaf Ebrahimi   {  4, 0x90, 0xBF, },  // 6:  90-BF 80-BF 80-BF
689*ccdc9c3eSSadaf Ebrahimi   {  6, 0xF0, 0xF0, },  // 7:  F0 90-BF 80-BF 80-BF*
690*ccdc9c3eSSadaf Ebrahimi   {  4, 0x80, 0xBF, },  // 8:  80-BF 80-BF 80-BF
691*ccdc9c3eSSadaf Ebrahimi   {  8, 0xF1, 0xF3, },  // 9: F1-F3 80-BF 80-BF 80-BF*
692*ccdc9c3eSSadaf Ebrahimi   {  4, 0x80, 0x8F, },  // 10: 80-8F 80-BF 80-BF
693*ccdc9c3eSSadaf Ebrahimi   { 10, 0xF4, 0xF4, },  // 11: F4 80-8F 80-BF 80-BF*
694*ccdc9c3eSSadaf Ebrahimi };
695*ccdc9c3eSSadaf Ebrahimi 
Add_80_10ffff()696*ccdc9c3eSSadaf Ebrahimi void Compiler::Add_80_10ffff() {
697*ccdc9c3eSSadaf Ebrahimi   int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialized; silences gcc warning
698*ccdc9c3eSSadaf Ebrahimi   for (int i = 0; i < arraysize(prog_80_10ffff); i++) {
699*ccdc9c3eSSadaf Ebrahimi     const ByteRangeProg& p = prog_80_10ffff[i];
700*ccdc9c3eSSadaf Ebrahimi     int next = 0;
701*ccdc9c3eSSadaf Ebrahimi     if (p.next >= 0)
702*ccdc9c3eSSadaf Ebrahimi       next = inst[p.next];
703*ccdc9c3eSSadaf Ebrahimi     inst[i] = UncachedRuneByteSuffix(static_cast<uint8_t>(p.lo),
704*ccdc9c3eSSadaf Ebrahimi                                      static_cast<uint8_t>(p.hi), false, next);
705*ccdc9c3eSSadaf Ebrahimi     if ((p.lo & 0xC0) != 0x80)
706*ccdc9c3eSSadaf Ebrahimi       AddSuffix(inst[i]);
707*ccdc9c3eSSadaf Ebrahimi   }
708*ccdc9c3eSSadaf Ebrahimi }
709*ccdc9c3eSSadaf Ebrahimi 
AddRuneRangeUTF8(Rune lo,Rune hi,bool foldcase)710*ccdc9c3eSSadaf Ebrahimi void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
711*ccdc9c3eSSadaf Ebrahimi   if (lo > hi)
712*ccdc9c3eSSadaf Ebrahimi     return;
713*ccdc9c3eSSadaf Ebrahimi 
714*ccdc9c3eSSadaf Ebrahimi   // Pick off 80-10FFFF as a common special case
715*ccdc9c3eSSadaf Ebrahimi   // that can bypass the slow rune_cache_.
716*ccdc9c3eSSadaf Ebrahimi   if (lo == 0x80 && hi == 0x10ffff && !reversed_) {
717*ccdc9c3eSSadaf Ebrahimi     Add_80_10ffff();
718*ccdc9c3eSSadaf Ebrahimi     return;
719*ccdc9c3eSSadaf Ebrahimi   }
720*ccdc9c3eSSadaf Ebrahimi 
721*ccdc9c3eSSadaf Ebrahimi   // Split range into same-length sized ranges.
722*ccdc9c3eSSadaf Ebrahimi   for (int i = 1; i < UTFmax; i++) {
723*ccdc9c3eSSadaf Ebrahimi     Rune max = MaxRune(i);
724*ccdc9c3eSSadaf Ebrahimi     if (lo <= max && max < hi) {
725*ccdc9c3eSSadaf Ebrahimi       AddRuneRangeUTF8(lo, max, foldcase);
726*ccdc9c3eSSadaf Ebrahimi       AddRuneRangeUTF8(max+1, hi, foldcase);
727*ccdc9c3eSSadaf Ebrahimi       return;
728*ccdc9c3eSSadaf Ebrahimi     }
729*ccdc9c3eSSadaf Ebrahimi   }
730*ccdc9c3eSSadaf Ebrahimi 
731*ccdc9c3eSSadaf Ebrahimi   // ASCII range is always a special case.
732*ccdc9c3eSSadaf Ebrahimi   if (hi < Runeself) {
733*ccdc9c3eSSadaf Ebrahimi     AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
734*ccdc9c3eSSadaf Ebrahimi                                      static_cast<uint8_t>(hi), foldcase, 0));
735*ccdc9c3eSSadaf Ebrahimi     return;
736*ccdc9c3eSSadaf Ebrahimi   }
737*ccdc9c3eSSadaf Ebrahimi 
738*ccdc9c3eSSadaf Ebrahimi   // Split range into sections that agree on leading bytes.
739*ccdc9c3eSSadaf Ebrahimi   for (int i = 1; i < UTFmax; i++) {
740*ccdc9c3eSSadaf Ebrahimi     uint32_t m = (1<<(6*i)) - 1;  // last i bytes of a UTF-8 sequence
741*ccdc9c3eSSadaf Ebrahimi     if ((lo & ~m) != (hi & ~m)) {
742*ccdc9c3eSSadaf Ebrahimi       if ((lo & m) != 0) {
743*ccdc9c3eSSadaf Ebrahimi         AddRuneRangeUTF8(lo, lo|m, foldcase);
744*ccdc9c3eSSadaf Ebrahimi         AddRuneRangeUTF8((lo|m)+1, hi, foldcase);
745*ccdc9c3eSSadaf Ebrahimi         return;
746*ccdc9c3eSSadaf Ebrahimi       }
747*ccdc9c3eSSadaf Ebrahimi       if ((hi & m) != m) {
748*ccdc9c3eSSadaf Ebrahimi         AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase);
749*ccdc9c3eSSadaf Ebrahimi         AddRuneRangeUTF8(hi&~m, hi, foldcase);
750*ccdc9c3eSSadaf Ebrahimi         return;
751*ccdc9c3eSSadaf Ebrahimi       }
752*ccdc9c3eSSadaf Ebrahimi     }
753*ccdc9c3eSSadaf Ebrahimi   }
754*ccdc9c3eSSadaf Ebrahimi 
755*ccdc9c3eSSadaf Ebrahimi   // Finally.  Generate byte matching equivalent for lo-hi.
756*ccdc9c3eSSadaf Ebrahimi   uint8_t ulo[UTFmax], uhi[UTFmax];
757*ccdc9c3eSSadaf Ebrahimi   int n = runetochar(reinterpret_cast<char*>(ulo), &lo);
758*ccdc9c3eSSadaf Ebrahimi   int m = runetochar(reinterpret_cast<char*>(uhi), &hi);
759*ccdc9c3eSSadaf Ebrahimi   (void)m;  // USED(m)
760*ccdc9c3eSSadaf Ebrahimi   DCHECK_EQ(n, m);
761*ccdc9c3eSSadaf Ebrahimi 
762*ccdc9c3eSSadaf Ebrahimi   // The logic below encodes this thinking:
763*ccdc9c3eSSadaf Ebrahimi   //
764*ccdc9c3eSSadaf Ebrahimi   // 1. When we have built the whole suffix, we know that it cannot
765*ccdc9c3eSSadaf Ebrahimi   // possibly be a suffix of anything longer: in forward mode, nothing
766*ccdc9c3eSSadaf Ebrahimi   // else can occur before the leading byte; in reverse mode, nothing
767*ccdc9c3eSSadaf Ebrahimi   // else can occur after the last continuation byte or else the leading
768*ccdc9c3eSSadaf Ebrahimi   // byte would have to change. Thus, there is no benefit to caching
769*ccdc9c3eSSadaf Ebrahimi   // the first byte of the suffix whereas there is a cost involved in
770*ccdc9c3eSSadaf Ebrahimi   // cloning it if it begins a common prefix, which is fairly likely.
771*ccdc9c3eSSadaf Ebrahimi   //
772*ccdc9c3eSSadaf Ebrahimi   // 2. Conversely, the last byte of the suffix cannot possibly be a
773*ccdc9c3eSSadaf Ebrahimi   // prefix of anything because next == 0, so we will never want to
774*ccdc9c3eSSadaf Ebrahimi   // clone it, but it is fairly likely to be a common suffix. Perhaps
775*ccdc9c3eSSadaf Ebrahimi   // more so in reverse mode than in forward mode because the former is
776*ccdc9c3eSSadaf Ebrahimi   // "converging" towards lower entropy, but caching is still worthwhile
777*ccdc9c3eSSadaf Ebrahimi   // for the latter in cases such as 80-BF.
778*ccdc9c3eSSadaf Ebrahimi   //
779*ccdc9c3eSSadaf Ebrahimi   // 3. Handling the bytes between the first and the last is less
780*ccdc9c3eSSadaf Ebrahimi   // straightforward and, again, the approach depends on whether we are
781*ccdc9c3eSSadaf Ebrahimi   // "converging" towards lower entropy: in forward mode, a single byte
782*ccdc9c3eSSadaf Ebrahimi   // is unlikely to be part of a common suffix whereas a byte range
783*ccdc9c3eSSadaf Ebrahimi   // is more likely so; in reverse mode, a byte range is unlikely to
784*ccdc9c3eSSadaf Ebrahimi   // be part of a common suffix whereas a single byte is more likely
785*ccdc9c3eSSadaf Ebrahimi   // so. The same benefit versus cost argument applies here.
786*ccdc9c3eSSadaf Ebrahimi   int id = 0;
787*ccdc9c3eSSadaf Ebrahimi   if (reversed_) {
788*ccdc9c3eSSadaf Ebrahimi     for (int i = 0; i < n; i++) {
789*ccdc9c3eSSadaf Ebrahimi       // In reverse UTF-8 mode: cache the leading byte; don't cache the last
790*ccdc9c3eSSadaf Ebrahimi       // continuation byte; cache anything else iff it's a single byte (XX-XX).
791*ccdc9c3eSSadaf Ebrahimi       if (i == 0 || (ulo[i] == uhi[i] && i != n-1))
792*ccdc9c3eSSadaf Ebrahimi         id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
793*ccdc9c3eSSadaf Ebrahimi       else
794*ccdc9c3eSSadaf Ebrahimi         id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
795*ccdc9c3eSSadaf Ebrahimi     }
796*ccdc9c3eSSadaf Ebrahimi   } else {
797*ccdc9c3eSSadaf Ebrahimi     for (int i = n-1; i >= 0; i--) {
798*ccdc9c3eSSadaf Ebrahimi       // In forward UTF-8 mode: don't cache the leading byte; cache the last
799*ccdc9c3eSSadaf Ebrahimi       // continuation byte; cache anything else iff it's a byte range (XX-YY).
800*ccdc9c3eSSadaf Ebrahimi       if (i == n-1 || (ulo[i] < uhi[i] && i != 0))
801*ccdc9c3eSSadaf Ebrahimi         id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
802*ccdc9c3eSSadaf Ebrahimi       else
803*ccdc9c3eSSadaf Ebrahimi         id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
804*ccdc9c3eSSadaf Ebrahimi     }
805*ccdc9c3eSSadaf Ebrahimi   }
806*ccdc9c3eSSadaf Ebrahimi   AddSuffix(id);
807*ccdc9c3eSSadaf Ebrahimi }
808*ccdc9c3eSSadaf Ebrahimi 
809*ccdc9c3eSSadaf Ebrahimi // Should not be called.
Copy(Frag arg)810*ccdc9c3eSSadaf Ebrahimi Frag Compiler::Copy(Frag arg) {
811*ccdc9c3eSSadaf Ebrahimi   // We're using WalkExponential; there should be no copying.
812*ccdc9c3eSSadaf Ebrahimi   LOG(DFATAL) << "Compiler::Copy called!";
813*ccdc9c3eSSadaf Ebrahimi   failed_ = true;
814*ccdc9c3eSSadaf Ebrahimi   return NoMatch();
815*ccdc9c3eSSadaf Ebrahimi }
816*ccdc9c3eSSadaf Ebrahimi 
817*ccdc9c3eSSadaf Ebrahimi // Visits a node quickly; called once WalkExponential has
818*ccdc9c3eSSadaf Ebrahimi // decided to cut this walk short.
ShortVisit(Regexp * re,Frag)819*ccdc9c3eSSadaf Ebrahimi Frag Compiler::ShortVisit(Regexp* re, Frag) {
820*ccdc9c3eSSadaf Ebrahimi   failed_ = true;
821*ccdc9c3eSSadaf Ebrahimi   return NoMatch();
822*ccdc9c3eSSadaf Ebrahimi }
823*ccdc9c3eSSadaf Ebrahimi 
824*ccdc9c3eSSadaf Ebrahimi // Called before traversing a node's children during the walk.
PreVisit(Regexp * re,Frag,bool * stop)825*ccdc9c3eSSadaf Ebrahimi Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) {
826*ccdc9c3eSSadaf Ebrahimi   // Cut off walk if we've already failed.
827*ccdc9c3eSSadaf Ebrahimi   if (failed_)
828*ccdc9c3eSSadaf Ebrahimi     *stop = true;
829*ccdc9c3eSSadaf Ebrahimi 
830*ccdc9c3eSSadaf Ebrahimi   return Frag();  // not used by caller
831*ccdc9c3eSSadaf Ebrahimi }
832*ccdc9c3eSSadaf Ebrahimi 
Literal(Rune r,bool foldcase)833*ccdc9c3eSSadaf Ebrahimi Frag Compiler::Literal(Rune r, bool foldcase) {
834*ccdc9c3eSSadaf Ebrahimi   switch (encoding_) {
835*ccdc9c3eSSadaf Ebrahimi     default:
836*ccdc9c3eSSadaf Ebrahimi       return Frag();
837*ccdc9c3eSSadaf Ebrahimi 
838*ccdc9c3eSSadaf Ebrahimi     case kEncodingLatin1:
839*ccdc9c3eSSadaf Ebrahimi       return ByteRange(r, r, foldcase);
840*ccdc9c3eSSadaf Ebrahimi 
841*ccdc9c3eSSadaf Ebrahimi     case kEncodingUTF8: {
842*ccdc9c3eSSadaf Ebrahimi       if (r < Runeself)  // Make common case fast.
843*ccdc9c3eSSadaf Ebrahimi         return ByteRange(r, r, foldcase);
844*ccdc9c3eSSadaf Ebrahimi       uint8_t buf[UTFmax];
845*ccdc9c3eSSadaf Ebrahimi       int n = runetochar(reinterpret_cast<char*>(buf), &r);
846*ccdc9c3eSSadaf Ebrahimi       Frag f = ByteRange((uint8_t)buf[0], buf[0], false);
847*ccdc9c3eSSadaf Ebrahimi       for (int i = 1; i < n; i++)
848*ccdc9c3eSSadaf Ebrahimi         f = Cat(f, ByteRange((uint8_t)buf[i], buf[i], false));
849*ccdc9c3eSSadaf Ebrahimi       return f;
850*ccdc9c3eSSadaf Ebrahimi     }
851*ccdc9c3eSSadaf Ebrahimi   }
852*ccdc9c3eSSadaf Ebrahimi }
853*ccdc9c3eSSadaf Ebrahimi 
854*ccdc9c3eSSadaf Ebrahimi // Called after traversing the node's children during the walk.
855*ccdc9c3eSSadaf Ebrahimi // Given their frags, build and return the frag for this re.
PostVisit(Regexp * re,Frag,Frag,Frag * child_frags,int nchild_frags)856*ccdc9c3eSSadaf Ebrahimi Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
857*ccdc9c3eSSadaf Ebrahimi                          int nchild_frags) {
858*ccdc9c3eSSadaf Ebrahimi   // If a child failed, don't bother going forward, especially
859*ccdc9c3eSSadaf Ebrahimi   // since the child_frags might contain Frags with NULLs in them.
860*ccdc9c3eSSadaf Ebrahimi   if (failed_)
861*ccdc9c3eSSadaf Ebrahimi     return NoMatch();
862*ccdc9c3eSSadaf Ebrahimi 
863*ccdc9c3eSSadaf Ebrahimi   // Given the child fragments, return the fragment for this node.
864*ccdc9c3eSSadaf Ebrahimi   switch (re->op()) {
865*ccdc9c3eSSadaf Ebrahimi     case kRegexpRepeat:
866*ccdc9c3eSSadaf Ebrahimi       // Should not see; code at bottom of function will print error
867*ccdc9c3eSSadaf Ebrahimi       break;
868*ccdc9c3eSSadaf Ebrahimi 
869*ccdc9c3eSSadaf Ebrahimi     case kRegexpNoMatch:
870*ccdc9c3eSSadaf Ebrahimi       return NoMatch();
871*ccdc9c3eSSadaf Ebrahimi 
872*ccdc9c3eSSadaf Ebrahimi     case kRegexpEmptyMatch:
873*ccdc9c3eSSadaf Ebrahimi       return Nop();
874*ccdc9c3eSSadaf Ebrahimi 
875*ccdc9c3eSSadaf Ebrahimi     case kRegexpHaveMatch: {
876*ccdc9c3eSSadaf Ebrahimi       Frag f = Match(re->match_id());
877*ccdc9c3eSSadaf Ebrahimi       if (anchor_ == RE2::ANCHOR_BOTH) {
878*ccdc9c3eSSadaf Ebrahimi         // Append \z or else the subexpression will effectively be unanchored.
879*ccdc9c3eSSadaf Ebrahimi         // Complemented by the UNANCHORED case in CompileSet().
880*ccdc9c3eSSadaf Ebrahimi         f = Cat(EmptyWidth(kEmptyEndText), f);
881*ccdc9c3eSSadaf Ebrahimi       }
882*ccdc9c3eSSadaf Ebrahimi       return f;
883*ccdc9c3eSSadaf Ebrahimi     }
884*ccdc9c3eSSadaf Ebrahimi 
885*ccdc9c3eSSadaf Ebrahimi     case kRegexpConcat: {
886*ccdc9c3eSSadaf Ebrahimi       Frag f = child_frags[0];
887*ccdc9c3eSSadaf Ebrahimi       for (int i = 1; i < nchild_frags; i++)
888*ccdc9c3eSSadaf Ebrahimi         f = Cat(f, child_frags[i]);
889*ccdc9c3eSSadaf Ebrahimi       return f;
890*ccdc9c3eSSadaf Ebrahimi     }
891*ccdc9c3eSSadaf Ebrahimi 
892*ccdc9c3eSSadaf Ebrahimi     case kRegexpAlternate: {
893*ccdc9c3eSSadaf Ebrahimi       Frag f = child_frags[0];
894*ccdc9c3eSSadaf Ebrahimi       for (int i = 1; i < nchild_frags; i++)
895*ccdc9c3eSSadaf Ebrahimi         f = Alt(f, child_frags[i]);
896*ccdc9c3eSSadaf Ebrahimi       return f;
897*ccdc9c3eSSadaf Ebrahimi     }
898*ccdc9c3eSSadaf Ebrahimi 
899*ccdc9c3eSSadaf Ebrahimi     case kRegexpStar:
900*ccdc9c3eSSadaf Ebrahimi       return Star(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
901*ccdc9c3eSSadaf Ebrahimi 
902*ccdc9c3eSSadaf Ebrahimi     case kRegexpPlus:
903*ccdc9c3eSSadaf Ebrahimi       return Plus(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
904*ccdc9c3eSSadaf Ebrahimi 
905*ccdc9c3eSSadaf Ebrahimi     case kRegexpQuest:
906*ccdc9c3eSSadaf Ebrahimi       return Quest(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
907*ccdc9c3eSSadaf Ebrahimi 
908*ccdc9c3eSSadaf Ebrahimi     case kRegexpLiteral:
909*ccdc9c3eSSadaf Ebrahimi       return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != 0);
910*ccdc9c3eSSadaf Ebrahimi 
911*ccdc9c3eSSadaf Ebrahimi     case kRegexpLiteralString: {
912*ccdc9c3eSSadaf Ebrahimi       // Concatenation of literals.
913*ccdc9c3eSSadaf Ebrahimi       if (re->nrunes() == 0)
914*ccdc9c3eSSadaf Ebrahimi         return Nop();
915*ccdc9c3eSSadaf Ebrahimi       Frag f;
916*ccdc9c3eSSadaf Ebrahimi       for (int i = 0; i < re->nrunes(); i++) {
917*ccdc9c3eSSadaf Ebrahimi         Frag f1 = Literal(re->runes()[i],
918*ccdc9c3eSSadaf Ebrahimi                           (re->parse_flags()&Regexp::FoldCase) != 0);
919*ccdc9c3eSSadaf Ebrahimi         if (i == 0)
920*ccdc9c3eSSadaf Ebrahimi           f = f1;
921*ccdc9c3eSSadaf Ebrahimi         else
922*ccdc9c3eSSadaf Ebrahimi           f = Cat(f, f1);
923*ccdc9c3eSSadaf Ebrahimi       }
924*ccdc9c3eSSadaf Ebrahimi       return f;
925*ccdc9c3eSSadaf Ebrahimi     }
926*ccdc9c3eSSadaf Ebrahimi 
927*ccdc9c3eSSadaf Ebrahimi     case kRegexpAnyChar:
928*ccdc9c3eSSadaf Ebrahimi       BeginRange();
929*ccdc9c3eSSadaf Ebrahimi       AddRuneRange(0, Runemax, false);
930*ccdc9c3eSSadaf Ebrahimi       return EndRange();
931*ccdc9c3eSSadaf Ebrahimi 
932*ccdc9c3eSSadaf Ebrahimi     case kRegexpAnyByte:
933*ccdc9c3eSSadaf Ebrahimi       return ByteRange(0x00, 0xFF, false);
934*ccdc9c3eSSadaf Ebrahimi 
935*ccdc9c3eSSadaf Ebrahimi     case kRegexpCharClass: {
936*ccdc9c3eSSadaf Ebrahimi       CharClass* cc = re->cc();
937*ccdc9c3eSSadaf Ebrahimi       if (cc->empty()) {
938*ccdc9c3eSSadaf Ebrahimi         // This can't happen.
939*ccdc9c3eSSadaf Ebrahimi         LOG(DFATAL) << "No ranges in char class";
940*ccdc9c3eSSadaf Ebrahimi         failed_ = true;
941*ccdc9c3eSSadaf Ebrahimi         return NoMatch();
942*ccdc9c3eSSadaf Ebrahimi       }
943*ccdc9c3eSSadaf Ebrahimi 
944*ccdc9c3eSSadaf Ebrahimi       // ASCII case-folding optimization: if the char class
945*ccdc9c3eSSadaf Ebrahimi       // behaves the same on A-Z as it does on a-z,
946*ccdc9c3eSSadaf Ebrahimi       // discard any ranges wholly contained in A-Z
947*ccdc9c3eSSadaf Ebrahimi       // and mark the other ranges as foldascii.
948*ccdc9c3eSSadaf Ebrahimi       // This reduces the size of a program for
949*ccdc9c3eSSadaf Ebrahimi       // (?i)abc from 3 insts per letter to 1 per letter.
950*ccdc9c3eSSadaf Ebrahimi       bool foldascii = cc->FoldsASCII();
951*ccdc9c3eSSadaf Ebrahimi 
952*ccdc9c3eSSadaf Ebrahimi       // Character class is just a big OR of the different
953*ccdc9c3eSSadaf Ebrahimi       // character ranges in the class.
954*ccdc9c3eSSadaf Ebrahimi       BeginRange();
955*ccdc9c3eSSadaf Ebrahimi       for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) {
956*ccdc9c3eSSadaf Ebrahimi         // ASCII case-folding optimization (see above).
957*ccdc9c3eSSadaf Ebrahimi         if (foldascii && 'A' <= i->lo && i->hi <= 'Z')
958*ccdc9c3eSSadaf Ebrahimi           continue;
959*ccdc9c3eSSadaf Ebrahimi 
960*ccdc9c3eSSadaf Ebrahimi         // If this range contains all of A-Za-z or none of it,
961*ccdc9c3eSSadaf Ebrahimi         // the fold flag is unnecessary; don't bother.
962*ccdc9c3eSSadaf Ebrahimi         bool fold = foldascii;
963*ccdc9c3eSSadaf Ebrahimi         if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo ||
964*ccdc9c3eSSadaf Ebrahimi             ('Z' < i->lo && i->hi < 'a'))
965*ccdc9c3eSSadaf Ebrahimi           fold = false;
966*ccdc9c3eSSadaf Ebrahimi 
967*ccdc9c3eSSadaf Ebrahimi         AddRuneRange(i->lo, i->hi, fold);
968*ccdc9c3eSSadaf Ebrahimi       }
969*ccdc9c3eSSadaf Ebrahimi       return EndRange();
970*ccdc9c3eSSadaf Ebrahimi     }
971*ccdc9c3eSSadaf Ebrahimi 
972*ccdc9c3eSSadaf Ebrahimi     case kRegexpCapture:
973*ccdc9c3eSSadaf Ebrahimi       // If this is a non-capturing parenthesis -- (?:foo) --
974*ccdc9c3eSSadaf Ebrahimi       // just use the inner expression.
975*ccdc9c3eSSadaf Ebrahimi       if (re->cap() < 0)
976*ccdc9c3eSSadaf Ebrahimi         return child_frags[0];
977*ccdc9c3eSSadaf Ebrahimi       return Capture(child_frags[0], re->cap());
978*ccdc9c3eSSadaf Ebrahimi 
979*ccdc9c3eSSadaf Ebrahimi     case kRegexpBeginLine:
980*ccdc9c3eSSadaf Ebrahimi       return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine);
981*ccdc9c3eSSadaf Ebrahimi 
982*ccdc9c3eSSadaf Ebrahimi     case kRegexpEndLine:
983*ccdc9c3eSSadaf Ebrahimi       return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine);
984*ccdc9c3eSSadaf Ebrahimi 
985*ccdc9c3eSSadaf Ebrahimi     case kRegexpBeginText:
986*ccdc9c3eSSadaf Ebrahimi       return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText);
987*ccdc9c3eSSadaf Ebrahimi 
988*ccdc9c3eSSadaf Ebrahimi     case kRegexpEndText:
989*ccdc9c3eSSadaf Ebrahimi       return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText);
990*ccdc9c3eSSadaf Ebrahimi 
991*ccdc9c3eSSadaf Ebrahimi     case kRegexpWordBoundary:
992*ccdc9c3eSSadaf Ebrahimi       return EmptyWidth(kEmptyWordBoundary);
993*ccdc9c3eSSadaf Ebrahimi 
994*ccdc9c3eSSadaf Ebrahimi     case kRegexpNoWordBoundary:
995*ccdc9c3eSSadaf Ebrahimi       return EmptyWidth(kEmptyNonWordBoundary);
996*ccdc9c3eSSadaf Ebrahimi   }
997*ccdc9c3eSSadaf Ebrahimi   LOG(DFATAL) << "Missing case in Compiler: " << re->op();
998*ccdc9c3eSSadaf Ebrahimi   failed_ = true;
999*ccdc9c3eSSadaf Ebrahimi   return NoMatch();
1000*ccdc9c3eSSadaf Ebrahimi }
1001*ccdc9c3eSSadaf Ebrahimi 
1002*ccdc9c3eSSadaf Ebrahimi // Is this regexp required to start at the beginning of the text?
1003*ccdc9c3eSSadaf Ebrahimi // Only approximate; can return false for complicated regexps like (\Aa|\Ab),
1004*ccdc9c3eSSadaf Ebrahimi // but handles (\A(a|b)).  Could use the Walker to write a more exact one.
IsAnchorStart(Regexp ** pre,int depth)1005*ccdc9c3eSSadaf Ebrahimi static bool IsAnchorStart(Regexp** pre, int depth) {
1006*ccdc9c3eSSadaf Ebrahimi   Regexp* re = *pre;
1007*ccdc9c3eSSadaf Ebrahimi   Regexp* sub;
1008*ccdc9c3eSSadaf Ebrahimi   // The depth limit makes sure that we don't overflow
1009*ccdc9c3eSSadaf Ebrahimi   // the stack on a deeply nested regexp.  As the comment
1010*ccdc9c3eSSadaf Ebrahimi   // above says, IsAnchorStart is conservative, so returning
1011*ccdc9c3eSSadaf Ebrahimi   // a false negative is okay.  The exact limit is somewhat arbitrary.
1012*ccdc9c3eSSadaf Ebrahimi   if (re == NULL || depth >= 4)
1013*ccdc9c3eSSadaf Ebrahimi     return false;
1014*ccdc9c3eSSadaf Ebrahimi   switch (re->op()) {
1015*ccdc9c3eSSadaf Ebrahimi     default:
1016*ccdc9c3eSSadaf Ebrahimi       break;
1017*ccdc9c3eSSadaf Ebrahimi     case kRegexpConcat:
1018*ccdc9c3eSSadaf Ebrahimi       if (re->nsub() > 0) {
1019*ccdc9c3eSSadaf Ebrahimi         sub = re->sub()[0]->Incref();
1020*ccdc9c3eSSadaf Ebrahimi         if (IsAnchorStart(&sub, depth+1)) {
1021*ccdc9c3eSSadaf Ebrahimi           PODArray<Regexp*> subcopy(re->nsub());
1022*ccdc9c3eSSadaf Ebrahimi           subcopy[0] = sub;  // already have reference
1023*ccdc9c3eSSadaf Ebrahimi           for (int i = 1; i < re->nsub(); i++)
1024*ccdc9c3eSSadaf Ebrahimi             subcopy[i] = re->sub()[i]->Incref();
1025*ccdc9c3eSSadaf Ebrahimi           *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
1026*ccdc9c3eSSadaf Ebrahimi           re->Decref();
1027*ccdc9c3eSSadaf Ebrahimi           return true;
1028*ccdc9c3eSSadaf Ebrahimi         }
1029*ccdc9c3eSSadaf Ebrahimi         sub->Decref();
1030*ccdc9c3eSSadaf Ebrahimi       }
1031*ccdc9c3eSSadaf Ebrahimi       break;
1032*ccdc9c3eSSadaf Ebrahimi     case kRegexpCapture:
1033*ccdc9c3eSSadaf Ebrahimi       sub = re->sub()[0]->Incref();
1034*ccdc9c3eSSadaf Ebrahimi       if (IsAnchorStart(&sub, depth+1)) {
1035*ccdc9c3eSSadaf Ebrahimi         *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
1036*ccdc9c3eSSadaf Ebrahimi         re->Decref();
1037*ccdc9c3eSSadaf Ebrahimi         return true;
1038*ccdc9c3eSSadaf Ebrahimi       }
1039*ccdc9c3eSSadaf Ebrahimi       sub->Decref();
1040*ccdc9c3eSSadaf Ebrahimi       break;
1041*ccdc9c3eSSadaf Ebrahimi     case kRegexpBeginText:
1042*ccdc9c3eSSadaf Ebrahimi       *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
1043*ccdc9c3eSSadaf Ebrahimi       re->Decref();
1044*ccdc9c3eSSadaf Ebrahimi       return true;
1045*ccdc9c3eSSadaf Ebrahimi   }
1046*ccdc9c3eSSadaf Ebrahimi   return false;
1047*ccdc9c3eSSadaf Ebrahimi }
1048*ccdc9c3eSSadaf Ebrahimi 
1049*ccdc9c3eSSadaf Ebrahimi // Is this regexp required to start at the end of the text?
1050*ccdc9c3eSSadaf Ebrahimi // Only approximate; can return false for complicated regexps like (a\z|b\z),
1051*ccdc9c3eSSadaf Ebrahimi // but handles ((a|b)\z).  Could use the Walker to write a more exact one.
IsAnchorEnd(Regexp ** pre,int depth)1052*ccdc9c3eSSadaf Ebrahimi static bool IsAnchorEnd(Regexp** pre, int depth) {
1053*ccdc9c3eSSadaf Ebrahimi   Regexp* re = *pre;
1054*ccdc9c3eSSadaf Ebrahimi   Regexp* sub;
1055*ccdc9c3eSSadaf Ebrahimi   // The depth limit makes sure that we don't overflow
1056*ccdc9c3eSSadaf Ebrahimi   // the stack on a deeply nested regexp.  As the comment
1057*ccdc9c3eSSadaf Ebrahimi   // above says, IsAnchorEnd is conservative, so returning
1058*ccdc9c3eSSadaf Ebrahimi   // a false negative is okay.  The exact limit is somewhat arbitrary.
1059*ccdc9c3eSSadaf Ebrahimi   if (re == NULL || depth >= 4)
1060*ccdc9c3eSSadaf Ebrahimi     return false;
1061*ccdc9c3eSSadaf Ebrahimi   switch (re->op()) {
1062*ccdc9c3eSSadaf Ebrahimi     default:
1063*ccdc9c3eSSadaf Ebrahimi       break;
1064*ccdc9c3eSSadaf Ebrahimi     case kRegexpConcat:
1065*ccdc9c3eSSadaf Ebrahimi       if (re->nsub() > 0) {
1066*ccdc9c3eSSadaf Ebrahimi         sub = re->sub()[re->nsub() - 1]->Incref();
1067*ccdc9c3eSSadaf Ebrahimi         if (IsAnchorEnd(&sub, depth+1)) {
1068*ccdc9c3eSSadaf Ebrahimi           PODArray<Regexp*> subcopy(re->nsub());
1069*ccdc9c3eSSadaf Ebrahimi           subcopy[re->nsub() - 1] = sub;  // already have reference
1070*ccdc9c3eSSadaf Ebrahimi           for (int i = 0; i < re->nsub() - 1; i++)
1071*ccdc9c3eSSadaf Ebrahimi             subcopy[i] = re->sub()[i]->Incref();
1072*ccdc9c3eSSadaf Ebrahimi           *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
1073*ccdc9c3eSSadaf Ebrahimi           re->Decref();
1074*ccdc9c3eSSadaf Ebrahimi           return true;
1075*ccdc9c3eSSadaf Ebrahimi         }
1076*ccdc9c3eSSadaf Ebrahimi         sub->Decref();
1077*ccdc9c3eSSadaf Ebrahimi       }
1078*ccdc9c3eSSadaf Ebrahimi       break;
1079*ccdc9c3eSSadaf Ebrahimi     case kRegexpCapture:
1080*ccdc9c3eSSadaf Ebrahimi       sub = re->sub()[0]->Incref();
1081*ccdc9c3eSSadaf Ebrahimi       if (IsAnchorEnd(&sub, depth+1)) {
1082*ccdc9c3eSSadaf Ebrahimi         *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
1083*ccdc9c3eSSadaf Ebrahimi         re->Decref();
1084*ccdc9c3eSSadaf Ebrahimi         return true;
1085*ccdc9c3eSSadaf Ebrahimi       }
1086*ccdc9c3eSSadaf Ebrahimi       sub->Decref();
1087*ccdc9c3eSSadaf Ebrahimi       break;
1088*ccdc9c3eSSadaf Ebrahimi     case kRegexpEndText:
1089*ccdc9c3eSSadaf Ebrahimi       *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
1090*ccdc9c3eSSadaf Ebrahimi       re->Decref();
1091*ccdc9c3eSSadaf Ebrahimi       return true;
1092*ccdc9c3eSSadaf Ebrahimi   }
1093*ccdc9c3eSSadaf Ebrahimi   return false;
1094*ccdc9c3eSSadaf Ebrahimi }
1095*ccdc9c3eSSadaf Ebrahimi 
Setup(Regexp::ParseFlags flags,int64_t max_mem,RE2::Anchor anchor)1096*ccdc9c3eSSadaf Ebrahimi void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem,
1097*ccdc9c3eSSadaf Ebrahimi                      RE2::Anchor anchor) {
1098*ccdc9c3eSSadaf Ebrahimi   prog_->set_flags(flags);
1099*ccdc9c3eSSadaf Ebrahimi 
1100*ccdc9c3eSSadaf Ebrahimi   if (flags & Regexp::Latin1)
1101*ccdc9c3eSSadaf Ebrahimi     encoding_ = kEncodingLatin1;
1102*ccdc9c3eSSadaf Ebrahimi   max_mem_ = max_mem;
1103*ccdc9c3eSSadaf Ebrahimi   if (max_mem <= 0) {
1104*ccdc9c3eSSadaf Ebrahimi     max_ninst_ = 100000;  // more than enough
1105*ccdc9c3eSSadaf Ebrahimi   } else if (static_cast<size_t>(max_mem) <= sizeof(Prog)) {
1106*ccdc9c3eSSadaf Ebrahimi     // No room for anything.
1107*ccdc9c3eSSadaf Ebrahimi     max_ninst_ = 0;
1108*ccdc9c3eSSadaf Ebrahimi   } else {
1109*ccdc9c3eSSadaf Ebrahimi     int64_t m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
1110*ccdc9c3eSSadaf Ebrahimi     // Limit instruction count so that inst->id() fits nicely in an int.
1111*ccdc9c3eSSadaf Ebrahimi     // SparseArray also assumes that the indices (inst->id()) are ints.
1112*ccdc9c3eSSadaf Ebrahimi     // The call to WalkExponential uses 2*max_ninst_ below,
1113*ccdc9c3eSSadaf Ebrahimi     // and other places in the code use 2 or 3 * prog->size().
1114*ccdc9c3eSSadaf Ebrahimi     // Limiting to 2^24 should avoid overflow in those places.
1115*ccdc9c3eSSadaf Ebrahimi     // (The point of allowing more than 32 bits of memory is to
1116*ccdc9c3eSSadaf Ebrahimi     // have plenty of room for the DFA states, not to use it up
1117*ccdc9c3eSSadaf Ebrahimi     // on the program.)
1118*ccdc9c3eSSadaf Ebrahimi     if (m >= 1<<24)
1119*ccdc9c3eSSadaf Ebrahimi       m = 1<<24;
1120*ccdc9c3eSSadaf Ebrahimi 
1121*ccdc9c3eSSadaf Ebrahimi     // Inst imposes its own limit (currently bigger than 2^24 but be safe).
1122*ccdc9c3eSSadaf Ebrahimi     if (m > Prog::Inst::kMaxInst)
1123*ccdc9c3eSSadaf Ebrahimi       m = Prog::Inst::kMaxInst;
1124*ccdc9c3eSSadaf Ebrahimi 
1125*ccdc9c3eSSadaf Ebrahimi     max_ninst_ = static_cast<int>(m);
1126*ccdc9c3eSSadaf Ebrahimi   }
1127*ccdc9c3eSSadaf Ebrahimi 
1128*ccdc9c3eSSadaf Ebrahimi   anchor_ = anchor;
1129*ccdc9c3eSSadaf Ebrahimi }
1130*ccdc9c3eSSadaf Ebrahimi 
1131*ccdc9c3eSSadaf Ebrahimi // Compiles re, returning program.
1132*ccdc9c3eSSadaf Ebrahimi // Caller is responsible for deleting prog_.
1133*ccdc9c3eSSadaf Ebrahimi // If reversed is true, compiles a program that expects
1134*ccdc9c3eSSadaf Ebrahimi // to run over the input string backward (reverses all concatenations).
1135*ccdc9c3eSSadaf Ebrahimi // The reversed flag is also recorded in the returned program.
Compile(Regexp * re,bool reversed,int64_t max_mem)1136*ccdc9c3eSSadaf Ebrahimi Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) {
1137*ccdc9c3eSSadaf Ebrahimi   Compiler c;
1138*ccdc9c3eSSadaf Ebrahimi   c.Setup(re->parse_flags(), max_mem, RE2::UNANCHORED /* unused */);
1139*ccdc9c3eSSadaf Ebrahimi   c.reversed_ = reversed;
1140*ccdc9c3eSSadaf Ebrahimi 
1141*ccdc9c3eSSadaf Ebrahimi   // Simplify to remove things like counted repetitions
1142*ccdc9c3eSSadaf Ebrahimi   // and character classes like \d.
1143*ccdc9c3eSSadaf Ebrahimi   Regexp* sre = re->Simplify();
1144*ccdc9c3eSSadaf Ebrahimi   if (sre == NULL)
1145*ccdc9c3eSSadaf Ebrahimi     return NULL;
1146*ccdc9c3eSSadaf Ebrahimi 
1147*ccdc9c3eSSadaf Ebrahimi   // Record whether prog is anchored, removing the anchors.
1148*ccdc9c3eSSadaf Ebrahimi   // (They get in the way of other optimizations.)
1149*ccdc9c3eSSadaf Ebrahimi   bool is_anchor_start = IsAnchorStart(&sre, 0);
1150*ccdc9c3eSSadaf Ebrahimi   bool is_anchor_end = IsAnchorEnd(&sre, 0);
1151*ccdc9c3eSSadaf Ebrahimi 
1152*ccdc9c3eSSadaf Ebrahimi   // Generate fragment for entire regexp.
1153*ccdc9c3eSSadaf Ebrahimi   Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_);
1154*ccdc9c3eSSadaf Ebrahimi   sre->Decref();
1155*ccdc9c3eSSadaf Ebrahimi   if (c.failed_)
1156*ccdc9c3eSSadaf Ebrahimi     return NULL;
1157*ccdc9c3eSSadaf Ebrahimi 
1158*ccdc9c3eSSadaf Ebrahimi   // Success!  Finish by putting Match node at end, and record start.
1159*ccdc9c3eSSadaf Ebrahimi   // Turn off c.reversed_ (if it is set) to force the remaining concatenations
1160*ccdc9c3eSSadaf Ebrahimi   // to behave normally.
1161*ccdc9c3eSSadaf Ebrahimi   c.reversed_ = false;
1162*ccdc9c3eSSadaf Ebrahimi   all = c.Cat(all, c.Match(0));
1163*ccdc9c3eSSadaf Ebrahimi 
1164*ccdc9c3eSSadaf Ebrahimi   c.prog_->set_reversed(reversed);
1165*ccdc9c3eSSadaf Ebrahimi   if (c.prog_->reversed()) {
1166*ccdc9c3eSSadaf Ebrahimi     c.prog_->set_anchor_start(is_anchor_end);
1167*ccdc9c3eSSadaf Ebrahimi     c.prog_->set_anchor_end(is_anchor_start);
1168*ccdc9c3eSSadaf Ebrahimi   } else {
1169*ccdc9c3eSSadaf Ebrahimi     c.prog_->set_anchor_start(is_anchor_start);
1170*ccdc9c3eSSadaf Ebrahimi     c.prog_->set_anchor_end(is_anchor_end);
1171*ccdc9c3eSSadaf Ebrahimi   }
1172*ccdc9c3eSSadaf Ebrahimi 
1173*ccdc9c3eSSadaf Ebrahimi   c.prog_->set_start(all.begin);
1174*ccdc9c3eSSadaf Ebrahimi   if (!c.prog_->anchor_start()) {
1175*ccdc9c3eSSadaf Ebrahimi     // Also create unanchored version, which starts with a .*? loop.
1176*ccdc9c3eSSadaf Ebrahimi     all = c.Cat(c.DotStar(), all);
1177*ccdc9c3eSSadaf Ebrahimi   }
1178*ccdc9c3eSSadaf Ebrahimi   c.prog_->set_start_unanchored(all.begin);
1179*ccdc9c3eSSadaf Ebrahimi 
1180*ccdc9c3eSSadaf Ebrahimi   // Hand ownership of prog_ to caller.
1181*ccdc9c3eSSadaf Ebrahimi   return c.Finish();
1182*ccdc9c3eSSadaf Ebrahimi }
1183*ccdc9c3eSSadaf Ebrahimi 
Finish()1184*ccdc9c3eSSadaf Ebrahimi Prog* Compiler::Finish() {
1185*ccdc9c3eSSadaf Ebrahimi   if (failed_)
1186*ccdc9c3eSSadaf Ebrahimi     return NULL;
1187*ccdc9c3eSSadaf Ebrahimi 
1188*ccdc9c3eSSadaf Ebrahimi   if (prog_->start() == 0 && prog_->start_unanchored() == 0) {
1189*ccdc9c3eSSadaf Ebrahimi     // No possible matches; keep Fail instruction only.
1190*ccdc9c3eSSadaf Ebrahimi     ninst_ = 1;
1191*ccdc9c3eSSadaf Ebrahimi   }
1192*ccdc9c3eSSadaf Ebrahimi 
1193*ccdc9c3eSSadaf Ebrahimi   // Hand off the array to Prog.
1194*ccdc9c3eSSadaf Ebrahimi   prog_->inst_ = std::move(inst_);
1195*ccdc9c3eSSadaf Ebrahimi   prog_->size_ = ninst_;
1196*ccdc9c3eSSadaf Ebrahimi 
1197*ccdc9c3eSSadaf Ebrahimi   prog_->Optimize();
1198*ccdc9c3eSSadaf Ebrahimi   prog_->Flatten();
1199*ccdc9c3eSSadaf Ebrahimi   prog_->ComputeByteMap();
1200*ccdc9c3eSSadaf Ebrahimi 
1201*ccdc9c3eSSadaf Ebrahimi   // Record remaining memory for DFA.
1202*ccdc9c3eSSadaf Ebrahimi   if (max_mem_ <= 0) {
1203*ccdc9c3eSSadaf Ebrahimi     prog_->set_dfa_mem(1<<20);
1204*ccdc9c3eSSadaf Ebrahimi   } else {
1205*ccdc9c3eSSadaf Ebrahimi     int64_t m = max_mem_ - sizeof(Prog) - prog_->size_*sizeof(Prog::Inst);
1206*ccdc9c3eSSadaf Ebrahimi     if (m < 0)
1207*ccdc9c3eSSadaf Ebrahimi       m = 0;
1208*ccdc9c3eSSadaf Ebrahimi     prog_->set_dfa_mem(m);
1209*ccdc9c3eSSadaf Ebrahimi   }
1210*ccdc9c3eSSadaf Ebrahimi 
1211*ccdc9c3eSSadaf Ebrahimi   Prog* p = prog_;
1212*ccdc9c3eSSadaf Ebrahimi   prog_ = NULL;
1213*ccdc9c3eSSadaf Ebrahimi   return p;
1214*ccdc9c3eSSadaf Ebrahimi }
1215*ccdc9c3eSSadaf Ebrahimi 
1216*ccdc9c3eSSadaf Ebrahimi // Converts Regexp to Prog.
CompileToProg(int64_t max_mem)1217*ccdc9c3eSSadaf Ebrahimi Prog* Regexp::CompileToProg(int64_t max_mem) {
1218*ccdc9c3eSSadaf Ebrahimi   return Compiler::Compile(this, false, max_mem);
1219*ccdc9c3eSSadaf Ebrahimi }
1220*ccdc9c3eSSadaf Ebrahimi 
CompileToReverseProg(int64_t max_mem)1221*ccdc9c3eSSadaf Ebrahimi Prog* Regexp::CompileToReverseProg(int64_t max_mem) {
1222*ccdc9c3eSSadaf Ebrahimi   return Compiler::Compile(this, true, max_mem);
1223*ccdc9c3eSSadaf Ebrahimi }
1224*ccdc9c3eSSadaf Ebrahimi 
DotStar()1225*ccdc9c3eSSadaf Ebrahimi Frag Compiler::DotStar() {
1226*ccdc9c3eSSadaf Ebrahimi   return Star(ByteRange(0x00, 0xff, false), true);
1227*ccdc9c3eSSadaf Ebrahimi }
1228*ccdc9c3eSSadaf Ebrahimi 
1229*ccdc9c3eSSadaf Ebrahimi // Compiles RE set to Prog.
CompileSet(Regexp * re,RE2::Anchor anchor,int64_t max_mem)1230*ccdc9c3eSSadaf Ebrahimi Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
1231*ccdc9c3eSSadaf Ebrahimi   Compiler c;
1232*ccdc9c3eSSadaf Ebrahimi   c.Setup(re->parse_flags(), max_mem, anchor);
1233*ccdc9c3eSSadaf Ebrahimi 
1234*ccdc9c3eSSadaf Ebrahimi   Regexp* sre = re->Simplify();
1235*ccdc9c3eSSadaf Ebrahimi   if (sre == NULL)
1236*ccdc9c3eSSadaf Ebrahimi     return NULL;
1237*ccdc9c3eSSadaf Ebrahimi 
1238*ccdc9c3eSSadaf Ebrahimi   Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_);
1239*ccdc9c3eSSadaf Ebrahimi   sre->Decref();
1240*ccdc9c3eSSadaf Ebrahimi   if (c.failed_)
1241*ccdc9c3eSSadaf Ebrahimi     return NULL;
1242*ccdc9c3eSSadaf Ebrahimi 
1243*ccdc9c3eSSadaf Ebrahimi   c.prog_->set_anchor_start(true);
1244*ccdc9c3eSSadaf Ebrahimi   c.prog_->set_anchor_end(true);
1245*ccdc9c3eSSadaf Ebrahimi 
1246*ccdc9c3eSSadaf Ebrahimi   if (anchor == RE2::UNANCHORED) {
1247*ccdc9c3eSSadaf Ebrahimi     // Prepend .* or else the expression will effectively be anchored.
1248*ccdc9c3eSSadaf Ebrahimi     // Complemented by the ANCHOR_BOTH case in PostVisit().
1249*ccdc9c3eSSadaf Ebrahimi     all = c.Cat(c.DotStar(), all);
1250*ccdc9c3eSSadaf Ebrahimi   }
1251*ccdc9c3eSSadaf Ebrahimi   c.prog_->set_start(all.begin);
1252*ccdc9c3eSSadaf Ebrahimi   c.prog_->set_start_unanchored(all.begin);
1253*ccdc9c3eSSadaf Ebrahimi 
1254*ccdc9c3eSSadaf Ebrahimi   Prog* prog = c.Finish();
1255*ccdc9c3eSSadaf Ebrahimi   if (prog == NULL)
1256*ccdc9c3eSSadaf Ebrahimi     return NULL;
1257*ccdc9c3eSSadaf Ebrahimi 
1258*ccdc9c3eSSadaf Ebrahimi   // Make sure DFA has enough memory to operate,
1259*ccdc9c3eSSadaf Ebrahimi   // since we're not going to fall back to the NFA.
1260*ccdc9c3eSSadaf Ebrahimi   bool dfa_failed = false;
1261*ccdc9c3eSSadaf Ebrahimi   StringPiece sp = "hello, world";
1262*ccdc9c3eSSadaf Ebrahimi   prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
1263*ccdc9c3eSSadaf Ebrahimi                   NULL, &dfa_failed, NULL);
1264*ccdc9c3eSSadaf Ebrahimi   if (dfa_failed) {
1265*ccdc9c3eSSadaf Ebrahimi     delete prog;
1266*ccdc9c3eSSadaf Ebrahimi     return NULL;
1267*ccdc9c3eSSadaf Ebrahimi   }
1268*ccdc9c3eSSadaf Ebrahimi 
1269*ccdc9c3eSSadaf Ebrahimi   return prog;
1270*ccdc9c3eSSadaf Ebrahimi }
1271*ccdc9c3eSSadaf Ebrahimi 
CompileSet(Regexp * re,RE2::Anchor anchor,int64_t max_mem)1272*ccdc9c3eSSadaf Ebrahimi Prog* Prog::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
1273*ccdc9c3eSSadaf Ebrahimi   return Compiler::CompileSet(re, anchor, max_mem);
1274*ccdc9c3eSSadaf Ebrahimi }
1275*ccdc9c3eSSadaf Ebrahimi 
1276*ccdc9c3eSSadaf Ebrahimi }  // namespace re2
1277