xref: /aosp_15_r20/external/pcre/src/pcre2_study.c (revision 22dc650d8ae982c6770746019a6f94af92b0f024)
1*22dc650dSSadaf Ebrahimi /*************************************************
2*22dc650dSSadaf Ebrahimi *      Perl-Compatible Regular Expressions       *
3*22dc650dSSadaf Ebrahimi *************************************************/
4*22dc650dSSadaf Ebrahimi 
5*22dc650dSSadaf Ebrahimi /* PCRE is a library of functions to support regular expressions whose syntax
6*22dc650dSSadaf Ebrahimi and semantics are as close as possible to those of the Perl 5 language.
7*22dc650dSSadaf Ebrahimi 
8*22dc650dSSadaf Ebrahimi                        Written by Philip Hazel
9*22dc650dSSadaf Ebrahimi      Original API code Copyright (c) 1997-2012 University of Cambridge
10*22dc650dSSadaf Ebrahimi           New API code Copyright (c) 2016-2023 University of Cambridge
11*22dc650dSSadaf Ebrahimi 
12*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
13*22dc650dSSadaf Ebrahimi Redistribution and use in source and binary forms, with or without
14*22dc650dSSadaf Ebrahimi modification, are permitted provided that the following conditions are met:
15*22dc650dSSadaf Ebrahimi 
16*22dc650dSSadaf Ebrahimi     * Redistributions of source code must retain the above copyright notice,
17*22dc650dSSadaf Ebrahimi       this list of conditions and the following disclaimer.
18*22dc650dSSadaf Ebrahimi 
19*22dc650dSSadaf Ebrahimi     * Redistributions in binary form must reproduce the above copyright
20*22dc650dSSadaf Ebrahimi       notice, this list of conditions and the following disclaimer in the
21*22dc650dSSadaf Ebrahimi       documentation and/or other materials provided with the distribution.
22*22dc650dSSadaf Ebrahimi 
23*22dc650dSSadaf Ebrahimi     * Neither the name of the University of Cambridge nor the names of its
24*22dc650dSSadaf Ebrahimi       contributors may be used to endorse or promote products derived from
25*22dc650dSSadaf Ebrahimi       this software without specific prior written permission.
26*22dc650dSSadaf Ebrahimi 
27*22dc650dSSadaf Ebrahimi THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28*22dc650dSSadaf Ebrahimi AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29*22dc650dSSadaf Ebrahimi IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30*22dc650dSSadaf Ebrahimi ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31*22dc650dSSadaf Ebrahimi LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32*22dc650dSSadaf Ebrahimi CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33*22dc650dSSadaf Ebrahimi SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34*22dc650dSSadaf Ebrahimi INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35*22dc650dSSadaf Ebrahimi CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36*22dc650dSSadaf Ebrahimi ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37*22dc650dSSadaf Ebrahimi POSSIBILITY OF SUCH DAMAGE.
38*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
39*22dc650dSSadaf Ebrahimi */
40*22dc650dSSadaf Ebrahimi 
41*22dc650dSSadaf Ebrahimi /* This module contains functions for scanning a compiled pattern and
42*22dc650dSSadaf Ebrahimi collecting data (e.g. minimum matching length). */
43*22dc650dSSadaf Ebrahimi 
44*22dc650dSSadaf Ebrahimi 
45*22dc650dSSadaf Ebrahimi #ifdef HAVE_CONFIG_H
46*22dc650dSSadaf Ebrahimi #include "config.h"
47*22dc650dSSadaf Ebrahimi #endif
48*22dc650dSSadaf Ebrahimi 
49*22dc650dSSadaf Ebrahimi #include "pcre2_internal.h"
50*22dc650dSSadaf Ebrahimi 
51*22dc650dSSadaf Ebrahimi /* The maximum remembered capturing brackets minimum. */
52*22dc650dSSadaf Ebrahimi 
53*22dc650dSSadaf Ebrahimi #define MAX_CACHE_BACKREF 128
54*22dc650dSSadaf Ebrahimi 
55*22dc650dSSadaf Ebrahimi /* Set a bit in the starting code unit bit map. */
56*22dc650dSSadaf Ebrahimi 
57*22dc650dSSadaf Ebrahimi #define SET_BIT(c) re->start_bitmap[(c)/8] |= (1u << ((c)&7))
58*22dc650dSSadaf Ebrahimi 
59*22dc650dSSadaf Ebrahimi /* Returns from set_start_bits() */
60*22dc650dSSadaf Ebrahimi 
61*22dc650dSSadaf Ebrahimi enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN, SSB_TOODEEP };
62*22dc650dSSadaf Ebrahimi 
63*22dc650dSSadaf Ebrahimi 
64*22dc650dSSadaf Ebrahimi /*************************************************
65*22dc650dSSadaf Ebrahimi *   Find the minimum subject length for a group  *
66*22dc650dSSadaf Ebrahimi *************************************************/
67*22dc650dSSadaf Ebrahimi 
68*22dc650dSSadaf Ebrahimi /* Scan a parenthesized group and compute the minimum length of subject that
69*22dc650dSSadaf Ebrahimi is needed to match it. This is a lower bound; it does not mean there is a
70*22dc650dSSadaf Ebrahimi string of that length that matches. In UTF mode, the result is in characters
71*22dc650dSSadaf Ebrahimi rather than code units. The field in a compiled pattern for storing the minimum
72*22dc650dSSadaf Ebrahimi length is 16-bits long (on the grounds that anything longer than that is
73*22dc650dSSadaf Ebrahimi pathological), so we give up when we reach that amount. This also means that
74*22dc650dSSadaf Ebrahimi integer overflow for really crazy patterns cannot happen.
75*22dc650dSSadaf Ebrahimi 
76*22dc650dSSadaf Ebrahimi Backreference minimum lengths are cached to speed up multiple references. This
77*22dc650dSSadaf Ebrahimi function is called only when the highest back reference in the pattern is less
78*22dc650dSSadaf Ebrahimi than or equal to MAX_CACHE_BACKREF, which is one less than the size of the
79*22dc650dSSadaf Ebrahimi caching vector. The zeroth element contains the number of the highest set
80*22dc650dSSadaf Ebrahimi value.
81*22dc650dSSadaf Ebrahimi 
82*22dc650dSSadaf Ebrahimi Arguments:
83*22dc650dSSadaf Ebrahimi   re              compiled pattern block
84*22dc650dSSadaf Ebrahimi   code            pointer to start of group (the bracket)
85*22dc650dSSadaf Ebrahimi   startcode       pointer to start of the whole pattern's code
86*22dc650dSSadaf Ebrahimi   utf             UTF flag
87*22dc650dSSadaf Ebrahimi   recurses        chain of recurse_check to catch mutual recursion
88*22dc650dSSadaf Ebrahimi   countptr        pointer to call count (to catch over complexity)
89*22dc650dSSadaf Ebrahimi   backref_cache   vector for caching back references.
90*22dc650dSSadaf Ebrahimi 
91*22dc650dSSadaf Ebrahimi This function is no longer called when the pattern contains (*ACCEPT); however,
92*22dc650dSSadaf Ebrahimi the old code for returning -1 is retained, just in case.
93*22dc650dSSadaf Ebrahimi 
94*22dc650dSSadaf Ebrahimi Returns:   the minimum length
95*22dc650dSSadaf Ebrahimi            -1 \C in UTF-8 mode
96*22dc650dSSadaf Ebrahimi               or (*ACCEPT)
97*22dc650dSSadaf Ebrahimi               or pattern too complicated
98*22dc650dSSadaf Ebrahimi            -2 internal error (missing capturing bracket)
99*22dc650dSSadaf Ebrahimi            -3 internal error (opcode not listed)
100*22dc650dSSadaf Ebrahimi */
101*22dc650dSSadaf Ebrahimi 
102*22dc650dSSadaf Ebrahimi static int
find_minlength(const pcre2_real_code * re,PCRE2_SPTR code,PCRE2_SPTR startcode,BOOL utf,recurse_check * recurses,int * countptr,int * backref_cache)103*22dc650dSSadaf Ebrahimi find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
104*22dc650dSSadaf Ebrahimi   PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr,
105*22dc650dSSadaf Ebrahimi   int *backref_cache)
106*22dc650dSSadaf Ebrahimi {
107*22dc650dSSadaf Ebrahimi int length = -1;
108*22dc650dSSadaf Ebrahimi int branchlength = 0;
109*22dc650dSSadaf Ebrahimi int prev_cap_recno = -1;
110*22dc650dSSadaf Ebrahimi int prev_cap_d = 0;
111*22dc650dSSadaf Ebrahimi int prev_recurse_recno = -1;
112*22dc650dSSadaf Ebrahimi int prev_recurse_d = 0;
113*22dc650dSSadaf Ebrahimi uint32_t once_fudge = 0;
114*22dc650dSSadaf Ebrahimi BOOL had_recurse = FALSE;
115*22dc650dSSadaf Ebrahimi BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
116*22dc650dSSadaf Ebrahimi PCRE2_SPTR nextbranch = code + GET(code, 1);
117*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
118*22dc650dSSadaf Ebrahimi recurse_check this_recurse;
119*22dc650dSSadaf Ebrahimi 
120*22dc650dSSadaf Ebrahimi /* If this is a "could be empty" group, its minimum length is 0. */
121*22dc650dSSadaf Ebrahimi 
122*22dc650dSSadaf Ebrahimi if (*code >= OP_SBRA && *code <= OP_SCOND) return 0;
123*22dc650dSSadaf Ebrahimi 
124*22dc650dSSadaf Ebrahimi /* Skip over capturing bracket number */
125*22dc650dSSadaf Ebrahimi 
126*22dc650dSSadaf Ebrahimi if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE;
127*22dc650dSSadaf Ebrahimi 
128*22dc650dSSadaf Ebrahimi /* A large and/or complex regex can take too long to process. */
129*22dc650dSSadaf Ebrahimi 
130*22dc650dSSadaf Ebrahimi if ((*countptr)++ > 1000) return -1;
131*22dc650dSSadaf Ebrahimi 
132*22dc650dSSadaf Ebrahimi /* Scan along the opcodes for this branch. If we get to the end of the branch,
133*22dc650dSSadaf Ebrahimi check the length against that of the other branches. If the accumulated length
134*22dc650dSSadaf Ebrahimi passes 16-bits, reset to that value and skip the rest of the branch. */
135*22dc650dSSadaf Ebrahimi 
136*22dc650dSSadaf Ebrahimi for (;;)
137*22dc650dSSadaf Ebrahimi   {
138*22dc650dSSadaf Ebrahimi   int d, min, recno;
139*22dc650dSSadaf Ebrahimi   PCRE2_UCHAR op, *cs, *ce;
140*22dc650dSSadaf Ebrahimi 
141*22dc650dSSadaf Ebrahimi   if (branchlength >= UINT16_MAX)
142*22dc650dSSadaf Ebrahimi     {
143*22dc650dSSadaf Ebrahimi     branchlength = UINT16_MAX;
144*22dc650dSSadaf Ebrahimi     cc = (PCRE2_UCHAR *)nextbranch;
145*22dc650dSSadaf Ebrahimi     }
146*22dc650dSSadaf Ebrahimi 
147*22dc650dSSadaf Ebrahimi   op = *cc;
148*22dc650dSSadaf Ebrahimi   switch (op)
149*22dc650dSSadaf Ebrahimi     {
150*22dc650dSSadaf Ebrahimi     case OP_COND:
151*22dc650dSSadaf Ebrahimi     case OP_SCOND:
152*22dc650dSSadaf Ebrahimi 
153*22dc650dSSadaf Ebrahimi     /* If there is only one branch in a condition, the implied branch has zero
154*22dc650dSSadaf Ebrahimi     length, so we don't add anything. This covers the DEFINE "condition"
155*22dc650dSSadaf Ebrahimi     automatically. If there are two branches we can treat it the same as any
156*22dc650dSSadaf Ebrahimi     other non-capturing subpattern. */
157*22dc650dSSadaf Ebrahimi 
158*22dc650dSSadaf Ebrahimi     cs = cc + GET(cc, 1);
159*22dc650dSSadaf Ebrahimi     if (*cs != OP_ALT)
160*22dc650dSSadaf Ebrahimi       {
161*22dc650dSSadaf Ebrahimi       cc = cs + 1 + LINK_SIZE;
162*22dc650dSSadaf Ebrahimi       break;
163*22dc650dSSadaf Ebrahimi       }
164*22dc650dSSadaf Ebrahimi     goto PROCESS_NON_CAPTURE;
165*22dc650dSSadaf Ebrahimi 
166*22dc650dSSadaf Ebrahimi     case OP_BRA:
167*22dc650dSSadaf Ebrahimi     /* There's a special case of OP_BRA, when it is wrapped round a repeated
168*22dc650dSSadaf Ebrahimi     OP_RECURSE. We'd like to process the latter at this level so that
169*22dc650dSSadaf Ebrahimi     remembering the value works for repeated cases. So we do nothing, but
170*22dc650dSSadaf Ebrahimi     set a fudge value to skip over the OP_KET after the recurse. */
171*22dc650dSSadaf Ebrahimi 
172*22dc650dSSadaf Ebrahimi     if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
173*22dc650dSSadaf Ebrahimi       {
174*22dc650dSSadaf Ebrahimi       once_fudge = 1 + LINK_SIZE;
175*22dc650dSSadaf Ebrahimi       cc += 1 + LINK_SIZE;
176*22dc650dSSadaf Ebrahimi       break;
177*22dc650dSSadaf Ebrahimi       }
178*22dc650dSSadaf Ebrahimi     /* Fall through */
179*22dc650dSSadaf Ebrahimi 
180*22dc650dSSadaf Ebrahimi     case OP_ONCE:
181*22dc650dSSadaf Ebrahimi     case OP_SCRIPT_RUN:
182*22dc650dSSadaf Ebrahimi     case OP_SBRA:
183*22dc650dSSadaf Ebrahimi     case OP_BRAPOS:
184*22dc650dSSadaf Ebrahimi     case OP_SBRAPOS:
185*22dc650dSSadaf Ebrahimi     PROCESS_NON_CAPTURE:
186*22dc650dSSadaf Ebrahimi     d = find_minlength(re, cc, startcode, utf, recurses, countptr,
187*22dc650dSSadaf Ebrahimi       backref_cache);
188*22dc650dSSadaf Ebrahimi     if (d < 0) return d;
189*22dc650dSSadaf Ebrahimi     branchlength += d;
190*22dc650dSSadaf Ebrahimi     do cc += GET(cc, 1); while (*cc == OP_ALT);
191*22dc650dSSadaf Ebrahimi     cc += 1 + LINK_SIZE;
192*22dc650dSSadaf Ebrahimi     break;
193*22dc650dSSadaf Ebrahimi 
194*22dc650dSSadaf Ebrahimi     /* To save time for repeated capturing subpatterns, we remember the
195*22dc650dSSadaf Ebrahimi     length of the previous one. Unfortunately we can't do the same for
196*22dc650dSSadaf Ebrahimi     the unnumbered ones above. Nor can we do this if (?| is present in the
197*22dc650dSSadaf Ebrahimi     pattern because captures with the same number are not then identical. */
198*22dc650dSSadaf Ebrahimi 
199*22dc650dSSadaf Ebrahimi     case OP_CBRA:
200*22dc650dSSadaf Ebrahimi     case OP_SCBRA:
201*22dc650dSSadaf Ebrahimi     case OP_CBRAPOS:
202*22dc650dSSadaf Ebrahimi     case OP_SCBRAPOS:
203*22dc650dSSadaf Ebrahimi     recno = (int)GET2(cc, 1+LINK_SIZE);
204*22dc650dSSadaf Ebrahimi     if (dupcapused || recno != prev_cap_recno)
205*22dc650dSSadaf Ebrahimi       {
206*22dc650dSSadaf Ebrahimi       prev_cap_recno = recno;
207*22dc650dSSadaf Ebrahimi       prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr,
208*22dc650dSSadaf Ebrahimi         backref_cache);
209*22dc650dSSadaf Ebrahimi       if (prev_cap_d < 0) return prev_cap_d;
210*22dc650dSSadaf Ebrahimi       }
211*22dc650dSSadaf Ebrahimi     branchlength += prev_cap_d;
212*22dc650dSSadaf Ebrahimi     do cc += GET(cc, 1); while (*cc == OP_ALT);
213*22dc650dSSadaf Ebrahimi     cc += 1 + LINK_SIZE;
214*22dc650dSSadaf Ebrahimi     break;
215*22dc650dSSadaf Ebrahimi 
216*22dc650dSSadaf Ebrahimi     /* ACCEPT makes things far too complicated; we have to give up. In fact,
217*22dc650dSSadaf Ebrahimi     from 10.34 onwards, if a pattern contains (*ACCEPT), this function is not
218*22dc650dSSadaf Ebrahimi     used. However, leave the code in place, just in case. */
219*22dc650dSSadaf Ebrahimi 
220*22dc650dSSadaf Ebrahimi     case OP_ACCEPT:
221*22dc650dSSadaf Ebrahimi     case OP_ASSERT_ACCEPT:
222*22dc650dSSadaf Ebrahimi     return -1;
223*22dc650dSSadaf Ebrahimi 
224*22dc650dSSadaf Ebrahimi     /* Reached end of a branch; if it's a ket it is the end of a nested
225*22dc650dSSadaf Ebrahimi     call. If it's ALT it is an alternation in a nested call. If it is END it's
226*22dc650dSSadaf Ebrahimi     the end of the outer call. All can be handled by the same code. If the
227*22dc650dSSadaf Ebrahimi     length of any branch is zero, there is no need to scan any subsequent
228*22dc650dSSadaf Ebrahimi     branches. */
229*22dc650dSSadaf Ebrahimi 
230*22dc650dSSadaf Ebrahimi     case OP_ALT:
231*22dc650dSSadaf Ebrahimi     case OP_KET:
232*22dc650dSSadaf Ebrahimi     case OP_KETRMAX:
233*22dc650dSSadaf Ebrahimi     case OP_KETRMIN:
234*22dc650dSSadaf Ebrahimi     case OP_KETRPOS:
235*22dc650dSSadaf Ebrahimi     case OP_END:
236*22dc650dSSadaf Ebrahimi     if (length < 0 || (!had_recurse && branchlength < length))
237*22dc650dSSadaf Ebrahimi       length = branchlength;
238*22dc650dSSadaf Ebrahimi     if (op != OP_ALT || length == 0) return length;
239*22dc650dSSadaf Ebrahimi     nextbranch = cc + GET(cc, 1);
240*22dc650dSSadaf Ebrahimi     cc += 1 + LINK_SIZE;
241*22dc650dSSadaf Ebrahimi     branchlength = 0;
242*22dc650dSSadaf Ebrahimi     had_recurse = FALSE;
243*22dc650dSSadaf Ebrahimi     break;
244*22dc650dSSadaf Ebrahimi 
245*22dc650dSSadaf Ebrahimi     /* Skip over assertive subpatterns */
246*22dc650dSSadaf Ebrahimi 
247*22dc650dSSadaf Ebrahimi     case OP_ASSERT:
248*22dc650dSSadaf Ebrahimi     case OP_ASSERT_NOT:
249*22dc650dSSadaf Ebrahimi     case OP_ASSERTBACK:
250*22dc650dSSadaf Ebrahimi     case OP_ASSERTBACK_NOT:
251*22dc650dSSadaf Ebrahimi     case OP_ASSERT_NA:
252*22dc650dSSadaf Ebrahimi     case OP_ASSERTBACK_NA:
253*22dc650dSSadaf Ebrahimi     do cc += GET(cc, 1); while (*cc == OP_ALT);
254*22dc650dSSadaf Ebrahimi     /* Fall through */
255*22dc650dSSadaf Ebrahimi 
256*22dc650dSSadaf Ebrahimi     /* Skip over things that don't match chars */
257*22dc650dSSadaf Ebrahimi 
258*22dc650dSSadaf Ebrahimi     case OP_REVERSE:
259*22dc650dSSadaf Ebrahimi     case OP_VREVERSE:
260*22dc650dSSadaf Ebrahimi     case OP_CREF:
261*22dc650dSSadaf Ebrahimi     case OP_DNCREF:
262*22dc650dSSadaf Ebrahimi     case OP_RREF:
263*22dc650dSSadaf Ebrahimi     case OP_DNRREF:
264*22dc650dSSadaf Ebrahimi     case OP_FALSE:
265*22dc650dSSadaf Ebrahimi     case OP_TRUE:
266*22dc650dSSadaf Ebrahimi     case OP_CALLOUT:
267*22dc650dSSadaf Ebrahimi     case OP_SOD:
268*22dc650dSSadaf Ebrahimi     case OP_SOM:
269*22dc650dSSadaf Ebrahimi     case OP_EOD:
270*22dc650dSSadaf Ebrahimi     case OP_EODN:
271*22dc650dSSadaf Ebrahimi     case OP_CIRC:
272*22dc650dSSadaf Ebrahimi     case OP_CIRCM:
273*22dc650dSSadaf Ebrahimi     case OP_DOLL:
274*22dc650dSSadaf Ebrahimi     case OP_DOLLM:
275*22dc650dSSadaf Ebrahimi     case OP_NOT_WORD_BOUNDARY:
276*22dc650dSSadaf Ebrahimi     case OP_WORD_BOUNDARY:
277*22dc650dSSadaf Ebrahimi     case OP_NOT_UCP_WORD_BOUNDARY:
278*22dc650dSSadaf Ebrahimi     case OP_UCP_WORD_BOUNDARY:
279*22dc650dSSadaf Ebrahimi     cc += PRIV(OP_lengths)[*cc];
280*22dc650dSSadaf Ebrahimi     break;
281*22dc650dSSadaf Ebrahimi 
282*22dc650dSSadaf Ebrahimi     case OP_CALLOUT_STR:
283*22dc650dSSadaf Ebrahimi     cc += GET(cc, 1 + 2*LINK_SIZE);
284*22dc650dSSadaf Ebrahimi     break;
285*22dc650dSSadaf Ebrahimi 
286*22dc650dSSadaf Ebrahimi     /* Skip over a subpattern that has a {0} or {0,x} quantifier */
287*22dc650dSSadaf Ebrahimi 
288*22dc650dSSadaf Ebrahimi     case OP_BRAZERO:
289*22dc650dSSadaf Ebrahimi     case OP_BRAMINZERO:
290*22dc650dSSadaf Ebrahimi     case OP_BRAPOSZERO:
291*22dc650dSSadaf Ebrahimi     case OP_SKIPZERO:
292*22dc650dSSadaf Ebrahimi     cc += PRIV(OP_lengths)[*cc];
293*22dc650dSSadaf Ebrahimi     do cc += GET(cc, 1); while (*cc == OP_ALT);
294*22dc650dSSadaf Ebrahimi     cc += 1 + LINK_SIZE;
295*22dc650dSSadaf Ebrahimi     break;
296*22dc650dSSadaf Ebrahimi 
297*22dc650dSSadaf Ebrahimi     /* Handle literal characters and + repetitions */
298*22dc650dSSadaf Ebrahimi 
299*22dc650dSSadaf Ebrahimi     case OP_CHAR:
300*22dc650dSSadaf Ebrahimi     case OP_CHARI:
301*22dc650dSSadaf Ebrahimi     case OP_NOT:
302*22dc650dSSadaf Ebrahimi     case OP_NOTI:
303*22dc650dSSadaf Ebrahimi     case OP_PLUS:
304*22dc650dSSadaf Ebrahimi     case OP_PLUSI:
305*22dc650dSSadaf Ebrahimi     case OP_MINPLUS:
306*22dc650dSSadaf Ebrahimi     case OP_MINPLUSI:
307*22dc650dSSadaf Ebrahimi     case OP_POSPLUS:
308*22dc650dSSadaf Ebrahimi     case OP_POSPLUSI:
309*22dc650dSSadaf Ebrahimi     case OP_NOTPLUS:
310*22dc650dSSadaf Ebrahimi     case OP_NOTPLUSI:
311*22dc650dSSadaf Ebrahimi     case OP_NOTMINPLUS:
312*22dc650dSSadaf Ebrahimi     case OP_NOTMINPLUSI:
313*22dc650dSSadaf Ebrahimi     case OP_NOTPOSPLUS:
314*22dc650dSSadaf Ebrahimi     case OP_NOTPOSPLUSI:
315*22dc650dSSadaf Ebrahimi     branchlength++;
316*22dc650dSSadaf Ebrahimi     cc += 2;
317*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
318*22dc650dSSadaf Ebrahimi     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
319*22dc650dSSadaf Ebrahimi #endif
320*22dc650dSSadaf Ebrahimi     break;
321*22dc650dSSadaf Ebrahimi 
322*22dc650dSSadaf Ebrahimi     case OP_TYPEPLUS:
323*22dc650dSSadaf Ebrahimi     case OP_TYPEMINPLUS:
324*22dc650dSSadaf Ebrahimi     case OP_TYPEPOSPLUS:
325*22dc650dSSadaf Ebrahimi     branchlength++;
326*22dc650dSSadaf Ebrahimi     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
327*22dc650dSSadaf Ebrahimi     break;
328*22dc650dSSadaf Ebrahimi 
329*22dc650dSSadaf Ebrahimi     /* Handle exact repetitions. The count is already in characters, but we
330*22dc650dSSadaf Ebrahimi     may need to skip over a multibyte character in UTF mode.  */
331*22dc650dSSadaf Ebrahimi 
332*22dc650dSSadaf Ebrahimi     case OP_EXACT:
333*22dc650dSSadaf Ebrahimi     case OP_EXACTI:
334*22dc650dSSadaf Ebrahimi     case OP_NOTEXACT:
335*22dc650dSSadaf Ebrahimi     case OP_NOTEXACTI:
336*22dc650dSSadaf Ebrahimi     branchlength += GET2(cc,1);
337*22dc650dSSadaf Ebrahimi     cc += 2 + IMM2_SIZE;
338*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
339*22dc650dSSadaf Ebrahimi     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
340*22dc650dSSadaf Ebrahimi #endif
341*22dc650dSSadaf Ebrahimi     break;
342*22dc650dSSadaf Ebrahimi 
343*22dc650dSSadaf Ebrahimi     case OP_TYPEEXACT:
344*22dc650dSSadaf Ebrahimi     branchlength += GET2(cc,1);
345*22dc650dSSadaf Ebrahimi     cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
346*22dc650dSSadaf Ebrahimi       || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
347*22dc650dSSadaf Ebrahimi     break;
348*22dc650dSSadaf Ebrahimi 
349*22dc650dSSadaf Ebrahimi     /* Handle single-char non-literal matchers */
350*22dc650dSSadaf Ebrahimi 
351*22dc650dSSadaf Ebrahimi     case OP_PROP:
352*22dc650dSSadaf Ebrahimi     case OP_NOTPROP:
353*22dc650dSSadaf Ebrahimi     cc += 2;
354*22dc650dSSadaf Ebrahimi     /* Fall through */
355*22dc650dSSadaf Ebrahimi 
356*22dc650dSSadaf Ebrahimi     case OP_NOT_DIGIT:
357*22dc650dSSadaf Ebrahimi     case OP_DIGIT:
358*22dc650dSSadaf Ebrahimi     case OP_NOT_WHITESPACE:
359*22dc650dSSadaf Ebrahimi     case OP_WHITESPACE:
360*22dc650dSSadaf Ebrahimi     case OP_NOT_WORDCHAR:
361*22dc650dSSadaf Ebrahimi     case OP_WORDCHAR:
362*22dc650dSSadaf Ebrahimi     case OP_ANY:
363*22dc650dSSadaf Ebrahimi     case OP_ALLANY:
364*22dc650dSSadaf Ebrahimi     case OP_EXTUNI:
365*22dc650dSSadaf Ebrahimi     case OP_HSPACE:
366*22dc650dSSadaf Ebrahimi     case OP_NOT_HSPACE:
367*22dc650dSSadaf Ebrahimi     case OP_VSPACE:
368*22dc650dSSadaf Ebrahimi     case OP_NOT_VSPACE:
369*22dc650dSSadaf Ebrahimi     branchlength++;
370*22dc650dSSadaf Ebrahimi     cc++;
371*22dc650dSSadaf Ebrahimi     break;
372*22dc650dSSadaf Ebrahimi 
373*22dc650dSSadaf Ebrahimi     /* "Any newline" might match two characters, but it also might match just
374*22dc650dSSadaf Ebrahimi     one. */
375*22dc650dSSadaf Ebrahimi 
376*22dc650dSSadaf Ebrahimi     case OP_ANYNL:
377*22dc650dSSadaf Ebrahimi     branchlength += 1;
378*22dc650dSSadaf Ebrahimi     cc++;
379*22dc650dSSadaf Ebrahimi     break;
380*22dc650dSSadaf Ebrahimi 
381*22dc650dSSadaf Ebrahimi     /* The single-byte matcher means we can't proceed in UTF mode. (In
382*22dc650dSSadaf Ebrahimi     non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever
383*22dc650dSSadaf Ebrahimi     appear, but leave the code, just in case.) */
384*22dc650dSSadaf Ebrahimi 
385*22dc650dSSadaf Ebrahimi     case OP_ANYBYTE:
386*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
387*22dc650dSSadaf Ebrahimi     if (utf) return -1;
388*22dc650dSSadaf Ebrahimi #endif
389*22dc650dSSadaf Ebrahimi     branchlength++;
390*22dc650dSSadaf Ebrahimi     cc++;
391*22dc650dSSadaf Ebrahimi     break;
392*22dc650dSSadaf Ebrahimi 
393*22dc650dSSadaf Ebrahimi     /* For repeated character types, we have to test for \p and \P, which have
394*22dc650dSSadaf Ebrahimi     an extra two bytes of parameters. */
395*22dc650dSSadaf Ebrahimi 
396*22dc650dSSadaf Ebrahimi     case OP_TYPESTAR:
397*22dc650dSSadaf Ebrahimi     case OP_TYPEMINSTAR:
398*22dc650dSSadaf Ebrahimi     case OP_TYPEQUERY:
399*22dc650dSSadaf Ebrahimi     case OP_TYPEMINQUERY:
400*22dc650dSSadaf Ebrahimi     case OP_TYPEPOSSTAR:
401*22dc650dSSadaf Ebrahimi     case OP_TYPEPOSQUERY:
402*22dc650dSSadaf Ebrahimi     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
403*22dc650dSSadaf Ebrahimi     cc += PRIV(OP_lengths)[op];
404*22dc650dSSadaf Ebrahimi     break;
405*22dc650dSSadaf Ebrahimi 
406*22dc650dSSadaf Ebrahimi     case OP_TYPEUPTO:
407*22dc650dSSadaf Ebrahimi     case OP_TYPEMINUPTO:
408*22dc650dSSadaf Ebrahimi     case OP_TYPEPOSUPTO:
409*22dc650dSSadaf Ebrahimi     if (cc[1 + IMM2_SIZE] == OP_PROP
410*22dc650dSSadaf Ebrahimi       || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
411*22dc650dSSadaf Ebrahimi     cc += PRIV(OP_lengths)[op];
412*22dc650dSSadaf Ebrahimi     break;
413*22dc650dSSadaf Ebrahimi 
414*22dc650dSSadaf Ebrahimi     /* Check a class for variable quantification */
415*22dc650dSSadaf Ebrahimi 
416*22dc650dSSadaf Ebrahimi     case OP_CLASS:
417*22dc650dSSadaf Ebrahimi     case OP_NCLASS:
418*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_WIDE_CHARS
419*22dc650dSSadaf Ebrahimi     case OP_XCLASS:
420*22dc650dSSadaf Ebrahimi     /* The original code caused an unsigned overflow in 64 bit systems,
421*22dc650dSSadaf Ebrahimi     so now we use a conditional statement. */
422*22dc650dSSadaf Ebrahimi     if (op == OP_XCLASS)
423*22dc650dSSadaf Ebrahimi       cc += GET(cc, 1);
424*22dc650dSSadaf Ebrahimi     else
425*22dc650dSSadaf Ebrahimi       cc += PRIV(OP_lengths)[OP_CLASS];
426*22dc650dSSadaf Ebrahimi #else
427*22dc650dSSadaf Ebrahimi     cc += PRIV(OP_lengths)[OP_CLASS];
428*22dc650dSSadaf Ebrahimi #endif
429*22dc650dSSadaf Ebrahimi 
430*22dc650dSSadaf Ebrahimi     switch (*cc)
431*22dc650dSSadaf Ebrahimi       {
432*22dc650dSSadaf Ebrahimi       case OP_CRPLUS:
433*22dc650dSSadaf Ebrahimi       case OP_CRMINPLUS:
434*22dc650dSSadaf Ebrahimi       case OP_CRPOSPLUS:
435*22dc650dSSadaf Ebrahimi       branchlength++;
436*22dc650dSSadaf Ebrahimi       /* Fall through */
437*22dc650dSSadaf Ebrahimi 
438*22dc650dSSadaf Ebrahimi       case OP_CRSTAR:
439*22dc650dSSadaf Ebrahimi       case OP_CRMINSTAR:
440*22dc650dSSadaf Ebrahimi       case OP_CRQUERY:
441*22dc650dSSadaf Ebrahimi       case OP_CRMINQUERY:
442*22dc650dSSadaf Ebrahimi       case OP_CRPOSSTAR:
443*22dc650dSSadaf Ebrahimi       case OP_CRPOSQUERY:
444*22dc650dSSadaf Ebrahimi       cc++;
445*22dc650dSSadaf Ebrahimi       break;
446*22dc650dSSadaf Ebrahimi 
447*22dc650dSSadaf Ebrahimi       case OP_CRRANGE:
448*22dc650dSSadaf Ebrahimi       case OP_CRMINRANGE:
449*22dc650dSSadaf Ebrahimi       case OP_CRPOSRANGE:
450*22dc650dSSadaf Ebrahimi       branchlength += GET2(cc,1);
451*22dc650dSSadaf Ebrahimi       cc += 1 + 2 * IMM2_SIZE;
452*22dc650dSSadaf Ebrahimi       break;
453*22dc650dSSadaf Ebrahimi 
454*22dc650dSSadaf Ebrahimi       default:
455*22dc650dSSadaf Ebrahimi       branchlength++;
456*22dc650dSSadaf Ebrahimi       break;
457*22dc650dSSadaf Ebrahimi       }
458*22dc650dSSadaf Ebrahimi     break;
459*22dc650dSSadaf Ebrahimi 
460*22dc650dSSadaf Ebrahimi     /* Backreferences and subroutine calls (OP_RECURSE) are treated in the same
461*22dc650dSSadaf Ebrahimi     way: we find the minimum length for the subpattern. A recursion
462*22dc650dSSadaf Ebrahimi     (backreference or subroutine) causes an a flag to be set that causes the
463*22dc650dSSadaf Ebrahimi     length of this branch to be ignored. The logic is that a recursion can only
464*22dc650dSSadaf Ebrahimi     make sense if there is another alternative that stops the recursing. That
465*22dc650dSSadaf Ebrahimi     will provide the minimum length (when no recursion happens).
466*22dc650dSSadaf Ebrahimi 
467*22dc650dSSadaf Ebrahimi     If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
468*22dc650dSSadaf Ebrahimi     matches an empty string (by default it causes a matching failure), so in
469*22dc650dSSadaf Ebrahimi     that case we must set the minimum length to zero.
470*22dc650dSSadaf Ebrahimi 
471*22dc650dSSadaf Ebrahimi     For backreferenes, if duplicate numbers are present in the pattern we check
472*22dc650dSSadaf Ebrahimi     for a reference to a duplicate. If it is, we don't know which version will
473*22dc650dSSadaf Ebrahimi     be referenced, so we have to set the minimum length to zero. */
474*22dc650dSSadaf Ebrahimi 
475*22dc650dSSadaf Ebrahimi     /* Duplicate named pattern back reference. */
476*22dc650dSSadaf Ebrahimi 
477*22dc650dSSadaf Ebrahimi     case OP_DNREF:
478*22dc650dSSadaf Ebrahimi     case OP_DNREFI:
479*22dc650dSSadaf Ebrahimi     if (!dupcapused && (re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
480*22dc650dSSadaf Ebrahimi       {
481*22dc650dSSadaf Ebrahimi       int count = GET2(cc, 1+IMM2_SIZE);
482*22dc650dSSadaf Ebrahimi       PCRE2_UCHAR *slot =
483*22dc650dSSadaf Ebrahimi         (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
484*22dc650dSSadaf Ebrahimi           GET2(cc, 1) * re->name_entry_size;
485*22dc650dSSadaf Ebrahimi 
486*22dc650dSSadaf Ebrahimi       d = INT_MAX;
487*22dc650dSSadaf Ebrahimi 
488*22dc650dSSadaf Ebrahimi       /* Scan all groups with the same name; find the shortest. */
489*22dc650dSSadaf Ebrahimi 
490*22dc650dSSadaf Ebrahimi       while (count-- > 0)
491*22dc650dSSadaf Ebrahimi         {
492*22dc650dSSadaf Ebrahimi         int dd, i;
493*22dc650dSSadaf Ebrahimi         recno = GET2(slot, 0);
494*22dc650dSSadaf Ebrahimi 
495*22dc650dSSadaf Ebrahimi         if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
496*22dc650dSSadaf Ebrahimi           dd = backref_cache[recno];
497*22dc650dSSadaf Ebrahimi         else
498*22dc650dSSadaf Ebrahimi           {
499*22dc650dSSadaf Ebrahimi           ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
500*22dc650dSSadaf Ebrahimi           if (cs == NULL) return -2;
501*22dc650dSSadaf Ebrahimi           do ce += GET(ce, 1); while (*ce == OP_ALT);
502*22dc650dSSadaf Ebrahimi 
503*22dc650dSSadaf Ebrahimi           dd = 0;
504*22dc650dSSadaf Ebrahimi           if (!dupcapused ||
505*22dc650dSSadaf Ebrahimi               (PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL)
506*22dc650dSSadaf Ebrahimi             {
507*22dc650dSSadaf Ebrahimi             if (cc > cs && cc < ce)    /* Simple recursion */
508*22dc650dSSadaf Ebrahimi               {
509*22dc650dSSadaf Ebrahimi               had_recurse = TRUE;
510*22dc650dSSadaf Ebrahimi               }
511*22dc650dSSadaf Ebrahimi             else
512*22dc650dSSadaf Ebrahimi               {
513*22dc650dSSadaf Ebrahimi               recurse_check *r = recurses;
514*22dc650dSSadaf Ebrahimi               for (r = recurses; r != NULL; r = r->prev)
515*22dc650dSSadaf Ebrahimi                 if (r->group == cs) break;
516*22dc650dSSadaf Ebrahimi               if (r != NULL)           /* Mutual recursion */
517*22dc650dSSadaf Ebrahimi                 {
518*22dc650dSSadaf Ebrahimi                 had_recurse = TRUE;
519*22dc650dSSadaf Ebrahimi                 }
520*22dc650dSSadaf Ebrahimi               else
521*22dc650dSSadaf Ebrahimi                 {
522*22dc650dSSadaf Ebrahimi                 this_recurse.prev = recurses;  /* No recursion */
523*22dc650dSSadaf Ebrahimi                 this_recurse.group = cs;
524*22dc650dSSadaf Ebrahimi                 dd = find_minlength(re, cs, startcode, utf, &this_recurse,
525*22dc650dSSadaf Ebrahimi                   countptr, backref_cache);
526*22dc650dSSadaf Ebrahimi                 if (dd < 0) return dd;
527*22dc650dSSadaf Ebrahimi                 }
528*22dc650dSSadaf Ebrahimi               }
529*22dc650dSSadaf Ebrahimi             }
530*22dc650dSSadaf Ebrahimi 
531*22dc650dSSadaf Ebrahimi           backref_cache[recno] = dd;
532*22dc650dSSadaf Ebrahimi           for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
533*22dc650dSSadaf Ebrahimi           backref_cache[0] = recno;
534*22dc650dSSadaf Ebrahimi           }
535*22dc650dSSadaf Ebrahimi 
536*22dc650dSSadaf Ebrahimi         if (dd < d) d = dd;
537*22dc650dSSadaf Ebrahimi         if (d <= 0) break;    /* No point looking at any more */
538*22dc650dSSadaf Ebrahimi         slot += re->name_entry_size;
539*22dc650dSSadaf Ebrahimi         }
540*22dc650dSSadaf Ebrahimi       }
541*22dc650dSSadaf Ebrahimi     else d = 0;
542*22dc650dSSadaf Ebrahimi     cc += 1 + 2*IMM2_SIZE;
543*22dc650dSSadaf Ebrahimi     goto REPEAT_BACK_REFERENCE;
544*22dc650dSSadaf Ebrahimi 
545*22dc650dSSadaf Ebrahimi     /* Single back reference by number. References by name are converted to by
546*22dc650dSSadaf Ebrahimi     number when there is no duplication. */
547*22dc650dSSadaf Ebrahimi 
548*22dc650dSSadaf Ebrahimi     case OP_REF:
549*22dc650dSSadaf Ebrahimi     case OP_REFI:
550*22dc650dSSadaf Ebrahimi     recno = GET2(cc, 1);
551*22dc650dSSadaf Ebrahimi     if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
552*22dc650dSSadaf Ebrahimi       d = backref_cache[recno];
553*22dc650dSSadaf Ebrahimi     else
554*22dc650dSSadaf Ebrahimi       {
555*22dc650dSSadaf Ebrahimi       int i;
556*22dc650dSSadaf Ebrahimi       d = 0;
557*22dc650dSSadaf Ebrahimi 
558*22dc650dSSadaf Ebrahimi       if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
559*22dc650dSSadaf Ebrahimi         {
560*22dc650dSSadaf Ebrahimi         ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
561*22dc650dSSadaf Ebrahimi         if (cs == NULL) return -2;
562*22dc650dSSadaf Ebrahimi         do ce += GET(ce, 1); while (*ce == OP_ALT);
563*22dc650dSSadaf Ebrahimi 
564*22dc650dSSadaf Ebrahimi         if (!dupcapused ||
565*22dc650dSSadaf Ebrahimi             (PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL)
566*22dc650dSSadaf Ebrahimi           {
567*22dc650dSSadaf Ebrahimi           if (cc > cs && cc < ce)    /* Simple recursion */
568*22dc650dSSadaf Ebrahimi             {
569*22dc650dSSadaf Ebrahimi             had_recurse = TRUE;
570*22dc650dSSadaf Ebrahimi             }
571*22dc650dSSadaf Ebrahimi           else
572*22dc650dSSadaf Ebrahimi             {
573*22dc650dSSadaf Ebrahimi             recurse_check *r = recurses;
574*22dc650dSSadaf Ebrahimi             for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
575*22dc650dSSadaf Ebrahimi             if (r != NULL)           /* Mutual recursion */
576*22dc650dSSadaf Ebrahimi               {
577*22dc650dSSadaf Ebrahimi               had_recurse = TRUE;
578*22dc650dSSadaf Ebrahimi               }
579*22dc650dSSadaf Ebrahimi             else                     /* No recursion */
580*22dc650dSSadaf Ebrahimi               {
581*22dc650dSSadaf Ebrahimi               this_recurse.prev = recurses;
582*22dc650dSSadaf Ebrahimi               this_recurse.group = cs;
583*22dc650dSSadaf Ebrahimi               d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
584*22dc650dSSadaf Ebrahimi                 backref_cache);
585*22dc650dSSadaf Ebrahimi               if (d < 0) return d;
586*22dc650dSSadaf Ebrahimi               }
587*22dc650dSSadaf Ebrahimi             }
588*22dc650dSSadaf Ebrahimi           }
589*22dc650dSSadaf Ebrahimi         }
590*22dc650dSSadaf Ebrahimi 
591*22dc650dSSadaf Ebrahimi       backref_cache[recno] = d;
592*22dc650dSSadaf Ebrahimi       for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
593*22dc650dSSadaf Ebrahimi       backref_cache[0] = recno;
594*22dc650dSSadaf Ebrahimi       }
595*22dc650dSSadaf Ebrahimi 
596*22dc650dSSadaf Ebrahimi     cc += 1 + IMM2_SIZE;
597*22dc650dSSadaf Ebrahimi 
598*22dc650dSSadaf Ebrahimi     /* Handle repeated back references */
599*22dc650dSSadaf Ebrahimi 
600*22dc650dSSadaf Ebrahimi     REPEAT_BACK_REFERENCE:
601*22dc650dSSadaf Ebrahimi     switch (*cc)
602*22dc650dSSadaf Ebrahimi       {
603*22dc650dSSadaf Ebrahimi       case OP_CRSTAR:
604*22dc650dSSadaf Ebrahimi       case OP_CRMINSTAR:
605*22dc650dSSadaf Ebrahimi       case OP_CRQUERY:
606*22dc650dSSadaf Ebrahimi       case OP_CRMINQUERY:
607*22dc650dSSadaf Ebrahimi       case OP_CRPOSSTAR:
608*22dc650dSSadaf Ebrahimi       case OP_CRPOSQUERY:
609*22dc650dSSadaf Ebrahimi       min = 0;
610*22dc650dSSadaf Ebrahimi       cc++;
611*22dc650dSSadaf Ebrahimi       break;
612*22dc650dSSadaf Ebrahimi 
613*22dc650dSSadaf Ebrahimi       case OP_CRPLUS:
614*22dc650dSSadaf Ebrahimi       case OP_CRMINPLUS:
615*22dc650dSSadaf Ebrahimi       case OP_CRPOSPLUS:
616*22dc650dSSadaf Ebrahimi       min = 1;
617*22dc650dSSadaf Ebrahimi       cc++;
618*22dc650dSSadaf Ebrahimi       break;
619*22dc650dSSadaf Ebrahimi 
620*22dc650dSSadaf Ebrahimi       case OP_CRRANGE:
621*22dc650dSSadaf Ebrahimi       case OP_CRMINRANGE:
622*22dc650dSSadaf Ebrahimi       case OP_CRPOSRANGE:
623*22dc650dSSadaf Ebrahimi       min = GET2(cc, 1);
624*22dc650dSSadaf Ebrahimi       cc += 1 + 2 * IMM2_SIZE;
625*22dc650dSSadaf Ebrahimi       break;
626*22dc650dSSadaf Ebrahimi 
627*22dc650dSSadaf Ebrahimi       default:
628*22dc650dSSadaf Ebrahimi       min = 1;
629*22dc650dSSadaf Ebrahimi       break;
630*22dc650dSSadaf Ebrahimi       }
631*22dc650dSSadaf Ebrahimi 
632*22dc650dSSadaf Ebrahimi      /* Take care not to overflow: (1) min and d are ints, so check that their
633*22dc650dSSadaf Ebrahimi      product is not greater than INT_MAX. (2) branchlength is limited to
634*22dc650dSSadaf Ebrahimi      UINT16_MAX (checked at the top of the loop). */
635*22dc650dSSadaf Ebrahimi 
636*22dc650dSSadaf Ebrahimi     if ((d > 0 && (INT_MAX/d) < min) || UINT16_MAX - branchlength < min*d)
637*22dc650dSSadaf Ebrahimi       branchlength = UINT16_MAX;
638*22dc650dSSadaf Ebrahimi     else branchlength += min * d;
639*22dc650dSSadaf Ebrahimi     break;
640*22dc650dSSadaf Ebrahimi 
641*22dc650dSSadaf Ebrahimi     /* Recursion always refers to the first occurrence of a subpattern with a
642*22dc650dSSadaf Ebrahimi     given number. Therefore, we can always make use of caching, even when the
643*22dc650dSSadaf Ebrahimi     pattern contains multiple subpatterns with the same number. */
644*22dc650dSSadaf Ebrahimi 
645*22dc650dSSadaf Ebrahimi     case OP_RECURSE:
646*22dc650dSSadaf Ebrahimi     cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
647*22dc650dSSadaf Ebrahimi     recno = GET2(cs, 1+LINK_SIZE);
648*22dc650dSSadaf Ebrahimi     if (recno == prev_recurse_recno)
649*22dc650dSSadaf Ebrahimi       {
650*22dc650dSSadaf Ebrahimi       branchlength += prev_recurse_d;
651*22dc650dSSadaf Ebrahimi       }
652*22dc650dSSadaf Ebrahimi     else
653*22dc650dSSadaf Ebrahimi       {
654*22dc650dSSadaf Ebrahimi       do ce += GET(ce, 1); while (*ce == OP_ALT);
655*22dc650dSSadaf Ebrahimi       if (cc > cs && cc < ce)    /* Simple recursion */
656*22dc650dSSadaf Ebrahimi         had_recurse = TRUE;
657*22dc650dSSadaf Ebrahimi       else
658*22dc650dSSadaf Ebrahimi         {
659*22dc650dSSadaf Ebrahimi         recurse_check *r = recurses;
660*22dc650dSSadaf Ebrahimi         for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
661*22dc650dSSadaf Ebrahimi         if (r != NULL)          /* Mutual recursion */
662*22dc650dSSadaf Ebrahimi           had_recurse = TRUE;
663*22dc650dSSadaf Ebrahimi         else
664*22dc650dSSadaf Ebrahimi           {
665*22dc650dSSadaf Ebrahimi           this_recurse.prev = recurses;
666*22dc650dSSadaf Ebrahimi           this_recurse.group = cs;
667*22dc650dSSadaf Ebrahimi           prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
668*22dc650dSSadaf Ebrahimi             countptr, backref_cache);
669*22dc650dSSadaf Ebrahimi           if (prev_recurse_d < 0) return prev_recurse_d;
670*22dc650dSSadaf Ebrahimi           prev_recurse_recno = recno;
671*22dc650dSSadaf Ebrahimi           branchlength += prev_recurse_d;
672*22dc650dSSadaf Ebrahimi           }
673*22dc650dSSadaf Ebrahimi         }
674*22dc650dSSadaf Ebrahimi       }
675*22dc650dSSadaf Ebrahimi     cc += 1 + LINK_SIZE + once_fudge;
676*22dc650dSSadaf Ebrahimi     once_fudge = 0;
677*22dc650dSSadaf Ebrahimi     break;
678*22dc650dSSadaf Ebrahimi 
679*22dc650dSSadaf Ebrahimi     /* Anything else does not or need not match a character. We can get the
680*22dc650dSSadaf Ebrahimi     item's length from the table, but for those that can match zero occurrences
681*22dc650dSSadaf Ebrahimi     of a character, we must take special action for UTF-8 characters. As it
682*22dc650dSSadaf Ebrahimi     happens, the "NOT" versions of these opcodes are used at present only for
683*22dc650dSSadaf Ebrahimi     ASCII characters, so they could be omitted from this list. However, in
684*22dc650dSSadaf Ebrahimi     future that may change, so we include them here so as not to leave a
685*22dc650dSSadaf Ebrahimi     gotcha for a future maintainer. */
686*22dc650dSSadaf Ebrahimi 
687*22dc650dSSadaf Ebrahimi     case OP_UPTO:
688*22dc650dSSadaf Ebrahimi     case OP_UPTOI:
689*22dc650dSSadaf Ebrahimi     case OP_NOTUPTO:
690*22dc650dSSadaf Ebrahimi     case OP_NOTUPTOI:
691*22dc650dSSadaf Ebrahimi     case OP_MINUPTO:
692*22dc650dSSadaf Ebrahimi     case OP_MINUPTOI:
693*22dc650dSSadaf Ebrahimi     case OP_NOTMINUPTO:
694*22dc650dSSadaf Ebrahimi     case OP_NOTMINUPTOI:
695*22dc650dSSadaf Ebrahimi     case OP_POSUPTO:
696*22dc650dSSadaf Ebrahimi     case OP_POSUPTOI:
697*22dc650dSSadaf Ebrahimi     case OP_NOTPOSUPTO:
698*22dc650dSSadaf Ebrahimi     case OP_NOTPOSUPTOI:
699*22dc650dSSadaf Ebrahimi 
700*22dc650dSSadaf Ebrahimi     case OP_STAR:
701*22dc650dSSadaf Ebrahimi     case OP_STARI:
702*22dc650dSSadaf Ebrahimi     case OP_NOTSTAR:
703*22dc650dSSadaf Ebrahimi     case OP_NOTSTARI:
704*22dc650dSSadaf Ebrahimi     case OP_MINSTAR:
705*22dc650dSSadaf Ebrahimi     case OP_MINSTARI:
706*22dc650dSSadaf Ebrahimi     case OP_NOTMINSTAR:
707*22dc650dSSadaf Ebrahimi     case OP_NOTMINSTARI:
708*22dc650dSSadaf Ebrahimi     case OP_POSSTAR:
709*22dc650dSSadaf Ebrahimi     case OP_POSSTARI:
710*22dc650dSSadaf Ebrahimi     case OP_NOTPOSSTAR:
711*22dc650dSSadaf Ebrahimi     case OP_NOTPOSSTARI:
712*22dc650dSSadaf Ebrahimi 
713*22dc650dSSadaf Ebrahimi     case OP_QUERY:
714*22dc650dSSadaf Ebrahimi     case OP_QUERYI:
715*22dc650dSSadaf Ebrahimi     case OP_NOTQUERY:
716*22dc650dSSadaf Ebrahimi     case OP_NOTQUERYI:
717*22dc650dSSadaf Ebrahimi     case OP_MINQUERY:
718*22dc650dSSadaf Ebrahimi     case OP_MINQUERYI:
719*22dc650dSSadaf Ebrahimi     case OP_NOTMINQUERY:
720*22dc650dSSadaf Ebrahimi     case OP_NOTMINQUERYI:
721*22dc650dSSadaf Ebrahimi     case OP_POSQUERY:
722*22dc650dSSadaf Ebrahimi     case OP_POSQUERYI:
723*22dc650dSSadaf Ebrahimi     case OP_NOTPOSQUERY:
724*22dc650dSSadaf Ebrahimi     case OP_NOTPOSQUERYI:
725*22dc650dSSadaf Ebrahimi 
726*22dc650dSSadaf Ebrahimi     cc += PRIV(OP_lengths)[op];
727*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
728*22dc650dSSadaf Ebrahimi     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
729*22dc650dSSadaf Ebrahimi #endif
730*22dc650dSSadaf Ebrahimi     break;
731*22dc650dSSadaf Ebrahimi 
732*22dc650dSSadaf Ebrahimi     /* Skip these, but we need to add in the name length. */
733*22dc650dSSadaf Ebrahimi 
734*22dc650dSSadaf Ebrahimi     case OP_MARK:
735*22dc650dSSadaf Ebrahimi     case OP_COMMIT_ARG:
736*22dc650dSSadaf Ebrahimi     case OP_PRUNE_ARG:
737*22dc650dSSadaf Ebrahimi     case OP_SKIP_ARG:
738*22dc650dSSadaf Ebrahimi     case OP_THEN_ARG:
739*22dc650dSSadaf Ebrahimi     cc += PRIV(OP_lengths)[op] + cc[1];
740*22dc650dSSadaf Ebrahimi     break;
741*22dc650dSSadaf Ebrahimi 
742*22dc650dSSadaf Ebrahimi     /* The remaining opcodes are just skipped over. */
743*22dc650dSSadaf Ebrahimi 
744*22dc650dSSadaf Ebrahimi     case OP_CLOSE:
745*22dc650dSSadaf Ebrahimi     case OP_COMMIT:
746*22dc650dSSadaf Ebrahimi     case OP_FAIL:
747*22dc650dSSadaf Ebrahimi     case OP_PRUNE:
748*22dc650dSSadaf Ebrahimi     case OP_SET_SOM:
749*22dc650dSSadaf Ebrahimi     case OP_SKIP:
750*22dc650dSSadaf Ebrahimi     case OP_THEN:
751*22dc650dSSadaf Ebrahimi     cc += PRIV(OP_lengths)[op];
752*22dc650dSSadaf Ebrahimi     break;
753*22dc650dSSadaf Ebrahimi 
754*22dc650dSSadaf Ebrahimi     /* This should not occur: we list all opcodes explicitly so that when
755*22dc650dSSadaf Ebrahimi     new ones get added they are properly considered. */
756*22dc650dSSadaf Ebrahimi 
757*22dc650dSSadaf Ebrahimi     default:
758*22dc650dSSadaf Ebrahimi     return -3;
759*22dc650dSSadaf Ebrahimi     }
760*22dc650dSSadaf Ebrahimi   }
761*22dc650dSSadaf Ebrahimi /* Control never gets here */
762*22dc650dSSadaf Ebrahimi }
763*22dc650dSSadaf Ebrahimi 
764*22dc650dSSadaf Ebrahimi 
765*22dc650dSSadaf Ebrahimi 
766*22dc650dSSadaf Ebrahimi /*************************************************
767*22dc650dSSadaf Ebrahimi *      Set a bit and maybe its alternate case    *
768*22dc650dSSadaf Ebrahimi *************************************************/
769*22dc650dSSadaf Ebrahimi 
770*22dc650dSSadaf Ebrahimi /* Given a character, set its first code unit's bit in the table, and also the
771*22dc650dSSadaf Ebrahimi corresponding bit for the other version of a letter if we are caseless.
772*22dc650dSSadaf Ebrahimi 
773*22dc650dSSadaf Ebrahimi Arguments:
774*22dc650dSSadaf Ebrahimi   re            points to the regex block
775*22dc650dSSadaf Ebrahimi   p             points to the first code unit of the character
776*22dc650dSSadaf Ebrahimi   caseless      TRUE if caseless
777*22dc650dSSadaf Ebrahimi   utf           TRUE for UTF mode
778*22dc650dSSadaf Ebrahimi   ucp           TRUE for UCP mode
779*22dc650dSSadaf Ebrahimi 
780*22dc650dSSadaf Ebrahimi Returns:        pointer after the character
781*22dc650dSSadaf Ebrahimi */
782*22dc650dSSadaf Ebrahimi 
783*22dc650dSSadaf Ebrahimi static PCRE2_SPTR
set_table_bit(pcre2_real_code * re,PCRE2_SPTR p,BOOL caseless,BOOL utf,BOOL ucp)784*22dc650dSSadaf Ebrahimi set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf,
785*22dc650dSSadaf Ebrahimi   BOOL ucp)
786*22dc650dSSadaf Ebrahimi {
787*22dc650dSSadaf Ebrahimi uint32_t c = *p++;   /* First code unit */
788*22dc650dSSadaf Ebrahimi 
789*22dc650dSSadaf Ebrahimi (void)utf;           /* Stop compiler warnings when UTF not supported */
790*22dc650dSSadaf Ebrahimi (void)ucp;
791*22dc650dSSadaf Ebrahimi 
792*22dc650dSSadaf Ebrahimi /* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
793*22dc650dSSadaf Ebrahimi 0xff. */
794*22dc650dSSadaf Ebrahimi 
795*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH != 8
796*22dc650dSSadaf Ebrahimi if (c > 0xff) SET_BIT(0xff); else
797*22dc650dSSadaf Ebrahimi #endif
798*22dc650dSSadaf Ebrahimi 
799*22dc650dSSadaf Ebrahimi SET_BIT(c);
800*22dc650dSSadaf Ebrahimi 
801*22dc650dSSadaf Ebrahimi /* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find
802*22dc650dSSadaf Ebrahimi the end of the character, even when caseless. */
803*22dc650dSSadaf Ebrahimi 
804*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
805*22dc650dSSadaf Ebrahimi if (utf)
806*22dc650dSSadaf Ebrahimi   {
807*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
808*22dc650dSSadaf Ebrahimi   if (c >= 0xc0) GETUTF8INC(c, p);
809*22dc650dSSadaf Ebrahimi #elif PCRE2_CODE_UNIT_WIDTH == 16
810*22dc650dSSadaf Ebrahimi   if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p);
811*22dc650dSSadaf Ebrahimi #endif
812*22dc650dSSadaf Ebrahimi   }
813*22dc650dSSadaf Ebrahimi #endif  /* SUPPORT_UNICODE */
814*22dc650dSSadaf Ebrahimi 
815*22dc650dSSadaf Ebrahimi /* If caseless, handle the other case of the character. */
816*22dc650dSSadaf Ebrahimi 
817*22dc650dSSadaf Ebrahimi if (caseless)
818*22dc650dSSadaf Ebrahimi   {
819*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
820*22dc650dSSadaf Ebrahimi   if (utf || ucp)
821*22dc650dSSadaf Ebrahimi     {
822*22dc650dSSadaf Ebrahimi     c = UCD_OTHERCASE(c);
823*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
824*22dc650dSSadaf Ebrahimi     if (utf)
825*22dc650dSSadaf Ebrahimi       {
826*22dc650dSSadaf Ebrahimi       PCRE2_UCHAR buff[6];
827*22dc650dSSadaf Ebrahimi       (void)PRIV(ord2utf)(c, buff);
828*22dc650dSSadaf Ebrahimi       SET_BIT(buff[0]);
829*22dc650dSSadaf Ebrahimi       }
830*22dc650dSSadaf Ebrahimi     else if (c < 256) SET_BIT(c);
831*22dc650dSSadaf Ebrahimi #else  /* 16-bit or 32-bit mode */
832*22dc650dSSadaf Ebrahimi     if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
833*22dc650dSSadaf Ebrahimi #endif
834*22dc650dSSadaf Ebrahimi     }
835*22dc650dSSadaf Ebrahimi 
836*22dc650dSSadaf Ebrahimi   else
837*22dc650dSSadaf Ebrahimi #endif  /* SUPPORT_UNICODE */
838*22dc650dSSadaf Ebrahimi 
839*22dc650dSSadaf Ebrahimi   /* Not UTF or UCP */
840*22dc650dSSadaf Ebrahimi 
841*22dc650dSSadaf Ebrahimi   if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
842*22dc650dSSadaf Ebrahimi   }
843*22dc650dSSadaf Ebrahimi 
844*22dc650dSSadaf Ebrahimi return p;
845*22dc650dSSadaf Ebrahimi }
846*22dc650dSSadaf Ebrahimi 
847*22dc650dSSadaf Ebrahimi 
848*22dc650dSSadaf Ebrahimi 
849*22dc650dSSadaf Ebrahimi /*************************************************
850*22dc650dSSadaf Ebrahimi *     Set bits for a positive character type     *
851*22dc650dSSadaf Ebrahimi *************************************************/
852*22dc650dSSadaf Ebrahimi 
853*22dc650dSSadaf Ebrahimi /* This function sets starting bits for a character type. In UTF-8 mode, we can
854*22dc650dSSadaf Ebrahimi only do a direct setting for bytes less than 128, as otherwise there can be
855*22dc650dSSadaf Ebrahimi confusion with bytes in the middle of UTF-8 characters. In a "traditional"
856*22dc650dSSadaf Ebrahimi environment, the tables will only recognize ASCII characters anyway, but in at
857*22dc650dSSadaf Ebrahimi least one Windows environment, some higher bytes bits were set in the tables.
858*22dc650dSSadaf Ebrahimi So we deal with that case by considering the UTF-8 encoding.
859*22dc650dSSadaf Ebrahimi 
860*22dc650dSSadaf Ebrahimi Arguments:
861*22dc650dSSadaf Ebrahimi   re             the regex block
862*22dc650dSSadaf Ebrahimi   cbit type      the type of character wanted
863*22dc650dSSadaf Ebrahimi   table_limit    32 for non-UTF-8; 16 for UTF-8
864*22dc650dSSadaf Ebrahimi 
865*22dc650dSSadaf Ebrahimi Returns:         nothing
866*22dc650dSSadaf Ebrahimi */
867*22dc650dSSadaf Ebrahimi 
868*22dc650dSSadaf Ebrahimi static void
set_type_bits(pcre2_real_code * re,int cbit_type,unsigned int table_limit)869*22dc650dSSadaf Ebrahimi set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
870*22dc650dSSadaf Ebrahimi {
871*22dc650dSSadaf Ebrahimi uint32_t c;
872*22dc650dSSadaf Ebrahimi for (c = 0; c < table_limit; c++)
873*22dc650dSSadaf Ebrahimi   re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type];
874*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
875*22dc650dSSadaf Ebrahimi if (table_limit == 32) return;
876*22dc650dSSadaf Ebrahimi for (c = 128; c < 256; c++)
877*22dc650dSSadaf Ebrahimi   {
878*22dc650dSSadaf Ebrahimi   if ((re->tables[cbits_offset + c/8] & (1u << (c&7))) != 0)
879*22dc650dSSadaf Ebrahimi     {
880*22dc650dSSadaf Ebrahimi     PCRE2_UCHAR buff[6];
881*22dc650dSSadaf Ebrahimi     (void)PRIV(ord2utf)(c, buff);
882*22dc650dSSadaf Ebrahimi     SET_BIT(buff[0]);
883*22dc650dSSadaf Ebrahimi     }
884*22dc650dSSadaf Ebrahimi   }
885*22dc650dSSadaf Ebrahimi #endif  /* UTF-8 */
886*22dc650dSSadaf Ebrahimi }
887*22dc650dSSadaf Ebrahimi 
888*22dc650dSSadaf Ebrahimi 
889*22dc650dSSadaf Ebrahimi /*************************************************
890*22dc650dSSadaf Ebrahimi *     Set bits for a negative character type     *
891*22dc650dSSadaf Ebrahimi *************************************************/
892*22dc650dSSadaf Ebrahimi 
893*22dc650dSSadaf Ebrahimi /* This function sets starting bits for a negative character type such as \D.
894*22dc650dSSadaf Ebrahimi In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
895*22dc650dSSadaf Ebrahimi otherwise there can be confusion with bytes in the middle of UTF-8 characters.
896*22dc650dSSadaf Ebrahimi Unlike in the positive case, where we can set appropriate starting bits for
897*22dc650dSSadaf Ebrahimi specific high-valued UTF-8 characters, in this case we have to set the bits for
898*22dc650dSSadaf Ebrahimi all high-valued characters. The lowest is 0xc2, but we overkill by starting at
899*22dc650dSSadaf Ebrahimi 0xc0 (192) for simplicity.
900*22dc650dSSadaf Ebrahimi 
901*22dc650dSSadaf Ebrahimi Arguments:
902*22dc650dSSadaf Ebrahimi   re             the regex block
903*22dc650dSSadaf Ebrahimi   cbit type      the type of character wanted
904*22dc650dSSadaf Ebrahimi   table_limit    32 for non-UTF-8; 16 for UTF-8
905*22dc650dSSadaf Ebrahimi 
906*22dc650dSSadaf Ebrahimi Returns:         nothing
907*22dc650dSSadaf Ebrahimi */
908*22dc650dSSadaf Ebrahimi 
909*22dc650dSSadaf Ebrahimi static void
set_nottype_bits(pcre2_real_code * re,int cbit_type,unsigned int table_limit)910*22dc650dSSadaf Ebrahimi set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
911*22dc650dSSadaf Ebrahimi {
912*22dc650dSSadaf Ebrahimi uint32_t c;
913*22dc650dSSadaf Ebrahimi for (c = 0; c < table_limit; c++)
914*22dc650dSSadaf Ebrahimi   re->start_bitmap[c] |= (uint8_t)(~(re->tables[c+cbits_offset+cbit_type]));
915*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
916*22dc650dSSadaf Ebrahimi if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
917*22dc650dSSadaf Ebrahimi #endif
918*22dc650dSSadaf Ebrahimi }
919*22dc650dSSadaf Ebrahimi 
920*22dc650dSSadaf Ebrahimi 
921*22dc650dSSadaf Ebrahimi 
922*22dc650dSSadaf Ebrahimi /*************************************************
923*22dc650dSSadaf Ebrahimi *      Create bitmap of starting code units      *
924*22dc650dSSadaf Ebrahimi *************************************************/
925*22dc650dSSadaf Ebrahimi 
926*22dc650dSSadaf Ebrahimi /* This function scans a compiled unanchored expression recursively and
927*22dc650dSSadaf Ebrahimi attempts to build a bitmap of the set of possible starting code units whose
928*22dc650dSSadaf Ebrahimi values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause
929*22dc650dSSadaf Ebrahimi the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode
930*22dc650dSSadaf Ebrahimi we pass a value of 16 rather than 32 as the final argument. (See comments in
931*22dc650dSSadaf Ebrahimi those functions for the reason.)
932*22dc650dSSadaf Ebrahimi 
933*22dc650dSSadaf Ebrahimi The SSB_CONTINUE return is useful for parenthesized groups in patterns such as
934*22dc650dSSadaf Ebrahimi (a*)b where the group provides some optional starting code units but scanning
935*22dc650dSSadaf Ebrahimi must continue at the outer level to find at least one mandatory code unit. At
936*22dc650dSSadaf Ebrahimi the outermost level, this function fails unless the result is SSB_DONE.
937*22dc650dSSadaf Ebrahimi 
938*22dc650dSSadaf Ebrahimi We restrict recursion (for nested groups) to 1000 to avoid stack overflow
939*22dc650dSSadaf Ebrahimi issues.
940*22dc650dSSadaf Ebrahimi 
941*22dc650dSSadaf Ebrahimi Arguments:
942*22dc650dSSadaf Ebrahimi   re           points to the compiled regex block
943*22dc650dSSadaf Ebrahimi   code         points to an expression
944*22dc650dSSadaf Ebrahimi   utf          TRUE if in UTF mode
945*22dc650dSSadaf Ebrahimi   ucp          TRUE if in UCP mode
946*22dc650dSSadaf Ebrahimi   depthptr     pointer to recurse depth
947*22dc650dSSadaf Ebrahimi 
948*22dc650dSSadaf Ebrahimi Returns:       SSB_FAIL     => Failed to find any starting code units
949*22dc650dSSadaf Ebrahimi                SSB_DONE     => Found mandatory starting code units
950*22dc650dSSadaf Ebrahimi                SSB_CONTINUE => Found optional starting code units
951*22dc650dSSadaf Ebrahimi                SSB_UNKNOWN  => Hit an unrecognized opcode
952*22dc650dSSadaf Ebrahimi                SSB_TOODEEP  => Recursion is too deep
953*22dc650dSSadaf Ebrahimi */
954*22dc650dSSadaf Ebrahimi 
955*22dc650dSSadaf Ebrahimi static int
set_start_bits(pcre2_real_code * re,PCRE2_SPTR code,BOOL utf,BOOL ucp,int * depthptr)956*22dc650dSSadaf Ebrahimi set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp,
957*22dc650dSSadaf Ebrahimi   int *depthptr)
958*22dc650dSSadaf Ebrahimi {
959*22dc650dSSadaf Ebrahimi uint32_t c;
960*22dc650dSSadaf Ebrahimi int yield = SSB_DONE;
961*22dc650dSSadaf Ebrahimi 
962*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
963*22dc650dSSadaf Ebrahimi int table_limit = utf? 16:32;
964*22dc650dSSadaf Ebrahimi #else
965*22dc650dSSadaf Ebrahimi int table_limit = 32;
966*22dc650dSSadaf Ebrahimi #endif
967*22dc650dSSadaf Ebrahimi 
968*22dc650dSSadaf Ebrahimi *depthptr += 1;
969*22dc650dSSadaf Ebrahimi if (*depthptr > 1000) return SSB_TOODEEP;
970*22dc650dSSadaf Ebrahimi 
971*22dc650dSSadaf Ebrahimi do
972*22dc650dSSadaf Ebrahimi   {
973*22dc650dSSadaf Ebrahimi   BOOL try_next = TRUE;
974*22dc650dSSadaf Ebrahimi   PCRE2_SPTR tcode = code + 1 + LINK_SIZE;
975*22dc650dSSadaf Ebrahimi 
976*22dc650dSSadaf Ebrahimi   if (*code == OP_CBRA || *code == OP_SCBRA ||
977*22dc650dSSadaf Ebrahimi       *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
978*22dc650dSSadaf Ebrahimi 
979*22dc650dSSadaf Ebrahimi   while (try_next)    /* Loop for items in this branch */
980*22dc650dSSadaf Ebrahimi     {
981*22dc650dSSadaf Ebrahimi     int rc;
982*22dc650dSSadaf Ebrahimi     PCRE2_SPTR ncode;
983*22dc650dSSadaf Ebrahimi     uint8_t *classmap = NULL;
984*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_WIDE_CHARS
985*22dc650dSSadaf Ebrahimi     PCRE2_UCHAR xclassflags;
986*22dc650dSSadaf Ebrahimi #endif
987*22dc650dSSadaf Ebrahimi 
988*22dc650dSSadaf Ebrahimi     switch(*tcode)
989*22dc650dSSadaf Ebrahimi       {
990*22dc650dSSadaf Ebrahimi       /* If we reach something we don't understand, it means a new opcode has
991*22dc650dSSadaf Ebrahimi       been created that hasn't been added to this function. Hopefully this
992*22dc650dSSadaf Ebrahimi       problem will be discovered during testing. */
993*22dc650dSSadaf Ebrahimi 
994*22dc650dSSadaf Ebrahimi       default:
995*22dc650dSSadaf Ebrahimi       return SSB_UNKNOWN;
996*22dc650dSSadaf Ebrahimi 
997*22dc650dSSadaf Ebrahimi       /* Fail for a valid opcode that implies no starting bits. */
998*22dc650dSSadaf Ebrahimi 
999*22dc650dSSadaf Ebrahimi       case OP_ACCEPT:
1000*22dc650dSSadaf Ebrahimi       case OP_ASSERT_ACCEPT:
1001*22dc650dSSadaf Ebrahimi       case OP_ALLANY:
1002*22dc650dSSadaf Ebrahimi       case OP_ANY:
1003*22dc650dSSadaf Ebrahimi       case OP_ANYBYTE:
1004*22dc650dSSadaf Ebrahimi       case OP_CIRCM:
1005*22dc650dSSadaf Ebrahimi       case OP_CLOSE:
1006*22dc650dSSadaf Ebrahimi       case OP_COMMIT:
1007*22dc650dSSadaf Ebrahimi       case OP_COMMIT_ARG:
1008*22dc650dSSadaf Ebrahimi       case OP_COND:
1009*22dc650dSSadaf Ebrahimi       case OP_CREF:
1010*22dc650dSSadaf Ebrahimi       case OP_FALSE:
1011*22dc650dSSadaf Ebrahimi       case OP_TRUE:
1012*22dc650dSSadaf Ebrahimi       case OP_DNCREF:
1013*22dc650dSSadaf Ebrahimi       case OP_DNREF:
1014*22dc650dSSadaf Ebrahimi       case OP_DNREFI:
1015*22dc650dSSadaf Ebrahimi       case OP_DNRREF:
1016*22dc650dSSadaf Ebrahimi       case OP_DOLL:
1017*22dc650dSSadaf Ebrahimi       case OP_DOLLM:
1018*22dc650dSSadaf Ebrahimi       case OP_END:
1019*22dc650dSSadaf Ebrahimi       case OP_EOD:
1020*22dc650dSSadaf Ebrahimi       case OP_EODN:
1021*22dc650dSSadaf Ebrahimi       case OP_EXTUNI:
1022*22dc650dSSadaf Ebrahimi       case OP_FAIL:
1023*22dc650dSSadaf Ebrahimi       case OP_MARK:
1024*22dc650dSSadaf Ebrahimi       case OP_NOT:
1025*22dc650dSSadaf Ebrahimi       case OP_NOTEXACT:
1026*22dc650dSSadaf Ebrahimi       case OP_NOTEXACTI:
1027*22dc650dSSadaf Ebrahimi       case OP_NOTI:
1028*22dc650dSSadaf Ebrahimi       case OP_NOTMINPLUS:
1029*22dc650dSSadaf Ebrahimi       case OP_NOTMINPLUSI:
1030*22dc650dSSadaf Ebrahimi       case OP_NOTMINQUERY:
1031*22dc650dSSadaf Ebrahimi       case OP_NOTMINQUERYI:
1032*22dc650dSSadaf Ebrahimi       case OP_NOTMINSTAR:
1033*22dc650dSSadaf Ebrahimi       case OP_NOTMINSTARI:
1034*22dc650dSSadaf Ebrahimi       case OP_NOTMINUPTO:
1035*22dc650dSSadaf Ebrahimi       case OP_NOTMINUPTOI:
1036*22dc650dSSadaf Ebrahimi       case OP_NOTPLUS:
1037*22dc650dSSadaf Ebrahimi       case OP_NOTPLUSI:
1038*22dc650dSSadaf Ebrahimi       case OP_NOTPOSPLUS:
1039*22dc650dSSadaf Ebrahimi       case OP_NOTPOSPLUSI:
1040*22dc650dSSadaf Ebrahimi       case OP_NOTPOSQUERY:
1041*22dc650dSSadaf Ebrahimi       case OP_NOTPOSQUERYI:
1042*22dc650dSSadaf Ebrahimi       case OP_NOTPOSSTAR:
1043*22dc650dSSadaf Ebrahimi       case OP_NOTPOSSTARI:
1044*22dc650dSSadaf Ebrahimi       case OP_NOTPOSUPTO:
1045*22dc650dSSadaf Ebrahimi       case OP_NOTPOSUPTOI:
1046*22dc650dSSadaf Ebrahimi       case OP_NOTPROP:
1047*22dc650dSSadaf Ebrahimi       case OP_NOTQUERY:
1048*22dc650dSSadaf Ebrahimi       case OP_NOTQUERYI:
1049*22dc650dSSadaf Ebrahimi       case OP_NOTSTAR:
1050*22dc650dSSadaf Ebrahimi       case OP_NOTSTARI:
1051*22dc650dSSadaf Ebrahimi       case OP_NOTUPTO:
1052*22dc650dSSadaf Ebrahimi       case OP_NOTUPTOI:
1053*22dc650dSSadaf Ebrahimi       case OP_NOT_HSPACE:
1054*22dc650dSSadaf Ebrahimi       case OP_NOT_VSPACE:
1055*22dc650dSSadaf Ebrahimi       case OP_PRUNE:
1056*22dc650dSSadaf Ebrahimi       case OP_PRUNE_ARG:
1057*22dc650dSSadaf Ebrahimi       case OP_RECURSE:
1058*22dc650dSSadaf Ebrahimi       case OP_REF:
1059*22dc650dSSadaf Ebrahimi       case OP_REFI:
1060*22dc650dSSadaf Ebrahimi       case OP_REVERSE:
1061*22dc650dSSadaf Ebrahimi       case OP_VREVERSE:
1062*22dc650dSSadaf Ebrahimi       case OP_RREF:
1063*22dc650dSSadaf Ebrahimi       case OP_SCOND:
1064*22dc650dSSadaf Ebrahimi       case OP_SET_SOM:
1065*22dc650dSSadaf Ebrahimi       case OP_SKIP:
1066*22dc650dSSadaf Ebrahimi       case OP_SKIP_ARG:
1067*22dc650dSSadaf Ebrahimi       case OP_SOD:
1068*22dc650dSSadaf Ebrahimi       case OP_SOM:
1069*22dc650dSSadaf Ebrahimi       case OP_THEN:
1070*22dc650dSSadaf Ebrahimi       case OP_THEN_ARG:
1071*22dc650dSSadaf Ebrahimi       return SSB_FAIL;
1072*22dc650dSSadaf Ebrahimi 
1073*22dc650dSSadaf Ebrahimi       /* OP_CIRC happens only at the start of an anchored branch (multiline ^
1074*22dc650dSSadaf Ebrahimi       uses OP_CIRCM). Skip over it. */
1075*22dc650dSSadaf Ebrahimi 
1076*22dc650dSSadaf Ebrahimi       case OP_CIRC:
1077*22dc650dSSadaf Ebrahimi       tcode += PRIV(OP_lengths)[OP_CIRC];
1078*22dc650dSSadaf Ebrahimi       break;
1079*22dc650dSSadaf Ebrahimi 
1080*22dc650dSSadaf Ebrahimi       /* A "real" property test implies no starting bits, but the fake property
1081*22dc650dSSadaf Ebrahimi       PT_CLIST identifies a list of characters. These lists are short, as they
1082*22dc650dSSadaf Ebrahimi       are used for characters with more than one "other case", so there is no
1083*22dc650dSSadaf Ebrahimi       point in recognizing them for OP_NOTPROP. */
1084*22dc650dSSadaf Ebrahimi 
1085*22dc650dSSadaf Ebrahimi       case OP_PROP:
1086*22dc650dSSadaf Ebrahimi       if (tcode[1] != PT_CLIST) return SSB_FAIL;
1087*22dc650dSSadaf Ebrahimi         {
1088*22dc650dSSadaf Ebrahimi         const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2];
1089*22dc650dSSadaf Ebrahimi         while ((c = *p++) < NOTACHAR)
1090*22dc650dSSadaf Ebrahimi           {
1091*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1092*22dc650dSSadaf Ebrahimi           if (utf)
1093*22dc650dSSadaf Ebrahimi             {
1094*22dc650dSSadaf Ebrahimi             PCRE2_UCHAR buff[6];
1095*22dc650dSSadaf Ebrahimi             (void)PRIV(ord2utf)(c, buff);
1096*22dc650dSSadaf Ebrahimi             c = buff[0];
1097*22dc650dSSadaf Ebrahimi             }
1098*22dc650dSSadaf Ebrahimi #endif
1099*22dc650dSSadaf Ebrahimi           if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
1100*22dc650dSSadaf Ebrahimi           }
1101*22dc650dSSadaf Ebrahimi         }
1102*22dc650dSSadaf Ebrahimi       try_next = FALSE;
1103*22dc650dSSadaf Ebrahimi       break;
1104*22dc650dSSadaf Ebrahimi 
1105*22dc650dSSadaf Ebrahimi       /* We can ignore word boundary tests. */
1106*22dc650dSSadaf Ebrahimi 
1107*22dc650dSSadaf Ebrahimi       case OP_WORD_BOUNDARY:
1108*22dc650dSSadaf Ebrahimi       case OP_NOT_WORD_BOUNDARY:
1109*22dc650dSSadaf Ebrahimi       case OP_UCP_WORD_BOUNDARY:
1110*22dc650dSSadaf Ebrahimi       case OP_NOT_UCP_WORD_BOUNDARY:
1111*22dc650dSSadaf Ebrahimi       tcode++;
1112*22dc650dSSadaf Ebrahimi       break;
1113*22dc650dSSadaf Ebrahimi 
1114*22dc650dSSadaf Ebrahimi       /* For a positive lookahead assertion, inspect what immediately follows,
1115*22dc650dSSadaf Ebrahimi       ignoring intermediate assertions and callouts. If the next item is one
1116*22dc650dSSadaf Ebrahimi       that sets a mandatory character, skip this assertion. Otherwise, treat it
1117*22dc650dSSadaf Ebrahimi       the same as other bracket groups. */
1118*22dc650dSSadaf Ebrahimi 
1119*22dc650dSSadaf Ebrahimi       case OP_ASSERT:
1120*22dc650dSSadaf Ebrahimi       case OP_ASSERT_NA:
1121*22dc650dSSadaf Ebrahimi       ncode = tcode + GET(tcode, 1);
1122*22dc650dSSadaf Ebrahimi       while (*ncode == OP_ALT) ncode += GET(ncode, 1);
1123*22dc650dSSadaf Ebrahimi       ncode += 1 + LINK_SIZE;
1124*22dc650dSSadaf Ebrahimi 
1125*22dc650dSSadaf Ebrahimi       /* Skip irrelevant items */
1126*22dc650dSSadaf Ebrahimi 
1127*22dc650dSSadaf Ebrahimi       for (BOOL done = FALSE; !done;)
1128*22dc650dSSadaf Ebrahimi         {
1129*22dc650dSSadaf Ebrahimi         switch (*ncode)
1130*22dc650dSSadaf Ebrahimi           {
1131*22dc650dSSadaf Ebrahimi           case OP_ASSERT:
1132*22dc650dSSadaf Ebrahimi           case OP_ASSERT_NOT:
1133*22dc650dSSadaf Ebrahimi           case OP_ASSERTBACK:
1134*22dc650dSSadaf Ebrahimi           case OP_ASSERTBACK_NOT:
1135*22dc650dSSadaf Ebrahimi           case OP_ASSERT_NA:
1136*22dc650dSSadaf Ebrahimi           case OP_ASSERTBACK_NA:
1137*22dc650dSSadaf Ebrahimi           ncode += GET(ncode, 1);
1138*22dc650dSSadaf Ebrahimi           while (*ncode == OP_ALT) ncode += GET(ncode, 1);
1139*22dc650dSSadaf Ebrahimi           ncode += 1 + LINK_SIZE;
1140*22dc650dSSadaf Ebrahimi           break;
1141*22dc650dSSadaf Ebrahimi 
1142*22dc650dSSadaf Ebrahimi           case OP_WORD_BOUNDARY:
1143*22dc650dSSadaf Ebrahimi           case OP_NOT_WORD_BOUNDARY:
1144*22dc650dSSadaf Ebrahimi           case OP_UCP_WORD_BOUNDARY:
1145*22dc650dSSadaf Ebrahimi           case OP_NOT_UCP_WORD_BOUNDARY:
1146*22dc650dSSadaf Ebrahimi           ncode++;
1147*22dc650dSSadaf Ebrahimi           break;
1148*22dc650dSSadaf Ebrahimi 
1149*22dc650dSSadaf Ebrahimi           case OP_CALLOUT:
1150*22dc650dSSadaf Ebrahimi           ncode += PRIV(OP_lengths)[OP_CALLOUT];
1151*22dc650dSSadaf Ebrahimi           break;
1152*22dc650dSSadaf Ebrahimi 
1153*22dc650dSSadaf Ebrahimi           case OP_CALLOUT_STR:
1154*22dc650dSSadaf Ebrahimi           ncode += GET(ncode, 1 + 2*LINK_SIZE);
1155*22dc650dSSadaf Ebrahimi           break;
1156*22dc650dSSadaf Ebrahimi 
1157*22dc650dSSadaf Ebrahimi           default:
1158*22dc650dSSadaf Ebrahimi           done = TRUE;
1159*22dc650dSSadaf Ebrahimi           break;
1160*22dc650dSSadaf Ebrahimi           }
1161*22dc650dSSadaf Ebrahimi         }
1162*22dc650dSSadaf Ebrahimi 
1163*22dc650dSSadaf Ebrahimi       /* Now check the next significant item. */
1164*22dc650dSSadaf Ebrahimi 
1165*22dc650dSSadaf Ebrahimi       switch(*ncode)
1166*22dc650dSSadaf Ebrahimi         {
1167*22dc650dSSadaf Ebrahimi         default:
1168*22dc650dSSadaf Ebrahimi         break;
1169*22dc650dSSadaf Ebrahimi 
1170*22dc650dSSadaf Ebrahimi         case OP_PROP:
1171*22dc650dSSadaf Ebrahimi         if (ncode[1] != PT_CLIST) break;
1172*22dc650dSSadaf Ebrahimi         /* Fall through */
1173*22dc650dSSadaf Ebrahimi         case OP_ANYNL:
1174*22dc650dSSadaf Ebrahimi         case OP_CHAR:
1175*22dc650dSSadaf Ebrahimi         case OP_CHARI:
1176*22dc650dSSadaf Ebrahimi         case OP_EXACT:
1177*22dc650dSSadaf Ebrahimi         case OP_EXACTI:
1178*22dc650dSSadaf Ebrahimi         case OP_HSPACE:
1179*22dc650dSSadaf Ebrahimi         case OP_MINPLUS:
1180*22dc650dSSadaf Ebrahimi         case OP_MINPLUSI:
1181*22dc650dSSadaf Ebrahimi         case OP_PLUS:
1182*22dc650dSSadaf Ebrahimi         case OP_PLUSI:
1183*22dc650dSSadaf Ebrahimi         case OP_POSPLUS:
1184*22dc650dSSadaf Ebrahimi         case OP_POSPLUSI:
1185*22dc650dSSadaf Ebrahimi         case OP_VSPACE:
1186*22dc650dSSadaf Ebrahimi         /* Note that these types will only be present in non-UCP mode. */
1187*22dc650dSSadaf Ebrahimi         case OP_DIGIT:
1188*22dc650dSSadaf Ebrahimi         case OP_NOT_DIGIT:
1189*22dc650dSSadaf Ebrahimi         case OP_WORDCHAR:
1190*22dc650dSSadaf Ebrahimi         case OP_NOT_WORDCHAR:
1191*22dc650dSSadaf Ebrahimi         case OP_WHITESPACE:
1192*22dc650dSSadaf Ebrahimi         case OP_NOT_WHITESPACE:
1193*22dc650dSSadaf Ebrahimi         tcode = ncode;
1194*22dc650dSSadaf Ebrahimi         continue;   /* With the following significant opcode */
1195*22dc650dSSadaf Ebrahimi         }
1196*22dc650dSSadaf Ebrahimi       /* Fall through */
1197*22dc650dSSadaf Ebrahimi 
1198*22dc650dSSadaf Ebrahimi       /* For a group bracket or a positive assertion without an immediately
1199*22dc650dSSadaf Ebrahimi       following mandatory setting, recurse to set bits from within the
1200*22dc650dSSadaf Ebrahimi       subpattern. If it can't find anything, we have to give up. If it finds
1201*22dc650dSSadaf Ebrahimi       some mandatory character(s), we are done for this branch. Otherwise,
1202*22dc650dSSadaf Ebrahimi       carry on scanning after the subpattern. */
1203*22dc650dSSadaf Ebrahimi 
1204*22dc650dSSadaf Ebrahimi       case OP_BRA:
1205*22dc650dSSadaf Ebrahimi       case OP_SBRA:
1206*22dc650dSSadaf Ebrahimi       case OP_CBRA:
1207*22dc650dSSadaf Ebrahimi       case OP_SCBRA:
1208*22dc650dSSadaf Ebrahimi       case OP_BRAPOS:
1209*22dc650dSSadaf Ebrahimi       case OP_SBRAPOS:
1210*22dc650dSSadaf Ebrahimi       case OP_CBRAPOS:
1211*22dc650dSSadaf Ebrahimi       case OP_SCBRAPOS:
1212*22dc650dSSadaf Ebrahimi       case OP_ONCE:
1213*22dc650dSSadaf Ebrahimi       case OP_SCRIPT_RUN:
1214*22dc650dSSadaf Ebrahimi       rc = set_start_bits(re, tcode, utf, ucp, depthptr);
1215*22dc650dSSadaf Ebrahimi       if (rc == SSB_DONE)
1216*22dc650dSSadaf Ebrahimi         {
1217*22dc650dSSadaf Ebrahimi         try_next = FALSE;
1218*22dc650dSSadaf Ebrahimi         }
1219*22dc650dSSadaf Ebrahimi       else if (rc == SSB_CONTINUE)
1220*22dc650dSSadaf Ebrahimi         {
1221*22dc650dSSadaf Ebrahimi         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1222*22dc650dSSadaf Ebrahimi         tcode += 1 + LINK_SIZE;
1223*22dc650dSSadaf Ebrahimi         }
1224*22dc650dSSadaf Ebrahimi       else return rc;   /* FAIL, UNKNOWN, or TOODEEP */
1225*22dc650dSSadaf Ebrahimi       break;
1226*22dc650dSSadaf Ebrahimi 
1227*22dc650dSSadaf Ebrahimi       /* If we hit ALT or KET, it means we haven't found anything mandatory in
1228*22dc650dSSadaf Ebrahimi       this branch, though we might have found something optional. For ALT, we
1229*22dc650dSSadaf Ebrahimi       continue with the next alternative, but we have to arrange that the final
1230*22dc650dSSadaf Ebrahimi       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
1231*22dc650dSSadaf Ebrahimi       return SSB_CONTINUE: if this is the top level, that indicates failure,
1232*22dc650dSSadaf Ebrahimi       but after a nested subpattern, it causes scanning to continue. */
1233*22dc650dSSadaf Ebrahimi 
1234*22dc650dSSadaf Ebrahimi       case OP_ALT:
1235*22dc650dSSadaf Ebrahimi       yield = SSB_CONTINUE;
1236*22dc650dSSadaf Ebrahimi       try_next = FALSE;
1237*22dc650dSSadaf Ebrahimi       break;
1238*22dc650dSSadaf Ebrahimi 
1239*22dc650dSSadaf Ebrahimi       case OP_KET:
1240*22dc650dSSadaf Ebrahimi       case OP_KETRMAX:
1241*22dc650dSSadaf Ebrahimi       case OP_KETRMIN:
1242*22dc650dSSadaf Ebrahimi       case OP_KETRPOS:
1243*22dc650dSSadaf Ebrahimi       return SSB_CONTINUE;
1244*22dc650dSSadaf Ebrahimi 
1245*22dc650dSSadaf Ebrahimi       /* Skip over callout */
1246*22dc650dSSadaf Ebrahimi 
1247*22dc650dSSadaf Ebrahimi       case OP_CALLOUT:
1248*22dc650dSSadaf Ebrahimi       tcode += PRIV(OP_lengths)[OP_CALLOUT];
1249*22dc650dSSadaf Ebrahimi       break;
1250*22dc650dSSadaf Ebrahimi 
1251*22dc650dSSadaf Ebrahimi       case OP_CALLOUT_STR:
1252*22dc650dSSadaf Ebrahimi       tcode += GET(tcode, 1 + 2*LINK_SIZE);
1253*22dc650dSSadaf Ebrahimi       break;
1254*22dc650dSSadaf Ebrahimi 
1255*22dc650dSSadaf Ebrahimi       /* Skip over lookbehind and negative lookahead assertions */
1256*22dc650dSSadaf Ebrahimi 
1257*22dc650dSSadaf Ebrahimi       case OP_ASSERT_NOT:
1258*22dc650dSSadaf Ebrahimi       case OP_ASSERTBACK:
1259*22dc650dSSadaf Ebrahimi       case OP_ASSERTBACK_NOT:
1260*22dc650dSSadaf Ebrahimi       case OP_ASSERTBACK_NA:
1261*22dc650dSSadaf Ebrahimi       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1262*22dc650dSSadaf Ebrahimi       tcode += 1 + LINK_SIZE;
1263*22dc650dSSadaf Ebrahimi       break;
1264*22dc650dSSadaf Ebrahimi 
1265*22dc650dSSadaf Ebrahimi       /* BRAZERO does the bracket, but carries on. */
1266*22dc650dSSadaf Ebrahimi 
1267*22dc650dSSadaf Ebrahimi       case OP_BRAZERO:
1268*22dc650dSSadaf Ebrahimi       case OP_BRAMINZERO:
1269*22dc650dSSadaf Ebrahimi       case OP_BRAPOSZERO:
1270*22dc650dSSadaf Ebrahimi       rc = set_start_bits(re, ++tcode, utf, ucp, depthptr);
1271*22dc650dSSadaf Ebrahimi       if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc;
1272*22dc650dSSadaf Ebrahimi       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1273*22dc650dSSadaf Ebrahimi       tcode += 1 + LINK_SIZE;
1274*22dc650dSSadaf Ebrahimi       break;
1275*22dc650dSSadaf Ebrahimi 
1276*22dc650dSSadaf Ebrahimi       /* SKIPZERO skips the bracket. */
1277*22dc650dSSadaf Ebrahimi 
1278*22dc650dSSadaf Ebrahimi       case OP_SKIPZERO:
1279*22dc650dSSadaf Ebrahimi       tcode++;
1280*22dc650dSSadaf Ebrahimi       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1281*22dc650dSSadaf Ebrahimi       tcode += 1 + LINK_SIZE;
1282*22dc650dSSadaf Ebrahimi       break;
1283*22dc650dSSadaf Ebrahimi 
1284*22dc650dSSadaf Ebrahimi       /* Single-char * or ? sets the bit and tries the next item */
1285*22dc650dSSadaf Ebrahimi 
1286*22dc650dSSadaf Ebrahimi       case OP_STAR:
1287*22dc650dSSadaf Ebrahimi       case OP_MINSTAR:
1288*22dc650dSSadaf Ebrahimi       case OP_POSSTAR:
1289*22dc650dSSadaf Ebrahimi       case OP_QUERY:
1290*22dc650dSSadaf Ebrahimi       case OP_MINQUERY:
1291*22dc650dSSadaf Ebrahimi       case OP_POSQUERY:
1292*22dc650dSSadaf Ebrahimi       tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp);
1293*22dc650dSSadaf Ebrahimi       break;
1294*22dc650dSSadaf Ebrahimi 
1295*22dc650dSSadaf Ebrahimi       case OP_STARI:
1296*22dc650dSSadaf Ebrahimi       case OP_MINSTARI:
1297*22dc650dSSadaf Ebrahimi       case OP_POSSTARI:
1298*22dc650dSSadaf Ebrahimi       case OP_QUERYI:
1299*22dc650dSSadaf Ebrahimi       case OP_MINQUERYI:
1300*22dc650dSSadaf Ebrahimi       case OP_POSQUERYI:
1301*22dc650dSSadaf Ebrahimi       tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp);
1302*22dc650dSSadaf Ebrahimi       break;
1303*22dc650dSSadaf Ebrahimi 
1304*22dc650dSSadaf Ebrahimi       /* Single-char upto sets the bit and tries the next */
1305*22dc650dSSadaf Ebrahimi 
1306*22dc650dSSadaf Ebrahimi       case OP_UPTO:
1307*22dc650dSSadaf Ebrahimi       case OP_MINUPTO:
1308*22dc650dSSadaf Ebrahimi       case OP_POSUPTO:
1309*22dc650dSSadaf Ebrahimi       tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp);
1310*22dc650dSSadaf Ebrahimi       break;
1311*22dc650dSSadaf Ebrahimi 
1312*22dc650dSSadaf Ebrahimi       case OP_UPTOI:
1313*22dc650dSSadaf Ebrahimi       case OP_MINUPTOI:
1314*22dc650dSSadaf Ebrahimi       case OP_POSUPTOI:
1315*22dc650dSSadaf Ebrahimi       tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp);
1316*22dc650dSSadaf Ebrahimi       break;
1317*22dc650dSSadaf Ebrahimi 
1318*22dc650dSSadaf Ebrahimi       /* At least one single char sets the bit and stops */
1319*22dc650dSSadaf Ebrahimi 
1320*22dc650dSSadaf Ebrahimi       case OP_EXACT:
1321*22dc650dSSadaf Ebrahimi       tcode += IMM2_SIZE;
1322*22dc650dSSadaf Ebrahimi       /* Fall through */
1323*22dc650dSSadaf Ebrahimi       case OP_CHAR:
1324*22dc650dSSadaf Ebrahimi       case OP_PLUS:
1325*22dc650dSSadaf Ebrahimi       case OP_MINPLUS:
1326*22dc650dSSadaf Ebrahimi       case OP_POSPLUS:
1327*22dc650dSSadaf Ebrahimi       (void)set_table_bit(re, tcode + 1, FALSE, utf, ucp);
1328*22dc650dSSadaf Ebrahimi       try_next = FALSE;
1329*22dc650dSSadaf Ebrahimi       break;
1330*22dc650dSSadaf Ebrahimi 
1331*22dc650dSSadaf Ebrahimi       case OP_EXACTI:
1332*22dc650dSSadaf Ebrahimi       tcode += IMM2_SIZE;
1333*22dc650dSSadaf Ebrahimi       /* Fall through */
1334*22dc650dSSadaf Ebrahimi       case OP_CHARI:
1335*22dc650dSSadaf Ebrahimi       case OP_PLUSI:
1336*22dc650dSSadaf Ebrahimi       case OP_MINPLUSI:
1337*22dc650dSSadaf Ebrahimi       case OP_POSPLUSI:
1338*22dc650dSSadaf Ebrahimi       (void)set_table_bit(re, tcode + 1, TRUE, utf, ucp);
1339*22dc650dSSadaf Ebrahimi       try_next = FALSE;
1340*22dc650dSSadaf Ebrahimi       break;
1341*22dc650dSSadaf Ebrahimi 
1342*22dc650dSSadaf Ebrahimi       /* Special spacing and line-terminating items. These recognize specific
1343*22dc650dSSadaf Ebrahimi       lists of characters. The difference between VSPACE and ANYNL is that the
1344*22dc650dSSadaf Ebrahimi       latter can match the two-character CRLF sequence, but that is not
1345*22dc650dSSadaf Ebrahimi       relevant for finding the first character, so their code here is
1346*22dc650dSSadaf Ebrahimi       identical. */
1347*22dc650dSSadaf Ebrahimi 
1348*22dc650dSSadaf Ebrahimi       case OP_HSPACE:
1349*22dc650dSSadaf Ebrahimi       SET_BIT(CHAR_HT);
1350*22dc650dSSadaf Ebrahimi       SET_BIT(CHAR_SPACE);
1351*22dc650dSSadaf Ebrahimi 
1352*22dc650dSSadaf Ebrahimi       /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1353*22dc650dSSadaf Ebrahimi       the bits for 0xA0 and for code units >= 255, independently of UTF. */
1354*22dc650dSSadaf Ebrahimi 
1355*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH != 8
1356*22dc650dSSadaf Ebrahimi       SET_BIT(0xA0);
1357*22dc650dSSadaf Ebrahimi       SET_BIT(0xFF);
1358*22dc650dSSadaf Ebrahimi #else
1359*22dc650dSSadaf Ebrahimi       /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1360*22dc650dSSadaf Ebrahimi       units of horizontal space characters. */
1361*22dc650dSSadaf Ebrahimi 
1362*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
1363*22dc650dSSadaf Ebrahimi       if (utf)
1364*22dc650dSSadaf Ebrahimi         {
1365*22dc650dSSadaf Ebrahimi         SET_BIT(0xC2);  /* For U+00A0 */
1366*22dc650dSSadaf Ebrahimi         SET_BIT(0xE1);  /* For U+1680, U+180E */
1367*22dc650dSSadaf Ebrahimi         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1368*22dc650dSSadaf Ebrahimi         SET_BIT(0xE3);  /* For U+3000 */
1369*22dc650dSSadaf Ebrahimi         }
1370*22dc650dSSadaf Ebrahimi       else
1371*22dc650dSSadaf Ebrahimi #endif
1372*22dc650dSSadaf Ebrahimi       /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1373*22dc650dSSadaf Ebrahimi       the code is EBCDIC. */
1374*22dc650dSSadaf Ebrahimi         {
1375*22dc650dSSadaf Ebrahimi #ifndef EBCDIC
1376*22dc650dSSadaf Ebrahimi         SET_BIT(0xA0);
1377*22dc650dSSadaf Ebrahimi #endif  /* Not EBCDIC */
1378*22dc650dSSadaf Ebrahimi         }
1379*22dc650dSSadaf Ebrahimi #endif  /* 8-bit support */
1380*22dc650dSSadaf Ebrahimi 
1381*22dc650dSSadaf Ebrahimi       try_next = FALSE;
1382*22dc650dSSadaf Ebrahimi       break;
1383*22dc650dSSadaf Ebrahimi 
1384*22dc650dSSadaf Ebrahimi       case OP_ANYNL:
1385*22dc650dSSadaf Ebrahimi       case OP_VSPACE:
1386*22dc650dSSadaf Ebrahimi       SET_BIT(CHAR_LF);
1387*22dc650dSSadaf Ebrahimi       SET_BIT(CHAR_VT);
1388*22dc650dSSadaf Ebrahimi       SET_BIT(CHAR_FF);
1389*22dc650dSSadaf Ebrahimi       SET_BIT(CHAR_CR);
1390*22dc650dSSadaf Ebrahimi 
1391*22dc650dSSadaf Ebrahimi       /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1392*22dc650dSSadaf Ebrahimi       the bits for NEL and for code units >= 255, independently of UTF. */
1393*22dc650dSSadaf Ebrahimi 
1394*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH != 8
1395*22dc650dSSadaf Ebrahimi       SET_BIT(CHAR_NEL);
1396*22dc650dSSadaf Ebrahimi       SET_BIT(0xFF);
1397*22dc650dSSadaf Ebrahimi #else
1398*22dc650dSSadaf Ebrahimi       /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1399*22dc650dSSadaf Ebrahimi       units of vertical space characters. */
1400*22dc650dSSadaf Ebrahimi 
1401*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
1402*22dc650dSSadaf Ebrahimi       if (utf)
1403*22dc650dSSadaf Ebrahimi         {
1404*22dc650dSSadaf Ebrahimi         SET_BIT(0xC2);  /* For U+0085 (NEL) */
1405*22dc650dSSadaf Ebrahimi         SET_BIT(0xE2);  /* For U+2028, U+2029 */
1406*22dc650dSSadaf Ebrahimi         }
1407*22dc650dSSadaf Ebrahimi       else
1408*22dc650dSSadaf Ebrahimi #endif
1409*22dc650dSSadaf Ebrahimi       /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1410*22dc650dSSadaf Ebrahimi         {
1411*22dc650dSSadaf Ebrahimi         SET_BIT(CHAR_NEL);
1412*22dc650dSSadaf Ebrahimi         }
1413*22dc650dSSadaf Ebrahimi #endif  /* 8-bit support */
1414*22dc650dSSadaf Ebrahimi 
1415*22dc650dSSadaf Ebrahimi       try_next = FALSE;
1416*22dc650dSSadaf Ebrahimi       break;
1417*22dc650dSSadaf Ebrahimi 
1418*22dc650dSSadaf Ebrahimi       /* Single character types set the bits and stop. Note that if PCRE2_UCP
1419*22dc650dSSadaf Ebrahimi       is set, we do not see these opcodes because \d etc are converted to
1420*22dc650dSSadaf Ebrahimi       properties. Therefore, these apply in the case when only characters less
1421*22dc650dSSadaf Ebrahimi       than 256 are recognized to match the types. */
1422*22dc650dSSadaf Ebrahimi 
1423*22dc650dSSadaf Ebrahimi       case OP_NOT_DIGIT:
1424*22dc650dSSadaf Ebrahimi       set_nottype_bits(re, cbit_digit, table_limit);
1425*22dc650dSSadaf Ebrahimi       try_next = FALSE;
1426*22dc650dSSadaf Ebrahimi       break;
1427*22dc650dSSadaf Ebrahimi 
1428*22dc650dSSadaf Ebrahimi       case OP_DIGIT:
1429*22dc650dSSadaf Ebrahimi       set_type_bits(re, cbit_digit, table_limit);
1430*22dc650dSSadaf Ebrahimi       try_next = FALSE;
1431*22dc650dSSadaf Ebrahimi       break;
1432*22dc650dSSadaf Ebrahimi 
1433*22dc650dSSadaf Ebrahimi       case OP_NOT_WHITESPACE:
1434*22dc650dSSadaf Ebrahimi       set_nottype_bits(re, cbit_space, table_limit);
1435*22dc650dSSadaf Ebrahimi       try_next = FALSE;
1436*22dc650dSSadaf Ebrahimi       break;
1437*22dc650dSSadaf Ebrahimi 
1438*22dc650dSSadaf Ebrahimi       case OP_WHITESPACE:
1439*22dc650dSSadaf Ebrahimi       set_type_bits(re, cbit_space, table_limit);
1440*22dc650dSSadaf Ebrahimi       try_next = FALSE;
1441*22dc650dSSadaf Ebrahimi       break;
1442*22dc650dSSadaf Ebrahimi 
1443*22dc650dSSadaf Ebrahimi       case OP_NOT_WORDCHAR:
1444*22dc650dSSadaf Ebrahimi       set_nottype_bits(re, cbit_word, table_limit);
1445*22dc650dSSadaf Ebrahimi       try_next = FALSE;
1446*22dc650dSSadaf Ebrahimi       break;
1447*22dc650dSSadaf Ebrahimi 
1448*22dc650dSSadaf Ebrahimi       case OP_WORDCHAR:
1449*22dc650dSSadaf Ebrahimi       set_type_bits(re, cbit_word, table_limit);
1450*22dc650dSSadaf Ebrahimi       try_next = FALSE;
1451*22dc650dSSadaf Ebrahimi       break;
1452*22dc650dSSadaf Ebrahimi 
1453*22dc650dSSadaf Ebrahimi       /* One or more character type fudges the pointer and restarts, knowing
1454*22dc650dSSadaf Ebrahimi       it will hit a single character type and stop there. */
1455*22dc650dSSadaf Ebrahimi 
1456*22dc650dSSadaf Ebrahimi       case OP_TYPEPLUS:
1457*22dc650dSSadaf Ebrahimi       case OP_TYPEMINPLUS:
1458*22dc650dSSadaf Ebrahimi       case OP_TYPEPOSPLUS:
1459*22dc650dSSadaf Ebrahimi       tcode++;
1460*22dc650dSSadaf Ebrahimi       break;
1461*22dc650dSSadaf Ebrahimi 
1462*22dc650dSSadaf Ebrahimi       case OP_TYPEEXACT:
1463*22dc650dSSadaf Ebrahimi       tcode += 1 + IMM2_SIZE;
1464*22dc650dSSadaf Ebrahimi       break;
1465*22dc650dSSadaf Ebrahimi 
1466*22dc650dSSadaf Ebrahimi       /* Zero or more repeats of character types set the bits and then
1467*22dc650dSSadaf Ebrahimi       try again. */
1468*22dc650dSSadaf Ebrahimi 
1469*22dc650dSSadaf Ebrahimi       case OP_TYPEUPTO:
1470*22dc650dSSadaf Ebrahimi       case OP_TYPEMINUPTO:
1471*22dc650dSSadaf Ebrahimi       case OP_TYPEPOSUPTO:
1472*22dc650dSSadaf Ebrahimi       tcode += IMM2_SIZE;  /* Fall through */
1473*22dc650dSSadaf Ebrahimi 
1474*22dc650dSSadaf Ebrahimi       case OP_TYPESTAR:
1475*22dc650dSSadaf Ebrahimi       case OP_TYPEMINSTAR:
1476*22dc650dSSadaf Ebrahimi       case OP_TYPEPOSSTAR:
1477*22dc650dSSadaf Ebrahimi       case OP_TYPEQUERY:
1478*22dc650dSSadaf Ebrahimi       case OP_TYPEMINQUERY:
1479*22dc650dSSadaf Ebrahimi       case OP_TYPEPOSQUERY:
1480*22dc650dSSadaf Ebrahimi       switch(tcode[1])
1481*22dc650dSSadaf Ebrahimi         {
1482*22dc650dSSadaf Ebrahimi         default:
1483*22dc650dSSadaf Ebrahimi         case OP_ANY:
1484*22dc650dSSadaf Ebrahimi         case OP_ALLANY:
1485*22dc650dSSadaf Ebrahimi         return SSB_FAIL;
1486*22dc650dSSadaf Ebrahimi 
1487*22dc650dSSadaf Ebrahimi         case OP_HSPACE:
1488*22dc650dSSadaf Ebrahimi         SET_BIT(CHAR_HT);
1489*22dc650dSSadaf Ebrahimi         SET_BIT(CHAR_SPACE);
1490*22dc650dSSadaf Ebrahimi 
1491*22dc650dSSadaf Ebrahimi         /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1492*22dc650dSSadaf Ebrahimi         the bits for 0xA0 and for code units >= 255, independently of UTF. */
1493*22dc650dSSadaf Ebrahimi 
1494*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH != 8
1495*22dc650dSSadaf Ebrahimi         SET_BIT(0xA0);
1496*22dc650dSSadaf Ebrahimi         SET_BIT(0xFF);
1497*22dc650dSSadaf Ebrahimi #else
1498*22dc650dSSadaf Ebrahimi         /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1499*22dc650dSSadaf Ebrahimi         units of horizontal space characters. */
1500*22dc650dSSadaf Ebrahimi 
1501*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
1502*22dc650dSSadaf Ebrahimi         if (utf)
1503*22dc650dSSadaf Ebrahimi           {
1504*22dc650dSSadaf Ebrahimi           SET_BIT(0xC2);  /* For U+00A0 */
1505*22dc650dSSadaf Ebrahimi           SET_BIT(0xE1);  /* For U+1680, U+180E */
1506*22dc650dSSadaf Ebrahimi           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1507*22dc650dSSadaf Ebrahimi           SET_BIT(0xE3);  /* For U+3000 */
1508*22dc650dSSadaf Ebrahimi           }
1509*22dc650dSSadaf Ebrahimi         else
1510*22dc650dSSadaf Ebrahimi #endif
1511*22dc650dSSadaf Ebrahimi         /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1512*22dc650dSSadaf Ebrahimi         the code is EBCDIC. */
1513*22dc650dSSadaf Ebrahimi           {
1514*22dc650dSSadaf Ebrahimi #ifndef EBCDIC
1515*22dc650dSSadaf Ebrahimi           SET_BIT(0xA0);
1516*22dc650dSSadaf Ebrahimi #endif  /* Not EBCDIC */
1517*22dc650dSSadaf Ebrahimi           }
1518*22dc650dSSadaf Ebrahimi #endif  /* 8-bit support */
1519*22dc650dSSadaf Ebrahimi         break;
1520*22dc650dSSadaf Ebrahimi 
1521*22dc650dSSadaf Ebrahimi         case OP_ANYNL:
1522*22dc650dSSadaf Ebrahimi         case OP_VSPACE:
1523*22dc650dSSadaf Ebrahimi         SET_BIT(CHAR_LF);
1524*22dc650dSSadaf Ebrahimi         SET_BIT(CHAR_VT);
1525*22dc650dSSadaf Ebrahimi         SET_BIT(CHAR_FF);
1526*22dc650dSSadaf Ebrahimi         SET_BIT(CHAR_CR);
1527*22dc650dSSadaf Ebrahimi 
1528*22dc650dSSadaf Ebrahimi         /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1529*22dc650dSSadaf Ebrahimi         the bits for NEL and for code units >= 255, independently of UTF. */
1530*22dc650dSSadaf Ebrahimi 
1531*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH != 8
1532*22dc650dSSadaf Ebrahimi         SET_BIT(CHAR_NEL);
1533*22dc650dSSadaf Ebrahimi         SET_BIT(0xFF);
1534*22dc650dSSadaf Ebrahimi #else
1535*22dc650dSSadaf Ebrahimi         /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1536*22dc650dSSadaf Ebrahimi         units of vertical space characters. */
1537*22dc650dSSadaf Ebrahimi 
1538*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
1539*22dc650dSSadaf Ebrahimi         if (utf)
1540*22dc650dSSadaf Ebrahimi           {
1541*22dc650dSSadaf Ebrahimi           SET_BIT(0xC2);  /* For U+0085 (NEL) */
1542*22dc650dSSadaf Ebrahimi           SET_BIT(0xE2);  /* For U+2028, U+2029 */
1543*22dc650dSSadaf Ebrahimi           }
1544*22dc650dSSadaf Ebrahimi         else
1545*22dc650dSSadaf Ebrahimi #endif
1546*22dc650dSSadaf Ebrahimi         /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1547*22dc650dSSadaf Ebrahimi           {
1548*22dc650dSSadaf Ebrahimi           SET_BIT(CHAR_NEL);
1549*22dc650dSSadaf Ebrahimi           }
1550*22dc650dSSadaf Ebrahimi #endif  /* 8-bit support */
1551*22dc650dSSadaf Ebrahimi         break;
1552*22dc650dSSadaf Ebrahimi 
1553*22dc650dSSadaf Ebrahimi         case OP_NOT_DIGIT:
1554*22dc650dSSadaf Ebrahimi         set_nottype_bits(re, cbit_digit, table_limit);
1555*22dc650dSSadaf Ebrahimi         break;
1556*22dc650dSSadaf Ebrahimi 
1557*22dc650dSSadaf Ebrahimi         case OP_DIGIT:
1558*22dc650dSSadaf Ebrahimi         set_type_bits(re, cbit_digit, table_limit);
1559*22dc650dSSadaf Ebrahimi         break;
1560*22dc650dSSadaf Ebrahimi 
1561*22dc650dSSadaf Ebrahimi         case OP_NOT_WHITESPACE:
1562*22dc650dSSadaf Ebrahimi         set_nottype_bits(re, cbit_space, table_limit);
1563*22dc650dSSadaf Ebrahimi         break;
1564*22dc650dSSadaf Ebrahimi 
1565*22dc650dSSadaf Ebrahimi         case OP_WHITESPACE:
1566*22dc650dSSadaf Ebrahimi         set_type_bits(re, cbit_space, table_limit);
1567*22dc650dSSadaf Ebrahimi         break;
1568*22dc650dSSadaf Ebrahimi 
1569*22dc650dSSadaf Ebrahimi         case OP_NOT_WORDCHAR:
1570*22dc650dSSadaf Ebrahimi         set_nottype_bits(re, cbit_word, table_limit);
1571*22dc650dSSadaf Ebrahimi         break;
1572*22dc650dSSadaf Ebrahimi 
1573*22dc650dSSadaf Ebrahimi         case OP_WORDCHAR:
1574*22dc650dSSadaf Ebrahimi         set_type_bits(re, cbit_word, table_limit);
1575*22dc650dSSadaf Ebrahimi         break;
1576*22dc650dSSadaf Ebrahimi         }
1577*22dc650dSSadaf Ebrahimi 
1578*22dc650dSSadaf Ebrahimi       tcode += 2;
1579*22dc650dSSadaf Ebrahimi       break;
1580*22dc650dSSadaf Ebrahimi 
1581*22dc650dSSadaf Ebrahimi       /* Extended class: if there are any property checks, or if this is a
1582*22dc650dSSadaf Ebrahimi       negative XCLASS without a map, give up. If there are no property checks,
1583*22dc650dSSadaf Ebrahimi       there must be wide characters on the XCLASS list, because otherwise an
1584*22dc650dSSadaf Ebrahimi       XCLASS would not have been created. This means that code points >= 255
1585*22dc650dSSadaf Ebrahimi       are potential starters. In the UTF-8 case we can scan them and set bits
1586*22dc650dSSadaf Ebrahimi       for the relevant leading bytes. */
1587*22dc650dSSadaf Ebrahimi 
1588*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_WIDE_CHARS
1589*22dc650dSSadaf Ebrahimi       case OP_XCLASS:
1590*22dc650dSSadaf Ebrahimi       xclassflags = tcode[1 + LINK_SIZE];
1591*22dc650dSSadaf Ebrahimi       if ((xclassflags & XCL_HASPROP) != 0 ||
1592*22dc650dSSadaf Ebrahimi           (xclassflags & (XCL_MAP|XCL_NOT)) == XCL_NOT)
1593*22dc650dSSadaf Ebrahimi         return SSB_FAIL;
1594*22dc650dSSadaf Ebrahimi 
1595*22dc650dSSadaf Ebrahimi       /* We have a positive XCLASS or a negative one without a map. Set up the
1596*22dc650dSSadaf Ebrahimi       map pointer if there is one, and fall through. */
1597*22dc650dSSadaf Ebrahimi 
1598*22dc650dSSadaf Ebrahimi       classmap = ((xclassflags & XCL_MAP) == 0)? NULL :
1599*22dc650dSSadaf Ebrahimi         (uint8_t *)(tcode + 1 + LINK_SIZE + 1);
1600*22dc650dSSadaf Ebrahimi 
1601*22dc650dSSadaf Ebrahimi       /* In UTF-8 mode, scan the character list and set bits for leading bytes,
1602*22dc650dSSadaf Ebrahimi       then jump to handle the map. */
1603*22dc650dSSadaf Ebrahimi 
1604*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
1605*22dc650dSSadaf Ebrahimi       if (utf && (xclassflags & XCL_NOT) == 0)
1606*22dc650dSSadaf Ebrahimi         {
1607*22dc650dSSadaf Ebrahimi         PCRE2_UCHAR b, e;
1608*22dc650dSSadaf Ebrahimi         PCRE2_SPTR p = tcode + 1 + LINK_SIZE + 1 + ((classmap == NULL)? 0:32);
1609*22dc650dSSadaf Ebrahimi         tcode += GET(tcode, 1);
1610*22dc650dSSadaf Ebrahimi 
1611*22dc650dSSadaf Ebrahimi         for (;;) switch (*p++)
1612*22dc650dSSadaf Ebrahimi           {
1613*22dc650dSSadaf Ebrahimi           case XCL_SINGLE:
1614*22dc650dSSadaf Ebrahimi           b = *p++;
1615*22dc650dSSadaf Ebrahimi           while ((*p & 0xc0) == 0x80) p++;
1616*22dc650dSSadaf Ebrahimi           re->start_bitmap[b/8] |= (1u << (b&7));
1617*22dc650dSSadaf Ebrahimi           break;
1618*22dc650dSSadaf Ebrahimi 
1619*22dc650dSSadaf Ebrahimi           case XCL_RANGE:
1620*22dc650dSSadaf Ebrahimi           b = *p++;
1621*22dc650dSSadaf Ebrahimi           while ((*p & 0xc0) == 0x80) p++;
1622*22dc650dSSadaf Ebrahimi           e = *p++;
1623*22dc650dSSadaf Ebrahimi           while ((*p & 0xc0) == 0x80) p++;
1624*22dc650dSSadaf Ebrahimi           for (; b <= e; b++)
1625*22dc650dSSadaf Ebrahimi             re->start_bitmap[b/8] |= (1u << (b&7));
1626*22dc650dSSadaf Ebrahimi           break;
1627*22dc650dSSadaf Ebrahimi 
1628*22dc650dSSadaf Ebrahimi           case XCL_END:
1629*22dc650dSSadaf Ebrahimi           goto HANDLE_CLASSMAP;
1630*22dc650dSSadaf Ebrahimi 
1631*22dc650dSSadaf Ebrahimi           default:
1632*22dc650dSSadaf Ebrahimi           return SSB_UNKNOWN;   /* Internal error, should not occur */
1633*22dc650dSSadaf Ebrahimi           }
1634*22dc650dSSadaf Ebrahimi         }
1635*22dc650dSSadaf Ebrahimi #endif  /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */
1636*22dc650dSSadaf Ebrahimi #endif  /* SUPPORT_WIDE_CHARS */
1637*22dc650dSSadaf Ebrahimi 
1638*22dc650dSSadaf Ebrahimi       /* It seems that the fall through comment must be outside the #ifdef if
1639*22dc650dSSadaf Ebrahimi       it is to avoid the gcc compiler warning. */
1640*22dc650dSSadaf Ebrahimi 
1641*22dc650dSSadaf Ebrahimi       /* Fall through */
1642*22dc650dSSadaf Ebrahimi 
1643*22dc650dSSadaf Ebrahimi       /* Enter here for a negative non-XCLASS. In the 8-bit library, if we are
1644*22dc650dSSadaf Ebrahimi       in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter
1645*22dc650dSSadaf Ebrahimi       because it starts a character with a value > 255. In 8-bit non-UTF mode,
1646*22dc650dSSadaf Ebrahimi       there is no difference between CLASS and NCLASS. In all other wide
1647*22dc650dSSadaf Ebrahimi       character modes, set the 0xFF bit to indicate code units >= 255. */
1648*22dc650dSSadaf Ebrahimi 
1649*22dc650dSSadaf Ebrahimi       case OP_NCLASS:
1650*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1651*22dc650dSSadaf Ebrahimi       if (utf)
1652*22dc650dSSadaf Ebrahimi         {
1653*22dc650dSSadaf Ebrahimi         re->start_bitmap[24] |= 0xf0;            /* Bits for 0xc4 - 0xc8 */
1654*22dc650dSSadaf Ebrahimi         memset(re->start_bitmap+25, 0xff, 7);    /* Bits for 0xc9 - 0xff */
1655*22dc650dSSadaf Ebrahimi         }
1656*22dc650dSSadaf Ebrahimi #elif PCRE2_CODE_UNIT_WIDTH != 8
1657*22dc650dSSadaf Ebrahimi       SET_BIT(0xFF);                             /* For characters >= 255 */
1658*22dc650dSSadaf Ebrahimi #endif
1659*22dc650dSSadaf Ebrahimi       /* Fall through */
1660*22dc650dSSadaf Ebrahimi 
1661*22dc650dSSadaf Ebrahimi       /* Enter here for a positive non-XCLASS. If we have fallen through from
1662*22dc650dSSadaf Ebrahimi       an XCLASS, classmap will already be set; just advance the code pointer.
1663*22dc650dSSadaf Ebrahimi       Otherwise, set up classmap for a a non-XCLASS and advance past it. */
1664*22dc650dSSadaf Ebrahimi 
1665*22dc650dSSadaf Ebrahimi       case OP_CLASS:
1666*22dc650dSSadaf Ebrahimi       if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else
1667*22dc650dSSadaf Ebrahimi         {
1668*22dc650dSSadaf Ebrahimi         classmap = (uint8_t *)(++tcode);
1669*22dc650dSSadaf Ebrahimi         tcode += 32 / sizeof(PCRE2_UCHAR);
1670*22dc650dSSadaf Ebrahimi         }
1671*22dc650dSSadaf Ebrahimi 
1672*22dc650dSSadaf Ebrahimi       /* When wide characters are supported, classmap may be NULL. In UTF-8
1673*22dc650dSSadaf Ebrahimi       (sic) mode, the bits in a class bit map correspond to character values,
1674*22dc650dSSadaf Ebrahimi       not to byte values. However, the bit map we are constructing is for byte
1675*22dc650dSSadaf Ebrahimi       values. So we have to do a conversion for characters whose code point is
1676*22dc650dSSadaf Ebrahimi       greater than 127. In fact, there are only two possible starting bytes for
1677*22dc650dSSadaf Ebrahimi       characters in the range 128 - 255. */
1678*22dc650dSSadaf Ebrahimi 
1679*22dc650dSSadaf Ebrahimi #if defined SUPPORT_WIDE_CHARS && PCRE2_CODE_UNIT_WIDTH == 8
1680*22dc650dSSadaf Ebrahimi       HANDLE_CLASSMAP:
1681*22dc650dSSadaf Ebrahimi #endif
1682*22dc650dSSadaf Ebrahimi       if (classmap != NULL)
1683*22dc650dSSadaf Ebrahimi         {
1684*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1685*22dc650dSSadaf Ebrahimi         if (utf)
1686*22dc650dSSadaf Ebrahimi           {
1687*22dc650dSSadaf Ebrahimi           for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c];
1688*22dc650dSSadaf Ebrahimi           for (c = 128; c < 256; c++)
1689*22dc650dSSadaf Ebrahimi             {
1690*22dc650dSSadaf Ebrahimi             if ((classmap[c/8] & (1u << (c&7))) != 0)
1691*22dc650dSSadaf Ebrahimi               {
1692*22dc650dSSadaf Ebrahimi               int d = (c >> 6) | 0xc0;                 /* Set bit for this starter */
1693*22dc650dSSadaf Ebrahimi               re->start_bitmap[d/8] |= (1u << (d&7));  /* and then skip on to the */
1694*22dc650dSSadaf Ebrahimi               c = (c & 0xc0) + 0x40 - 1;               /* next relevant character. */
1695*22dc650dSSadaf Ebrahimi               }
1696*22dc650dSSadaf Ebrahimi             }
1697*22dc650dSSadaf Ebrahimi           }
1698*22dc650dSSadaf Ebrahimi         else
1699*22dc650dSSadaf Ebrahimi #endif
1700*22dc650dSSadaf Ebrahimi         /* In all modes except UTF-8, the two bit maps are compatible. */
1701*22dc650dSSadaf Ebrahimi 
1702*22dc650dSSadaf Ebrahimi           {
1703*22dc650dSSadaf Ebrahimi           for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c];
1704*22dc650dSSadaf Ebrahimi           }
1705*22dc650dSSadaf Ebrahimi         }
1706*22dc650dSSadaf Ebrahimi 
1707*22dc650dSSadaf Ebrahimi       /* Act on what follows the class. For a zero minimum repeat, continue;
1708*22dc650dSSadaf Ebrahimi       otherwise stop processing. */
1709*22dc650dSSadaf Ebrahimi 
1710*22dc650dSSadaf Ebrahimi       switch (*tcode)
1711*22dc650dSSadaf Ebrahimi         {
1712*22dc650dSSadaf Ebrahimi         case OP_CRSTAR:
1713*22dc650dSSadaf Ebrahimi         case OP_CRMINSTAR:
1714*22dc650dSSadaf Ebrahimi         case OP_CRQUERY:
1715*22dc650dSSadaf Ebrahimi         case OP_CRMINQUERY:
1716*22dc650dSSadaf Ebrahimi         case OP_CRPOSSTAR:
1717*22dc650dSSadaf Ebrahimi         case OP_CRPOSQUERY:
1718*22dc650dSSadaf Ebrahimi         tcode++;
1719*22dc650dSSadaf Ebrahimi         break;
1720*22dc650dSSadaf Ebrahimi 
1721*22dc650dSSadaf Ebrahimi         case OP_CRRANGE:
1722*22dc650dSSadaf Ebrahimi         case OP_CRMINRANGE:
1723*22dc650dSSadaf Ebrahimi         case OP_CRPOSRANGE:
1724*22dc650dSSadaf Ebrahimi         if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1725*22dc650dSSadaf Ebrahimi           else try_next = FALSE;
1726*22dc650dSSadaf Ebrahimi         break;
1727*22dc650dSSadaf Ebrahimi 
1728*22dc650dSSadaf Ebrahimi         default:
1729*22dc650dSSadaf Ebrahimi         try_next = FALSE;
1730*22dc650dSSadaf Ebrahimi         break;
1731*22dc650dSSadaf Ebrahimi         }
1732*22dc650dSSadaf Ebrahimi       break; /* End of class handling case */
1733*22dc650dSSadaf Ebrahimi       }      /* End of switch for opcodes */
1734*22dc650dSSadaf Ebrahimi     }        /* End of try_next loop */
1735*22dc650dSSadaf Ebrahimi 
1736*22dc650dSSadaf Ebrahimi   code += GET(code, 1);   /* Advance to next branch */
1737*22dc650dSSadaf Ebrahimi   }
1738*22dc650dSSadaf Ebrahimi while (*code == OP_ALT);
1739*22dc650dSSadaf Ebrahimi 
1740*22dc650dSSadaf Ebrahimi return yield;
1741*22dc650dSSadaf Ebrahimi }
1742*22dc650dSSadaf Ebrahimi 
1743*22dc650dSSadaf Ebrahimi 
1744*22dc650dSSadaf Ebrahimi 
1745*22dc650dSSadaf Ebrahimi /*************************************************
1746*22dc650dSSadaf Ebrahimi *          Study a compiled expression           *
1747*22dc650dSSadaf Ebrahimi *************************************************/
1748*22dc650dSSadaf Ebrahimi 
1749*22dc650dSSadaf Ebrahimi /* This function is handed a compiled expression that it must study to produce
1750*22dc650dSSadaf Ebrahimi information that will speed up the matching.
1751*22dc650dSSadaf Ebrahimi 
1752*22dc650dSSadaf Ebrahimi Argument:
1753*22dc650dSSadaf Ebrahimi   re       points to the compiled expression
1754*22dc650dSSadaf Ebrahimi 
1755*22dc650dSSadaf Ebrahimi Returns:   0 normally; non-zero should never normally occur
1756*22dc650dSSadaf Ebrahimi            1 unknown opcode in set_start_bits
1757*22dc650dSSadaf Ebrahimi            2 missing capturing bracket
1758*22dc650dSSadaf Ebrahimi            3 unknown opcode in find_minlength
1759*22dc650dSSadaf Ebrahimi */
1760*22dc650dSSadaf Ebrahimi 
1761*22dc650dSSadaf Ebrahimi int
PRIV(study)1762*22dc650dSSadaf Ebrahimi PRIV(study)(pcre2_real_code *re)
1763*22dc650dSSadaf Ebrahimi {
1764*22dc650dSSadaf Ebrahimi int count = 0;
1765*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *code;
1766*22dc650dSSadaf Ebrahimi BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
1767*22dc650dSSadaf Ebrahimi BOOL ucp = (re->overall_options & PCRE2_UCP) != 0;
1768*22dc650dSSadaf Ebrahimi 
1769*22dc650dSSadaf Ebrahimi /* Find start of compiled code */
1770*22dc650dSSadaf Ebrahimi 
1771*22dc650dSSadaf Ebrahimi code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
1772*22dc650dSSadaf Ebrahimi   re->name_entry_size * re->name_count;
1773*22dc650dSSadaf Ebrahimi 
1774*22dc650dSSadaf Ebrahimi /* For a pattern that has a first code unit, or a multiline pattern that
1775*22dc650dSSadaf Ebrahimi matches only at "line start", there is no point in seeking a list of starting
1776*22dc650dSSadaf Ebrahimi code units. */
1777*22dc650dSSadaf Ebrahimi 
1778*22dc650dSSadaf Ebrahimi if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
1779*22dc650dSSadaf Ebrahimi   {
1780*22dc650dSSadaf Ebrahimi   int depth = 0;
1781*22dc650dSSadaf Ebrahimi   int rc = set_start_bits(re, code, utf, ucp, &depth);
1782*22dc650dSSadaf Ebrahimi   if (rc == SSB_UNKNOWN) return 1;
1783*22dc650dSSadaf Ebrahimi 
1784*22dc650dSSadaf Ebrahimi   /* If a list of starting code units was set up, scan the list to see if only
1785*22dc650dSSadaf Ebrahimi   one or two were listed. Having only one listed is rare because usually a
1786*22dc650dSSadaf Ebrahimi   single starting code unit will have been recognized and PCRE2_FIRSTSET set.
1787*22dc650dSSadaf Ebrahimi   If two are listed, see if they are caseless versions of the same character;
1788*22dc650dSSadaf Ebrahimi   if so we can replace the list with a caseless first code unit. This gives
1789*22dc650dSSadaf Ebrahimi   better performance and is plausibly worth doing for patterns such as [Ww]ord
1790*22dc650dSSadaf Ebrahimi   or (word|WORD). */
1791*22dc650dSSadaf Ebrahimi 
1792*22dc650dSSadaf Ebrahimi   if (rc == SSB_DONE)
1793*22dc650dSSadaf Ebrahimi     {
1794*22dc650dSSadaf Ebrahimi     int i;
1795*22dc650dSSadaf Ebrahimi     int a = -1;
1796*22dc650dSSadaf Ebrahimi     int b = -1;
1797*22dc650dSSadaf Ebrahimi     uint8_t *p = re->start_bitmap;
1798*22dc650dSSadaf Ebrahimi     uint32_t flags = PCRE2_FIRSTMAPSET;
1799*22dc650dSSadaf Ebrahimi 
1800*22dc650dSSadaf Ebrahimi     for (i = 0; i < 256; p++, i += 8)
1801*22dc650dSSadaf Ebrahimi       {
1802*22dc650dSSadaf Ebrahimi       uint8_t x = *p;
1803*22dc650dSSadaf Ebrahimi       if (x != 0)
1804*22dc650dSSadaf Ebrahimi         {
1805*22dc650dSSadaf Ebrahimi         int c;
1806*22dc650dSSadaf Ebrahimi         uint8_t y = x & (~x + 1);   /* Least significant bit */
1807*22dc650dSSadaf Ebrahimi         if (y != x) goto DONE;      /* More than one bit set */
1808*22dc650dSSadaf Ebrahimi 
1809*22dc650dSSadaf Ebrahimi         /* In the 16-bit and 32-bit libraries, the bit for 0xff means "0xff and
1810*22dc650dSSadaf Ebrahimi         all wide characters", so we cannot use it here. */
1811*22dc650dSSadaf Ebrahimi 
1812*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH != 8
1813*22dc650dSSadaf Ebrahimi         if (i == 248 && x == 0x80) goto DONE;
1814*22dc650dSSadaf Ebrahimi #endif
1815*22dc650dSSadaf Ebrahimi 
1816*22dc650dSSadaf Ebrahimi         /* Compute the character value */
1817*22dc650dSSadaf Ebrahimi 
1818*22dc650dSSadaf Ebrahimi         c = i;
1819*22dc650dSSadaf Ebrahimi         switch (x)
1820*22dc650dSSadaf Ebrahimi           {
1821*22dc650dSSadaf Ebrahimi           case 1:   break;
1822*22dc650dSSadaf Ebrahimi           case 2:   c += 1; break;  case 4:  c += 2; break;
1823*22dc650dSSadaf Ebrahimi           case 8:   c += 3; break;  case 16: c += 4; break;
1824*22dc650dSSadaf Ebrahimi           case 32:  c += 5; break;  case 64: c += 6; break;
1825*22dc650dSSadaf Ebrahimi           case 128: c += 7; break;
1826*22dc650dSSadaf Ebrahimi           }
1827*22dc650dSSadaf Ebrahimi 
1828*22dc650dSSadaf Ebrahimi         /* c contains the code unit value, in the range 0-255. In 8-bit UTF
1829*22dc650dSSadaf Ebrahimi         mode, only values < 128 can be used. In all the other cases, c is a
1830*22dc650dSSadaf Ebrahimi         character value. */
1831*22dc650dSSadaf Ebrahimi 
1832*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
1833*22dc650dSSadaf Ebrahimi         if (utf && c > 127) goto DONE;
1834*22dc650dSSadaf Ebrahimi #endif
1835*22dc650dSSadaf Ebrahimi         if (a < 0) a = c;   /* First one found, save in a */
1836*22dc650dSSadaf Ebrahimi         else if (b < 0)     /* Second one found */
1837*22dc650dSSadaf Ebrahimi           {
1838*22dc650dSSadaf Ebrahimi           int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
1839*22dc650dSSadaf Ebrahimi 
1840*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
1841*22dc650dSSadaf Ebrahimi           if (utf || ucp)
1842*22dc650dSSadaf Ebrahimi             {
1843*22dc650dSSadaf Ebrahimi             if (UCD_CASESET(c) != 0) goto DONE;     /* Multiple case set */
1844*22dc650dSSadaf Ebrahimi             if (c > 127) d = UCD_OTHERCASE(c);
1845*22dc650dSSadaf Ebrahimi             }
1846*22dc650dSSadaf Ebrahimi #endif  /* SUPPORT_UNICODE */
1847*22dc650dSSadaf Ebrahimi 
1848*22dc650dSSadaf Ebrahimi           if (d != a) goto DONE;   /* Not the other case of a */
1849*22dc650dSSadaf Ebrahimi           b = c;                   /* Save second in b */
1850*22dc650dSSadaf Ebrahimi           }
1851*22dc650dSSadaf Ebrahimi         else goto DONE;   /* More than two characters found */
1852*22dc650dSSadaf Ebrahimi         }
1853*22dc650dSSadaf Ebrahimi       }
1854*22dc650dSSadaf Ebrahimi 
1855*22dc650dSSadaf Ebrahimi     /* Replace the start code unit bits with a first code unit, but only if it
1856*22dc650dSSadaf Ebrahimi     is not the same as a required later code unit. This is because a search for
1857*22dc650dSSadaf Ebrahimi     a required code unit starts after an explicit first code unit, but at a
1858*22dc650dSSadaf Ebrahimi     code unit found from the bitmap. Patterns such as /a*a/ don't work
1859*22dc650dSSadaf Ebrahimi     if both the start unit and required unit are the same. */
1860*22dc650dSSadaf Ebrahimi 
1861*22dc650dSSadaf Ebrahimi     if (a >= 0 &&
1862*22dc650dSSadaf Ebrahimi         (
1863*22dc650dSSadaf Ebrahimi         (re->flags & PCRE2_LASTSET) == 0 ||
1864*22dc650dSSadaf Ebrahimi           (
1865*22dc650dSSadaf Ebrahimi           re->last_codeunit != (uint32_t)a &&
1866*22dc650dSSadaf Ebrahimi           (b < 0 || re->last_codeunit != (uint32_t)b)
1867*22dc650dSSadaf Ebrahimi           )
1868*22dc650dSSadaf Ebrahimi         ))
1869*22dc650dSSadaf Ebrahimi       {
1870*22dc650dSSadaf Ebrahimi       re->first_codeunit = a;
1871*22dc650dSSadaf Ebrahimi       flags = PCRE2_FIRSTSET;
1872*22dc650dSSadaf Ebrahimi       if (b >= 0) flags |= PCRE2_FIRSTCASELESS;
1873*22dc650dSSadaf Ebrahimi       }
1874*22dc650dSSadaf Ebrahimi 
1875*22dc650dSSadaf Ebrahimi     DONE:
1876*22dc650dSSadaf Ebrahimi     re->flags |= flags;
1877*22dc650dSSadaf Ebrahimi     }
1878*22dc650dSSadaf Ebrahimi   }
1879*22dc650dSSadaf Ebrahimi 
1880*22dc650dSSadaf Ebrahimi /* Find the minimum length of subject string. If the pattern can match an empty
1881*22dc650dSSadaf Ebrahimi string, the minimum length is already known. If the pattern contains (*ACCEPT)
1882*22dc650dSSadaf Ebrahimi all bets are off, and we don't even try to find a minimum length. If there are
1883*22dc650dSSadaf Ebrahimi more back references than the size of the vector we are going to cache them in,
1884*22dc650dSSadaf Ebrahimi do nothing. A pattern that complicated will probably take a long time to
1885*22dc650dSSadaf Ebrahimi analyze and may in any case turn out to be too complicated. Note that back
1886*22dc650dSSadaf Ebrahimi reference minima are held as 16-bit numbers. */
1887*22dc650dSSadaf Ebrahimi 
1888*22dc650dSSadaf Ebrahimi if ((re->flags & (PCRE2_MATCH_EMPTY|PCRE2_HASACCEPT)) == 0 &&
1889*22dc650dSSadaf Ebrahimi      re->top_backref <= MAX_CACHE_BACKREF)
1890*22dc650dSSadaf Ebrahimi   {
1891*22dc650dSSadaf Ebrahimi   int min;
1892*22dc650dSSadaf Ebrahimi   int backref_cache[MAX_CACHE_BACKREF+1];
1893*22dc650dSSadaf Ebrahimi   backref_cache[0] = 0;    /* Highest one that is set */
1894*22dc650dSSadaf Ebrahimi   min = find_minlength(re, code, code, utf, NULL, &count, backref_cache);
1895*22dc650dSSadaf Ebrahimi   switch(min)
1896*22dc650dSSadaf Ebrahimi     {
1897*22dc650dSSadaf Ebrahimi     case -1:  /* \C in UTF mode or over-complex regex */
1898*22dc650dSSadaf Ebrahimi     break;    /* Leave minlength unchanged (will be zero) */
1899*22dc650dSSadaf Ebrahimi 
1900*22dc650dSSadaf Ebrahimi     case -2:
1901*22dc650dSSadaf Ebrahimi     return 2; /* missing capturing bracket */
1902*22dc650dSSadaf Ebrahimi 
1903*22dc650dSSadaf Ebrahimi     case -3:
1904*22dc650dSSadaf Ebrahimi     return 3; /* unrecognized opcode */
1905*22dc650dSSadaf Ebrahimi 
1906*22dc650dSSadaf Ebrahimi     default:
1907*22dc650dSSadaf Ebrahimi     re->minlength = (min > UINT16_MAX)? UINT16_MAX : min;
1908*22dc650dSSadaf Ebrahimi     break;
1909*22dc650dSSadaf Ebrahimi     }
1910*22dc650dSSadaf Ebrahimi   }
1911*22dc650dSSadaf Ebrahimi 
1912*22dc650dSSadaf Ebrahimi return 0;
1913*22dc650dSSadaf Ebrahimi }
1914*22dc650dSSadaf Ebrahimi 
1915*22dc650dSSadaf Ebrahimi /* End of pcre2_study.c */
1916