xref: /aosp_15_r20/external/pcre/src/pcre2_find_bracket.c (revision 22dc650d8ae982c6770746019a6f94af92b0f024)
1*22dc650dSSadaf Ebrahimi /*************************************************
2*22dc650dSSadaf Ebrahimi *      Perl-Compatible Regular Expressions       *
3*22dc650dSSadaf Ebrahimi *************************************************/
4*22dc650dSSadaf Ebrahimi 
5*22dc650dSSadaf Ebrahimi /* PCRE is a library of functions to support regular expressions whose syntax
6*22dc650dSSadaf Ebrahimi and semantics are as close as possible to those of the Perl 5 language.
7*22dc650dSSadaf Ebrahimi 
8*22dc650dSSadaf Ebrahimi                        Written by Philip Hazel
9*22dc650dSSadaf Ebrahimi      Original API code Copyright (c) 1997-2012 University of Cambridge
10*22dc650dSSadaf Ebrahimi           New API code Copyright (c) 2016-2023 University of Cambridge
11*22dc650dSSadaf Ebrahimi 
12*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
13*22dc650dSSadaf Ebrahimi Redistribution and use in source and binary forms, with or without
14*22dc650dSSadaf Ebrahimi modification, are permitted provided that the following conditions are met:
15*22dc650dSSadaf Ebrahimi 
16*22dc650dSSadaf Ebrahimi     * Redistributions of source code must retain the above copyright notice,
17*22dc650dSSadaf Ebrahimi       this list of conditions and the following disclaimer.
18*22dc650dSSadaf Ebrahimi 
19*22dc650dSSadaf Ebrahimi     * Redistributions in binary form must reproduce the above copyright
20*22dc650dSSadaf Ebrahimi       notice, this list of conditions and the following disclaimer in the
21*22dc650dSSadaf Ebrahimi       documentation and/or other materials provided with the distribution.
22*22dc650dSSadaf Ebrahimi 
23*22dc650dSSadaf Ebrahimi     * Neither the name of the University of Cambridge nor the names of its
24*22dc650dSSadaf Ebrahimi       contributors may be used to endorse or promote products derived from
25*22dc650dSSadaf Ebrahimi       this software without specific prior written permission.
26*22dc650dSSadaf Ebrahimi 
27*22dc650dSSadaf Ebrahimi THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28*22dc650dSSadaf Ebrahimi AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29*22dc650dSSadaf Ebrahimi IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30*22dc650dSSadaf Ebrahimi ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31*22dc650dSSadaf Ebrahimi LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32*22dc650dSSadaf Ebrahimi CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33*22dc650dSSadaf Ebrahimi SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34*22dc650dSSadaf Ebrahimi INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35*22dc650dSSadaf Ebrahimi CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36*22dc650dSSadaf Ebrahimi ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37*22dc650dSSadaf Ebrahimi POSSIBILITY OF SUCH DAMAGE.
38*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
39*22dc650dSSadaf Ebrahimi */
40*22dc650dSSadaf Ebrahimi 
41*22dc650dSSadaf Ebrahimi 
42*22dc650dSSadaf Ebrahimi /* This module contains a single function that scans through a compiled pattern
43*22dc650dSSadaf Ebrahimi until it finds a capturing bracket with the given number, or, if the number is
44*22dc650dSSadaf Ebrahimi negative, an instance of OP_REVERSE or OP_VREVERSE for a lookbehind. The
45*22dc650dSSadaf Ebrahimi function is called from pcre2_compile.c and also from pcre2_study.c when
46*22dc650dSSadaf Ebrahimi finding the minimum matching length. */
47*22dc650dSSadaf Ebrahimi 
48*22dc650dSSadaf Ebrahimi 
49*22dc650dSSadaf Ebrahimi #ifdef HAVE_CONFIG_H
50*22dc650dSSadaf Ebrahimi #include "config.h"
51*22dc650dSSadaf Ebrahimi #endif
52*22dc650dSSadaf Ebrahimi 
53*22dc650dSSadaf Ebrahimi #include "pcre2_internal.h"
54*22dc650dSSadaf Ebrahimi 
55*22dc650dSSadaf Ebrahimi 
56*22dc650dSSadaf Ebrahimi /*************************************************
57*22dc650dSSadaf Ebrahimi *    Scan compiled regex for specific bracket    *
58*22dc650dSSadaf Ebrahimi *************************************************/
59*22dc650dSSadaf Ebrahimi 
60*22dc650dSSadaf Ebrahimi /*
61*22dc650dSSadaf Ebrahimi Arguments:
62*22dc650dSSadaf Ebrahimi   code        points to start of expression
63*22dc650dSSadaf Ebrahimi   utf         TRUE in UTF mode
64*22dc650dSSadaf Ebrahimi   number      the required bracket number or negative to find a lookbehind
65*22dc650dSSadaf Ebrahimi 
66*22dc650dSSadaf Ebrahimi Returns:      pointer to the opcode for the bracket, or NULL if not found
67*22dc650dSSadaf Ebrahimi */
68*22dc650dSSadaf Ebrahimi 
69*22dc650dSSadaf Ebrahimi PCRE2_SPTR
PRIV(find_bracket)70*22dc650dSSadaf Ebrahimi PRIV(find_bracket)(PCRE2_SPTR code, BOOL utf, int number)
71*22dc650dSSadaf Ebrahimi {
72*22dc650dSSadaf Ebrahimi for (;;)
73*22dc650dSSadaf Ebrahimi   {
74*22dc650dSSadaf Ebrahimi   PCRE2_UCHAR c = *code;
75*22dc650dSSadaf Ebrahimi 
76*22dc650dSSadaf Ebrahimi   if (c == OP_END) return NULL;
77*22dc650dSSadaf Ebrahimi 
78*22dc650dSSadaf Ebrahimi   /* XCLASS is used for classes that cannot be represented just by a bit map.
79*22dc650dSSadaf Ebrahimi   This includes negated single high-valued characters. CALLOUT_STR is used for
80*22dc650dSSadaf Ebrahimi   callouts with string arguments. In both cases the length in the table is
81*22dc650dSSadaf Ebrahimi   zero; the actual length is stored in the compiled code. */
82*22dc650dSSadaf Ebrahimi 
83*22dc650dSSadaf Ebrahimi   if (c == OP_XCLASS) code += GET(code, 1);
84*22dc650dSSadaf Ebrahimi     else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
85*22dc650dSSadaf Ebrahimi 
86*22dc650dSSadaf Ebrahimi   /* Handle lookbehind */
87*22dc650dSSadaf Ebrahimi 
88*22dc650dSSadaf Ebrahimi   else if (c == OP_REVERSE || c == OP_VREVERSE)
89*22dc650dSSadaf Ebrahimi     {
90*22dc650dSSadaf Ebrahimi     if (number < 0) return (PCRE2_UCHAR *)code;
91*22dc650dSSadaf Ebrahimi     code += PRIV(OP_lengths)[c];
92*22dc650dSSadaf Ebrahimi     }
93*22dc650dSSadaf Ebrahimi 
94*22dc650dSSadaf Ebrahimi   /* Handle capturing bracket */
95*22dc650dSSadaf Ebrahimi 
96*22dc650dSSadaf Ebrahimi   else if (c == OP_CBRA || c == OP_SCBRA ||
97*22dc650dSSadaf Ebrahimi            c == OP_CBRAPOS || c == OP_SCBRAPOS)
98*22dc650dSSadaf Ebrahimi     {
99*22dc650dSSadaf Ebrahimi     int n = (int)GET2(code, 1+LINK_SIZE);
100*22dc650dSSadaf Ebrahimi     if (n == number) return (PCRE2_UCHAR *)code;
101*22dc650dSSadaf Ebrahimi     code += PRIV(OP_lengths)[c];
102*22dc650dSSadaf Ebrahimi     }
103*22dc650dSSadaf Ebrahimi 
104*22dc650dSSadaf Ebrahimi   /* Otherwise, we can get the item's length from the table, except that for
105*22dc650dSSadaf Ebrahimi   repeated character types, we have to test for \p and \P, which have an extra
106*22dc650dSSadaf Ebrahimi   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
107*22dc650dSSadaf Ebrahimi   must add in its length. */
108*22dc650dSSadaf Ebrahimi 
109*22dc650dSSadaf Ebrahimi   else
110*22dc650dSSadaf Ebrahimi     {
111*22dc650dSSadaf Ebrahimi     switch(c)
112*22dc650dSSadaf Ebrahimi       {
113*22dc650dSSadaf Ebrahimi       case OP_TYPESTAR:
114*22dc650dSSadaf Ebrahimi       case OP_TYPEMINSTAR:
115*22dc650dSSadaf Ebrahimi       case OP_TYPEPLUS:
116*22dc650dSSadaf Ebrahimi       case OP_TYPEMINPLUS:
117*22dc650dSSadaf Ebrahimi       case OP_TYPEQUERY:
118*22dc650dSSadaf Ebrahimi       case OP_TYPEMINQUERY:
119*22dc650dSSadaf Ebrahimi       case OP_TYPEPOSSTAR:
120*22dc650dSSadaf Ebrahimi       case OP_TYPEPOSPLUS:
121*22dc650dSSadaf Ebrahimi       case OP_TYPEPOSQUERY:
122*22dc650dSSadaf Ebrahimi       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
123*22dc650dSSadaf Ebrahimi       break;
124*22dc650dSSadaf Ebrahimi 
125*22dc650dSSadaf Ebrahimi       case OP_TYPEUPTO:
126*22dc650dSSadaf Ebrahimi       case OP_TYPEMINUPTO:
127*22dc650dSSadaf Ebrahimi       case OP_TYPEEXACT:
128*22dc650dSSadaf Ebrahimi       case OP_TYPEPOSUPTO:
129*22dc650dSSadaf Ebrahimi       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
130*22dc650dSSadaf Ebrahimi         code += 2;
131*22dc650dSSadaf Ebrahimi       break;
132*22dc650dSSadaf Ebrahimi 
133*22dc650dSSadaf Ebrahimi       case OP_MARK:
134*22dc650dSSadaf Ebrahimi       case OP_COMMIT_ARG:
135*22dc650dSSadaf Ebrahimi       case OP_PRUNE_ARG:
136*22dc650dSSadaf Ebrahimi       case OP_SKIP_ARG:
137*22dc650dSSadaf Ebrahimi       case OP_THEN_ARG:
138*22dc650dSSadaf Ebrahimi       code += code[1];
139*22dc650dSSadaf Ebrahimi       break;
140*22dc650dSSadaf Ebrahimi       }
141*22dc650dSSadaf Ebrahimi 
142*22dc650dSSadaf Ebrahimi     /* Add in the fixed length from the table */
143*22dc650dSSadaf Ebrahimi 
144*22dc650dSSadaf Ebrahimi     code += PRIV(OP_lengths)[c];
145*22dc650dSSadaf Ebrahimi 
146*22dc650dSSadaf Ebrahimi   /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be
147*22dc650dSSadaf Ebrahimi   followed by a multi-byte character. The length in the table is a minimum, so
148*22dc650dSSadaf Ebrahimi   we have to arrange to skip the extra bytes. */
149*22dc650dSSadaf Ebrahimi 
150*22dc650dSSadaf Ebrahimi #ifdef MAYBE_UTF_MULTI
151*22dc650dSSadaf Ebrahimi     if (utf) switch(c)
152*22dc650dSSadaf Ebrahimi       {
153*22dc650dSSadaf Ebrahimi       case OP_CHAR:
154*22dc650dSSadaf Ebrahimi       case OP_CHARI:
155*22dc650dSSadaf Ebrahimi       case OP_NOT:
156*22dc650dSSadaf Ebrahimi       case OP_NOTI:
157*22dc650dSSadaf Ebrahimi       case OP_EXACT:
158*22dc650dSSadaf Ebrahimi       case OP_EXACTI:
159*22dc650dSSadaf Ebrahimi       case OP_NOTEXACT:
160*22dc650dSSadaf Ebrahimi       case OP_NOTEXACTI:
161*22dc650dSSadaf Ebrahimi       case OP_UPTO:
162*22dc650dSSadaf Ebrahimi       case OP_UPTOI:
163*22dc650dSSadaf Ebrahimi       case OP_NOTUPTO:
164*22dc650dSSadaf Ebrahimi       case OP_NOTUPTOI:
165*22dc650dSSadaf Ebrahimi       case OP_MINUPTO:
166*22dc650dSSadaf Ebrahimi       case OP_MINUPTOI:
167*22dc650dSSadaf Ebrahimi       case OP_NOTMINUPTO:
168*22dc650dSSadaf Ebrahimi       case OP_NOTMINUPTOI:
169*22dc650dSSadaf Ebrahimi       case OP_POSUPTO:
170*22dc650dSSadaf Ebrahimi       case OP_POSUPTOI:
171*22dc650dSSadaf Ebrahimi       case OP_NOTPOSUPTO:
172*22dc650dSSadaf Ebrahimi       case OP_NOTPOSUPTOI:
173*22dc650dSSadaf Ebrahimi       case OP_STAR:
174*22dc650dSSadaf Ebrahimi       case OP_STARI:
175*22dc650dSSadaf Ebrahimi       case OP_NOTSTAR:
176*22dc650dSSadaf Ebrahimi       case OP_NOTSTARI:
177*22dc650dSSadaf Ebrahimi       case OP_MINSTAR:
178*22dc650dSSadaf Ebrahimi       case OP_MINSTARI:
179*22dc650dSSadaf Ebrahimi       case OP_NOTMINSTAR:
180*22dc650dSSadaf Ebrahimi       case OP_NOTMINSTARI:
181*22dc650dSSadaf Ebrahimi       case OP_POSSTAR:
182*22dc650dSSadaf Ebrahimi       case OP_POSSTARI:
183*22dc650dSSadaf Ebrahimi       case OP_NOTPOSSTAR:
184*22dc650dSSadaf Ebrahimi       case OP_NOTPOSSTARI:
185*22dc650dSSadaf Ebrahimi       case OP_PLUS:
186*22dc650dSSadaf Ebrahimi       case OP_PLUSI:
187*22dc650dSSadaf Ebrahimi       case OP_NOTPLUS:
188*22dc650dSSadaf Ebrahimi       case OP_NOTPLUSI:
189*22dc650dSSadaf Ebrahimi       case OP_MINPLUS:
190*22dc650dSSadaf Ebrahimi       case OP_MINPLUSI:
191*22dc650dSSadaf Ebrahimi       case OP_NOTMINPLUS:
192*22dc650dSSadaf Ebrahimi       case OP_NOTMINPLUSI:
193*22dc650dSSadaf Ebrahimi       case OP_POSPLUS:
194*22dc650dSSadaf Ebrahimi       case OP_POSPLUSI:
195*22dc650dSSadaf Ebrahimi       case OP_NOTPOSPLUS:
196*22dc650dSSadaf Ebrahimi       case OP_NOTPOSPLUSI:
197*22dc650dSSadaf Ebrahimi       case OP_QUERY:
198*22dc650dSSadaf Ebrahimi       case OP_QUERYI:
199*22dc650dSSadaf Ebrahimi       case OP_NOTQUERY:
200*22dc650dSSadaf Ebrahimi       case OP_NOTQUERYI:
201*22dc650dSSadaf Ebrahimi       case OP_MINQUERY:
202*22dc650dSSadaf Ebrahimi       case OP_MINQUERYI:
203*22dc650dSSadaf Ebrahimi       case OP_NOTMINQUERY:
204*22dc650dSSadaf Ebrahimi       case OP_NOTMINQUERYI:
205*22dc650dSSadaf Ebrahimi       case OP_POSQUERY:
206*22dc650dSSadaf Ebrahimi       case OP_POSQUERYI:
207*22dc650dSSadaf Ebrahimi       case OP_NOTPOSQUERY:
208*22dc650dSSadaf Ebrahimi       case OP_NOTPOSQUERYI:
209*22dc650dSSadaf Ebrahimi       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
210*22dc650dSSadaf Ebrahimi       break;
211*22dc650dSSadaf Ebrahimi       }
212*22dc650dSSadaf Ebrahimi #else
213*22dc650dSSadaf Ebrahimi     (void)(utf);  /* Keep compiler happy by referencing function argument */
214*22dc650dSSadaf Ebrahimi #endif  /* MAYBE_UTF_MULTI */
215*22dc650dSSadaf Ebrahimi     }
216*22dc650dSSadaf Ebrahimi   }
217*22dc650dSSadaf Ebrahimi }
218*22dc650dSSadaf Ebrahimi 
219*22dc650dSSadaf Ebrahimi /* End of pcre2_find_bracket.c */
220