1*22dc650dSSadaf Ebrahimi /*************************************************
2*22dc650dSSadaf Ebrahimi * Perl-Compatible Regular Expressions *
3*22dc650dSSadaf Ebrahimi *************************************************/
4*22dc650dSSadaf Ebrahimi
5*22dc650dSSadaf Ebrahimi /* PCRE is a library of functions to support regular expressions whose syntax
6*22dc650dSSadaf Ebrahimi and semantics are as close as possible to those of the Perl 5 language.
7*22dc650dSSadaf Ebrahimi
8*22dc650dSSadaf Ebrahimi Written by Philip Hazel
9*22dc650dSSadaf Ebrahimi Original API code Copyright (c) 1997-2012 University of Cambridge
10*22dc650dSSadaf Ebrahimi New API code Copyright (c) 2016-2023 University of Cambridge
11*22dc650dSSadaf Ebrahimi
12*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
13*22dc650dSSadaf Ebrahimi Redistribution and use in source and binary forms, with or without
14*22dc650dSSadaf Ebrahimi modification, are permitted provided that the following conditions are met:
15*22dc650dSSadaf Ebrahimi
16*22dc650dSSadaf Ebrahimi * Redistributions of source code must retain the above copyright notice,
17*22dc650dSSadaf Ebrahimi this list of conditions and the following disclaimer.
18*22dc650dSSadaf Ebrahimi
19*22dc650dSSadaf Ebrahimi * Redistributions in binary form must reproduce the above copyright
20*22dc650dSSadaf Ebrahimi notice, this list of conditions and the following disclaimer in the
21*22dc650dSSadaf Ebrahimi documentation and/or other materials provided with the distribution.
22*22dc650dSSadaf Ebrahimi
23*22dc650dSSadaf Ebrahimi * Neither the name of the University of Cambridge nor the names of its
24*22dc650dSSadaf Ebrahimi contributors may be used to endorse or promote products derived from
25*22dc650dSSadaf Ebrahimi this software without specific prior written permission.
26*22dc650dSSadaf Ebrahimi
27*22dc650dSSadaf Ebrahimi THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28*22dc650dSSadaf Ebrahimi AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29*22dc650dSSadaf Ebrahimi IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30*22dc650dSSadaf Ebrahimi ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31*22dc650dSSadaf Ebrahimi LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32*22dc650dSSadaf Ebrahimi CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33*22dc650dSSadaf Ebrahimi SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34*22dc650dSSadaf Ebrahimi INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35*22dc650dSSadaf Ebrahimi CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36*22dc650dSSadaf Ebrahimi ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37*22dc650dSSadaf Ebrahimi POSSIBILITY OF SUCH DAMAGE.
38*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
39*22dc650dSSadaf Ebrahimi */
40*22dc650dSSadaf Ebrahimi
41*22dc650dSSadaf Ebrahimi
42*22dc650dSSadaf Ebrahimi /* This module contains a single function that scans through a compiled pattern
43*22dc650dSSadaf Ebrahimi until it finds a capturing bracket with the given number, or, if the number is
44*22dc650dSSadaf Ebrahimi negative, an instance of OP_REVERSE or OP_VREVERSE for a lookbehind. The
45*22dc650dSSadaf Ebrahimi function is called from pcre2_compile.c and also from pcre2_study.c when
46*22dc650dSSadaf Ebrahimi finding the minimum matching length. */
47*22dc650dSSadaf Ebrahimi
48*22dc650dSSadaf Ebrahimi
49*22dc650dSSadaf Ebrahimi #ifdef HAVE_CONFIG_H
50*22dc650dSSadaf Ebrahimi #include "config.h"
51*22dc650dSSadaf Ebrahimi #endif
52*22dc650dSSadaf Ebrahimi
53*22dc650dSSadaf Ebrahimi #include "pcre2_internal.h"
54*22dc650dSSadaf Ebrahimi
55*22dc650dSSadaf Ebrahimi
56*22dc650dSSadaf Ebrahimi /*************************************************
57*22dc650dSSadaf Ebrahimi * Scan compiled regex for specific bracket *
58*22dc650dSSadaf Ebrahimi *************************************************/
59*22dc650dSSadaf Ebrahimi
60*22dc650dSSadaf Ebrahimi /*
61*22dc650dSSadaf Ebrahimi Arguments:
62*22dc650dSSadaf Ebrahimi code points to start of expression
63*22dc650dSSadaf Ebrahimi utf TRUE in UTF mode
64*22dc650dSSadaf Ebrahimi number the required bracket number or negative to find a lookbehind
65*22dc650dSSadaf Ebrahimi
66*22dc650dSSadaf Ebrahimi Returns: pointer to the opcode for the bracket, or NULL if not found
67*22dc650dSSadaf Ebrahimi */
68*22dc650dSSadaf Ebrahimi
69*22dc650dSSadaf Ebrahimi PCRE2_SPTR
PRIV(find_bracket)70*22dc650dSSadaf Ebrahimi PRIV(find_bracket)(PCRE2_SPTR code, BOOL utf, int number)
71*22dc650dSSadaf Ebrahimi {
72*22dc650dSSadaf Ebrahimi for (;;)
73*22dc650dSSadaf Ebrahimi {
74*22dc650dSSadaf Ebrahimi PCRE2_UCHAR c = *code;
75*22dc650dSSadaf Ebrahimi
76*22dc650dSSadaf Ebrahimi if (c == OP_END) return NULL;
77*22dc650dSSadaf Ebrahimi
78*22dc650dSSadaf Ebrahimi /* XCLASS is used for classes that cannot be represented just by a bit map.
79*22dc650dSSadaf Ebrahimi This includes negated single high-valued characters. CALLOUT_STR is used for
80*22dc650dSSadaf Ebrahimi callouts with string arguments. In both cases the length in the table is
81*22dc650dSSadaf Ebrahimi zero; the actual length is stored in the compiled code. */
82*22dc650dSSadaf Ebrahimi
83*22dc650dSSadaf Ebrahimi if (c == OP_XCLASS) code += GET(code, 1);
84*22dc650dSSadaf Ebrahimi else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
85*22dc650dSSadaf Ebrahimi
86*22dc650dSSadaf Ebrahimi /* Handle lookbehind */
87*22dc650dSSadaf Ebrahimi
88*22dc650dSSadaf Ebrahimi else if (c == OP_REVERSE || c == OP_VREVERSE)
89*22dc650dSSadaf Ebrahimi {
90*22dc650dSSadaf Ebrahimi if (number < 0) return (PCRE2_UCHAR *)code;
91*22dc650dSSadaf Ebrahimi code += PRIV(OP_lengths)[c];
92*22dc650dSSadaf Ebrahimi }
93*22dc650dSSadaf Ebrahimi
94*22dc650dSSadaf Ebrahimi /* Handle capturing bracket */
95*22dc650dSSadaf Ebrahimi
96*22dc650dSSadaf Ebrahimi else if (c == OP_CBRA || c == OP_SCBRA ||
97*22dc650dSSadaf Ebrahimi c == OP_CBRAPOS || c == OP_SCBRAPOS)
98*22dc650dSSadaf Ebrahimi {
99*22dc650dSSadaf Ebrahimi int n = (int)GET2(code, 1+LINK_SIZE);
100*22dc650dSSadaf Ebrahimi if (n == number) return (PCRE2_UCHAR *)code;
101*22dc650dSSadaf Ebrahimi code += PRIV(OP_lengths)[c];
102*22dc650dSSadaf Ebrahimi }
103*22dc650dSSadaf Ebrahimi
104*22dc650dSSadaf Ebrahimi /* Otherwise, we can get the item's length from the table, except that for
105*22dc650dSSadaf Ebrahimi repeated character types, we have to test for \p and \P, which have an extra
106*22dc650dSSadaf Ebrahimi two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
107*22dc650dSSadaf Ebrahimi must add in its length. */
108*22dc650dSSadaf Ebrahimi
109*22dc650dSSadaf Ebrahimi else
110*22dc650dSSadaf Ebrahimi {
111*22dc650dSSadaf Ebrahimi switch(c)
112*22dc650dSSadaf Ebrahimi {
113*22dc650dSSadaf Ebrahimi case OP_TYPESTAR:
114*22dc650dSSadaf Ebrahimi case OP_TYPEMINSTAR:
115*22dc650dSSadaf Ebrahimi case OP_TYPEPLUS:
116*22dc650dSSadaf Ebrahimi case OP_TYPEMINPLUS:
117*22dc650dSSadaf Ebrahimi case OP_TYPEQUERY:
118*22dc650dSSadaf Ebrahimi case OP_TYPEMINQUERY:
119*22dc650dSSadaf Ebrahimi case OP_TYPEPOSSTAR:
120*22dc650dSSadaf Ebrahimi case OP_TYPEPOSPLUS:
121*22dc650dSSadaf Ebrahimi case OP_TYPEPOSQUERY:
122*22dc650dSSadaf Ebrahimi if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
123*22dc650dSSadaf Ebrahimi break;
124*22dc650dSSadaf Ebrahimi
125*22dc650dSSadaf Ebrahimi case OP_TYPEUPTO:
126*22dc650dSSadaf Ebrahimi case OP_TYPEMINUPTO:
127*22dc650dSSadaf Ebrahimi case OP_TYPEEXACT:
128*22dc650dSSadaf Ebrahimi case OP_TYPEPOSUPTO:
129*22dc650dSSadaf Ebrahimi if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
130*22dc650dSSadaf Ebrahimi code += 2;
131*22dc650dSSadaf Ebrahimi break;
132*22dc650dSSadaf Ebrahimi
133*22dc650dSSadaf Ebrahimi case OP_MARK:
134*22dc650dSSadaf Ebrahimi case OP_COMMIT_ARG:
135*22dc650dSSadaf Ebrahimi case OP_PRUNE_ARG:
136*22dc650dSSadaf Ebrahimi case OP_SKIP_ARG:
137*22dc650dSSadaf Ebrahimi case OP_THEN_ARG:
138*22dc650dSSadaf Ebrahimi code += code[1];
139*22dc650dSSadaf Ebrahimi break;
140*22dc650dSSadaf Ebrahimi }
141*22dc650dSSadaf Ebrahimi
142*22dc650dSSadaf Ebrahimi /* Add in the fixed length from the table */
143*22dc650dSSadaf Ebrahimi
144*22dc650dSSadaf Ebrahimi code += PRIV(OP_lengths)[c];
145*22dc650dSSadaf Ebrahimi
146*22dc650dSSadaf Ebrahimi /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be
147*22dc650dSSadaf Ebrahimi followed by a multi-byte character. The length in the table is a minimum, so
148*22dc650dSSadaf Ebrahimi we have to arrange to skip the extra bytes. */
149*22dc650dSSadaf Ebrahimi
150*22dc650dSSadaf Ebrahimi #ifdef MAYBE_UTF_MULTI
151*22dc650dSSadaf Ebrahimi if (utf) switch(c)
152*22dc650dSSadaf Ebrahimi {
153*22dc650dSSadaf Ebrahimi case OP_CHAR:
154*22dc650dSSadaf Ebrahimi case OP_CHARI:
155*22dc650dSSadaf Ebrahimi case OP_NOT:
156*22dc650dSSadaf Ebrahimi case OP_NOTI:
157*22dc650dSSadaf Ebrahimi case OP_EXACT:
158*22dc650dSSadaf Ebrahimi case OP_EXACTI:
159*22dc650dSSadaf Ebrahimi case OP_NOTEXACT:
160*22dc650dSSadaf Ebrahimi case OP_NOTEXACTI:
161*22dc650dSSadaf Ebrahimi case OP_UPTO:
162*22dc650dSSadaf Ebrahimi case OP_UPTOI:
163*22dc650dSSadaf Ebrahimi case OP_NOTUPTO:
164*22dc650dSSadaf Ebrahimi case OP_NOTUPTOI:
165*22dc650dSSadaf Ebrahimi case OP_MINUPTO:
166*22dc650dSSadaf Ebrahimi case OP_MINUPTOI:
167*22dc650dSSadaf Ebrahimi case OP_NOTMINUPTO:
168*22dc650dSSadaf Ebrahimi case OP_NOTMINUPTOI:
169*22dc650dSSadaf Ebrahimi case OP_POSUPTO:
170*22dc650dSSadaf Ebrahimi case OP_POSUPTOI:
171*22dc650dSSadaf Ebrahimi case OP_NOTPOSUPTO:
172*22dc650dSSadaf Ebrahimi case OP_NOTPOSUPTOI:
173*22dc650dSSadaf Ebrahimi case OP_STAR:
174*22dc650dSSadaf Ebrahimi case OP_STARI:
175*22dc650dSSadaf Ebrahimi case OP_NOTSTAR:
176*22dc650dSSadaf Ebrahimi case OP_NOTSTARI:
177*22dc650dSSadaf Ebrahimi case OP_MINSTAR:
178*22dc650dSSadaf Ebrahimi case OP_MINSTARI:
179*22dc650dSSadaf Ebrahimi case OP_NOTMINSTAR:
180*22dc650dSSadaf Ebrahimi case OP_NOTMINSTARI:
181*22dc650dSSadaf Ebrahimi case OP_POSSTAR:
182*22dc650dSSadaf Ebrahimi case OP_POSSTARI:
183*22dc650dSSadaf Ebrahimi case OP_NOTPOSSTAR:
184*22dc650dSSadaf Ebrahimi case OP_NOTPOSSTARI:
185*22dc650dSSadaf Ebrahimi case OP_PLUS:
186*22dc650dSSadaf Ebrahimi case OP_PLUSI:
187*22dc650dSSadaf Ebrahimi case OP_NOTPLUS:
188*22dc650dSSadaf Ebrahimi case OP_NOTPLUSI:
189*22dc650dSSadaf Ebrahimi case OP_MINPLUS:
190*22dc650dSSadaf Ebrahimi case OP_MINPLUSI:
191*22dc650dSSadaf Ebrahimi case OP_NOTMINPLUS:
192*22dc650dSSadaf Ebrahimi case OP_NOTMINPLUSI:
193*22dc650dSSadaf Ebrahimi case OP_POSPLUS:
194*22dc650dSSadaf Ebrahimi case OP_POSPLUSI:
195*22dc650dSSadaf Ebrahimi case OP_NOTPOSPLUS:
196*22dc650dSSadaf Ebrahimi case OP_NOTPOSPLUSI:
197*22dc650dSSadaf Ebrahimi case OP_QUERY:
198*22dc650dSSadaf Ebrahimi case OP_QUERYI:
199*22dc650dSSadaf Ebrahimi case OP_NOTQUERY:
200*22dc650dSSadaf Ebrahimi case OP_NOTQUERYI:
201*22dc650dSSadaf Ebrahimi case OP_MINQUERY:
202*22dc650dSSadaf Ebrahimi case OP_MINQUERYI:
203*22dc650dSSadaf Ebrahimi case OP_NOTMINQUERY:
204*22dc650dSSadaf Ebrahimi case OP_NOTMINQUERYI:
205*22dc650dSSadaf Ebrahimi case OP_POSQUERY:
206*22dc650dSSadaf Ebrahimi case OP_POSQUERYI:
207*22dc650dSSadaf Ebrahimi case OP_NOTPOSQUERY:
208*22dc650dSSadaf Ebrahimi case OP_NOTPOSQUERYI:
209*22dc650dSSadaf Ebrahimi if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
210*22dc650dSSadaf Ebrahimi break;
211*22dc650dSSadaf Ebrahimi }
212*22dc650dSSadaf Ebrahimi #else
213*22dc650dSSadaf Ebrahimi (void)(utf); /* Keep compiler happy by referencing function argument */
214*22dc650dSSadaf Ebrahimi #endif /* MAYBE_UTF_MULTI */
215*22dc650dSSadaf Ebrahimi }
216*22dc650dSSadaf Ebrahimi }
217*22dc650dSSadaf Ebrahimi }
218*22dc650dSSadaf Ebrahimi
219*22dc650dSSadaf Ebrahimi /* End of pcre2_find_bracket.c */
220