xref: /aosp_15_r20/external/pcre/src/pcre2_compile.c (revision 22dc650d8ae982c6770746019a6f94af92b0f024)
1*22dc650dSSadaf Ebrahimi /*************************************************
2*22dc650dSSadaf Ebrahimi *      Perl-Compatible Regular Expressions       *
3*22dc650dSSadaf Ebrahimi *************************************************/
4*22dc650dSSadaf Ebrahimi 
5*22dc650dSSadaf Ebrahimi /* PCRE is a library of functions to support regular expressions whose syntax
6*22dc650dSSadaf Ebrahimi and semantics are as close as possible to those of the Perl 5 language.
7*22dc650dSSadaf Ebrahimi 
8*22dc650dSSadaf Ebrahimi                        Written by Philip Hazel
9*22dc650dSSadaf Ebrahimi      Original API code Copyright (c) 1997-2012 University of Cambridge
10*22dc650dSSadaf Ebrahimi           New API code Copyright (c) 2016-2024 University of Cambridge
11*22dc650dSSadaf Ebrahimi 
12*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
13*22dc650dSSadaf Ebrahimi Redistribution and use in source and binary forms, with or without
14*22dc650dSSadaf Ebrahimi modification, are permitted provided that the following conditions are met:
15*22dc650dSSadaf Ebrahimi 
16*22dc650dSSadaf Ebrahimi     * Redistributions of source code must retain the above copyright notice,
17*22dc650dSSadaf Ebrahimi       this list of conditions and the following disclaimer.
18*22dc650dSSadaf Ebrahimi 
19*22dc650dSSadaf Ebrahimi     * Redistributions in binary form must reproduce the above copyright
20*22dc650dSSadaf Ebrahimi       notice, this list of conditions and the following disclaimer in the
21*22dc650dSSadaf Ebrahimi       documentation and/or other materials provided with the distribution.
22*22dc650dSSadaf Ebrahimi 
23*22dc650dSSadaf Ebrahimi     * Neither the name of the University of Cambridge nor the names of its
24*22dc650dSSadaf Ebrahimi       contributors may be used to endorse or promote products derived from
25*22dc650dSSadaf Ebrahimi       this software without specific prior written permission.
26*22dc650dSSadaf Ebrahimi 
27*22dc650dSSadaf Ebrahimi THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28*22dc650dSSadaf Ebrahimi AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29*22dc650dSSadaf Ebrahimi IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30*22dc650dSSadaf Ebrahimi ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31*22dc650dSSadaf Ebrahimi LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32*22dc650dSSadaf Ebrahimi CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33*22dc650dSSadaf Ebrahimi SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34*22dc650dSSadaf Ebrahimi INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35*22dc650dSSadaf Ebrahimi CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36*22dc650dSSadaf Ebrahimi ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37*22dc650dSSadaf Ebrahimi POSSIBILITY OF SUCH DAMAGE.
38*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
39*22dc650dSSadaf Ebrahimi */
40*22dc650dSSadaf Ebrahimi 
41*22dc650dSSadaf Ebrahimi 
42*22dc650dSSadaf Ebrahimi #ifdef HAVE_CONFIG_H
43*22dc650dSSadaf Ebrahimi #include "config.h"
44*22dc650dSSadaf Ebrahimi #endif
45*22dc650dSSadaf Ebrahimi 
46*22dc650dSSadaf Ebrahimi #define NLBLOCK cb             /* Block containing newline information */
47*22dc650dSSadaf Ebrahimi #define PSSTART start_pattern  /* Field containing processed string start */
48*22dc650dSSadaf Ebrahimi #define PSEND   end_pattern    /* Field containing processed string end */
49*22dc650dSSadaf Ebrahimi 
50*22dc650dSSadaf Ebrahimi #include "pcre2_internal.h"
51*22dc650dSSadaf Ebrahimi 
52*22dc650dSSadaf Ebrahimi /* In rare error cases debugging might require calling pcre2_printint(). */
53*22dc650dSSadaf Ebrahimi 
54*22dc650dSSadaf Ebrahimi #if 0
55*22dc650dSSadaf Ebrahimi #ifdef EBCDIC
56*22dc650dSSadaf Ebrahimi #define PRINTABLE(c) ((c) >= 64 && (c) < 255)
57*22dc650dSSadaf Ebrahimi #else
58*22dc650dSSadaf Ebrahimi #define PRINTABLE(c) ((c) >= 32 && (c) < 127)
59*22dc650dSSadaf Ebrahimi #endif
60*22dc650dSSadaf Ebrahimi #include "pcre2_printint.c"
61*22dc650dSSadaf Ebrahimi #define DEBUG_CALL_PRINTINT
62*22dc650dSSadaf Ebrahimi #endif
63*22dc650dSSadaf Ebrahimi 
64*22dc650dSSadaf Ebrahimi /* Other debugging code can be enabled by these defines. */
65*22dc650dSSadaf Ebrahimi 
66*22dc650dSSadaf Ebrahimi /* #define DEBUG_SHOW_CAPTURES */
67*22dc650dSSadaf Ebrahimi /* #define DEBUG_SHOW_PARSED */
68*22dc650dSSadaf Ebrahimi 
69*22dc650dSSadaf Ebrahimi /* There are a few things that vary with different code unit sizes. Handle them
70*22dc650dSSadaf Ebrahimi by defining macros in order to minimize #if usage. */
71*22dc650dSSadaf Ebrahimi 
72*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
73*22dc650dSSadaf Ebrahimi #define STRING_UTFn_RIGHTPAR     STRING_UTF8_RIGHTPAR, 5
74*22dc650dSSadaf Ebrahimi #define XDIGIT(c)                xdigitab[c]
75*22dc650dSSadaf Ebrahimi 
76*22dc650dSSadaf Ebrahimi #else  /* Either 16-bit or 32-bit */
77*22dc650dSSadaf Ebrahimi #define XDIGIT(c)                (MAX_255(c)? xdigitab[c] : 0xff)
78*22dc650dSSadaf Ebrahimi 
79*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 16
80*22dc650dSSadaf Ebrahimi #define STRING_UTFn_RIGHTPAR     STRING_UTF16_RIGHTPAR, 6
81*22dc650dSSadaf Ebrahimi 
82*22dc650dSSadaf Ebrahimi #else  /* 32-bit */
83*22dc650dSSadaf Ebrahimi #define STRING_UTFn_RIGHTPAR     STRING_UTF32_RIGHTPAR, 6
84*22dc650dSSadaf Ebrahimi #endif
85*22dc650dSSadaf Ebrahimi #endif
86*22dc650dSSadaf Ebrahimi 
87*22dc650dSSadaf Ebrahimi /* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which
88*22dc650dSSadaf Ebrahimi consists of uint32_t elements. Assume that if uint32_t can't hold it, two of
89*22dc650dSSadaf Ebrahimi them will be able to (i.e. assume a 64-bit world). */
90*22dc650dSSadaf Ebrahimi 
91*22dc650dSSadaf Ebrahimi #if PCRE2_SIZE_MAX <= UINT32_MAX
92*22dc650dSSadaf Ebrahimi #define PUTOFFSET(s,p) *p++ = s
93*22dc650dSSadaf Ebrahimi #define GETOFFSET(s,p) s = *p++
94*22dc650dSSadaf Ebrahimi #define GETPLUSOFFSET(s,p) s = *(++p)
95*22dc650dSSadaf Ebrahimi #define READPLUSOFFSET(s,p) s = p[1]
96*22dc650dSSadaf Ebrahimi #define SKIPOFFSET(p) p++
97*22dc650dSSadaf Ebrahimi #define SIZEOFFSET 1
98*22dc650dSSadaf Ebrahimi #else
99*22dc650dSSadaf Ebrahimi #define PUTOFFSET(s,p) \
100*22dc650dSSadaf Ebrahimi   { *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); }
101*22dc650dSSadaf Ebrahimi #define GETOFFSET(s,p) \
102*22dc650dSSadaf Ebrahimi   { s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; }
103*22dc650dSSadaf Ebrahimi #define GETPLUSOFFSET(s,p) \
104*22dc650dSSadaf Ebrahimi   { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; }
105*22dc650dSSadaf Ebrahimi #define READPLUSOFFSET(s,p) \
106*22dc650dSSadaf Ebrahimi   { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; }
107*22dc650dSSadaf Ebrahimi #define SKIPOFFSET(p) p += 2
108*22dc650dSSadaf Ebrahimi #define SIZEOFFSET 2
109*22dc650dSSadaf Ebrahimi #endif
110*22dc650dSSadaf Ebrahimi 
111*22dc650dSSadaf Ebrahimi /* Macros for manipulating elements of the parsed pattern vector. */
112*22dc650dSSadaf Ebrahimi 
113*22dc650dSSadaf Ebrahimi #define META_CODE(x)   (x & 0xffff0000u)
114*22dc650dSSadaf Ebrahimi #define META_DATA(x)   (x & 0x0000ffffu)
115*22dc650dSSadaf Ebrahimi #define META_DIFF(x,y) ((x-y)>>16)
116*22dc650dSSadaf Ebrahimi 
117*22dc650dSSadaf Ebrahimi /* Function definitions to allow mutual recursion */
118*22dc650dSSadaf Ebrahimi 
119*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
120*22dc650dSSadaf Ebrahimi static unsigned int
121*22dc650dSSadaf Ebrahimi   add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t, uint32_t,
122*22dc650dSSadaf Ebrahimi     compile_block *, const uint32_t *, unsigned int);
123*22dc650dSSadaf Ebrahimi #endif
124*22dc650dSSadaf Ebrahimi 
125*22dc650dSSadaf Ebrahimi static int
126*22dc650dSSadaf Ebrahimi   compile_regex(uint32_t, uint32_t, PCRE2_UCHAR **, uint32_t **, int *,
127*22dc650dSSadaf Ebrahimi     uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *,
128*22dc650dSSadaf Ebrahimi     open_capitem *, compile_block *, PCRE2_SIZE *);
129*22dc650dSSadaf Ebrahimi 
130*22dc650dSSadaf Ebrahimi static int
131*22dc650dSSadaf Ebrahimi   get_branchlength(uint32_t **, int *, int *, int *, parsed_recurse_check *,
132*22dc650dSSadaf Ebrahimi     compile_block *);
133*22dc650dSSadaf Ebrahimi 
134*22dc650dSSadaf Ebrahimi static BOOL
135*22dc650dSSadaf Ebrahimi   set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
136*22dc650dSSadaf Ebrahimi     compile_block *);
137*22dc650dSSadaf Ebrahimi 
138*22dc650dSSadaf Ebrahimi static int
139*22dc650dSSadaf Ebrahimi   check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,
140*22dc650dSSadaf Ebrahimi     compile_block *, int *);
141*22dc650dSSadaf Ebrahimi 
142*22dc650dSSadaf Ebrahimi 
143*22dc650dSSadaf Ebrahimi /*************************************************
144*22dc650dSSadaf Ebrahimi *      Code parameters and static tables         *
145*22dc650dSSadaf Ebrahimi *************************************************/
146*22dc650dSSadaf Ebrahimi 
147*22dc650dSSadaf Ebrahimi #define MAX_GROUP_NUMBER   65535u
148*22dc650dSSadaf Ebrahimi #define MAX_REPEAT_COUNT   65535u
149*22dc650dSSadaf Ebrahimi #define REPEAT_UNLIMITED   (MAX_REPEAT_COUNT+1)
150*22dc650dSSadaf Ebrahimi 
151*22dc650dSSadaf Ebrahimi /* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
152*22dc650dSSadaf Ebrahimi different ways in the different pattern scans. The parsing and group-
153*22dc650dSSadaf Ebrahimi identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
154*22dc650dSSadaf Ebrahimi aligned for this. Having defined the size in code units, we set up
155*22dc650dSSadaf Ebrahimi C16_WORK_SIZE as the number of elements in the 16-bit vector.
156*22dc650dSSadaf Ebrahimi 
157*22dc650dSSadaf Ebrahimi During the first compiling phase, when determining how much memory is required,
158*22dc650dSSadaf Ebrahimi the regex is partly compiled into this space, but the compiled parts are
159*22dc650dSSadaf Ebrahimi discarded as soon as they can be, so that hopefully there will never be an
160*22dc650dSSadaf Ebrahimi overrun. The code does, however, check for an overrun, which can occur for
161*22dc650dSSadaf Ebrahimi pathological patterns. The size of the workspace depends on LINK_SIZE because
162*22dc650dSSadaf Ebrahimi the length of compiled items varies with this.
163*22dc650dSSadaf Ebrahimi 
164*22dc650dSSadaf Ebrahimi In the real compile phase, this workspace is not currently used. */
165*22dc650dSSadaf Ebrahimi 
166*22dc650dSSadaf Ebrahimi #define COMPILE_WORK_SIZE (3000*LINK_SIZE)   /* Size in code units */
167*22dc650dSSadaf Ebrahimi 
168*22dc650dSSadaf Ebrahimi #define C16_WORK_SIZE \
169*22dc650dSSadaf Ebrahimi   ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
170*22dc650dSSadaf Ebrahimi 
171*22dc650dSSadaf Ebrahimi /* A uint32_t vector is used for caching information about the size of
172*22dc650dSSadaf Ebrahimi capturing groups, to improve performance. A default is created on the stack of
173*22dc650dSSadaf Ebrahimi this size. */
174*22dc650dSSadaf Ebrahimi 
175*22dc650dSSadaf Ebrahimi #define GROUPINFO_DEFAULT_SIZE 256
176*22dc650dSSadaf Ebrahimi 
177*22dc650dSSadaf Ebrahimi /* The overrun tests check for a slightly smaller size so that they detect the
178*22dc650dSSadaf Ebrahimi overrun before it actually does run off the end of the data block. */
179*22dc650dSSadaf Ebrahimi 
180*22dc650dSSadaf Ebrahimi #define WORK_SIZE_SAFETY_MARGIN (100)
181*22dc650dSSadaf Ebrahimi 
182*22dc650dSSadaf Ebrahimi /* This value determines the size of the initial vector that is used for
183*22dc650dSSadaf Ebrahimi remembering named groups during the pre-compile. It is allocated on the stack,
184*22dc650dSSadaf Ebrahimi but if it is too small, it is expanded, in a similar way to the workspace. The
185*22dc650dSSadaf Ebrahimi value is the number of slots in the list. */
186*22dc650dSSadaf Ebrahimi 
187*22dc650dSSadaf Ebrahimi #define NAMED_GROUP_LIST_SIZE  20
188*22dc650dSSadaf Ebrahimi 
189*22dc650dSSadaf Ebrahimi /* The pre-compiling pass over the pattern creates a parsed pattern in a vector
190*22dc650dSSadaf Ebrahimi of uint32_t. For short patterns this lives on the stack, with this size. Heap
191*22dc650dSSadaf Ebrahimi memory is used for longer patterns. */
192*22dc650dSSadaf Ebrahimi 
193*22dc650dSSadaf Ebrahimi #define PARSED_PATTERN_DEFAULT_SIZE 1024
194*22dc650dSSadaf Ebrahimi 
195*22dc650dSSadaf Ebrahimi /* Maximum length value to check against when making sure that the variable
196*22dc650dSSadaf Ebrahimi that holds the compiled pattern length does not overflow. We make it a bit less
197*22dc650dSSadaf Ebrahimi than INT_MAX to allow for adding in group terminating code units, so that we
198*22dc650dSSadaf Ebrahimi don't have to check them every time. */
199*22dc650dSSadaf Ebrahimi 
200*22dc650dSSadaf Ebrahimi #define OFLOW_MAX (INT_MAX - 20)
201*22dc650dSSadaf Ebrahimi 
202*22dc650dSSadaf Ebrahimi /* Code values for parsed patterns, which are stored in a vector of 32-bit
203*22dc650dSSadaf Ebrahimi unsigned ints. Values less than META_END are literal data values. The coding
204*22dc650dSSadaf Ebrahimi for identifying the item is in the top 16-bits, leaving 16 bits for the
205*22dc650dSSadaf Ebrahimi additional data that some of them need. The META_CODE, META_DATA, and META_DIFF
206*22dc650dSSadaf Ebrahimi macros are used to manipulate parsed pattern elements.
207*22dc650dSSadaf Ebrahimi 
208*22dc650dSSadaf Ebrahimi NOTE: When these definitions are changed, the table of extra lengths for each
209*22dc650dSSadaf Ebrahimi code (meta_extra_lengths, just below) must be updated to remain in step. */
210*22dc650dSSadaf Ebrahimi 
211*22dc650dSSadaf Ebrahimi #define META_END              0x80000000u  /* End of pattern */
212*22dc650dSSadaf Ebrahimi 
213*22dc650dSSadaf Ebrahimi #define META_ALT              0x80010000u  /* alternation */
214*22dc650dSSadaf Ebrahimi #define META_ATOMIC           0x80020000u  /* atomic group */
215*22dc650dSSadaf Ebrahimi #define META_BACKREF          0x80030000u  /* Back ref */
216*22dc650dSSadaf Ebrahimi #define META_BACKREF_BYNAME   0x80040000u  /* \k'name' */
217*22dc650dSSadaf Ebrahimi #define META_BIGVALUE         0x80050000u  /* Next is a literal > META_END */
218*22dc650dSSadaf Ebrahimi #define META_CALLOUT_NUMBER   0x80060000u  /* (?C with numerical argument */
219*22dc650dSSadaf Ebrahimi #define META_CALLOUT_STRING   0x80070000u  /* (?C with string argument */
220*22dc650dSSadaf Ebrahimi #define META_CAPTURE          0x80080000u  /* Capturing parenthesis */
221*22dc650dSSadaf Ebrahimi #define META_CIRCUMFLEX       0x80090000u  /* ^ metacharacter */
222*22dc650dSSadaf Ebrahimi #define META_CLASS            0x800a0000u  /* start non-empty class */
223*22dc650dSSadaf Ebrahimi #define META_CLASS_EMPTY      0x800b0000u  /* empty class */
224*22dc650dSSadaf Ebrahimi #define META_CLASS_EMPTY_NOT  0x800c0000u  /* negative empty class */
225*22dc650dSSadaf Ebrahimi #define META_CLASS_END        0x800d0000u  /* end of non-empty class */
226*22dc650dSSadaf Ebrahimi #define META_CLASS_NOT        0x800e0000u  /* start non-empty negative class */
227*22dc650dSSadaf Ebrahimi #define META_COND_ASSERT      0x800f0000u  /* (?(?assertion)... */
228*22dc650dSSadaf Ebrahimi #define META_COND_DEFINE      0x80100000u  /* (?(DEFINE)... */
229*22dc650dSSadaf Ebrahimi #define META_COND_NAME        0x80110000u  /* (?(<name>)... */
230*22dc650dSSadaf Ebrahimi #define META_COND_NUMBER      0x80120000u  /* (?(digits)... */
231*22dc650dSSadaf Ebrahimi #define META_COND_RNAME       0x80130000u  /* (?(R&name)... */
232*22dc650dSSadaf Ebrahimi #define META_COND_RNUMBER     0x80140000u  /* (?(Rdigits)... */
233*22dc650dSSadaf Ebrahimi #define META_COND_VERSION     0x80150000u  /* (?(VERSION<op>x.y)... */
234*22dc650dSSadaf Ebrahimi #define META_DOLLAR           0x80160000u  /* $ metacharacter */
235*22dc650dSSadaf Ebrahimi #define META_DOT              0x80170000u  /* . metacharacter */
236*22dc650dSSadaf Ebrahimi #define META_ESCAPE           0x80180000u  /* \d and friends */
237*22dc650dSSadaf Ebrahimi #define META_KET              0x80190000u  /* closing parenthesis */
238*22dc650dSSadaf Ebrahimi #define META_NOCAPTURE        0x801a0000u  /* no capture parens */
239*22dc650dSSadaf Ebrahimi #define META_OPTIONS          0x801b0000u  /* (?i) and friends */
240*22dc650dSSadaf Ebrahimi #define META_POSIX            0x801c0000u  /* POSIX class item */
241*22dc650dSSadaf Ebrahimi #define META_POSIX_NEG        0x801d0000u  /* negative POSIX class item */
242*22dc650dSSadaf Ebrahimi #define META_RANGE_ESCAPED    0x801e0000u  /* range with at least one escape */
243*22dc650dSSadaf Ebrahimi #define META_RANGE_LITERAL    0x801f0000u  /* range defined literally */
244*22dc650dSSadaf Ebrahimi #define META_RECURSE          0x80200000u  /* Recursion */
245*22dc650dSSadaf Ebrahimi #define META_RECURSE_BYNAME   0x80210000u  /* (?&name) */
246*22dc650dSSadaf Ebrahimi #define META_SCRIPT_RUN       0x80220000u  /* (*script_run:...) */
247*22dc650dSSadaf Ebrahimi 
248*22dc650dSSadaf Ebrahimi /* These must be kept together to make it easy to check that an assertion
249*22dc650dSSadaf Ebrahimi is present where expected in a conditional group. */
250*22dc650dSSadaf Ebrahimi 
251*22dc650dSSadaf Ebrahimi #define META_LOOKAHEAD        0x80230000u  /* (?= */
252*22dc650dSSadaf Ebrahimi #define META_LOOKAHEADNOT     0x80240000u  /* (?! */
253*22dc650dSSadaf Ebrahimi #define META_LOOKBEHIND       0x80250000u  /* (?<= */
254*22dc650dSSadaf Ebrahimi #define META_LOOKBEHINDNOT    0x80260000u  /* (?<! */
255*22dc650dSSadaf Ebrahimi 
256*22dc650dSSadaf Ebrahimi /* These cannot be conditions */
257*22dc650dSSadaf Ebrahimi 
258*22dc650dSSadaf Ebrahimi #define META_LOOKAHEAD_NA     0x80270000u  /* (*napla: */
259*22dc650dSSadaf Ebrahimi #define META_LOOKBEHIND_NA    0x80280000u  /* (*naplb: */
260*22dc650dSSadaf Ebrahimi 
261*22dc650dSSadaf Ebrahimi /* These must be kept in this order, with consecutive values, and the _ARG
262*22dc650dSSadaf Ebrahimi versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument
263*22dc650dSSadaf Ebrahimi versions. */
264*22dc650dSSadaf Ebrahimi 
265*22dc650dSSadaf Ebrahimi #define META_MARK             0x80290000u  /* (*MARK) */
266*22dc650dSSadaf Ebrahimi #define META_ACCEPT           0x802a0000u  /* (*ACCEPT) */
267*22dc650dSSadaf Ebrahimi #define META_FAIL             0x802b0000u  /* (*FAIL) */
268*22dc650dSSadaf Ebrahimi #define META_COMMIT           0x802c0000u  /* These               */
269*22dc650dSSadaf Ebrahimi #define META_COMMIT_ARG       0x802d0000u  /*   pairs             */
270*22dc650dSSadaf Ebrahimi #define META_PRUNE            0x802e0000u  /*     must            */
271*22dc650dSSadaf Ebrahimi #define META_PRUNE_ARG        0x802f0000u  /*       be            */
272*22dc650dSSadaf Ebrahimi #define META_SKIP             0x80300000u  /*         kept        */
273*22dc650dSSadaf Ebrahimi #define META_SKIP_ARG         0x80310000u  /*           in        */
274*22dc650dSSadaf Ebrahimi #define META_THEN             0x80320000u  /*             this    */
275*22dc650dSSadaf Ebrahimi #define META_THEN_ARG         0x80330000u  /*               order */
276*22dc650dSSadaf Ebrahimi 
277*22dc650dSSadaf Ebrahimi /* These must be kept in groups of adjacent 3 values, and all together. */
278*22dc650dSSadaf Ebrahimi 
279*22dc650dSSadaf Ebrahimi #define META_ASTERISK         0x80340000u  /* *  */
280*22dc650dSSadaf Ebrahimi #define META_ASTERISK_PLUS    0x80350000u  /* *+ */
281*22dc650dSSadaf Ebrahimi #define META_ASTERISK_QUERY   0x80360000u  /* *? */
282*22dc650dSSadaf Ebrahimi #define META_PLUS             0x80370000u  /* +  */
283*22dc650dSSadaf Ebrahimi #define META_PLUS_PLUS        0x80380000u  /* ++ */
284*22dc650dSSadaf Ebrahimi #define META_PLUS_QUERY       0x80390000u  /* +? */
285*22dc650dSSadaf Ebrahimi #define META_QUERY            0x803a0000u  /* ?  */
286*22dc650dSSadaf Ebrahimi #define META_QUERY_PLUS       0x803b0000u  /* ?+ */
287*22dc650dSSadaf Ebrahimi #define META_QUERY_QUERY      0x803c0000u  /* ?? */
288*22dc650dSSadaf Ebrahimi #define META_MINMAX           0x803d0000u  /* {n,m}  repeat */
289*22dc650dSSadaf Ebrahimi #define META_MINMAX_PLUS      0x803e0000u  /* {n,m}+ repeat */
290*22dc650dSSadaf Ebrahimi #define META_MINMAX_QUERY     0x803f0000u  /* {n,m}? repeat */
291*22dc650dSSadaf Ebrahimi 
292*22dc650dSSadaf Ebrahimi #define META_FIRST_QUANTIFIER META_ASTERISK
293*22dc650dSSadaf Ebrahimi #define META_LAST_QUANTIFIER  META_MINMAX_QUERY
294*22dc650dSSadaf Ebrahimi 
295*22dc650dSSadaf Ebrahimi /* This is a special "meta code" that is used only to distinguish (*asr: from
296*22dc650dSSadaf Ebrahimi (*sr: in the table of aphabetic assertions. It is never stored in the parsed
297*22dc650dSSadaf Ebrahimi pattern because (*asr: is turned into (*sr:(*atomic: at that stage. There is
298*22dc650dSSadaf Ebrahimi therefore no need for it to have a length entry, so use a high value. */
299*22dc650dSSadaf Ebrahimi 
300*22dc650dSSadaf Ebrahimi #define META_ATOMIC_SCRIPT_RUN 0x8fff0000u
301*22dc650dSSadaf Ebrahimi 
302*22dc650dSSadaf Ebrahimi /* Table of extra lengths for each of the meta codes. Must be kept in step with
303*22dc650dSSadaf Ebrahimi the definitions above. For some items these values are a basic length to which
304*22dc650dSSadaf Ebrahimi a variable amount has to be added. */
305*22dc650dSSadaf Ebrahimi 
306*22dc650dSSadaf Ebrahimi static unsigned char meta_extra_lengths[] = {
307*22dc650dSSadaf Ebrahimi   0,             /* META_END */
308*22dc650dSSadaf Ebrahimi   0,             /* META_ALT */
309*22dc650dSSadaf Ebrahimi   0,             /* META_ATOMIC */
310*22dc650dSSadaf Ebrahimi   0,             /* META_BACKREF - more if group is >= 10 */
311*22dc650dSSadaf Ebrahimi   1+SIZEOFFSET,  /* META_BACKREF_BYNAME */
312*22dc650dSSadaf Ebrahimi   1,             /* META_BIGVALUE */
313*22dc650dSSadaf Ebrahimi   3,             /* META_CALLOUT_NUMBER */
314*22dc650dSSadaf Ebrahimi   3+SIZEOFFSET,  /* META_CALLOUT_STRING */
315*22dc650dSSadaf Ebrahimi   0,             /* META_CAPTURE */
316*22dc650dSSadaf Ebrahimi   0,             /* META_CIRCUMFLEX */
317*22dc650dSSadaf Ebrahimi   0,             /* META_CLASS */
318*22dc650dSSadaf Ebrahimi   0,             /* META_CLASS_EMPTY */
319*22dc650dSSadaf Ebrahimi   0,             /* META_CLASS_EMPTY_NOT */
320*22dc650dSSadaf Ebrahimi   0,             /* META_CLASS_END */
321*22dc650dSSadaf Ebrahimi   0,             /* META_CLASS_NOT */
322*22dc650dSSadaf Ebrahimi   0,             /* META_COND_ASSERT */
323*22dc650dSSadaf Ebrahimi   SIZEOFFSET,    /* META_COND_DEFINE */
324*22dc650dSSadaf Ebrahimi   1+SIZEOFFSET,  /* META_COND_NAME */
325*22dc650dSSadaf Ebrahimi   1+SIZEOFFSET,  /* META_COND_NUMBER */
326*22dc650dSSadaf Ebrahimi   1+SIZEOFFSET,  /* META_COND_RNAME */
327*22dc650dSSadaf Ebrahimi   1+SIZEOFFSET,  /* META_COND_RNUMBER */
328*22dc650dSSadaf Ebrahimi   3,             /* META_COND_VERSION */
329*22dc650dSSadaf Ebrahimi   0,             /* META_DOLLAR */
330*22dc650dSSadaf Ebrahimi   0,             /* META_DOT */
331*22dc650dSSadaf Ebrahimi   0,             /* META_ESCAPE - more for ESC_P, ESC_p, ESC_g, ESC_k */
332*22dc650dSSadaf Ebrahimi   0,             /* META_KET */
333*22dc650dSSadaf Ebrahimi   0,             /* META_NOCAPTURE */
334*22dc650dSSadaf Ebrahimi   1,             /* META_OPTIONS */
335*22dc650dSSadaf Ebrahimi   1,             /* META_POSIX */
336*22dc650dSSadaf Ebrahimi   1,             /* META_POSIX_NEG */
337*22dc650dSSadaf Ebrahimi   0,             /* META_RANGE_ESCAPED */
338*22dc650dSSadaf Ebrahimi   0,             /* META_RANGE_LITERAL */
339*22dc650dSSadaf Ebrahimi   SIZEOFFSET,    /* META_RECURSE */
340*22dc650dSSadaf Ebrahimi   1+SIZEOFFSET,  /* META_RECURSE_BYNAME */
341*22dc650dSSadaf Ebrahimi   0,             /* META_SCRIPT_RUN */
342*22dc650dSSadaf Ebrahimi   0,             /* META_LOOKAHEAD */
343*22dc650dSSadaf Ebrahimi   0,             /* META_LOOKAHEADNOT */
344*22dc650dSSadaf Ebrahimi   SIZEOFFSET,    /* META_LOOKBEHIND */
345*22dc650dSSadaf Ebrahimi   SIZEOFFSET,    /* META_LOOKBEHINDNOT */
346*22dc650dSSadaf Ebrahimi   0,             /* META_LOOKAHEAD_NA */
347*22dc650dSSadaf Ebrahimi   SIZEOFFSET,    /* META_LOOKBEHIND_NA */
348*22dc650dSSadaf Ebrahimi   1,             /* META_MARK - plus the string length */
349*22dc650dSSadaf Ebrahimi   0,             /* META_ACCEPT */
350*22dc650dSSadaf Ebrahimi   0,             /* META_FAIL */
351*22dc650dSSadaf Ebrahimi   0,             /* META_COMMIT */
352*22dc650dSSadaf Ebrahimi   1,             /* META_COMMIT_ARG - plus the string length */
353*22dc650dSSadaf Ebrahimi   0,             /* META_PRUNE */
354*22dc650dSSadaf Ebrahimi   1,             /* META_PRUNE_ARG - plus the string length */
355*22dc650dSSadaf Ebrahimi   0,             /* META_SKIP */
356*22dc650dSSadaf Ebrahimi   1,             /* META_SKIP_ARG - plus the string length */
357*22dc650dSSadaf Ebrahimi   0,             /* META_THEN */
358*22dc650dSSadaf Ebrahimi   1,             /* META_THEN_ARG - plus the string length */
359*22dc650dSSadaf Ebrahimi   0,             /* META_ASTERISK */
360*22dc650dSSadaf Ebrahimi   0,             /* META_ASTERISK_PLUS */
361*22dc650dSSadaf Ebrahimi   0,             /* META_ASTERISK_QUERY */
362*22dc650dSSadaf Ebrahimi   0,             /* META_PLUS */
363*22dc650dSSadaf Ebrahimi   0,             /* META_PLUS_PLUS */
364*22dc650dSSadaf Ebrahimi   0,             /* META_PLUS_QUERY */
365*22dc650dSSadaf Ebrahimi   0,             /* META_QUERY */
366*22dc650dSSadaf Ebrahimi   0,             /* META_QUERY_PLUS */
367*22dc650dSSadaf Ebrahimi   0,             /* META_QUERY_QUERY */
368*22dc650dSSadaf Ebrahimi   2,             /* META_MINMAX */
369*22dc650dSSadaf Ebrahimi   2,             /* META_MINMAX_PLUS */
370*22dc650dSSadaf Ebrahimi   2              /* META_MINMAX_QUERY */
371*22dc650dSSadaf Ebrahimi };
372*22dc650dSSadaf Ebrahimi 
373*22dc650dSSadaf Ebrahimi /* Types for skipping parts of a parsed pattern. */
374*22dc650dSSadaf Ebrahimi 
375*22dc650dSSadaf Ebrahimi enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
376*22dc650dSSadaf Ebrahimi 
377*22dc650dSSadaf Ebrahimi /* Macro for setting individual bits in class bitmaps. It took some
378*22dc650dSSadaf Ebrahimi experimenting to figure out how to stop gcc 5.3.0 from warning with
379*22dc650dSSadaf Ebrahimi -Wconversion. This version gets a warning:
380*22dc650dSSadaf Ebrahimi 
381*22dc650dSSadaf Ebrahimi   #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1u << ((b)&7))
382*22dc650dSSadaf Ebrahimi 
383*22dc650dSSadaf Ebrahimi Let's hope the apparently less efficient version isn't actually so bad if the
384*22dc650dSSadaf Ebrahimi compiler is clever with identical subexpressions. */
385*22dc650dSSadaf Ebrahimi 
386*22dc650dSSadaf Ebrahimi #define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1u << ((b)&7)))
387*22dc650dSSadaf Ebrahimi 
388*22dc650dSSadaf Ebrahimi /* Values and flags for the unsigned xxcuflags variables that accompany xxcu
389*22dc650dSSadaf Ebrahimi variables, which are concerned with first and required code units. A value
390*22dc650dSSadaf Ebrahimi greater than or equal to REQ_NONE means "no code unit set"; otherwise the
391*22dc650dSSadaf Ebrahimi matching xxcu variable is set, and the low valued bits are relevant. */
392*22dc650dSSadaf Ebrahimi 
393*22dc650dSSadaf Ebrahimi #define REQ_UNSET     0xffffffffu  /* Not yet found anything */
394*22dc650dSSadaf Ebrahimi #define REQ_NONE      0xfffffffeu  /* Found not fixed character */
395*22dc650dSSadaf Ebrahimi #define REQ_CASELESS  0x00000001u  /* Code unit in xxcu is caseless */
396*22dc650dSSadaf Ebrahimi #define REQ_VARY      0x00000002u  /* Code unit is followed by non-literal */
397*22dc650dSSadaf Ebrahimi 
398*22dc650dSSadaf Ebrahimi /* These flags are used in the groupinfo vector. */
399*22dc650dSSadaf Ebrahimi 
400*22dc650dSSadaf Ebrahimi #define GI_SET_FIXED_LENGTH    0x80000000u
401*22dc650dSSadaf Ebrahimi #define GI_NOT_FIXED_LENGTH    0x40000000u
402*22dc650dSSadaf Ebrahimi #define GI_FIXED_LENGTH_MASK   0x0000ffffu
403*22dc650dSSadaf Ebrahimi 
404*22dc650dSSadaf Ebrahimi /* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
405*22dc650dSSadaf Ebrahimi and is fast (a good compiler can turn it into a subtraction and unsigned
406*22dc650dSSadaf Ebrahimi comparison). */
407*22dc650dSSadaf Ebrahimi 
408*22dc650dSSadaf Ebrahimi #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
409*22dc650dSSadaf Ebrahimi 
410*22dc650dSSadaf Ebrahimi /* Table to identify hex digits. The tables in chartables are dependent on the
411*22dc650dSSadaf Ebrahimi locale, and may mark arbitrary characters as digits. We want to recognize only
412*22dc650dSSadaf Ebrahimi 0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
413*22dc650dSSadaf Ebrahimi costs 256 bytes, but it is a lot faster than doing character value tests (at
414*22dc650dSSadaf Ebrahimi least in some simple cases I timed), and in some applications one wants PCRE2
415*22dc650dSSadaf Ebrahimi to compile efficiently as well as match efficiently. The value in the table is
416*22dc650dSSadaf Ebrahimi the binary hex digit value, or 0xff for non-hex digits. */
417*22dc650dSSadaf Ebrahimi 
418*22dc650dSSadaf Ebrahimi /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
419*22dc650dSSadaf Ebrahimi UTF-8 mode. */
420*22dc650dSSadaf Ebrahimi 
421*22dc650dSSadaf Ebrahimi #ifndef EBCDIC
422*22dc650dSSadaf Ebrahimi static const uint8_t xdigitab[] =
423*22dc650dSSadaf Ebrahimi   {
424*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7 */
425*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15 */
426*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 */
427*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31 */
428*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - '  */
429*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ( - /  */
430*22dc650dSSadaf Ebrahimi   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  */
431*22dc650dSSadaf Ebrahimi   0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /*  8 - ?  */
432*22dc650dSSadaf Ebrahimi   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  @ - G  */
433*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H - O  */
434*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  P - W  */
435*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  X - _  */
436*22dc650dSSadaf Ebrahimi   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  ` - g  */
437*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h - o  */
438*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  p - w  */
439*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  x -127 */
440*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
441*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
442*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
443*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
444*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
445*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
446*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
447*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
448*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
449*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
450*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
451*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
452*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
453*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
454*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
455*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
456*22dc650dSSadaf Ebrahimi 
457*22dc650dSSadaf Ebrahimi #else
458*22dc650dSSadaf Ebrahimi 
459*22dc650dSSadaf Ebrahimi /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
460*22dc650dSSadaf Ebrahimi 
461*22dc650dSSadaf Ebrahimi static const uint8_t xdigitab[] =
462*22dc650dSSadaf Ebrahimi   {
463*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7  0 */
464*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15    */
465*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 10 */
466*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31    */
467*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  32- 39 20 */
468*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  40- 47    */
469*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  48- 55 30 */
470*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  56- 63    */
471*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - 71 40 */
472*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  72- |     */
473*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  & - 87 50 */
474*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  88- 95    */
475*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  - -103 60 */
476*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ?     */
477*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
478*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- "     */
479*22dc650dSSadaf Ebrahimi   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g  80 */
480*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h -143    */
481*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p  90 */
482*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  q -159    */
483*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x  A0 */
484*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  y -175    */
485*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ^ -183 B0 */
486*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191    */
487*22dc650dSSadaf Ebrahimi   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  { - G  C0 */
488*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H -207    */
489*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  } - P  D0 */
490*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Q -223    */
491*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  \ - X  E0 */
492*22dc650dSSadaf Ebrahimi   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Y -239    */
493*22dc650dSSadaf Ebrahimi   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  F0 */
494*22dc650dSSadaf Ebrahimi   0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/*  8 -255    */
495*22dc650dSSadaf Ebrahimi #endif  /* EBCDIC */
496*22dc650dSSadaf Ebrahimi 
497*22dc650dSSadaf Ebrahimi 
498*22dc650dSSadaf Ebrahimi /* Table for handling alphanumeric escaped characters. Positive returns are
499*22dc650dSSadaf Ebrahimi simple data values; negative values are for special things like \d and so on.
500*22dc650dSSadaf Ebrahimi Zero means further processing is needed (for things like \x), or the escape is
501*22dc650dSSadaf Ebrahimi invalid. */
502*22dc650dSSadaf Ebrahimi 
503*22dc650dSSadaf Ebrahimi /* This is the "normal" table for ASCII systems or for EBCDIC systems running
504*22dc650dSSadaf Ebrahimi in UTF-8 mode. It runs from '0' to 'z'. */
505*22dc650dSSadaf Ebrahimi 
506*22dc650dSSadaf Ebrahimi #ifndef EBCDIC
507*22dc650dSSadaf Ebrahimi #define ESCAPES_FIRST       CHAR_0
508*22dc650dSSadaf Ebrahimi #define ESCAPES_LAST        CHAR_z
509*22dc650dSSadaf Ebrahimi #define UPPER_CASE(c)       (c-32)
510*22dc650dSSadaf Ebrahimi 
511*22dc650dSSadaf Ebrahimi static const short int escapes[] = {
512*22dc650dSSadaf Ebrahimi      0,                       0,
513*22dc650dSSadaf Ebrahimi      0,                       0,
514*22dc650dSSadaf Ebrahimi      0,                       0,
515*22dc650dSSadaf Ebrahimi      0,                       0,
516*22dc650dSSadaf Ebrahimi      0,                       0,
517*22dc650dSSadaf Ebrahimi      CHAR_COLON,              CHAR_SEMICOLON,
518*22dc650dSSadaf Ebrahimi      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
519*22dc650dSSadaf Ebrahimi      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
520*22dc650dSSadaf Ebrahimi      CHAR_COMMERCIAL_AT,      -ESC_A,
521*22dc650dSSadaf Ebrahimi      -ESC_B,                  -ESC_C,
522*22dc650dSSadaf Ebrahimi      -ESC_D,                  -ESC_E,
523*22dc650dSSadaf Ebrahimi      0,                       -ESC_G,
524*22dc650dSSadaf Ebrahimi      -ESC_H,                  0,
525*22dc650dSSadaf Ebrahimi      0,                       -ESC_K,
526*22dc650dSSadaf Ebrahimi      0,                       0,
527*22dc650dSSadaf Ebrahimi      -ESC_N,                  0,
528*22dc650dSSadaf Ebrahimi      -ESC_P,                  -ESC_Q,
529*22dc650dSSadaf Ebrahimi      -ESC_R,                  -ESC_S,
530*22dc650dSSadaf Ebrahimi      0,                       0,
531*22dc650dSSadaf Ebrahimi      -ESC_V,                  -ESC_W,
532*22dc650dSSadaf Ebrahimi      -ESC_X,                  0,
533*22dc650dSSadaf Ebrahimi      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
534*22dc650dSSadaf Ebrahimi      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
535*22dc650dSSadaf Ebrahimi      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
536*22dc650dSSadaf Ebrahimi      CHAR_GRAVE_ACCENT,       CHAR_BEL,
537*22dc650dSSadaf Ebrahimi      -ESC_b,                  0,
538*22dc650dSSadaf Ebrahimi      -ESC_d,                  CHAR_ESC,
539*22dc650dSSadaf Ebrahimi      CHAR_FF,                 0,
540*22dc650dSSadaf Ebrahimi      -ESC_h,                  0,
541*22dc650dSSadaf Ebrahimi      0,                       -ESC_k,
542*22dc650dSSadaf Ebrahimi      0,                       0,
543*22dc650dSSadaf Ebrahimi      CHAR_LF,                 0,
544*22dc650dSSadaf Ebrahimi      -ESC_p,                  0,
545*22dc650dSSadaf Ebrahimi      CHAR_CR,                 -ESC_s,
546*22dc650dSSadaf Ebrahimi      CHAR_HT,                 0,
547*22dc650dSSadaf Ebrahimi      -ESC_v,                  -ESC_w,
548*22dc650dSSadaf Ebrahimi      0,                       0,
549*22dc650dSSadaf Ebrahimi      -ESC_z
550*22dc650dSSadaf Ebrahimi };
551*22dc650dSSadaf Ebrahimi 
552*22dc650dSSadaf Ebrahimi #else
553*22dc650dSSadaf Ebrahimi 
554*22dc650dSSadaf Ebrahimi /* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
555*22dc650dSSadaf Ebrahimi It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
556*22dc650dSSadaf Ebrahimi is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
557*22dc650dSSadaf Ebrahimi because it is defined as 'a', which of course picks up the ASCII value. */
558*22dc650dSSadaf Ebrahimi 
559*22dc650dSSadaf Ebrahimi #if 'a' == 0x81                    /* Check for a real EBCDIC environment */
560*22dc650dSSadaf Ebrahimi #define ESCAPES_FIRST       CHAR_a
561*22dc650dSSadaf Ebrahimi #define ESCAPES_LAST        CHAR_9
562*22dc650dSSadaf Ebrahimi #define UPPER_CASE(c)       (c+64)
563*22dc650dSSadaf Ebrahimi #else                              /* Testing in an ASCII environment */
564*22dc650dSSadaf Ebrahimi #define ESCAPES_FIRST  ((unsigned char)'\x81')   /* EBCDIC 'a' */
565*22dc650dSSadaf Ebrahimi #define ESCAPES_LAST   ((unsigned char)'\xf9')   /* EBCDIC '9' */
566*22dc650dSSadaf Ebrahimi #define UPPER_CASE(c)  (c-32)
567*22dc650dSSadaf Ebrahimi #endif
568*22dc650dSSadaf Ebrahimi 
569*22dc650dSSadaf Ebrahimi static const short int escapes[] = {
570*22dc650dSSadaf Ebrahimi /*  80 */         CHAR_BEL, -ESC_b,       0, -ESC_d, CHAR_ESC, CHAR_FF,      0,
571*22dc650dSSadaf Ebrahimi /*  88 */ -ESC_h,        0,      0,     '{',      0,        0,       0,      0,
572*22dc650dSSadaf Ebrahimi /*  90 */      0,        0, -ESC_k,       0,      0,  CHAR_LF,       0, -ESC_p,
573*22dc650dSSadaf Ebrahimi /*  98 */      0,  CHAR_CR,      0,     '}',      0,        0,       0,      0,
574*22dc650dSSadaf Ebrahimi /*  A0 */      0,      '~', -ESC_s, CHAR_HT,      0,   -ESC_v,  -ESC_w,      0,
575*22dc650dSSadaf Ebrahimi /*  A8 */      0,   -ESC_z,      0,       0,      0,      '[',       0,      0,
576*22dc650dSSadaf Ebrahimi /*  B0 */      0,        0,      0,       0,      0,        0,       0,      0,
577*22dc650dSSadaf Ebrahimi /*  B8 */      0,        0,      0,       0,      0,      ']',     '=',    '-',
578*22dc650dSSadaf Ebrahimi /*  C0 */    '{',   -ESC_A, -ESC_B,  -ESC_C, -ESC_D,   -ESC_E,       0, -ESC_G,
579*22dc650dSSadaf Ebrahimi /*  C8 */ -ESC_H,        0,      0,       0,      0,        0,       0,      0,
580*22dc650dSSadaf Ebrahimi /*  D0 */    '}',        0, -ESC_K,       0,      0,   -ESC_N,       0, -ESC_P,
581*22dc650dSSadaf Ebrahimi /*  D8 */ -ESC_Q,   -ESC_R,      0,       0,      0,        0,       0,      0,
582*22dc650dSSadaf Ebrahimi /*  E0 */   '\\',        0, -ESC_S,       0,      0,   -ESC_V,  -ESC_W, -ESC_X,
583*22dc650dSSadaf Ebrahimi /*  E8 */      0,   -ESC_Z,      0,       0,      0,        0,       0,      0,
584*22dc650dSSadaf Ebrahimi /*  F0 */      0,        0,      0,       0,      0,        0,       0,      0,
585*22dc650dSSadaf Ebrahimi /*  F8 */      0,        0
586*22dc650dSSadaf Ebrahimi };
587*22dc650dSSadaf Ebrahimi 
588*22dc650dSSadaf Ebrahimi /* We also need a table of characters that may follow \c in an EBCDIC
589*22dc650dSSadaf Ebrahimi environment for characters 0-31. */
590*22dc650dSSadaf Ebrahimi 
591*22dc650dSSadaf Ebrahimi static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
592*22dc650dSSadaf Ebrahimi 
593*22dc650dSSadaf Ebrahimi #endif   /* EBCDIC */
594*22dc650dSSadaf Ebrahimi 
595*22dc650dSSadaf Ebrahimi 
596*22dc650dSSadaf Ebrahimi /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
597*22dc650dSSadaf Ebrahimi searched linearly. Put all the names into a single string, in order to reduce
598*22dc650dSSadaf Ebrahimi the number of relocations when a shared library is dynamically linked. The
599*22dc650dSSadaf Ebrahimi string is built from string macros so that it works in UTF-8 mode on EBCDIC
600*22dc650dSSadaf Ebrahimi platforms. */
601*22dc650dSSadaf Ebrahimi 
602*22dc650dSSadaf Ebrahimi typedef struct verbitem {
603*22dc650dSSadaf Ebrahimi   unsigned int len;          /* Length of verb name */
604*22dc650dSSadaf Ebrahimi   uint32_t meta;             /* Base META_ code */
605*22dc650dSSadaf Ebrahimi   int has_arg;               /* Argument requirement */
606*22dc650dSSadaf Ebrahimi } verbitem;
607*22dc650dSSadaf Ebrahimi 
608*22dc650dSSadaf Ebrahimi static const char verbnames[] =
609*22dc650dSSadaf Ebrahimi   "\0"                       /* Empty name is a shorthand for MARK */
610*22dc650dSSadaf Ebrahimi   STRING_MARK0
611*22dc650dSSadaf Ebrahimi   STRING_ACCEPT0
612*22dc650dSSadaf Ebrahimi   STRING_F0
613*22dc650dSSadaf Ebrahimi   STRING_FAIL0
614*22dc650dSSadaf Ebrahimi   STRING_COMMIT0
615*22dc650dSSadaf Ebrahimi   STRING_PRUNE0
616*22dc650dSSadaf Ebrahimi   STRING_SKIP0
617*22dc650dSSadaf Ebrahimi   STRING_THEN;
618*22dc650dSSadaf Ebrahimi 
619*22dc650dSSadaf Ebrahimi static const verbitem verbs[] = {
620*22dc650dSSadaf Ebrahimi   { 0, META_MARK,   +1 },  /* > 0 => must have an argument */
621*22dc650dSSadaf Ebrahimi   { 4, META_MARK,   +1 },
622*22dc650dSSadaf Ebrahimi   { 6, META_ACCEPT, -1 },  /* < 0 => Optional argument, convert to pre-MARK */
623*22dc650dSSadaf Ebrahimi   { 1, META_FAIL,   -1 },
624*22dc650dSSadaf Ebrahimi   { 4, META_FAIL,   -1 },
625*22dc650dSSadaf Ebrahimi   { 6, META_COMMIT,  0 },
626*22dc650dSSadaf Ebrahimi   { 5, META_PRUNE,   0 },  /* Optional argument; bump META code if found */
627*22dc650dSSadaf Ebrahimi   { 4, META_SKIP,    0 },
628*22dc650dSSadaf Ebrahimi   { 4, META_THEN,    0 }
629*22dc650dSSadaf Ebrahimi };
630*22dc650dSSadaf Ebrahimi 
631*22dc650dSSadaf Ebrahimi static const int verbcount = sizeof(verbs)/sizeof(verbitem);
632*22dc650dSSadaf Ebrahimi 
633*22dc650dSSadaf Ebrahimi /* Verb opcodes, indexed by their META code offset from META_MARK. */
634*22dc650dSSadaf Ebrahimi 
635*22dc650dSSadaf Ebrahimi static const uint32_t verbops[] = {
636*22dc650dSSadaf Ebrahimi   OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
637*22dc650dSSadaf Ebrahimi   OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
638*22dc650dSSadaf Ebrahimi 
639*22dc650dSSadaf Ebrahimi /* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */
640*22dc650dSSadaf Ebrahimi 
641*22dc650dSSadaf Ebrahimi typedef struct alasitem {
642*22dc650dSSadaf Ebrahimi   unsigned int len;          /* Length of name */
643*22dc650dSSadaf Ebrahimi   uint32_t meta;             /* Base META_ code */
644*22dc650dSSadaf Ebrahimi } alasitem;
645*22dc650dSSadaf Ebrahimi 
646*22dc650dSSadaf Ebrahimi static const char alasnames[] =
647*22dc650dSSadaf Ebrahimi   STRING_pla0
648*22dc650dSSadaf Ebrahimi   STRING_plb0
649*22dc650dSSadaf Ebrahimi   STRING_napla0
650*22dc650dSSadaf Ebrahimi   STRING_naplb0
651*22dc650dSSadaf Ebrahimi   STRING_nla0
652*22dc650dSSadaf Ebrahimi   STRING_nlb0
653*22dc650dSSadaf Ebrahimi   STRING_positive_lookahead0
654*22dc650dSSadaf Ebrahimi   STRING_positive_lookbehind0
655*22dc650dSSadaf Ebrahimi   STRING_non_atomic_positive_lookahead0
656*22dc650dSSadaf Ebrahimi   STRING_non_atomic_positive_lookbehind0
657*22dc650dSSadaf Ebrahimi   STRING_negative_lookahead0
658*22dc650dSSadaf Ebrahimi   STRING_negative_lookbehind0
659*22dc650dSSadaf Ebrahimi   STRING_atomic0
660*22dc650dSSadaf Ebrahimi   STRING_sr0
661*22dc650dSSadaf Ebrahimi   STRING_asr0
662*22dc650dSSadaf Ebrahimi   STRING_script_run0
663*22dc650dSSadaf Ebrahimi   STRING_atomic_script_run;
664*22dc650dSSadaf Ebrahimi 
665*22dc650dSSadaf Ebrahimi static const alasitem alasmeta[] = {
666*22dc650dSSadaf Ebrahimi   {  3, META_LOOKAHEAD         },
667*22dc650dSSadaf Ebrahimi   {  3, META_LOOKBEHIND        },
668*22dc650dSSadaf Ebrahimi   {  5, META_LOOKAHEAD_NA      },
669*22dc650dSSadaf Ebrahimi   {  5, META_LOOKBEHIND_NA     },
670*22dc650dSSadaf Ebrahimi   {  3, META_LOOKAHEADNOT      },
671*22dc650dSSadaf Ebrahimi   {  3, META_LOOKBEHINDNOT     },
672*22dc650dSSadaf Ebrahimi   { 18, META_LOOKAHEAD         },
673*22dc650dSSadaf Ebrahimi   { 19, META_LOOKBEHIND        },
674*22dc650dSSadaf Ebrahimi   { 29, META_LOOKAHEAD_NA      },
675*22dc650dSSadaf Ebrahimi   { 30, META_LOOKBEHIND_NA     },
676*22dc650dSSadaf Ebrahimi   { 18, META_LOOKAHEADNOT      },
677*22dc650dSSadaf Ebrahimi   { 19, META_LOOKBEHINDNOT     },
678*22dc650dSSadaf Ebrahimi   {  6, META_ATOMIC            },
679*22dc650dSSadaf Ebrahimi   {  2, META_SCRIPT_RUN        }, /* sr = script run */
680*22dc650dSSadaf Ebrahimi   {  3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
681*22dc650dSSadaf Ebrahimi   { 10, META_SCRIPT_RUN        }, /* script run */
682*22dc650dSSadaf Ebrahimi   { 17, META_ATOMIC_SCRIPT_RUN }  /* atomic script run */
683*22dc650dSSadaf Ebrahimi };
684*22dc650dSSadaf Ebrahimi 
685*22dc650dSSadaf Ebrahimi static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
686*22dc650dSSadaf Ebrahimi 
687*22dc650dSSadaf Ebrahimi /* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
688*22dc650dSSadaf Ebrahimi 
689*22dc650dSSadaf Ebrahimi static uint32_t chartypeoffset[] = {
690*22dc650dSSadaf Ebrahimi   OP_STAR - OP_STAR,    OP_STARI - OP_STAR,
691*22dc650dSSadaf Ebrahimi   OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
692*22dc650dSSadaf Ebrahimi 
693*22dc650dSSadaf Ebrahimi /* Tables of names of POSIX character classes and their lengths. The names are
694*22dc650dSSadaf Ebrahimi now all in a single string, to reduce the number of relocations when a shared
695*22dc650dSSadaf Ebrahimi library is dynamically loaded. The list of lengths is terminated by a zero
696*22dc650dSSadaf Ebrahimi length entry. The first three must be alpha, lower, upper, as this is assumed
697*22dc650dSSadaf Ebrahimi for handling case independence. The indices for several classes are needed, so
698*22dc650dSSadaf Ebrahimi identify them. */
699*22dc650dSSadaf Ebrahimi 
700*22dc650dSSadaf Ebrahimi static const char posix_names[] =
701*22dc650dSSadaf Ebrahimi   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
702*22dc650dSSadaf Ebrahimi   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
703*22dc650dSSadaf Ebrahimi   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
704*22dc650dSSadaf Ebrahimi   STRING_word0  STRING_xdigit;
705*22dc650dSSadaf Ebrahimi 
706*22dc650dSSadaf Ebrahimi static const uint8_t posix_name_lengths[] = {
707*22dc650dSSadaf Ebrahimi   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
708*22dc650dSSadaf Ebrahimi 
709*22dc650dSSadaf Ebrahimi #define PC_DIGIT   7
710*22dc650dSSadaf Ebrahimi #define PC_GRAPH   8
711*22dc650dSSadaf Ebrahimi #define PC_PRINT   9
712*22dc650dSSadaf Ebrahimi #define PC_PUNCT  10
713*22dc650dSSadaf Ebrahimi #define PC_XDIGIT 13
714*22dc650dSSadaf Ebrahimi 
715*22dc650dSSadaf Ebrahimi /* Table of class bit maps for each POSIX class. Each class is formed from a
716*22dc650dSSadaf Ebrahimi base map, with an optional addition or removal of another map. Then, for some
717*22dc650dSSadaf Ebrahimi classes, there is some additional tweaking: for [:blank:] the vertical space
718*22dc650dSSadaf Ebrahimi characters are removed, and for [:alpha:] and [:alnum:] the underscore
719*22dc650dSSadaf Ebrahimi character is removed. The triples in the table consist of the base map offset,
720*22dc650dSSadaf Ebrahimi second map offset or -1 if no second map, and a non-negative value for map
721*22dc650dSSadaf Ebrahimi addition or a negative value for map subtraction (if there are two maps). The
722*22dc650dSSadaf Ebrahimi absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
723*22dc650dSSadaf Ebrahimi remove vertical space characters, 2 => remove underscore. */
724*22dc650dSSadaf Ebrahimi 
725*22dc650dSSadaf Ebrahimi static const int posix_class_maps[] = {
726*22dc650dSSadaf Ebrahimi   cbit_word,   cbit_digit, -2,            /* alpha */
727*22dc650dSSadaf Ebrahimi   cbit_lower,  -1,          0,            /* lower */
728*22dc650dSSadaf Ebrahimi   cbit_upper,  -1,          0,            /* upper */
729*22dc650dSSadaf Ebrahimi   cbit_word,   -1,          2,            /* alnum - word without underscore */
730*22dc650dSSadaf Ebrahimi   cbit_print,  cbit_cntrl,  0,            /* ascii */
731*22dc650dSSadaf Ebrahimi   cbit_space,  -1,          1,            /* blank - a GNU extension */
732*22dc650dSSadaf Ebrahimi   cbit_cntrl,  -1,          0,            /* cntrl */
733*22dc650dSSadaf Ebrahimi   cbit_digit,  -1,          0,            /* digit */
734*22dc650dSSadaf Ebrahimi   cbit_graph,  -1,          0,            /* graph */
735*22dc650dSSadaf Ebrahimi   cbit_print,  -1,          0,            /* print */
736*22dc650dSSadaf Ebrahimi   cbit_punct,  -1,          0,            /* punct */
737*22dc650dSSadaf Ebrahimi   cbit_space,  -1,          0,            /* space */
738*22dc650dSSadaf Ebrahimi   cbit_word,   -1,          0,            /* word - a Perl extension */
739*22dc650dSSadaf Ebrahimi   cbit_xdigit, -1,          0             /* xdigit */
740*22dc650dSSadaf Ebrahimi };
741*22dc650dSSadaf Ebrahimi 
742*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
743*22dc650dSSadaf Ebrahimi 
744*22dc650dSSadaf Ebrahimi /* The POSIX class Unicode property substitutes that are used in UCP mode must
745*22dc650dSSadaf Ebrahimi be in the order of the POSIX class names, defined above. */
746*22dc650dSSadaf Ebrahimi 
747*22dc650dSSadaf Ebrahimi static int posix_substitutes[] = {
748*22dc650dSSadaf Ebrahimi   PT_GC, ucp_L,     /* alpha */
749*22dc650dSSadaf Ebrahimi   PT_PC, ucp_Ll,    /* lower */
750*22dc650dSSadaf Ebrahimi   PT_PC, ucp_Lu,    /* upper */
751*22dc650dSSadaf Ebrahimi   PT_ALNUM, 0,      /* alnum */
752*22dc650dSSadaf Ebrahimi   -1, 0,            /* ascii, treat as non-UCP */
753*22dc650dSSadaf Ebrahimi   -1, 1,            /* blank, treat as \h */
754*22dc650dSSadaf Ebrahimi   PT_PC, ucp_Cc,    /* cntrl */
755*22dc650dSSadaf Ebrahimi   PT_PC, ucp_Nd,    /* digit */
756*22dc650dSSadaf Ebrahimi   PT_PXGRAPH, 0,    /* graph */
757*22dc650dSSadaf Ebrahimi   PT_PXPRINT, 0,    /* print */
758*22dc650dSSadaf Ebrahimi   PT_PXPUNCT, 0,    /* punct */
759*22dc650dSSadaf Ebrahimi   PT_PXSPACE, 0,    /* space */   /* Xps is POSIX space, but from 8.34 */
760*22dc650dSSadaf Ebrahimi   PT_WORD, 0,       /* word  */   /* Perl and POSIX space are the same */
761*22dc650dSSadaf Ebrahimi   PT_PXXDIGIT, 0    /* xdigit */  /* Perl has additional hex digits */
762*22dc650dSSadaf Ebrahimi };
763*22dc650dSSadaf Ebrahimi #define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t)))
764*22dc650dSSadaf Ebrahimi #endif  /* SUPPORT_UNICODE */
765*22dc650dSSadaf Ebrahimi 
766*22dc650dSSadaf Ebrahimi /* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
767*22dc650dSSadaf Ebrahimi are allowed. */
768*22dc650dSSadaf Ebrahimi 
769*22dc650dSSadaf Ebrahimi #define PUBLIC_LITERAL_COMPILE_OPTIONS \
770*22dc650dSSadaf Ebrahimi   (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
771*22dc650dSSadaf Ebrahimi    PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \
772*22dc650dSSadaf Ebrahimi    PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
773*22dc650dSSadaf Ebrahimi 
774*22dc650dSSadaf Ebrahimi #define PUBLIC_COMPILE_OPTIONS \
775*22dc650dSSadaf Ebrahimi   (PUBLIC_LITERAL_COMPILE_OPTIONS| \
776*22dc650dSSadaf Ebrahimi    PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
777*22dc650dSSadaf Ebrahimi    PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
778*22dc650dSSadaf Ebrahimi    PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
779*22dc650dSSadaf Ebrahimi    PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
780*22dc650dSSadaf Ebrahimi    PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
781*22dc650dSSadaf Ebrahimi    PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY)
782*22dc650dSSadaf Ebrahimi 
783*22dc650dSSadaf Ebrahimi #define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
784*22dc650dSSadaf Ebrahimi    (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_CASELESS_RESTRICT)
785*22dc650dSSadaf Ebrahimi 
786*22dc650dSSadaf Ebrahimi #define PUBLIC_COMPILE_EXTRA_OPTIONS \
787*22dc650dSSadaf Ebrahimi    (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
788*22dc650dSSadaf Ebrahimi     PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
789*22dc650dSSadaf Ebrahimi     PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
790*22dc650dSSadaf Ebrahimi     PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \
791*22dc650dSSadaf Ebrahimi     PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \
792*22dc650dSSadaf Ebrahimi     PCRE2_EXTRA_ASCII_DIGIT)
793*22dc650dSSadaf Ebrahimi 
794*22dc650dSSadaf Ebrahimi /* Compile time error code numbers. They are given names so that they can more
795*22dc650dSSadaf Ebrahimi easily be tracked. When a new number is added, the tables called eint1 and
796*22dc650dSSadaf Ebrahimi eint2 in pcre2posix.c may need to be updated, and a new error text must be
797*22dc650dSSadaf Ebrahimi added to compile_error_texts in pcre2_error.c. Also, the error codes in
798*22dc650dSSadaf Ebrahimi pcre2.h.in must be updated - their values are exactly 100 greater than these
799*22dc650dSSadaf Ebrahimi values. */
800*22dc650dSSadaf Ebrahimi 
801*22dc650dSSadaf Ebrahimi enum { ERR0 = COMPILE_ERROR_BASE,
802*22dc650dSSadaf Ebrahimi        ERR1,  ERR2,  ERR3,  ERR4,  ERR5,  ERR6,  ERR7,  ERR8,  ERR9,  ERR10,
803*22dc650dSSadaf Ebrahimi        ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
804*22dc650dSSadaf Ebrahimi        ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
805*22dc650dSSadaf Ebrahimi        ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
806*22dc650dSSadaf Ebrahimi        ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
807*22dc650dSSadaf Ebrahimi        ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
808*22dc650dSSadaf Ebrahimi        ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
809*22dc650dSSadaf Ebrahimi        ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
810*22dc650dSSadaf Ebrahimi        ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
811*22dc650dSSadaf Ebrahimi        ERR91, ERR92, ERR93, ERR94, ERR95, ERR96, ERR97, ERR98, ERR99, ERR100,
812*22dc650dSSadaf Ebrahimi        ERR101 };
813*22dc650dSSadaf Ebrahimi 
814*22dc650dSSadaf Ebrahimi /* This is a table of start-of-pattern options such as (*UTF) and settings such
815*22dc650dSSadaf Ebrahimi as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
816*22dc650dSSadaf Ebrahimi compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
817*22dc650dSSadaf Ebrahimi generic and always supported. */
818*22dc650dSSadaf Ebrahimi 
819*22dc650dSSadaf Ebrahimi enum { PSO_OPT,     /* Value is an option bit */
820*22dc650dSSadaf Ebrahimi        PSO_FLG,     /* Value is a flag bit */
821*22dc650dSSadaf Ebrahimi        PSO_NL,      /* Value is a newline type */
822*22dc650dSSadaf Ebrahimi        PSO_BSR,     /* Value is a \R type */
823*22dc650dSSadaf Ebrahimi        PSO_LIMH,    /* Read integer value for heap limit */
824*22dc650dSSadaf Ebrahimi        PSO_LIMM,    /* Read integer value for match limit */
825*22dc650dSSadaf Ebrahimi        PSO_LIMD     /* Read integer value for depth limit */
826*22dc650dSSadaf Ebrahimi      };
827*22dc650dSSadaf Ebrahimi 
828*22dc650dSSadaf Ebrahimi typedef struct pso {
829*22dc650dSSadaf Ebrahimi   const uint8_t *name;
830*22dc650dSSadaf Ebrahimi   uint16_t length;
831*22dc650dSSadaf Ebrahimi   uint16_t type;
832*22dc650dSSadaf Ebrahimi   uint32_t value;
833*22dc650dSSadaf Ebrahimi } pso;
834*22dc650dSSadaf Ebrahimi 
835*22dc650dSSadaf Ebrahimi /* NB: STRING_UTFn_RIGHTPAR contains the length as well */
836*22dc650dSSadaf Ebrahimi 
837*22dc650dSSadaf Ebrahimi static const pso pso_list[] = {
838*22dc650dSSadaf Ebrahimi   { (uint8_t *)STRING_UTFn_RIGHTPAR,                  PSO_OPT, PCRE2_UTF },
839*22dc650dSSadaf Ebrahimi   { (uint8_t *)STRING_UTF_RIGHTPAR,                4, PSO_OPT, PCRE2_UTF },
840*22dc650dSSadaf Ebrahimi   { (uint8_t *)STRING_UCP_RIGHTPAR,                4, PSO_OPT, PCRE2_UCP },
841*22dc650dSSadaf Ebrahimi   { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR,           9, PSO_FLG, PCRE2_NOTEMPTY_SET },
842*22dc650dSSadaf Ebrahimi   { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR,  17, PSO_FLG, PCRE2_NE_ATST_SET },
843*22dc650dSSadaf Ebrahimi   { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR,   16, PSO_OPT, PCRE2_NO_AUTO_POSSESS },
844*22dc650dSSadaf Ebrahimi   { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
845*22dc650dSSadaf Ebrahimi   { (uint8_t *)STRING_NO_JIT_RIGHTPAR,             7, PSO_FLG, PCRE2_NOJIT },
846*22dc650dSSadaf Ebrahimi   { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR,      13, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
847*22dc650dSSadaf Ebrahimi   { (uint8_t *)STRING_LIMIT_HEAP_EQ,              11, PSO_LIMH, 0 },
848*22dc650dSSadaf Ebrahimi   { (uint8_t *)STRING_LIMIT_MATCH_EQ,             12, PSO_LIMM, 0 },
849*22dc650dSSadaf Ebrahimi   { (uint8_t *)STRING_LIMIT_DEPTH_EQ,             12, PSO_LIMD, 0 },
850*22dc650dSSadaf Ebrahimi   { (uint8_t *)STRING_LIMIT_RECURSION_EQ,         16, PSO_LIMD, 0 },
851*22dc650dSSadaf Ebrahimi   { (uint8_t *)STRING_CR_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_CR },
852*22dc650dSSadaf Ebrahimi   { (uint8_t *)STRING_LF_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_LF },
853*22dc650dSSadaf Ebrahimi   { (uint8_t *)STRING_CRLF_RIGHTPAR,               5, PSO_NL,  PCRE2_NEWLINE_CRLF },
854*22dc650dSSadaf Ebrahimi   { (uint8_t *)STRING_ANY_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_ANY },
855*22dc650dSSadaf Ebrahimi   { (uint8_t *)STRING_NUL_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_NUL },
856*22dc650dSSadaf Ebrahimi   { (uint8_t *)STRING_ANYCRLF_RIGHTPAR,            8, PSO_NL,  PCRE2_NEWLINE_ANYCRLF },
857*22dc650dSSadaf Ebrahimi   { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_ANYCRLF },
858*22dc650dSSadaf Ebrahimi   { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_UNICODE }
859*22dc650dSSadaf Ebrahimi };
860*22dc650dSSadaf Ebrahimi 
861*22dc650dSSadaf Ebrahimi /* This table is used when converting repeating opcodes into possessified
862*22dc650dSSadaf Ebrahimi versions as a result of an explicit possessive quantifier such as ++. A zero
863*22dc650dSSadaf Ebrahimi value means there is no possessified version - in those cases the item in
864*22dc650dSSadaf Ebrahimi question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
865*22dc650dSSadaf Ebrahimi because all relevant opcodes are less than that. */
866*22dc650dSSadaf Ebrahimi 
867*22dc650dSSadaf Ebrahimi static const uint8_t opcode_possessify[] = {
868*22dc650dSSadaf Ebrahimi   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
869*22dc650dSSadaf Ebrahimi   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
870*22dc650dSSadaf Ebrahimi 
871*22dc650dSSadaf Ebrahimi   0,                       /* NOTI */
872*22dc650dSSadaf Ebrahimi   OP_POSSTAR, 0,           /* STAR, MINSTAR */
873*22dc650dSSadaf Ebrahimi   OP_POSPLUS, 0,           /* PLUS, MINPLUS */
874*22dc650dSSadaf Ebrahimi   OP_POSQUERY, 0,          /* QUERY, MINQUERY */
875*22dc650dSSadaf Ebrahimi   OP_POSUPTO, 0,           /* UPTO, MINUPTO */
876*22dc650dSSadaf Ebrahimi   0,                       /* EXACT */
877*22dc650dSSadaf Ebrahimi   0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
878*22dc650dSSadaf Ebrahimi 
879*22dc650dSSadaf Ebrahimi   OP_POSSTARI, 0,          /* STARI, MINSTARI */
880*22dc650dSSadaf Ebrahimi   OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
881*22dc650dSSadaf Ebrahimi   OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
882*22dc650dSSadaf Ebrahimi   OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
883*22dc650dSSadaf Ebrahimi   0,                       /* EXACTI */
884*22dc650dSSadaf Ebrahimi   0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
885*22dc650dSSadaf Ebrahimi 
886*22dc650dSSadaf Ebrahimi   OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
887*22dc650dSSadaf Ebrahimi   OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
888*22dc650dSSadaf Ebrahimi   OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
889*22dc650dSSadaf Ebrahimi   OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
890*22dc650dSSadaf Ebrahimi   0,                       /* NOTEXACT */
891*22dc650dSSadaf Ebrahimi   0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
892*22dc650dSSadaf Ebrahimi 
893*22dc650dSSadaf Ebrahimi   OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
894*22dc650dSSadaf Ebrahimi   OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
895*22dc650dSSadaf Ebrahimi   OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
896*22dc650dSSadaf Ebrahimi   OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
897*22dc650dSSadaf Ebrahimi   0,                       /* NOTEXACTI */
898*22dc650dSSadaf Ebrahimi   0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
899*22dc650dSSadaf Ebrahimi 
900*22dc650dSSadaf Ebrahimi   OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
901*22dc650dSSadaf Ebrahimi   OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
902*22dc650dSSadaf Ebrahimi   OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
903*22dc650dSSadaf Ebrahimi   OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
904*22dc650dSSadaf Ebrahimi   0,                       /* TYPEEXACT */
905*22dc650dSSadaf Ebrahimi   0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
906*22dc650dSSadaf Ebrahimi 
907*22dc650dSSadaf Ebrahimi   OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
908*22dc650dSSadaf Ebrahimi   OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
909*22dc650dSSadaf Ebrahimi   OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
910*22dc650dSSadaf Ebrahimi   OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
911*22dc650dSSadaf Ebrahimi   0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
912*22dc650dSSadaf Ebrahimi 
913*22dc650dSSadaf Ebrahimi   0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
914*22dc650dSSadaf Ebrahimi   0, 0,                    /* REF, REFI */
915*22dc650dSSadaf Ebrahimi   0, 0,                    /* DNREF, DNREFI */
916*22dc650dSSadaf Ebrahimi   0, 0                     /* RECURSE, CALLOUT */
917*22dc650dSSadaf Ebrahimi };
918*22dc650dSSadaf Ebrahimi 
919*22dc650dSSadaf Ebrahimi 
920*22dc650dSSadaf Ebrahimi #ifdef DEBUG_SHOW_PARSED
921*22dc650dSSadaf Ebrahimi /*************************************************
922*22dc650dSSadaf Ebrahimi *     Show the parsed pattern for debugging      *
923*22dc650dSSadaf Ebrahimi *************************************************/
924*22dc650dSSadaf Ebrahimi 
925*22dc650dSSadaf Ebrahimi /* For debugging the pre-scan, this code, which outputs the parsed data vector,
926*22dc650dSSadaf Ebrahimi can be enabled. */
927*22dc650dSSadaf Ebrahimi 
show_parsed(compile_block * cb)928*22dc650dSSadaf Ebrahimi static void show_parsed(compile_block *cb)
929*22dc650dSSadaf Ebrahimi {
930*22dc650dSSadaf Ebrahimi uint32_t *pptr = cb->parsed_pattern;
931*22dc650dSSadaf Ebrahimi 
932*22dc650dSSadaf Ebrahimi for (;;)
933*22dc650dSSadaf Ebrahimi   {
934*22dc650dSSadaf Ebrahimi   int max, min;
935*22dc650dSSadaf Ebrahimi   PCRE2_SIZE offset;
936*22dc650dSSadaf Ebrahimi   uint32_t i;
937*22dc650dSSadaf Ebrahimi   uint32_t length;
938*22dc650dSSadaf Ebrahimi   uint32_t meta_arg = META_DATA(*pptr);
939*22dc650dSSadaf Ebrahimi 
940*22dc650dSSadaf Ebrahimi   fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
941*22dc650dSSadaf Ebrahimi 
942*22dc650dSSadaf Ebrahimi   if (*pptr < META_END)
943*22dc650dSSadaf Ebrahimi     {
944*22dc650dSSadaf Ebrahimi     if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
945*22dc650dSSadaf Ebrahimi     pptr++;
946*22dc650dSSadaf Ebrahimi     }
947*22dc650dSSadaf Ebrahimi 
948*22dc650dSSadaf Ebrahimi   else switch (META_CODE(*pptr++))
949*22dc650dSSadaf Ebrahimi     {
950*22dc650dSSadaf Ebrahimi     default:
951*22dc650dSSadaf Ebrahimi     fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
952*22dc650dSSadaf Ebrahimi     return;
953*22dc650dSSadaf Ebrahimi 
954*22dc650dSSadaf Ebrahimi     case META_END:
955*22dc650dSSadaf Ebrahimi     fprintf(stderr, "META_END\n");
956*22dc650dSSadaf Ebrahimi     return;
957*22dc650dSSadaf Ebrahimi 
958*22dc650dSSadaf Ebrahimi     case META_CAPTURE:
959*22dc650dSSadaf Ebrahimi     fprintf(stderr, "META_CAPTURE %d", meta_arg);
960*22dc650dSSadaf Ebrahimi     break;
961*22dc650dSSadaf Ebrahimi 
962*22dc650dSSadaf Ebrahimi     case META_RECURSE:
963*22dc650dSSadaf Ebrahimi     GETOFFSET(offset, pptr);
964*22dc650dSSadaf Ebrahimi     fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
965*22dc650dSSadaf Ebrahimi     break;
966*22dc650dSSadaf Ebrahimi 
967*22dc650dSSadaf Ebrahimi     case META_BACKREF:
968*22dc650dSSadaf Ebrahimi     if (meta_arg < 10)
969*22dc650dSSadaf Ebrahimi       offset = cb->small_ref_offset[meta_arg];
970*22dc650dSSadaf Ebrahimi     else
971*22dc650dSSadaf Ebrahimi       GETOFFSET(offset, pptr);
972*22dc650dSSadaf Ebrahimi     fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
973*22dc650dSSadaf Ebrahimi     break;
974*22dc650dSSadaf Ebrahimi 
975*22dc650dSSadaf Ebrahimi     case META_ESCAPE:
976*22dc650dSSadaf Ebrahimi     if (meta_arg == ESC_P || meta_arg == ESC_p)
977*22dc650dSSadaf Ebrahimi       {
978*22dc650dSSadaf Ebrahimi       uint32_t ptype = *pptr >> 16;
979*22dc650dSSadaf Ebrahimi       uint32_t pvalue = *pptr++ & 0xffff;
980*22dc650dSSadaf Ebrahimi       fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? 'P':'p',
981*22dc650dSSadaf Ebrahimi         ptype, pvalue);
982*22dc650dSSadaf Ebrahimi       }
983*22dc650dSSadaf Ebrahimi     else
984*22dc650dSSadaf Ebrahimi       {
985*22dc650dSSadaf Ebrahimi       uint32_t cc;
986*22dc650dSSadaf Ebrahimi       /* There's just one escape we might have here that isn't negated in the
987*22dc650dSSadaf Ebrahimi       escapes table. */
988*22dc650dSSadaf Ebrahimi       if (meta_arg == ESC_g) cc = CHAR_g;
989*22dc650dSSadaf Ebrahimi       else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
990*22dc650dSSadaf Ebrahimi         {
991*22dc650dSSadaf Ebrahimi         if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
992*22dc650dSSadaf Ebrahimi         }
993*22dc650dSSadaf Ebrahimi       if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
994*22dc650dSSadaf Ebrahimi       fprintf(stderr, "META \\%c", cc);
995*22dc650dSSadaf Ebrahimi       }
996*22dc650dSSadaf Ebrahimi     break;
997*22dc650dSSadaf Ebrahimi 
998*22dc650dSSadaf Ebrahimi     case META_MINMAX:
999*22dc650dSSadaf Ebrahimi     min = *pptr++;
1000*22dc650dSSadaf Ebrahimi     max = *pptr++;
1001*22dc650dSSadaf Ebrahimi     if (max != REPEAT_UNLIMITED)
1002*22dc650dSSadaf Ebrahimi       fprintf(stderr, "META {%d,%d}", min, max);
1003*22dc650dSSadaf Ebrahimi     else
1004*22dc650dSSadaf Ebrahimi       fprintf(stderr, "META {%d,}", min);
1005*22dc650dSSadaf Ebrahimi     break;
1006*22dc650dSSadaf Ebrahimi 
1007*22dc650dSSadaf Ebrahimi     case META_MINMAX_QUERY:
1008*22dc650dSSadaf Ebrahimi     min = *pptr++;
1009*22dc650dSSadaf Ebrahimi     max = *pptr++;
1010*22dc650dSSadaf Ebrahimi     if (max != REPEAT_UNLIMITED)
1011*22dc650dSSadaf Ebrahimi       fprintf(stderr, "META {%d,%d}?", min, max);
1012*22dc650dSSadaf Ebrahimi     else
1013*22dc650dSSadaf Ebrahimi       fprintf(stderr, "META {%d,}?", min);
1014*22dc650dSSadaf Ebrahimi     break;
1015*22dc650dSSadaf Ebrahimi 
1016*22dc650dSSadaf Ebrahimi     case META_MINMAX_PLUS:
1017*22dc650dSSadaf Ebrahimi     min = *pptr++;
1018*22dc650dSSadaf Ebrahimi     max = *pptr++;
1019*22dc650dSSadaf Ebrahimi     if (max != REPEAT_UNLIMITED)
1020*22dc650dSSadaf Ebrahimi       fprintf(stderr, "META {%d,%d}+", min, max);
1021*22dc650dSSadaf Ebrahimi     else
1022*22dc650dSSadaf Ebrahimi       fprintf(stderr, "META {%d,}+", min);
1023*22dc650dSSadaf Ebrahimi     break;
1024*22dc650dSSadaf Ebrahimi 
1025*22dc650dSSadaf Ebrahimi     case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
1026*22dc650dSSadaf Ebrahimi     case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
1027*22dc650dSSadaf Ebrahimi     case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
1028*22dc650dSSadaf Ebrahimi     case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
1029*22dc650dSSadaf Ebrahimi     case META_DOT: fprintf(stderr, "META_DOT"); break;
1030*22dc650dSSadaf Ebrahimi     case META_ASTERISK: fprintf(stderr, "META *"); break;
1031*22dc650dSSadaf Ebrahimi     case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
1032*22dc650dSSadaf Ebrahimi     case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
1033*22dc650dSSadaf Ebrahimi     case META_PLUS: fprintf(stderr, "META +"); break;
1034*22dc650dSSadaf Ebrahimi     case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
1035*22dc650dSSadaf Ebrahimi     case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
1036*22dc650dSSadaf Ebrahimi     case META_QUERY: fprintf(stderr, "META ?"); break;
1037*22dc650dSSadaf Ebrahimi     case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
1038*22dc650dSSadaf Ebrahimi     case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
1039*22dc650dSSadaf Ebrahimi 
1040*22dc650dSSadaf Ebrahimi     case META_ATOMIC: fprintf(stderr, "META (?>"); break;
1041*22dc650dSSadaf Ebrahimi     case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
1042*22dc650dSSadaf Ebrahimi     case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
1043*22dc650dSSadaf Ebrahimi     case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
1044*22dc650dSSadaf Ebrahimi     case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;
1045*22dc650dSSadaf Ebrahimi     case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
1046*22dc650dSSadaf Ebrahimi     case META_KET: fprintf(stderr, "META )"); break;
1047*22dc650dSSadaf Ebrahimi     case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
1048*22dc650dSSadaf Ebrahimi 
1049*22dc650dSSadaf Ebrahimi     case META_CLASS: fprintf(stderr, "META ["); break;
1050*22dc650dSSadaf Ebrahimi     case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
1051*22dc650dSSadaf Ebrahimi     case META_CLASS_END: fprintf(stderr, "META ]"); break;
1052*22dc650dSSadaf Ebrahimi     case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
1053*22dc650dSSadaf Ebrahimi     case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
1054*22dc650dSSadaf Ebrahimi 
1055*22dc650dSSadaf Ebrahimi     case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
1056*22dc650dSSadaf Ebrahimi     case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
1057*22dc650dSSadaf Ebrahimi 
1058*22dc650dSSadaf Ebrahimi     case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
1059*22dc650dSSadaf Ebrahimi     case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
1060*22dc650dSSadaf Ebrahimi 
1061*22dc650dSSadaf Ebrahimi     case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
1062*22dc650dSSadaf Ebrahimi     case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
1063*22dc650dSSadaf Ebrahimi     case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
1064*22dc650dSSadaf Ebrahimi     case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
1065*22dc650dSSadaf Ebrahimi     case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
1066*22dc650dSSadaf Ebrahimi     case META_THEN: fprintf(stderr, "META (*THEN)"); break;
1067*22dc650dSSadaf Ebrahimi 
1068*22dc650dSSadaf Ebrahimi     case META_OPTIONS:
1069*22dc650dSSadaf Ebrahimi     fprintf(stderr, "META_OPTIONS 0x%08x 0x%08x", pptr[0], pptr[1]);
1070*22dc650dSSadaf Ebrahimi     pptr += 2;
1071*22dc650dSSadaf Ebrahimi     break;
1072*22dc650dSSadaf Ebrahimi 
1073*22dc650dSSadaf Ebrahimi     case META_LOOKBEHIND:
1074*22dc650dSSadaf Ebrahimi     fprintf(stderr, "META (?<= %d %d", meta_arg, *pptr);
1075*22dc650dSSadaf Ebrahimi     pptr += 2;
1076*22dc650dSSadaf Ebrahimi     break;
1077*22dc650dSSadaf Ebrahimi 
1078*22dc650dSSadaf Ebrahimi     case META_LOOKBEHIND_NA:
1079*22dc650dSSadaf Ebrahimi     fprintf(stderr, "META (*naplb: %d %d", meta_arg, *pptr);
1080*22dc650dSSadaf Ebrahimi     pptr += 2;
1081*22dc650dSSadaf Ebrahimi     break;
1082*22dc650dSSadaf Ebrahimi 
1083*22dc650dSSadaf Ebrahimi     case META_LOOKBEHINDNOT:
1084*22dc650dSSadaf Ebrahimi     fprintf(stderr, "META (?<! %d %d", meta_arg, *pptr);
1085*22dc650dSSadaf Ebrahimi     pptr += 2;
1086*22dc650dSSadaf Ebrahimi     break;
1087*22dc650dSSadaf Ebrahimi 
1088*22dc650dSSadaf Ebrahimi     case META_CALLOUT_NUMBER:
1089*22dc650dSSadaf Ebrahimi     fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
1090*22dc650dSSadaf Ebrahimi        pptr[1]);
1091*22dc650dSSadaf Ebrahimi     pptr += 3;
1092*22dc650dSSadaf Ebrahimi     break;
1093*22dc650dSSadaf Ebrahimi 
1094*22dc650dSSadaf Ebrahimi     case META_CALLOUT_STRING:
1095*22dc650dSSadaf Ebrahimi       {
1096*22dc650dSSadaf Ebrahimi       uint32_t patoffset = *pptr++;    /* Offset of next pattern item */
1097*22dc650dSSadaf Ebrahimi       uint32_t patlength = *pptr++;    /* Length of next pattern item */
1098*22dc650dSSadaf Ebrahimi       fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
1099*22dc650dSSadaf Ebrahimi       GETOFFSET(offset, pptr);
1100*22dc650dSSadaf Ebrahimi       fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
1101*22dc650dSSadaf Ebrahimi       }
1102*22dc650dSSadaf Ebrahimi     break;
1103*22dc650dSSadaf Ebrahimi 
1104*22dc650dSSadaf Ebrahimi     case META_RECURSE_BYNAME:
1105*22dc650dSSadaf Ebrahimi     fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
1106*22dc650dSSadaf Ebrahimi     GETOFFSET(offset, pptr);
1107*22dc650dSSadaf Ebrahimi     fprintf(stderr, "%zd", offset);
1108*22dc650dSSadaf Ebrahimi     break;
1109*22dc650dSSadaf Ebrahimi 
1110*22dc650dSSadaf Ebrahimi     case META_BACKREF_BYNAME:
1111*22dc650dSSadaf Ebrahimi     fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
1112*22dc650dSSadaf Ebrahimi     GETOFFSET(offset, pptr);
1113*22dc650dSSadaf Ebrahimi     fprintf(stderr, "%zd", offset);
1114*22dc650dSSadaf Ebrahimi     break;
1115*22dc650dSSadaf Ebrahimi 
1116*22dc650dSSadaf Ebrahimi     case META_COND_NUMBER:
1117*22dc650dSSadaf Ebrahimi     fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
1118*22dc650dSSadaf Ebrahimi     GETOFFSET(offset, pptr);
1119*22dc650dSSadaf Ebrahimi     fprintf(stderr, "%zd", offset);
1120*22dc650dSSadaf Ebrahimi     pptr++;
1121*22dc650dSSadaf Ebrahimi     break;
1122*22dc650dSSadaf Ebrahimi 
1123*22dc650dSSadaf Ebrahimi     case META_COND_DEFINE:
1124*22dc650dSSadaf Ebrahimi     fprintf(stderr, "META (?(DEFINE) offset=");
1125*22dc650dSSadaf Ebrahimi     GETOFFSET(offset, pptr);
1126*22dc650dSSadaf Ebrahimi     fprintf(stderr, "%zd", offset);
1127*22dc650dSSadaf Ebrahimi     break;
1128*22dc650dSSadaf Ebrahimi 
1129*22dc650dSSadaf Ebrahimi     case META_COND_VERSION:
1130*22dc650dSSadaf Ebrahimi     fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
1131*22dc650dSSadaf Ebrahimi     fprintf(stderr, "%d.", *pptr++);
1132*22dc650dSSadaf Ebrahimi     fprintf(stderr, "%d)", *pptr++);
1133*22dc650dSSadaf Ebrahimi     break;
1134*22dc650dSSadaf Ebrahimi 
1135*22dc650dSSadaf Ebrahimi     case META_COND_NAME:
1136*22dc650dSSadaf Ebrahimi     fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1137*22dc650dSSadaf Ebrahimi     GETOFFSET(offset, pptr);
1138*22dc650dSSadaf Ebrahimi     fprintf(stderr, "%zd", offset);
1139*22dc650dSSadaf Ebrahimi     break;
1140*22dc650dSSadaf Ebrahimi 
1141*22dc650dSSadaf Ebrahimi     case META_COND_RNAME:
1142*22dc650dSSadaf Ebrahimi     fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1143*22dc650dSSadaf Ebrahimi     GETOFFSET(offset, pptr);
1144*22dc650dSSadaf Ebrahimi     fprintf(stderr, "%zd", offset);
1145*22dc650dSSadaf Ebrahimi     break;
1146*22dc650dSSadaf Ebrahimi 
1147*22dc650dSSadaf Ebrahimi     /* This is kept as a name, because it might be. */
1148*22dc650dSSadaf Ebrahimi 
1149*22dc650dSSadaf Ebrahimi     case META_COND_RNUMBER:
1150*22dc650dSSadaf Ebrahimi     fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1151*22dc650dSSadaf Ebrahimi     GETOFFSET(offset, pptr);
1152*22dc650dSSadaf Ebrahimi     fprintf(stderr, "%zd", offset);
1153*22dc650dSSadaf Ebrahimi     break;
1154*22dc650dSSadaf Ebrahimi 
1155*22dc650dSSadaf Ebrahimi     case META_MARK:
1156*22dc650dSSadaf Ebrahimi     fprintf(stderr, "META (*MARK:");
1157*22dc650dSSadaf Ebrahimi     goto SHOWARG;
1158*22dc650dSSadaf Ebrahimi 
1159*22dc650dSSadaf Ebrahimi     case META_COMMIT_ARG:
1160*22dc650dSSadaf Ebrahimi     fprintf(stderr, "META (*COMMIT:");
1161*22dc650dSSadaf Ebrahimi     goto SHOWARG;
1162*22dc650dSSadaf Ebrahimi 
1163*22dc650dSSadaf Ebrahimi     case META_PRUNE_ARG:
1164*22dc650dSSadaf Ebrahimi     fprintf(stderr, "META (*PRUNE:");
1165*22dc650dSSadaf Ebrahimi     goto SHOWARG;
1166*22dc650dSSadaf Ebrahimi 
1167*22dc650dSSadaf Ebrahimi     case META_SKIP_ARG:
1168*22dc650dSSadaf Ebrahimi     fprintf(stderr, "META (*SKIP:");
1169*22dc650dSSadaf Ebrahimi     goto SHOWARG;
1170*22dc650dSSadaf Ebrahimi 
1171*22dc650dSSadaf Ebrahimi     case META_THEN_ARG:
1172*22dc650dSSadaf Ebrahimi     fprintf(stderr, "META (*THEN:");
1173*22dc650dSSadaf Ebrahimi     SHOWARG:
1174*22dc650dSSadaf Ebrahimi     length = *pptr++;
1175*22dc650dSSadaf Ebrahimi     for (i = 0; i < length; i++)
1176*22dc650dSSadaf Ebrahimi       {
1177*22dc650dSSadaf Ebrahimi       uint32_t cc = *pptr++;
1178*22dc650dSSadaf Ebrahimi       if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
1179*22dc650dSSadaf Ebrahimi         else fprintf(stderr, "\\x{%x}", cc);
1180*22dc650dSSadaf Ebrahimi       }
1181*22dc650dSSadaf Ebrahimi     fprintf(stderr, ") length=%u", length);
1182*22dc650dSSadaf Ebrahimi     break;
1183*22dc650dSSadaf Ebrahimi     }
1184*22dc650dSSadaf Ebrahimi   fprintf(stderr, "\n");
1185*22dc650dSSadaf Ebrahimi   }
1186*22dc650dSSadaf Ebrahimi return;
1187*22dc650dSSadaf Ebrahimi }
1188*22dc650dSSadaf Ebrahimi #endif  /* DEBUG_SHOW_PARSED */
1189*22dc650dSSadaf Ebrahimi 
1190*22dc650dSSadaf Ebrahimi 
1191*22dc650dSSadaf Ebrahimi 
1192*22dc650dSSadaf Ebrahimi /*************************************************
1193*22dc650dSSadaf Ebrahimi *               Copy compiled code               *
1194*22dc650dSSadaf Ebrahimi *************************************************/
1195*22dc650dSSadaf Ebrahimi 
1196*22dc650dSSadaf Ebrahimi /* Compiled JIT code cannot be copied, so the new compiled block has no
1197*22dc650dSSadaf Ebrahimi associated JIT data. */
1198*22dc650dSSadaf Ebrahimi 
1199*22dc650dSSadaf Ebrahimi PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy(const pcre2_code * code)1200*22dc650dSSadaf Ebrahimi pcre2_code_copy(const pcre2_code *code)
1201*22dc650dSSadaf Ebrahimi {
1202*22dc650dSSadaf Ebrahimi PCRE2_SIZE* ref_count;
1203*22dc650dSSadaf Ebrahimi pcre2_code *newcode;
1204*22dc650dSSadaf Ebrahimi 
1205*22dc650dSSadaf Ebrahimi if (code == NULL) return NULL;
1206*22dc650dSSadaf Ebrahimi newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1207*22dc650dSSadaf Ebrahimi if (newcode == NULL) return NULL;
1208*22dc650dSSadaf Ebrahimi memcpy(newcode, code, code->blocksize);
1209*22dc650dSSadaf Ebrahimi newcode->executable_jit = NULL;
1210*22dc650dSSadaf Ebrahimi 
1211*22dc650dSSadaf Ebrahimi /* If the code is one that has been deserialized, increment the reference count
1212*22dc650dSSadaf Ebrahimi in the decoded tables. */
1213*22dc650dSSadaf Ebrahimi 
1214*22dc650dSSadaf Ebrahimi if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1215*22dc650dSSadaf Ebrahimi   {
1216*22dc650dSSadaf Ebrahimi   ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1217*22dc650dSSadaf Ebrahimi   (*ref_count)++;
1218*22dc650dSSadaf Ebrahimi   }
1219*22dc650dSSadaf Ebrahimi 
1220*22dc650dSSadaf Ebrahimi return newcode;
1221*22dc650dSSadaf Ebrahimi }
1222*22dc650dSSadaf Ebrahimi 
1223*22dc650dSSadaf Ebrahimi 
1224*22dc650dSSadaf Ebrahimi 
1225*22dc650dSSadaf Ebrahimi /*************************************************
1226*22dc650dSSadaf Ebrahimi *     Copy compiled code and character tables    *
1227*22dc650dSSadaf Ebrahimi *************************************************/
1228*22dc650dSSadaf Ebrahimi 
1229*22dc650dSSadaf Ebrahimi /* Compiled JIT code cannot be copied, so the new compiled block has no
1230*22dc650dSSadaf Ebrahimi associated JIT data. This version of code_copy also makes a separate copy of
1231*22dc650dSSadaf Ebrahimi the character tables. */
1232*22dc650dSSadaf Ebrahimi 
1233*22dc650dSSadaf Ebrahimi PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy_with_tables(const pcre2_code * code)1234*22dc650dSSadaf Ebrahimi pcre2_code_copy_with_tables(const pcre2_code *code)
1235*22dc650dSSadaf Ebrahimi {
1236*22dc650dSSadaf Ebrahimi PCRE2_SIZE* ref_count;
1237*22dc650dSSadaf Ebrahimi pcre2_code *newcode;
1238*22dc650dSSadaf Ebrahimi uint8_t *newtables;
1239*22dc650dSSadaf Ebrahimi 
1240*22dc650dSSadaf Ebrahimi if (code == NULL) return NULL;
1241*22dc650dSSadaf Ebrahimi newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1242*22dc650dSSadaf Ebrahimi if (newcode == NULL) return NULL;
1243*22dc650dSSadaf Ebrahimi memcpy(newcode, code, code->blocksize);
1244*22dc650dSSadaf Ebrahimi newcode->executable_jit = NULL;
1245*22dc650dSSadaf Ebrahimi 
1246*22dc650dSSadaf Ebrahimi newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),
1247*22dc650dSSadaf Ebrahimi   code->memctl.memory_data);
1248*22dc650dSSadaf Ebrahimi if (newtables == NULL)
1249*22dc650dSSadaf Ebrahimi   {
1250*22dc650dSSadaf Ebrahimi   code->memctl.free((void *)newcode, code->memctl.memory_data);
1251*22dc650dSSadaf Ebrahimi   return NULL;
1252*22dc650dSSadaf Ebrahimi   }
1253*22dc650dSSadaf Ebrahimi memcpy(newtables, code->tables, TABLES_LENGTH);
1254*22dc650dSSadaf Ebrahimi ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);
1255*22dc650dSSadaf Ebrahimi *ref_count = 1;
1256*22dc650dSSadaf Ebrahimi 
1257*22dc650dSSadaf Ebrahimi newcode->tables = newtables;
1258*22dc650dSSadaf Ebrahimi newcode->flags |= PCRE2_DEREF_TABLES;
1259*22dc650dSSadaf Ebrahimi return newcode;
1260*22dc650dSSadaf Ebrahimi }
1261*22dc650dSSadaf Ebrahimi 
1262*22dc650dSSadaf Ebrahimi 
1263*22dc650dSSadaf Ebrahimi 
1264*22dc650dSSadaf Ebrahimi /*************************************************
1265*22dc650dSSadaf Ebrahimi *               Free compiled code               *
1266*22dc650dSSadaf Ebrahimi *************************************************/
1267*22dc650dSSadaf Ebrahimi 
1268*22dc650dSSadaf Ebrahimi PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_code_free(pcre2_code * code)1269*22dc650dSSadaf Ebrahimi pcre2_code_free(pcre2_code *code)
1270*22dc650dSSadaf Ebrahimi {
1271*22dc650dSSadaf Ebrahimi PCRE2_SIZE* ref_count;
1272*22dc650dSSadaf Ebrahimi 
1273*22dc650dSSadaf Ebrahimi if (code != NULL)
1274*22dc650dSSadaf Ebrahimi   {
1275*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_JIT
1276*22dc650dSSadaf Ebrahimi   if (code->executable_jit != NULL)
1277*22dc650dSSadaf Ebrahimi     PRIV(jit_free)(code->executable_jit, &code->memctl);
1278*22dc650dSSadaf Ebrahimi #endif
1279*22dc650dSSadaf Ebrahimi 
1280*22dc650dSSadaf Ebrahimi   if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1281*22dc650dSSadaf Ebrahimi     {
1282*22dc650dSSadaf Ebrahimi     /* Decoded tables belong to the codes after deserialization, and they must
1283*22dc650dSSadaf Ebrahimi     be freed when there are no more references to them. The *ref_count should
1284*22dc650dSSadaf Ebrahimi     always be > 0. */
1285*22dc650dSSadaf Ebrahimi 
1286*22dc650dSSadaf Ebrahimi     ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1287*22dc650dSSadaf Ebrahimi     if (*ref_count > 0)
1288*22dc650dSSadaf Ebrahimi       {
1289*22dc650dSSadaf Ebrahimi       (*ref_count)--;
1290*22dc650dSSadaf Ebrahimi       if (*ref_count == 0)
1291*22dc650dSSadaf Ebrahimi         code->memctl.free((void *)code->tables, code->memctl.memory_data);
1292*22dc650dSSadaf Ebrahimi       }
1293*22dc650dSSadaf Ebrahimi     }
1294*22dc650dSSadaf Ebrahimi 
1295*22dc650dSSadaf Ebrahimi   code->memctl.free(code, code->memctl.memory_data);
1296*22dc650dSSadaf Ebrahimi   }
1297*22dc650dSSadaf Ebrahimi }
1298*22dc650dSSadaf Ebrahimi 
1299*22dc650dSSadaf Ebrahimi 
1300*22dc650dSSadaf Ebrahimi 
1301*22dc650dSSadaf Ebrahimi /*************************************************
1302*22dc650dSSadaf Ebrahimi *         Read a number, possibly signed         *
1303*22dc650dSSadaf Ebrahimi *************************************************/
1304*22dc650dSSadaf Ebrahimi 
1305*22dc650dSSadaf Ebrahimi /* This function is used to read numbers in the pattern. The initial pointer
1306*22dc650dSSadaf Ebrahimi must be at the sign or first digit of the number. When relative values
1307*22dc650dSSadaf Ebrahimi (introduced by + or -) are allowed, they are relative group numbers, and the
1308*22dc650dSSadaf Ebrahimi result must be greater than zero.
1309*22dc650dSSadaf Ebrahimi 
1310*22dc650dSSadaf Ebrahimi Arguments:
1311*22dc650dSSadaf Ebrahimi   ptrptr      points to the character pointer variable
1312*22dc650dSSadaf Ebrahimi   ptrend      points to the end of the input string
1313*22dc650dSSadaf Ebrahimi   allow_sign  if < 0, sign not allowed; if >= 0, sign is relative to this
1314*22dc650dSSadaf Ebrahimi   max_value   the largest number allowed
1315*22dc650dSSadaf Ebrahimi   max_error   the error to give for an over-large number
1316*22dc650dSSadaf Ebrahimi   intptr      where to put the result
1317*22dc650dSSadaf Ebrahimi   errcodeptr  where to put an error code
1318*22dc650dSSadaf Ebrahimi 
1319*22dc650dSSadaf Ebrahimi Returns:      TRUE  - a number was read
1320*22dc650dSSadaf Ebrahimi               FALSE - errorcode == 0 => no number was found
1321*22dc650dSSadaf Ebrahimi                       errorcode != 0 => an error occurred
1322*22dc650dSSadaf Ebrahimi */
1323*22dc650dSSadaf Ebrahimi 
1324*22dc650dSSadaf Ebrahimi static BOOL
read_number(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,int32_t allow_sign,uint32_t max_value,uint32_t max_error,int * intptr,int * errorcodeptr)1325*22dc650dSSadaf Ebrahimi read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1326*22dc650dSSadaf Ebrahimi   uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
1327*22dc650dSSadaf Ebrahimi {
1328*22dc650dSSadaf Ebrahimi int sign = 0;
1329*22dc650dSSadaf Ebrahimi uint32_t n = 0;
1330*22dc650dSSadaf Ebrahimi PCRE2_SPTR ptr = *ptrptr;
1331*22dc650dSSadaf Ebrahimi BOOL yield = FALSE;
1332*22dc650dSSadaf Ebrahimi 
1333*22dc650dSSadaf Ebrahimi *errorcodeptr = 0;
1334*22dc650dSSadaf Ebrahimi 
1335*22dc650dSSadaf Ebrahimi if (allow_sign >= 0 && ptr < ptrend)
1336*22dc650dSSadaf Ebrahimi   {
1337*22dc650dSSadaf Ebrahimi   if (*ptr == CHAR_PLUS)
1338*22dc650dSSadaf Ebrahimi     {
1339*22dc650dSSadaf Ebrahimi     sign = +1;
1340*22dc650dSSadaf Ebrahimi     max_value -= allow_sign;
1341*22dc650dSSadaf Ebrahimi     ptr++;
1342*22dc650dSSadaf Ebrahimi     }
1343*22dc650dSSadaf Ebrahimi   else if (*ptr == CHAR_MINUS)
1344*22dc650dSSadaf Ebrahimi     {
1345*22dc650dSSadaf Ebrahimi     sign = -1;
1346*22dc650dSSadaf Ebrahimi     ptr++;
1347*22dc650dSSadaf Ebrahimi     }
1348*22dc650dSSadaf Ebrahimi   }
1349*22dc650dSSadaf Ebrahimi 
1350*22dc650dSSadaf Ebrahimi if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
1351*22dc650dSSadaf Ebrahimi while (ptr < ptrend && IS_DIGIT(*ptr))
1352*22dc650dSSadaf Ebrahimi   {
1353*22dc650dSSadaf Ebrahimi   n = n * 10 + *ptr++ - CHAR_0;
1354*22dc650dSSadaf Ebrahimi   if (n > max_value)
1355*22dc650dSSadaf Ebrahimi     {
1356*22dc650dSSadaf Ebrahimi     *errorcodeptr = max_error;
1357*22dc650dSSadaf Ebrahimi     goto EXIT;
1358*22dc650dSSadaf Ebrahimi     }
1359*22dc650dSSadaf Ebrahimi   }
1360*22dc650dSSadaf Ebrahimi 
1361*22dc650dSSadaf Ebrahimi if (allow_sign >= 0 && sign != 0)
1362*22dc650dSSadaf Ebrahimi   {
1363*22dc650dSSadaf Ebrahimi   if (n == 0)
1364*22dc650dSSadaf Ebrahimi     {
1365*22dc650dSSadaf Ebrahimi     *errorcodeptr = ERR26;  /* +0 and -0 are not allowed */
1366*22dc650dSSadaf Ebrahimi     goto EXIT;
1367*22dc650dSSadaf Ebrahimi     }
1368*22dc650dSSadaf Ebrahimi 
1369*22dc650dSSadaf Ebrahimi   if (sign > 0) n += allow_sign;
1370*22dc650dSSadaf Ebrahimi   else if ((int)n > allow_sign)
1371*22dc650dSSadaf Ebrahimi     {
1372*22dc650dSSadaf Ebrahimi     *errorcodeptr = ERR15;  /* Non-existent subpattern */
1373*22dc650dSSadaf Ebrahimi     goto EXIT;
1374*22dc650dSSadaf Ebrahimi     }
1375*22dc650dSSadaf Ebrahimi   else n = allow_sign + 1 - n;
1376*22dc650dSSadaf Ebrahimi   }
1377*22dc650dSSadaf Ebrahimi 
1378*22dc650dSSadaf Ebrahimi yield = TRUE;
1379*22dc650dSSadaf Ebrahimi 
1380*22dc650dSSadaf Ebrahimi EXIT:
1381*22dc650dSSadaf Ebrahimi *intptr = n;
1382*22dc650dSSadaf Ebrahimi *ptrptr = ptr;
1383*22dc650dSSadaf Ebrahimi return yield;
1384*22dc650dSSadaf Ebrahimi }
1385*22dc650dSSadaf Ebrahimi 
1386*22dc650dSSadaf Ebrahimi 
1387*22dc650dSSadaf Ebrahimi 
1388*22dc650dSSadaf Ebrahimi /*************************************************
1389*22dc650dSSadaf Ebrahimi *         Read repeat counts                     *
1390*22dc650dSSadaf Ebrahimi *************************************************/
1391*22dc650dSSadaf Ebrahimi 
1392*22dc650dSSadaf Ebrahimi /* Read an item of the form {n,m} and return the values when non-NULL pointers
1393*22dc650dSSadaf Ebrahimi are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1394*22dc650dSSadaf Ebrahimi larger value is used for "unlimited". We have to use signed arguments for
1395*22dc650dSSadaf Ebrahimi read_number() because it is capable of returning a signed value. As of Perl
1396*22dc650dSSadaf Ebrahimi 5.34.0 either n or m may be absent, but not both. Perl also allows spaces and
1397*22dc650dSSadaf Ebrahimi tabs after { and before } and between the numbers and the comma, so we do too.
1398*22dc650dSSadaf Ebrahimi 
1399*22dc650dSSadaf Ebrahimi Arguments:
1400*22dc650dSSadaf Ebrahimi   ptrptr         points to pointer to character after '{'
1401*22dc650dSSadaf Ebrahimi   ptrend         pointer to end of input
1402*22dc650dSSadaf Ebrahimi   minp           if not NULL, pointer to int for min
1403*22dc650dSSadaf Ebrahimi   maxp           if not NULL, pointer to int for max
1404*22dc650dSSadaf Ebrahimi   errorcodeptr   points to error code variable
1405*22dc650dSSadaf Ebrahimi 
1406*22dc650dSSadaf Ebrahimi Returns:         FALSE if not a repeat quantifier, errorcode set zero
1407*22dc650dSSadaf Ebrahimi                  FALSE on error, with errorcode set non-zero
1408*22dc650dSSadaf Ebrahimi                  TRUE on success, with pointer updated to point after '}'
1409*22dc650dSSadaf Ebrahimi */
1410*22dc650dSSadaf Ebrahimi 
1411*22dc650dSSadaf Ebrahimi static BOOL
read_repeat_counts(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,uint32_t * minp,uint32_t * maxp,int * errorcodeptr)1412*22dc650dSSadaf Ebrahimi read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
1413*22dc650dSSadaf Ebrahimi   uint32_t *maxp, int *errorcodeptr)
1414*22dc650dSSadaf Ebrahimi {
1415*22dc650dSSadaf Ebrahimi PCRE2_SPTR p = *ptrptr;
1416*22dc650dSSadaf Ebrahimi PCRE2_SPTR pp;
1417*22dc650dSSadaf Ebrahimi BOOL yield = FALSE;
1418*22dc650dSSadaf Ebrahimi BOOL had_minimum = FALSE;
1419*22dc650dSSadaf Ebrahimi int32_t min = 0;
1420*22dc650dSSadaf Ebrahimi int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
1421*22dc650dSSadaf Ebrahimi 
1422*22dc650dSSadaf Ebrahimi *errorcodeptr = 0;
1423*22dc650dSSadaf Ebrahimi while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1424*22dc650dSSadaf Ebrahimi 
1425*22dc650dSSadaf Ebrahimi /* Check the syntax before interpreting. Otherwise, a non-quantifier sequence
1426*22dc650dSSadaf Ebrahimi such as "X{123456ABC" would incorrectly give a "number too big in quantifier"
1427*22dc650dSSadaf Ebrahimi error. */
1428*22dc650dSSadaf Ebrahimi 
1429*22dc650dSSadaf Ebrahimi pp = p;
1430*22dc650dSSadaf Ebrahimi if (pp < ptrend && IS_DIGIT(*pp))
1431*22dc650dSSadaf Ebrahimi   {
1432*22dc650dSSadaf Ebrahimi   had_minimum = TRUE;
1433*22dc650dSSadaf Ebrahimi   while (++pp < ptrend && IS_DIGIT(*pp)) {}
1434*22dc650dSSadaf Ebrahimi   }
1435*22dc650dSSadaf Ebrahimi 
1436*22dc650dSSadaf Ebrahimi while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1437*22dc650dSSadaf Ebrahimi if (pp >= ptrend) return FALSE;
1438*22dc650dSSadaf Ebrahimi 
1439*22dc650dSSadaf Ebrahimi if (*pp == CHAR_RIGHT_CURLY_BRACKET)
1440*22dc650dSSadaf Ebrahimi   {
1441*22dc650dSSadaf Ebrahimi   if (!had_minimum) return FALSE;
1442*22dc650dSSadaf Ebrahimi   }
1443*22dc650dSSadaf Ebrahimi else
1444*22dc650dSSadaf Ebrahimi   {
1445*22dc650dSSadaf Ebrahimi   if (*pp++ != CHAR_COMMA) return FALSE;
1446*22dc650dSSadaf Ebrahimi   while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1447*22dc650dSSadaf Ebrahimi   if (pp >= ptrend) return FALSE;
1448*22dc650dSSadaf Ebrahimi   if (IS_DIGIT(*pp))
1449*22dc650dSSadaf Ebrahimi     {
1450*22dc650dSSadaf Ebrahimi     while (++pp < ptrend && IS_DIGIT(*pp)) {}
1451*22dc650dSSadaf Ebrahimi     }
1452*22dc650dSSadaf Ebrahimi   else if (!had_minimum) return FALSE;
1453*22dc650dSSadaf Ebrahimi   while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1454*22dc650dSSadaf Ebrahimi   if (pp >= ptrend || *pp != CHAR_RIGHT_CURLY_BRACKET) return FALSE;
1455*22dc650dSSadaf Ebrahimi   }
1456*22dc650dSSadaf Ebrahimi 
1457*22dc650dSSadaf Ebrahimi /* Now process the quantifier for real. We know it must be {n} or (n,} or {,m}
1458*22dc650dSSadaf Ebrahimi or {n,m}. The only error that read_number() can return is for a number that is
1459*22dc650dSSadaf Ebrahimi too big. If *errorcodeptr is returned as zero it means no number was found. */
1460*22dc650dSSadaf Ebrahimi 
1461*22dc650dSSadaf Ebrahimi /* Deal with {,m} or n too big. If we successfully read m there is no need to
1462*22dc650dSSadaf Ebrahimi check m >= n because n defaults to zero. */
1463*22dc650dSSadaf Ebrahimi 
1464*22dc650dSSadaf Ebrahimi if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1465*22dc650dSSadaf Ebrahimi   {
1466*22dc650dSSadaf Ebrahimi   if (*errorcodeptr != 0) goto EXIT;    /* n too big */
1467*22dc650dSSadaf Ebrahimi   p++;  /* Skip comma and subsequent spaces */
1468*22dc650dSSadaf Ebrahimi   while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1469*22dc650dSSadaf Ebrahimi   if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1470*22dc650dSSadaf Ebrahimi     {
1471*22dc650dSSadaf Ebrahimi     if (*errorcodeptr != 0) goto EXIT;  /* m too big */
1472*22dc650dSSadaf Ebrahimi     }
1473*22dc650dSSadaf Ebrahimi   }
1474*22dc650dSSadaf Ebrahimi 
1475*22dc650dSSadaf Ebrahimi /* Have read one number. Deal with {n} or {n,} or {n,m} */
1476*22dc650dSSadaf Ebrahimi 
1477*22dc650dSSadaf Ebrahimi else
1478*22dc650dSSadaf Ebrahimi   {
1479*22dc650dSSadaf Ebrahimi   while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1480*22dc650dSSadaf Ebrahimi   if (*p == CHAR_RIGHT_CURLY_BRACKET)
1481*22dc650dSSadaf Ebrahimi     {
1482*22dc650dSSadaf Ebrahimi     max = min;
1483*22dc650dSSadaf Ebrahimi     }
1484*22dc650dSSadaf Ebrahimi   else   /* Handle {n,} or {n,m} */
1485*22dc650dSSadaf Ebrahimi     {
1486*22dc650dSSadaf Ebrahimi     p++;    /* Skip comma and subsequent spaces */
1487*22dc650dSSadaf Ebrahimi     while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1488*22dc650dSSadaf Ebrahimi     if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1489*22dc650dSSadaf Ebrahimi       {
1490*22dc650dSSadaf Ebrahimi       if (*errorcodeptr != 0) goto EXIT;   /* m too big */
1491*22dc650dSSadaf Ebrahimi       }
1492*22dc650dSSadaf Ebrahimi 
1493*22dc650dSSadaf Ebrahimi     if (max < min)
1494*22dc650dSSadaf Ebrahimi       {
1495*22dc650dSSadaf Ebrahimi       *errorcodeptr = ERR4;
1496*22dc650dSSadaf Ebrahimi       goto EXIT;
1497*22dc650dSSadaf Ebrahimi       }
1498*22dc650dSSadaf Ebrahimi     }
1499*22dc650dSSadaf Ebrahimi   }
1500*22dc650dSSadaf Ebrahimi 
1501*22dc650dSSadaf Ebrahimi /* Valid quantifier exists */
1502*22dc650dSSadaf Ebrahimi 
1503*22dc650dSSadaf Ebrahimi while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1504*22dc650dSSadaf Ebrahimi p++;
1505*22dc650dSSadaf Ebrahimi yield = TRUE;
1506*22dc650dSSadaf Ebrahimi if (minp != NULL) *minp = (uint32_t)min;
1507*22dc650dSSadaf Ebrahimi if (maxp != NULL) *maxp = (uint32_t)max;
1508*22dc650dSSadaf Ebrahimi 
1509*22dc650dSSadaf Ebrahimi /* Update the pattern pointer */
1510*22dc650dSSadaf Ebrahimi 
1511*22dc650dSSadaf Ebrahimi EXIT:
1512*22dc650dSSadaf Ebrahimi *ptrptr = p;
1513*22dc650dSSadaf Ebrahimi return yield;
1514*22dc650dSSadaf Ebrahimi }
1515*22dc650dSSadaf Ebrahimi 
1516*22dc650dSSadaf Ebrahimi 
1517*22dc650dSSadaf Ebrahimi 
1518*22dc650dSSadaf Ebrahimi /*************************************************
1519*22dc650dSSadaf Ebrahimi *            Handle escapes                      *
1520*22dc650dSSadaf Ebrahimi *************************************************/
1521*22dc650dSSadaf Ebrahimi 
1522*22dc650dSSadaf Ebrahimi /* This function is called when a \ has been encountered. It either returns a
1523*22dc650dSSadaf Ebrahimi positive value for a simple escape such as \d, or 0 for a data character, which
1524*22dc650dSSadaf Ebrahimi is placed in chptr. A backreference to group n is returned as negative n. On
1525*22dc650dSSadaf Ebrahimi entry, ptr is pointing at the character after \. On exit, it points after the
1526*22dc650dSSadaf Ebrahimi final code unit of the escape sequence.
1527*22dc650dSSadaf Ebrahimi 
1528*22dc650dSSadaf Ebrahimi This function is also called from pcre2_substitute() to handle escape sequences
1529*22dc650dSSadaf Ebrahimi in replacement strings. In this case, the cb argument is NULL, and in the case
1530*22dc650dSSadaf Ebrahimi of escapes that have further processing, only sequences that define a data
1531*22dc650dSSadaf Ebrahimi character are recognised. The isclass argument is not relevant; the options
1532*22dc650dSSadaf Ebrahimi argument is the final value of the compiled pattern's options.
1533*22dc650dSSadaf Ebrahimi 
1534*22dc650dSSadaf Ebrahimi Arguments:
1535*22dc650dSSadaf Ebrahimi   ptrptr         points to the input position pointer
1536*22dc650dSSadaf Ebrahimi   ptrend         points to the end of the input
1537*22dc650dSSadaf Ebrahimi   chptr          points to a returned data character
1538*22dc650dSSadaf Ebrahimi   errorcodeptr   points to the errorcode variable (containing zero)
1539*22dc650dSSadaf Ebrahimi   options        the current options bits
1540*22dc650dSSadaf Ebrahimi   xoptions       the current extra options bits
1541*22dc650dSSadaf Ebrahimi   isclass        TRUE if inside a character class
1542*22dc650dSSadaf Ebrahimi   cb             compile data block or NULL when called from pcre2_substitute()
1543*22dc650dSSadaf Ebrahimi 
1544*22dc650dSSadaf Ebrahimi Returns:         zero => a data character
1545*22dc650dSSadaf Ebrahimi                  positive => a special escape sequence
1546*22dc650dSSadaf Ebrahimi                  negative => a numerical back reference
1547*22dc650dSSadaf Ebrahimi                  on error, errorcodeptr is set non-zero
1548*22dc650dSSadaf Ebrahimi */
1549*22dc650dSSadaf Ebrahimi 
1550*22dc650dSSadaf Ebrahimi int
PRIV(check_escape)1551*22dc650dSSadaf Ebrahimi PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1552*22dc650dSSadaf Ebrahimi   int *errorcodeptr, uint32_t options, uint32_t xoptions, BOOL isclass,
1553*22dc650dSSadaf Ebrahimi   compile_block *cb)
1554*22dc650dSSadaf Ebrahimi {
1555*22dc650dSSadaf Ebrahimi BOOL utf = (options & PCRE2_UTF) != 0;
1556*22dc650dSSadaf Ebrahimi BOOL alt_bsux =
1557*22dc650dSSadaf Ebrahimi   ((options & PCRE2_ALT_BSUX) | (xoptions & PCRE2_EXTRA_ALT_BSUX)) != 0;
1558*22dc650dSSadaf Ebrahimi PCRE2_SPTR ptr = *ptrptr;
1559*22dc650dSSadaf Ebrahimi uint32_t c, cc;
1560*22dc650dSSadaf Ebrahimi int escape = 0;
1561*22dc650dSSadaf Ebrahimi int i;
1562*22dc650dSSadaf Ebrahimi 
1563*22dc650dSSadaf Ebrahimi /* If backslash is at the end of the string, it's an error. */
1564*22dc650dSSadaf Ebrahimi 
1565*22dc650dSSadaf Ebrahimi if (ptr >= ptrend)
1566*22dc650dSSadaf Ebrahimi   {
1567*22dc650dSSadaf Ebrahimi   *errorcodeptr = ERR1;
1568*22dc650dSSadaf Ebrahimi   return 0;
1569*22dc650dSSadaf Ebrahimi   }
1570*22dc650dSSadaf Ebrahimi 
1571*22dc650dSSadaf Ebrahimi GETCHARINCTEST(c, ptr);         /* Get character value, increment pointer */
1572*22dc650dSSadaf Ebrahimi *errorcodeptr = 0;              /* Be optimistic */
1573*22dc650dSSadaf Ebrahimi 
1574*22dc650dSSadaf Ebrahimi /* Non-alphanumerics are literals, so we just leave the value in c. An initial
1575*22dc650dSSadaf Ebrahimi value test saves a memory lookup for code points outside the alphanumeric
1576*22dc650dSSadaf Ebrahimi range. */
1577*22dc650dSSadaf Ebrahimi 
1578*22dc650dSSadaf Ebrahimi if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {}  /* Definitely literal */
1579*22dc650dSSadaf Ebrahimi 
1580*22dc650dSSadaf Ebrahimi /* Otherwise, do a table lookup. Non-zero values need little processing here. A
1581*22dc650dSSadaf Ebrahimi positive value is a literal value for something like \n. A negative value is
1582*22dc650dSSadaf Ebrahimi the negation of one of the ESC_ macros that is passed back for handling by the
1583*22dc650dSSadaf Ebrahimi calling function. Some extra checking is needed for \N because only \N{U+dddd}
1584*22dc650dSSadaf Ebrahimi is supported. If the value is zero, further processing is handled below. */
1585*22dc650dSSadaf Ebrahimi 
1586*22dc650dSSadaf Ebrahimi else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1587*22dc650dSSadaf Ebrahimi   {
1588*22dc650dSSadaf Ebrahimi   if (i > 0)
1589*22dc650dSSadaf Ebrahimi     {
1590*22dc650dSSadaf Ebrahimi     c = (uint32_t)i;
1591*22dc650dSSadaf Ebrahimi     if (c == CHAR_CR && (xoptions & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
1592*22dc650dSSadaf Ebrahimi       c = CHAR_LF;
1593*22dc650dSSadaf Ebrahimi     }
1594*22dc650dSSadaf Ebrahimi   else  /* Negative table entry */
1595*22dc650dSSadaf Ebrahimi     {
1596*22dc650dSSadaf Ebrahimi     escape = -i;                    /* Else return a special escape */
1597*22dc650dSSadaf Ebrahimi     if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
1598*22dc650dSSadaf Ebrahimi       cb->external_flags |= PCRE2_HASBKPORX;   /* Note \P, \p, or \X */
1599*22dc650dSSadaf Ebrahimi 
1600*22dc650dSSadaf Ebrahimi     /* Perl supports \N{name} for character names and \N{U+dddd} for numerical
1601*22dc650dSSadaf Ebrahimi     Unicode code points, as well as plain \N for "not newline". PCRE does not
1602*22dc650dSSadaf Ebrahimi     support \N{name}. However, it does support quantification such as \N{2,3},
1603*22dc650dSSadaf Ebrahimi     so if \N{ is not followed by U+dddd we check for a quantifier. */
1604*22dc650dSSadaf Ebrahimi 
1605*22dc650dSSadaf Ebrahimi     if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1606*22dc650dSSadaf Ebrahimi       {
1607*22dc650dSSadaf Ebrahimi       PCRE2_SPTR p = ptr + 1;
1608*22dc650dSSadaf Ebrahimi 
1609*22dc650dSSadaf Ebrahimi       /* Perl ignores spaces and tabs after { */
1610*22dc650dSSadaf Ebrahimi 
1611*22dc650dSSadaf Ebrahimi       while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1612*22dc650dSSadaf Ebrahimi 
1613*22dc650dSSadaf Ebrahimi       /* \N{U+ can be handled by the \x{ code. However, this construction is
1614*22dc650dSSadaf Ebrahimi       not valid in EBCDIC environments because it specifies a Unicode
1615*22dc650dSSadaf Ebrahimi       character, not a codepoint in the local code. For example \N{U+0041}
1616*22dc650dSSadaf Ebrahimi       must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1617*22dc650dSSadaf Ebrahimi       casing semantics for the entire pattern, so allow it only in UTF (i.e.
1618*22dc650dSSadaf Ebrahimi       Unicode) mode. */
1619*22dc650dSSadaf Ebrahimi 
1620*22dc650dSSadaf Ebrahimi       if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
1621*22dc650dSSadaf Ebrahimi         {
1622*22dc650dSSadaf Ebrahimi #ifdef EBCDIC
1623*22dc650dSSadaf Ebrahimi         *errorcodeptr = ERR93;
1624*22dc650dSSadaf Ebrahimi #else
1625*22dc650dSSadaf Ebrahimi         if (utf)
1626*22dc650dSSadaf Ebrahimi           {
1627*22dc650dSSadaf Ebrahimi           ptr = p + 2;
1628*22dc650dSSadaf Ebrahimi           escape = 0;   /* Not a fancy escape after all */
1629*22dc650dSSadaf Ebrahimi           goto COME_FROM_NU;
1630*22dc650dSSadaf Ebrahimi           }
1631*22dc650dSSadaf Ebrahimi         else *errorcodeptr = ERR93;
1632*22dc650dSSadaf Ebrahimi #endif
1633*22dc650dSSadaf Ebrahimi         }
1634*22dc650dSSadaf Ebrahimi 
1635*22dc650dSSadaf Ebrahimi       /* Give an error if what follows is not a quantifier, but don't override
1636*22dc650dSSadaf Ebrahimi       an error set by the quantifier reader (e.g. number overflow). */
1637*22dc650dSSadaf Ebrahimi 
1638*22dc650dSSadaf Ebrahimi       else
1639*22dc650dSSadaf Ebrahimi         {
1640*22dc650dSSadaf Ebrahimi         if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1641*22dc650dSSadaf Ebrahimi              *errorcodeptr == 0)
1642*22dc650dSSadaf Ebrahimi           *errorcodeptr = ERR37;
1643*22dc650dSSadaf Ebrahimi         }
1644*22dc650dSSadaf Ebrahimi       }
1645*22dc650dSSadaf Ebrahimi     }
1646*22dc650dSSadaf Ebrahimi   }
1647*22dc650dSSadaf Ebrahimi 
1648*22dc650dSSadaf Ebrahimi /* Escapes that need further processing, including those that are unknown, have
1649*22dc650dSSadaf Ebrahimi a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
1650*22dc650dSSadaf Ebrahimi \o, and \x are recognized (\u and \U can never appear as they are used for case
1651*22dc650dSSadaf Ebrahimi forcing). */
1652*22dc650dSSadaf Ebrahimi 
1653*22dc650dSSadaf Ebrahimi else
1654*22dc650dSSadaf Ebrahimi   {
1655*22dc650dSSadaf Ebrahimi   int s;
1656*22dc650dSSadaf Ebrahimi   PCRE2_SPTR oldptr;
1657*22dc650dSSadaf Ebrahimi   BOOL overflow;
1658*22dc650dSSadaf Ebrahimi 
1659*22dc650dSSadaf Ebrahimi   /* Filter calls from pcre2_substitute(). */
1660*22dc650dSSadaf Ebrahimi 
1661*22dc650dSSadaf Ebrahimi   if (cb == NULL)
1662*22dc650dSSadaf Ebrahimi     {
1663*22dc650dSSadaf Ebrahimi     if (c != CHAR_c && c != CHAR_o && c != CHAR_x)
1664*22dc650dSSadaf Ebrahimi       {
1665*22dc650dSSadaf Ebrahimi       *errorcodeptr = ERR3;
1666*22dc650dSSadaf Ebrahimi       return 0;
1667*22dc650dSSadaf Ebrahimi       }
1668*22dc650dSSadaf Ebrahimi     alt_bsux = FALSE;   /* Do not modify \x handling */
1669*22dc650dSSadaf Ebrahimi     }
1670*22dc650dSSadaf Ebrahimi 
1671*22dc650dSSadaf Ebrahimi   switch (c)
1672*22dc650dSSadaf Ebrahimi     {
1673*22dc650dSSadaf Ebrahimi     /* A number of Perl escapes are not handled by PCRE. We give an explicit
1674*22dc650dSSadaf Ebrahimi     error. */
1675*22dc650dSSadaf Ebrahimi 
1676*22dc650dSSadaf Ebrahimi     case CHAR_F:
1677*22dc650dSSadaf Ebrahimi     case CHAR_l:
1678*22dc650dSSadaf Ebrahimi     case CHAR_L:
1679*22dc650dSSadaf Ebrahimi     *errorcodeptr = ERR37;
1680*22dc650dSSadaf Ebrahimi     break;
1681*22dc650dSSadaf Ebrahimi 
1682*22dc650dSSadaf Ebrahimi     /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
1683*22dc650dSSadaf Ebrahimi     is set. Otherwise, \u must be followed by exactly four hex digits or, if
1684*22dc650dSSadaf Ebrahimi     PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
1685*22dc650dSSadaf Ebrahimi     Otherwise it is a lowercase u letter. This gives some compatibility with
1686*22dc650dSSadaf Ebrahimi     ECMAScript (aka JavaScript). Unlike other braced items, white space is NOT
1687*22dc650dSSadaf Ebrahimi     allowed. When \u{ is not followed by hex digits, a special return is given
1688*22dc650dSSadaf Ebrahimi     because otherwise \u{ 12} (for example) would be treated as u{12}. */
1689*22dc650dSSadaf Ebrahimi 
1690*22dc650dSSadaf Ebrahimi     case CHAR_u:
1691*22dc650dSSadaf Ebrahimi     if (!alt_bsux) *errorcodeptr = ERR37; else
1692*22dc650dSSadaf Ebrahimi       {
1693*22dc650dSSadaf Ebrahimi       uint32_t xc;
1694*22dc650dSSadaf Ebrahimi 
1695*22dc650dSSadaf Ebrahimi       if (ptr >= ptrend) break;
1696*22dc650dSSadaf Ebrahimi       if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
1697*22dc650dSSadaf Ebrahimi           (xoptions & PCRE2_EXTRA_ALT_BSUX) != 0)
1698*22dc650dSSadaf Ebrahimi         {
1699*22dc650dSSadaf Ebrahimi         PCRE2_SPTR hptr = ptr + 1;
1700*22dc650dSSadaf Ebrahimi 
1701*22dc650dSSadaf Ebrahimi         cc = 0;
1702*22dc650dSSadaf Ebrahimi         while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
1703*22dc650dSSadaf Ebrahimi           {
1704*22dc650dSSadaf Ebrahimi           if ((cc & 0xf0000000) != 0)  /* Test for 32-bit overflow */
1705*22dc650dSSadaf Ebrahimi             {
1706*22dc650dSSadaf Ebrahimi             *errorcodeptr = ERR77;
1707*22dc650dSSadaf Ebrahimi             ptr = hptr;   /* Show where */
1708*22dc650dSSadaf Ebrahimi             break;        /* *hptr != } will cause another break below */
1709*22dc650dSSadaf Ebrahimi             }
1710*22dc650dSSadaf Ebrahimi           cc = (cc << 4) | xc;
1711*22dc650dSSadaf Ebrahimi           hptr++;
1712*22dc650dSSadaf Ebrahimi           }
1713*22dc650dSSadaf Ebrahimi 
1714*22dc650dSSadaf Ebrahimi         if (hptr == ptr + 1 ||   /* No hex digits */
1715*22dc650dSSadaf Ebrahimi             hptr >= ptrend ||    /* Hit end of input */
1716*22dc650dSSadaf Ebrahimi             *hptr != CHAR_RIGHT_CURLY_BRACKET)  /* No } terminator */
1717*22dc650dSSadaf Ebrahimi           {
1718*22dc650dSSadaf Ebrahimi           escape = ESC_ub;    /* Special return */
1719*22dc650dSSadaf Ebrahimi           ptr++;              /* Skip { */
1720*22dc650dSSadaf Ebrahimi           break;              /* Hex escape not recognized */
1721*22dc650dSSadaf Ebrahimi           }
1722*22dc650dSSadaf Ebrahimi 
1723*22dc650dSSadaf Ebrahimi         c = cc;          /* Accept the code point */
1724*22dc650dSSadaf Ebrahimi         ptr = hptr + 1;
1725*22dc650dSSadaf Ebrahimi         }
1726*22dc650dSSadaf Ebrahimi 
1727*22dc650dSSadaf Ebrahimi       else  /* Must be exactly 4 hex digits */
1728*22dc650dSSadaf Ebrahimi         {
1729*22dc650dSSadaf Ebrahimi         if (ptrend - ptr < 4) break;               /* Less than 4 chars */
1730*22dc650dSSadaf Ebrahimi         if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
1731*22dc650dSSadaf Ebrahimi         if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1732*22dc650dSSadaf Ebrahimi         cc = (cc << 4) | xc;
1733*22dc650dSSadaf Ebrahimi         if ((xc = XDIGIT(ptr[2])) == 0xff) break;  /* Not a hex digit */
1734*22dc650dSSadaf Ebrahimi         cc = (cc << 4) | xc;
1735*22dc650dSSadaf Ebrahimi         if ((xc = XDIGIT(ptr[3])) == 0xff) break;  /* Not a hex digit */
1736*22dc650dSSadaf Ebrahimi         c = (cc << 4) | xc;
1737*22dc650dSSadaf Ebrahimi         ptr += 4;
1738*22dc650dSSadaf Ebrahimi         }
1739*22dc650dSSadaf Ebrahimi 
1740*22dc650dSSadaf Ebrahimi       if (utf)
1741*22dc650dSSadaf Ebrahimi         {
1742*22dc650dSSadaf Ebrahimi         if (c > 0x10ffffU) *errorcodeptr = ERR77;
1743*22dc650dSSadaf Ebrahimi         else
1744*22dc650dSSadaf Ebrahimi           if (c >= 0xd800 && c <= 0xdfff &&
1745*22dc650dSSadaf Ebrahimi               (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1746*22dc650dSSadaf Ebrahimi                 *errorcodeptr = ERR73;
1747*22dc650dSSadaf Ebrahimi         }
1748*22dc650dSSadaf Ebrahimi       else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1749*22dc650dSSadaf Ebrahimi       }
1750*22dc650dSSadaf Ebrahimi     break;
1751*22dc650dSSadaf Ebrahimi 
1752*22dc650dSSadaf Ebrahimi     /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
1753*22dc650dSSadaf Ebrahimi     in which case it is an upper case letter. */
1754*22dc650dSSadaf Ebrahimi 
1755*22dc650dSSadaf Ebrahimi     case CHAR_U:
1756*22dc650dSSadaf Ebrahimi     if (!alt_bsux) *errorcodeptr = ERR37;
1757*22dc650dSSadaf Ebrahimi     break;
1758*22dc650dSSadaf Ebrahimi 
1759*22dc650dSSadaf Ebrahimi     /* In a character class, \g is just a literal "g". Outside a character
1760*22dc650dSSadaf Ebrahimi     class, \g must be followed by one of a number of specific things:
1761*22dc650dSSadaf Ebrahimi 
1762*22dc650dSSadaf Ebrahimi     (1) A number, either plain or braced. If positive, it is an absolute
1763*22dc650dSSadaf Ebrahimi     backreference. If negative, it is a relative backreference. This is a Perl
1764*22dc650dSSadaf Ebrahimi     5.10 feature.
1765*22dc650dSSadaf Ebrahimi 
1766*22dc650dSSadaf Ebrahimi     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1767*22dc650dSSadaf Ebrahimi     is part of Perl's movement towards a unified syntax for back references. As
1768*22dc650dSSadaf Ebrahimi     this is synonymous with \k{name}, we fudge it up by pretending it really
1769*22dc650dSSadaf Ebrahimi     was \k{name}.
1770*22dc650dSSadaf Ebrahimi 
1771*22dc650dSSadaf Ebrahimi     (3) For Oniguruma compatibility we also support \g followed by a name or a
1772*22dc650dSSadaf Ebrahimi     number either in angle brackets or in single quotes. However, these are
1773*22dc650dSSadaf Ebrahimi     (possibly recursive) subroutine calls, _not_ backreferences. We return
1774*22dc650dSSadaf Ebrahimi     the ESC_g code.
1775*22dc650dSSadaf Ebrahimi 
1776*22dc650dSSadaf Ebrahimi     Summary: Return a negative number for a numerical back reference, ESC_k for
1777*22dc650dSSadaf Ebrahimi     a named back reference, and ESC_g for a named or numbered subroutine call.
1778*22dc650dSSadaf Ebrahimi     */
1779*22dc650dSSadaf Ebrahimi 
1780*22dc650dSSadaf Ebrahimi     case CHAR_g:
1781*22dc650dSSadaf Ebrahimi     if (isclass) break;
1782*22dc650dSSadaf Ebrahimi 
1783*22dc650dSSadaf Ebrahimi     if (ptr >= ptrend)
1784*22dc650dSSadaf Ebrahimi       {
1785*22dc650dSSadaf Ebrahimi       *errorcodeptr = ERR57;
1786*22dc650dSSadaf Ebrahimi       break;
1787*22dc650dSSadaf Ebrahimi       }
1788*22dc650dSSadaf Ebrahimi 
1789*22dc650dSSadaf Ebrahimi     if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
1790*22dc650dSSadaf Ebrahimi       {
1791*22dc650dSSadaf Ebrahimi       escape = ESC_g;
1792*22dc650dSSadaf Ebrahimi       break;
1793*22dc650dSSadaf Ebrahimi       }
1794*22dc650dSSadaf Ebrahimi 
1795*22dc650dSSadaf Ebrahimi     /* If there is a brace delimiter, try to read a numerical reference. If
1796*22dc650dSSadaf Ebrahimi     there isn't one, assume we have a name and treat it as \k. */
1797*22dc650dSSadaf Ebrahimi 
1798*22dc650dSSadaf Ebrahimi     if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1799*22dc650dSSadaf Ebrahimi       {
1800*22dc650dSSadaf Ebrahimi       PCRE2_SPTR p = ptr + 1;
1801*22dc650dSSadaf Ebrahimi 
1802*22dc650dSSadaf Ebrahimi       while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1803*22dc650dSSadaf Ebrahimi       if (!read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1804*22dc650dSSadaf Ebrahimi           errorcodeptr))
1805*22dc650dSSadaf Ebrahimi         {
1806*22dc650dSSadaf Ebrahimi         if (*errorcodeptr == 0) escape = ESC_k;  /* No number found */
1807*22dc650dSSadaf Ebrahimi         break;
1808*22dc650dSSadaf Ebrahimi         }
1809*22dc650dSSadaf Ebrahimi       while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1810*22dc650dSSadaf Ebrahimi 
1811*22dc650dSSadaf Ebrahimi       if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1812*22dc650dSSadaf Ebrahimi         {
1813*22dc650dSSadaf Ebrahimi         *errorcodeptr = ERR57;
1814*22dc650dSSadaf Ebrahimi         break;
1815*22dc650dSSadaf Ebrahimi         }
1816*22dc650dSSadaf Ebrahimi       ptr = p + 1;
1817*22dc650dSSadaf Ebrahimi       }
1818*22dc650dSSadaf Ebrahimi 
1819*22dc650dSSadaf Ebrahimi     /* Read an undelimited number */
1820*22dc650dSSadaf Ebrahimi 
1821*22dc650dSSadaf Ebrahimi     else
1822*22dc650dSSadaf Ebrahimi       {
1823*22dc650dSSadaf Ebrahimi       if (!read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1824*22dc650dSSadaf Ebrahimi           errorcodeptr))
1825*22dc650dSSadaf Ebrahimi         {
1826*22dc650dSSadaf Ebrahimi         if (*errorcodeptr == 0) *errorcodeptr = ERR57;  /* No number found */
1827*22dc650dSSadaf Ebrahimi         break;
1828*22dc650dSSadaf Ebrahimi         }
1829*22dc650dSSadaf Ebrahimi       }
1830*22dc650dSSadaf Ebrahimi 
1831*22dc650dSSadaf Ebrahimi     if (s <= 0)
1832*22dc650dSSadaf Ebrahimi       {
1833*22dc650dSSadaf Ebrahimi       *errorcodeptr = ERR15;
1834*22dc650dSSadaf Ebrahimi       break;
1835*22dc650dSSadaf Ebrahimi       }
1836*22dc650dSSadaf Ebrahimi 
1837*22dc650dSSadaf Ebrahimi     escape = -s;
1838*22dc650dSSadaf Ebrahimi     break;
1839*22dc650dSSadaf Ebrahimi 
1840*22dc650dSSadaf Ebrahimi     /* The handling of escape sequences consisting of a string of digits
1841*22dc650dSSadaf Ebrahimi     starting with one that is not zero is not straightforward. Perl has changed
1842*22dc650dSSadaf Ebrahimi     over the years. Nowadays \g{} for backreferences and \o{} for octal are
1843*22dc650dSSadaf Ebrahimi     recommended to avoid the ambiguities in the old syntax.
1844*22dc650dSSadaf Ebrahimi 
1845*22dc650dSSadaf Ebrahimi     Outside a character class, the digits are read as a decimal number. If the
1846*22dc650dSSadaf Ebrahimi     number is less than 10, or if there are that many previous extracting left
1847*22dc650dSSadaf Ebrahimi     brackets, it is a back reference. Otherwise, up to three octal digits are
1848*22dc650dSSadaf Ebrahimi     read to form an escaped character code. Thus \123 is likely to be octal 123
1849*22dc650dSSadaf Ebrahimi     (cf \0123, which is octal 012 followed by the literal 3).
1850*22dc650dSSadaf Ebrahimi 
1851*22dc650dSSadaf Ebrahimi     Inside a character class, \ followed by a digit is always either a literal
1852*22dc650dSSadaf Ebrahimi     8 or 9 or an octal number. */
1853*22dc650dSSadaf Ebrahimi 
1854*22dc650dSSadaf Ebrahimi     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1855*22dc650dSSadaf Ebrahimi     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1856*22dc650dSSadaf Ebrahimi 
1857*22dc650dSSadaf Ebrahimi     if (!isclass)
1858*22dc650dSSadaf Ebrahimi       {
1859*22dc650dSSadaf Ebrahimi       oldptr = ptr;
1860*22dc650dSSadaf Ebrahimi       ptr--;   /* Back to the digit */
1861*22dc650dSSadaf Ebrahimi 
1862*22dc650dSSadaf Ebrahimi       /* As we know we are at a digit, the only possible error from
1863*22dc650dSSadaf Ebrahimi       read_number() is a number that is too large to be a group number. In this
1864*22dc650dSSadaf Ebrahimi       case we fall through handle this as not a group reference. If we have
1865*22dc650dSSadaf Ebrahimi       read a small enough number, check for a back reference.
1866*22dc650dSSadaf Ebrahimi 
1867*22dc650dSSadaf Ebrahimi       \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1868*22dc650dSSadaf Ebrahimi       are octal escapes if there are not that many previous captures. */
1869*22dc650dSSadaf Ebrahimi 
1870*22dc650dSSadaf Ebrahimi       if (read_number(&ptr, ptrend, -1, INT_MAX/10 - 1, 0, &s, errorcodeptr) &&
1871*22dc650dSSadaf Ebrahimi           (s < 10 || oldptr[-1] >= CHAR_8 || s <= (int)cb->bracount))
1872*22dc650dSSadaf Ebrahimi         {
1873*22dc650dSSadaf Ebrahimi         if (s > (int)MAX_GROUP_NUMBER) *errorcodeptr = ERR61;
1874*22dc650dSSadaf Ebrahimi           else escape = -s;     /* Indicates a back reference */
1875*22dc650dSSadaf Ebrahimi         break;
1876*22dc650dSSadaf Ebrahimi         }
1877*22dc650dSSadaf Ebrahimi 
1878*22dc650dSSadaf Ebrahimi       ptr = oldptr;      /* Put the pointer back and fall through */
1879*22dc650dSSadaf Ebrahimi       }
1880*22dc650dSSadaf Ebrahimi 
1881*22dc650dSSadaf Ebrahimi     /* Handle a digit following \ when the number is not a back reference, or
1882*22dc650dSSadaf Ebrahimi     we are within a character class. If the first digit is 8 or 9, Perl used to
1883*22dc650dSSadaf Ebrahimi     generate a binary zero and then treat the digit as a following literal. At
1884*22dc650dSSadaf Ebrahimi     least by Perl 5.18 this changed so as not to insert the binary zero. */
1885*22dc650dSSadaf Ebrahimi 
1886*22dc650dSSadaf Ebrahimi     if (c >= CHAR_8) break;
1887*22dc650dSSadaf Ebrahimi 
1888*22dc650dSSadaf Ebrahimi     /* Fall through */
1889*22dc650dSSadaf Ebrahimi 
1890*22dc650dSSadaf Ebrahimi     /* \0 always starts an octal number, but we may drop through to here with a
1891*22dc650dSSadaf Ebrahimi     larger first octal digit. The original code used just to take the least
1892*22dc650dSSadaf Ebrahimi     significant 8 bits of octal numbers (I think this is what early Perls used
1893*22dc650dSSadaf Ebrahimi     to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1894*22dc650dSSadaf Ebrahimi     but no more than 3 octal digits. */
1895*22dc650dSSadaf Ebrahimi 
1896*22dc650dSSadaf Ebrahimi     case CHAR_0:
1897*22dc650dSSadaf Ebrahimi     c -= CHAR_0;
1898*22dc650dSSadaf Ebrahimi     while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1899*22dc650dSSadaf Ebrahimi         c = c * 8 + *ptr++ - CHAR_0;
1900*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
1901*22dc650dSSadaf Ebrahimi     if (!utf && c > 0xff) *errorcodeptr = ERR51;
1902*22dc650dSSadaf Ebrahimi #endif
1903*22dc650dSSadaf Ebrahimi     break;
1904*22dc650dSSadaf Ebrahimi 
1905*22dc650dSSadaf Ebrahimi     /* \o is a relatively new Perl feature, supporting a more general way of
1906*22dc650dSSadaf Ebrahimi     specifying character codes in octal. The only supported form is \o{ddd},
1907*22dc650dSSadaf Ebrahimi     with optional spaces or tabs after { and before }. */
1908*22dc650dSSadaf Ebrahimi 
1909*22dc650dSSadaf Ebrahimi     case CHAR_o:
1910*22dc650dSSadaf Ebrahimi     if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET)
1911*22dc650dSSadaf Ebrahimi       {
1912*22dc650dSSadaf Ebrahimi       ptr--;
1913*22dc650dSSadaf Ebrahimi       *errorcodeptr = ERR55;
1914*22dc650dSSadaf Ebrahimi       break;
1915*22dc650dSSadaf Ebrahimi       }
1916*22dc650dSSadaf Ebrahimi 
1917*22dc650dSSadaf Ebrahimi     while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1918*22dc650dSSadaf Ebrahimi     if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1919*22dc650dSSadaf Ebrahimi       {
1920*22dc650dSSadaf Ebrahimi       *errorcodeptr = ERR78;
1921*22dc650dSSadaf Ebrahimi       break;
1922*22dc650dSSadaf Ebrahimi       }
1923*22dc650dSSadaf Ebrahimi 
1924*22dc650dSSadaf Ebrahimi     c = 0;
1925*22dc650dSSadaf Ebrahimi     overflow = FALSE;
1926*22dc650dSSadaf Ebrahimi     while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1927*22dc650dSSadaf Ebrahimi       {
1928*22dc650dSSadaf Ebrahimi       cc = *ptr++;
1929*22dc650dSSadaf Ebrahimi       if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1930*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 32
1931*22dc650dSSadaf Ebrahimi       if (c >= 0x20000000l) { overflow = TRUE; break; }
1932*22dc650dSSadaf Ebrahimi #endif
1933*22dc650dSSadaf Ebrahimi       c = (c << 3) + (cc - CHAR_0);
1934*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
1935*22dc650dSSadaf Ebrahimi       if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1936*22dc650dSSadaf Ebrahimi #elif PCRE2_CODE_UNIT_WIDTH == 16
1937*22dc650dSSadaf Ebrahimi       if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1938*22dc650dSSadaf Ebrahimi #elif PCRE2_CODE_UNIT_WIDTH == 32
1939*22dc650dSSadaf Ebrahimi       if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1940*22dc650dSSadaf Ebrahimi #endif
1941*22dc650dSSadaf Ebrahimi       }
1942*22dc650dSSadaf Ebrahimi 
1943*22dc650dSSadaf Ebrahimi     while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1944*22dc650dSSadaf Ebrahimi 
1945*22dc650dSSadaf Ebrahimi     if (overflow)
1946*22dc650dSSadaf Ebrahimi       {
1947*22dc650dSSadaf Ebrahimi       while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1948*22dc650dSSadaf Ebrahimi       *errorcodeptr = ERR34;
1949*22dc650dSSadaf Ebrahimi       }
1950*22dc650dSSadaf Ebrahimi     else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1951*22dc650dSSadaf Ebrahimi       {
1952*22dc650dSSadaf Ebrahimi       if (utf && c >= 0xd800 && c <= 0xdfff &&
1953*22dc650dSSadaf Ebrahimi           (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1954*22dc650dSSadaf Ebrahimi         {
1955*22dc650dSSadaf Ebrahimi         ptr--;
1956*22dc650dSSadaf Ebrahimi         *errorcodeptr = ERR73;
1957*22dc650dSSadaf Ebrahimi         }
1958*22dc650dSSadaf Ebrahimi       }
1959*22dc650dSSadaf Ebrahimi     else
1960*22dc650dSSadaf Ebrahimi       {
1961*22dc650dSSadaf Ebrahimi       ptr--;
1962*22dc650dSSadaf Ebrahimi       *errorcodeptr = ERR64;
1963*22dc650dSSadaf Ebrahimi       }
1964*22dc650dSSadaf Ebrahimi     break;
1965*22dc650dSSadaf Ebrahimi 
1966*22dc650dSSadaf Ebrahimi     /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
1967*22dc650dSSadaf Ebrahimi     by two hexadecimal digits. Otherwise it is a lowercase x letter. */
1968*22dc650dSSadaf Ebrahimi 
1969*22dc650dSSadaf Ebrahimi     case CHAR_x:
1970*22dc650dSSadaf Ebrahimi     if (alt_bsux)
1971*22dc650dSSadaf Ebrahimi       {
1972*22dc650dSSadaf Ebrahimi       uint32_t xc;
1973*22dc650dSSadaf Ebrahimi       if (ptrend - ptr < 2) break;               /* Less than 2 characters */
1974*22dc650dSSadaf Ebrahimi       if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
1975*22dc650dSSadaf Ebrahimi       if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1976*22dc650dSSadaf Ebrahimi       c = (cc << 4) | xc;
1977*22dc650dSSadaf Ebrahimi       ptr += 2;
1978*22dc650dSSadaf Ebrahimi       }
1979*22dc650dSSadaf Ebrahimi 
1980*22dc650dSSadaf Ebrahimi     /* Handle \x in Perl's style. \x{ddd} is a character code which can be
1981*22dc650dSSadaf Ebrahimi     greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
1982*22dc650dSSadaf Ebrahimi     digits. If not, { used to be treated as a data character. However, Perl
1983*22dc650dSSadaf Ebrahimi     seems to read hex digits up to the first non-such, and ignore the rest, so
1984*22dc650dSSadaf Ebrahimi     that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1985*22dc650dSSadaf Ebrahimi     now gives an error. */
1986*22dc650dSSadaf Ebrahimi 
1987*22dc650dSSadaf Ebrahimi     else
1988*22dc650dSSadaf Ebrahimi       {
1989*22dc650dSSadaf Ebrahimi       if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1990*22dc650dSSadaf Ebrahimi         {
1991*22dc650dSSadaf Ebrahimi         ptr++;
1992*22dc650dSSadaf Ebrahimi         while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1993*22dc650dSSadaf Ebrahimi 
1994*22dc650dSSadaf Ebrahimi #ifndef EBCDIC
1995*22dc650dSSadaf Ebrahimi         COME_FROM_NU:
1996*22dc650dSSadaf Ebrahimi #endif
1997*22dc650dSSadaf Ebrahimi         if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1998*22dc650dSSadaf Ebrahimi           {
1999*22dc650dSSadaf Ebrahimi           *errorcodeptr = ERR78;
2000*22dc650dSSadaf Ebrahimi           break;
2001*22dc650dSSadaf Ebrahimi           }
2002*22dc650dSSadaf Ebrahimi         c = 0;
2003*22dc650dSSadaf Ebrahimi         overflow = FALSE;
2004*22dc650dSSadaf Ebrahimi 
2005*22dc650dSSadaf Ebrahimi         while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
2006*22dc650dSSadaf Ebrahimi           {
2007*22dc650dSSadaf Ebrahimi           ptr++;
2008*22dc650dSSadaf Ebrahimi           if (c == 0 && cc == 0) continue;   /* Leading zeroes */
2009*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 32
2010*22dc650dSSadaf Ebrahimi           if (c >= 0x10000000l) { overflow = TRUE; break; }
2011*22dc650dSSadaf Ebrahimi #endif
2012*22dc650dSSadaf Ebrahimi           c = (c << 4) | cc;
2013*22dc650dSSadaf Ebrahimi           if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
2014*22dc650dSSadaf Ebrahimi             {
2015*22dc650dSSadaf Ebrahimi             overflow = TRUE;
2016*22dc650dSSadaf Ebrahimi             break;
2017*22dc650dSSadaf Ebrahimi             }
2018*22dc650dSSadaf Ebrahimi           }
2019*22dc650dSSadaf Ebrahimi 
2020*22dc650dSSadaf Ebrahimi         /* Perl ignores spaces and tabs before } */
2021*22dc650dSSadaf Ebrahimi 
2022*22dc650dSSadaf Ebrahimi         while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2023*22dc650dSSadaf Ebrahimi 
2024*22dc650dSSadaf Ebrahimi         /* On overflow, skip remaining hex digits */
2025*22dc650dSSadaf Ebrahimi 
2026*22dc650dSSadaf Ebrahimi         if (overflow)
2027*22dc650dSSadaf Ebrahimi           {
2028*22dc650dSSadaf Ebrahimi           while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
2029*22dc650dSSadaf Ebrahimi           *errorcodeptr = ERR34;
2030*22dc650dSSadaf Ebrahimi           }
2031*22dc650dSSadaf Ebrahimi         else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
2032*22dc650dSSadaf Ebrahimi           {
2033*22dc650dSSadaf Ebrahimi           if (utf && c >= 0xd800 && c <= 0xdfff &&
2034*22dc650dSSadaf Ebrahimi               (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
2035*22dc650dSSadaf Ebrahimi             {
2036*22dc650dSSadaf Ebrahimi             ptr--;
2037*22dc650dSSadaf Ebrahimi             *errorcodeptr = ERR73;
2038*22dc650dSSadaf Ebrahimi             }
2039*22dc650dSSadaf Ebrahimi           }
2040*22dc650dSSadaf Ebrahimi 
2041*22dc650dSSadaf Ebrahimi         /* If the sequence of hex digits (followed by optional space) does not
2042*22dc650dSSadaf Ebrahimi         end with '}', give an error. We used just to recognize this construct
2043*22dc650dSSadaf Ebrahimi         and fall through to the normal \x handling, but nowadays Perl gives an
2044*22dc650dSSadaf Ebrahimi         error, which seems much more sensible, so we do too. */
2045*22dc650dSSadaf Ebrahimi 
2046*22dc650dSSadaf Ebrahimi         else
2047*22dc650dSSadaf Ebrahimi           {
2048*22dc650dSSadaf Ebrahimi           ptr--;
2049*22dc650dSSadaf Ebrahimi           *errorcodeptr = ERR67;
2050*22dc650dSSadaf Ebrahimi           }
2051*22dc650dSSadaf Ebrahimi         }   /* End of \x{} processing */
2052*22dc650dSSadaf Ebrahimi 
2053*22dc650dSSadaf Ebrahimi       /* Read a up to two hex digits after \x */
2054*22dc650dSSadaf Ebrahimi 
2055*22dc650dSSadaf Ebrahimi       else
2056*22dc650dSSadaf Ebrahimi         {
2057*22dc650dSSadaf Ebrahimi         c = 0;
2058*22dc650dSSadaf Ebrahimi         if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break;  /* Not a hex digit */
2059*22dc650dSSadaf Ebrahimi         ptr++;
2060*22dc650dSSadaf Ebrahimi         c = cc;
2061*22dc650dSSadaf Ebrahimi         if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break;  /* Not a hex digit */
2062*22dc650dSSadaf Ebrahimi         ptr++;
2063*22dc650dSSadaf Ebrahimi         c = (c << 4) | cc;
2064*22dc650dSSadaf Ebrahimi         }     /* End of \xdd handling */
2065*22dc650dSSadaf Ebrahimi       }       /* End of Perl-style \x handling */
2066*22dc650dSSadaf Ebrahimi     break;
2067*22dc650dSSadaf Ebrahimi 
2068*22dc650dSSadaf Ebrahimi     /* The handling of \c is different in ASCII and EBCDIC environments. In an
2069*22dc650dSSadaf Ebrahimi     ASCII (or Unicode) environment, an error is given if the character
2070*22dc650dSSadaf Ebrahimi     following \c is not a printable ASCII character. Otherwise, the following
2071*22dc650dSSadaf Ebrahimi     character is upper-cased if it is a letter, and after that the 0x40 bit is
2072*22dc650dSSadaf Ebrahimi     flipped. The result is the value of the escape.
2073*22dc650dSSadaf Ebrahimi 
2074*22dc650dSSadaf Ebrahimi     In an EBCDIC environment the handling of \c is compatible with the
2075*22dc650dSSadaf Ebrahimi     specification in the perlebcdic document. The following character must be
2076*22dc650dSSadaf Ebrahimi     a letter or one of small number of special characters. These provide a
2077*22dc650dSSadaf Ebrahimi     means of defining the character values 0-31.
2078*22dc650dSSadaf Ebrahimi 
2079*22dc650dSSadaf Ebrahimi     For testing the EBCDIC handling of \c in an ASCII environment, recognize
2080*22dc650dSSadaf Ebrahimi     the EBCDIC value of 'c' explicitly. */
2081*22dc650dSSadaf Ebrahimi 
2082*22dc650dSSadaf Ebrahimi #if defined EBCDIC && 'a' != 0x81
2083*22dc650dSSadaf Ebrahimi     case 0x83:
2084*22dc650dSSadaf Ebrahimi #else
2085*22dc650dSSadaf Ebrahimi     case CHAR_c:
2086*22dc650dSSadaf Ebrahimi #endif
2087*22dc650dSSadaf Ebrahimi     if (ptr >= ptrend)
2088*22dc650dSSadaf Ebrahimi       {
2089*22dc650dSSadaf Ebrahimi       *errorcodeptr = ERR2;
2090*22dc650dSSadaf Ebrahimi       break;
2091*22dc650dSSadaf Ebrahimi       }
2092*22dc650dSSadaf Ebrahimi     c = *ptr;
2093*22dc650dSSadaf Ebrahimi     if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
2094*22dc650dSSadaf Ebrahimi 
2095*22dc650dSSadaf Ebrahimi     /* Handle \c in an ASCII/Unicode environment. */
2096*22dc650dSSadaf Ebrahimi 
2097*22dc650dSSadaf Ebrahimi #ifndef EBCDIC    /* ASCII/UTF-8 coding */
2098*22dc650dSSadaf Ebrahimi     if (c < 32 || c > 126)  /* Excludes all non-printable ASCII */
2099*22dc650dSSadaf Ebrahimi       {
2100*22dc650dSSadaf Ebrahimi       *errorcodeptr = ERR68;
2101*22dc650dSSadaf Ebrahimi       break;
2102*22dc650dSSadaf Ebrahimi       }
2103*22dc650dSSadaf Ebrahimi     c ^= 0x40;
2104*22dc650dSSadaf Ebrahimi 
2105*22dc650dSSadaf Ebrahimi     /* Handle \c in an EBCDIC environment. The special case \c? is converted to
2106*22dc650dSSadaf Ebrahimi     255 (0xff) or 95 (0x5f) if other characters suggest we are using the
2107*22dc650dSSadaf Ebrahimi     POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)
2108*22dc650dSSadaf Ebrahimi     The other valid sequences correspond to a list of specific characters. */
2109*22dc650dSSadaf Ebrahimi 
2110*22dc650dSSadaf Ebrahimi #else
2111*22dc650dSSadaf Ebrahimi     if (c == CHAR_QUESTION_MARK)
2112*22dc650dSSadaf Ebrahimi       c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
2113*22dc650dSSadaf Ebrahimi     else
2114*22dc650dSSadaf Ebrahimi       {
2115*22dc650dSSadaf Ebrahimi       for (i = 0; i < 32; i++)
2116*22dc650dSSadaf Ebrahimi         {
2117*22dc650dSSadaf Ebrahimi         if (c == ebcdic_escape_c[i]) break;
2118*22dc650dSSadaf Ebrahimi         }
2119*22dc650dSSadaf Ebrahimi       if (i < 32) c = i; else *errorcodeptr = ERR68;
2120*22dc650dSSadaf Ebrahimi       }
2121*22dc650dSSadaf Ebrahimi #endif  /* EBCDIC */
2122*22dc650dSSadaf Ebrahimi 
2123*22dc650dSSadaf Ebrahimi     ptr++;
2124*22dc650dSSadaf Ebrahimi     break;
2125*22dc650dSSadaf Ebrahimi 
2126*22dc650dSSadaf Ebrahimi     /* Any other alphanumeric following \ is an error. Perl gives an error only
2127*22dc650dSSadaf Ebrahimi     if in warning mode, but PCRE doesn't have a warning mode. */
2128*22dc650dSSadaf Ebrahimi 
2129*22dc650dSSadaf Ebrahimi     default:
2130*22dc650dSSadaf Ebrahimi     *errorcodeptr = ERR3;
2131*22dc650dSSadaf Ebrahimi     *ptrptr = ptr - 1;     /* Point to the character at fault */
2132*22dc650dSSadaf Ebrahimi     return 0;
2133*22dc650dSSadaf Ebrahimi     }
2134*22dc650dSSadaf Ebrahimi   }
2135*22dc650dSSadaf Ebrahimi 
2136*22dc650dSSadaf Ebrahimi /* Set the pointer to the next character before returning. */
2137*22dc650dSSadaf Ebrahimi 
2138*22dc650dSSadaf Ebrahimi *ptrptr = ptr;
2139*22dc650dSSadaf Ebrahimi *chptr = c;
2140*22dc650dSSadaf Ebrahimi return escape;
2141*22dc650dSSadaf Ebrahimi }
2142*22dc650dSSadaf Ebrahimi 
2143*22dc650dSSadaf Ebrahimi 
2144*22dc650dSSadaf Ebrahimi 
2145*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
2146*22dc650dSSadaf Ebrahimi /*************************************************
2147*22dc650dSSadaf Ebrahimi *               Handle \P and \p                 *
2148*22dc650dSSadaf Ebrahimi *************************************************/
2149*22dc650dSSadaf Ebrahimi 
2150*22dc650dSSadaf Ebrahimi /* This function is called after \P or \p has been encountered, provided that
2151*22dc650dSSadaf Ebrahimi PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2152*22dc650dSSadaf Ebrahimi contents of ptrptr are pointing after the P or p. On exit, it is left pointing
2153*22dc650dSSadaf Ebrahimi after the final code unit of the escape sequence.
2154*22dc650dSSadaf Ebrahimi 
2155*22dc650dSSadaf Ebrahimi Arguments:
2156*22dc650dSSadaf Ebrahimi   ptrptr         the pattern position pointer
2157*22dc650dSSadaf Ebrahimi   negptr         a boolean that is set TRUE for negation else FALSE
2158*22dc650dSSadaf Ebrahimi   ptypeptr       an unsigned int that is set to the type value
2159*22dc650dSSadaf Ebrahimi   pdataptr       an unsigned int that is set to the detailed property value
2160*22dc650dSSadaf Ebrahimi   errorcodeptr   the error code variable
2161*22dc650dSSadaf Ebrahimi   cb             the compile data
2162*22dc650dSSadaf Ebrahimi 
2163*22dc650dSSadaf Ebrahimi Returns:         TRUE if the type value was found, or FALSE for an invalid type
2164*22dc650dSSadaf Ebrahimi */
2165*22dc650dSSadaf Ebrahimi 
2166*22dc650dSSadaf Ebrahimi static BOOL
get_ucp(PCRE2_SPTR * ptrptr,BOOL * negptr,uint16_t * ptypeptr,uint16_t * pdataptr,int * errorcodeptr,compile_block * cb)2167*22dc650dSSadaf Ebrahimi get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,
2168*22dc650dSSadaf Ebrahimi   uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
2169*22dc650dSSadaf Ebrahimi {
2170*22dc650dSSadaf Ebrahimi PCRE2_UCHAR c;
2171*22dc650dSSadaf Ebrahimi PCRE2_SIZE i, bot, top;
2172*22dc650dSSadaf Ebrahimi PCRE2_SPTR ptr = *ptrptr;
2173*22dc650dSSadaf Ebrahimi PCRE2_UCHAR name[50];
2174*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *vptr = NULL;
2175*22dc650dSSadaf Ebrahimi uint16_t ptscript = PT_NOTSCRIPT;
2176*22dc650dSSadaf Ebrahimi 
2177*22dc650dSSadaf Ebrahimi if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2178*22dc650dSSadaf Ebrahimi c = *ptr++;
2179*22dc650dSSadaf Ebrahimi *negptr = FALSE;
2180*22dc650dSSadaf Ebrahimi 
2181*22dc650dSSadaf Ebrahimi /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2182*22dc650dSSadaf Ebrahimi negation. */
2183*22dc650dSSadaf Ebrahimi 
2184*22dc650dSSadaf Ebrahimi if (c == CHAR_LEFT_CURLY_BRACKET)
2185*22dc650dSSadaf Ebrahimi   {
2186*22dc650dSSadaf Ebrahimi   if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2187*22dc650dSSadaf Ebrahimi 
2188*22dc650dSSadaf Ebrahimi   if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
2189*22dc650dSSadaf Ebrahimi     {
2190*22dc650dSSadaf Ebrahimi     *negptr = TRUE;
2191*22dc650dSSadaf Ebrahimi     ptr++;
2192*22dc650dSSadaf Ebrahimi     }
2193*22dc650dSSadaf Ebrahimi 
2194*22dc650dSSadaf Ebrahimi   for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2195*22dc650dSSadaf Ebrahimi     {
2196*22dc650dSSadaf Ebrahimi     if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2197*22dc650dSSadaf Ebrahimi     c = *ptr++;
2198*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH != 8
2199*22dc650dSSadaf Ebrahimi     while (c == '_' || c == '-' || (c <= 0xff && isspace(c)))
2200*22dc650dSSadaf Ebrahimi #else
2201*22dc650dSSadaf Ebrahimi     while (c == '_' || c == '-' || isspace(c))
2202*22dc650dSSadaf Ebrahimi #endif
2203*22dc650dSSadaf Ebrahimi       {
2204*22dc650dSSadaf Ebrahimi       if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2205*22dc650dSSadaf Ebrahimi       c = *ptr++;
2206*22dc650dSSadaf Ebrahimi       }
2207*22dc650dSSadaf Ebrahimi     if (c == CHAR_NUL) goto ERROR_RETURN;
2208*22dc650dSSadaf Ebrahimi     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2209*22dc650dSSadaf Ebrahimi     name[i] = tolower(c);
2210*22dc650dSSadaf Ebrahimi     if ((c == ':' || c == '=') && vptr == NULL) vptr = name + i;
2211*22dc650dSSadaf Ebrahimi     }
2212*22dc650dSSadaf Ebrahimi 
2213*22dc650dSSadaf Ebrahimi   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2214*22dc650dSSadaf Ebrahimi   name[i] = 0;
2215*22dc650dSSadaf Ebrahimi   }
2216*22dc650dSSadaf Ebrahimi 
2217*22dc650dSSadaf Ebrahimi /* If { doesn't follow \p or \P there is just one following character, which
2218*22dc650dSSadaf Ebrahimi must be an ASCII letter. */
2219*22dc650dSSadaf Ebrahimi 
2220*22dc650dSSadaf Ebrahimi else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
2221*22dc650dSSadaf Ebrahimi   {
2222*22dc650dSSadaf Ebrahimi   name[0] = tolower(c);
2223*22dc650dSSadaf Ebrahimi   name[1] = 0;
2224*22dc650dSSadaf Ebrahimi   }
2225*22dc650dSSadaf Ebrahimi else goto ERROR_RETURN;
2226*22dc650dSSadaf Ebrahimi 
2227*22dc650dSSadaf Ebrahimi *ptrptr = ptr;
2228*22dc650dSSadaf Ebrahimi 
2229*22dc650dSSadaf Ebrahimi /* If the property contains ':' or '=' we have class name and value separately
2230*22dc650dSSadaf Ebrahimi specified. The following are supported:
2231*22dc650dSSadaf Ebrahimi 
2232*22dc650dSSadaf Ebrahimi   . Bidi_Class (synonym bc), for which the property names are "bidi<name>".
2233*22dc650dSSadaf Ebrahimi   . Script (synonym sc) for which the property name is the script name
2234*22dc650dSSadaf Ebrahimi   . Script_Extensions (synonym scx), ditto
2235*22dc650dSSadaf Ebrahimi 
2236*22dc650dSSadaf Ebrahimi As this is a small number, we currently just check the names directly. If this
2237*22dc650dSSadaf Ebrahimi grows, a sorted table and a switch will be neater.
2238*22dc650dSSadaf Ebrahimi 
2239*22dc650dSSadaf Ebrahimi For both the script properties, set a PT_xxx value so that (1) they can be
2240*22dc650dSSadaf Ebrahimi distinguished and (2) invalid script names that happen to be the name of
2241*22dc650dSSadaf Ebrahimi another property can be diagnosed. */
2242*22dc650dSSadaf Ebrahimi 
2243*22dc650dSSadaf Ebrahimi if (vptr != NULL)
2244*22dc650dSSadaf Ebrahimi   {
2245*22dc650dSSadaf Ebrahimi   int offset = 0;
2246*22dc650dSSadaf Ebrahimi   PCRE2_UCHAR sname[8];
2247*22dc650dSSadaf Ebrahimi 
2248*22dc650dSSadaf Ebrahimi   *vptr = 0;   /* Terminate property name */
2249*22dc650dSSadaf Ebrahimi   if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 ||
2250*22dc650dSSadaf Ebrahimi       PRIV(strcmp_c8)(name, STRING_bc) == 0)
2251*22dc650dSSadaf Ebrahimi     {
2252*22dc650dSSadaf Ebrahimi     offset = 4;
2253*22dc650dSSadaf Ebrahimi     sname[0] = CHAR_b;
2254*22dc650dSSadaf Ebrahimi     sname[1] = CHAR_i;  /* There is no strcpy_c8 function */
2255*22dc650dSSadaf Ebrahimi     sname[2] = CHAR_d;
2256*22dc650dSSadaf Ebrahimi     sname[3] = CHAR_i;
2257*22dc650dSSadaf Ebrahimi     }
2258*22dc650dSSadaf Ebrahimi 
2259*22dc650dSSadaf Ebrahimi   else if (PRIV(strcmp_c8)(name, STRING_script) == 0 ||
2260*22dc650dSSadaf Ebrahimi            PRIV(strcmp_c8)(name, STRING_sc) == 0)
2261*22dc650dSSadaf Ebrahimi     ptscript = PT_SC;
2262*22dc650dSSadaf Ebrahimi 
2263*22dc650dSSadaf Ebrahimi   else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == 0 ||
2264*22dc650dSSadaf Ebrahimi            PRIV(strcmp_c8)(name, STRING_scx) == 0)
2265*22dc650dSSadaf Ebrahimi     ptscript = PT_SCX;
2266*22dc650dSSadaf Ebrahimi 
2267*22dc650dSSadaf Ebrahimi   else
2268*22dc650dSSadaf Ebrahimi     {
2269*22dc650dSSadaf Ebrahimi     *errorcodeptr = ERR47;
2270*22dc650dSSadaf Ebrahimi     return FALSE;
2271*22dc650dSSadaf Ebrahimi     }
2272*22dc650dSSadaf Ebrahimi 
2273*22dc650dSSadaf Ebrahimi   /* Adjust the string in name[] as needed */
2274*22dc650dSSadaf Ebrahimi 
2275*22dc650dSSadaf Ebrahimi   memmove(name + offset, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR));
2276*22dc650dSSadaf Ebrahimi   if (offset != 0) memmove(name, sname, offset*sizeof(PCRE2_UCHAR));
2277*22dc650dSSadaf Ebrahimi   }
2278*22dc650dSSadaf Ebrahimi 
2279*22dc650dSSadaf Ebrahimi /* Search for a recognized property using binary chop. */
2280*22dc650dSSadaf Ebrahimi 
2281*22dc650dSSadaf Ebrahimi bot = 0;
2282*22dc650dSSadaf Ebrahimi top = PRIV(utt_size);
2283*22dc650dSSadaf Ebrahimi 
2284*22dc650dSSadaf Ebrahimi while (bot < top)
2285*22dc650dSSadaf Ebrahimi   {
2286*22dc650dSSadaf Ebrahimi   int r;
2287*22dc650dSSadaf Ebrahimi   i = (bot + top) >> 1;
2288*22dc650dSSadaf Ebrahimi   r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2289*22dc650dSSadaf Ebrahimi 
2290*22dc650dSSadaf Ebrahimi   /* When a matching property is found, some extra checking is needed when the
2291*22dc650dSSadaf Ebrahimi   \p{xx:yy} syntax is used and xx is either sc or scx. */
2292*22dc650dSSadaf Ebrahimi 
2293*22dc650dSSadaf Ebrahimi   if (r == 0)
2294*22dc650dSSadaf Ebrahimi     {
2295*22dc650dSSadaf Ebrahimi     *pdataptr = PRIV(utt)[i].value;
2296*22dc650dSSadaf Ebrahimi     if (vptr == NULL || ptscript == PT_NOTSCRIPT)
2297*22dc650dSSadaf Ebrahimi       {
2298*22dc650dSSadaf Ebrahimi       *ptypeptr = PRIV(utt)[i].type;
2299*22dc650dSSadaf Ebrahimi       return TRUE;
2300*22dc650dSSadaf Ebrahimi       }
2301*22dc650dSSadaf Ebrahimi 
2302*22dc650dSSadaf Ebrahimi     switch (PRIV(utt)[i].type)
2303*22dc650dSSadaf Ebrahimi       {
2304*22dc650dSSadaf Ebrahimi       case PT_SC:
2305*22dc650dSSadaf Ebrahimi       *ptypeptr = PT_SC;
2306*22dc650dSSadaf Ebrahimi       return TRUE;
2307*22dc650dSSadaf Ebrahimi 
2308*22dc650dSSadaf Ebrahimi       case PT_SCX:
2309*22dc650dSSadaf Ebrahimi       *ptypeptr = ptscript;
2310*22dc650dSSadaf Ebrahimi       return TRUE;
2311*22dc650dSSadaf Ebrahimi       }
2312*22dc650dSSadaf Ebrahimi 
2313*22dc650dSSadaf Ebrahimi     break;  /* Non-script found */
2314*22dc650dSSadaf Ebrahimi     }
2315*22dc650dSSadaf Ebrahimi 
2316*22dc650dSSadaf Ebrahimi   if (r > 0) bot = i + 1; else top = i;
2317*22dc650dSSadaf Ebrahimi   }
2318*22dc650dSSadaf Ebrahimi 
2319*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR47;   /* Unrecognized property */
2320*22dc650dSSadaf Ebrahimi return FALSE;
2321*22dc650dSSadaf Ebrahimi 
2322*22dc650dSSadaf Ebrahimi ERROR_RETURN:            /* Malformed \P or \p */
2323*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR46;
2324*22dc650dSSadaf Ebrahimi *ptrptr = ptr;
2325*22dc650dSSadaf Ebrahimi return FALSE;
2326*22dc650dSSadaf Ebrahimi }
2327*22dc650dSSadaf Ebrahimi #endif
2328*22dc650dSSadaf Ebrahimi 
2329*22dc650dSSadaf Ebrahimi 
2330*22dc650dSSadaf Ebrahimi 
2331*22dc650dSSadaf Ebrahimi /*************************************************
2332*22dc650dSSadaf Ebrahimi *           Check for POSIX class syntax         *
2333*22dc650dSSadaf Ebrahimi *************************************************/
2334*22dc650dSSadaf Ebrahimi 
2335*22dc650dSSadaf Ebrahimi /* This function is called when the sequence "[:" or "[." or "[=" is
2336*22dc650dSSadaf Ebrahimi encountered in a character class. It checks whether this is followed by a
2337*22dc650dSSadaf Ebrahimi sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2338*22dc650dSSadaf Ebrahimi reach an unescaped ']' without the special preceding character, return FALSE.
2339*22dc650dSSadaf Ebrahimi 
2340*22dc650dSSadaf Ebrahimi Originally, this function only recognized a sequence of letters between the
2341*22dc650dSSadaf Ebrahimi terminators, but it seems that Perl recognizes any sequence of characters,
2342*22dc650dSSadaf Ebrahimi though of course unknown POSIX names are subsequently rejected. Perl gives an
2343*22dc650dSSadaf Ebrahimi "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2344*22dc650dSSadaf Ebrahimi didn't consider this to be a POSIX class. Likewise for [:1234:].
2345*22dc650dSSadaf Ebrahimi 
2346*22dc650dSSadaf Ebrahimi The problem in trying to be exactly like Perl is in the handling of escapes. We
2347*22dc650dSSadaf Ebrahimi have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2348*22dc650dSSadaf Ebrahimi class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2349*22dc650dSSadaf Ebrahimi below handles the special cases \\ and \], but does not try to do any other
2350*22dc650dSSadaf Ebrahimi escape processing. This makes it different from Perl for cases such as
2351*22dc650dSSadaf Ebrahimi [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2352*22dc650dSSadaf Ebrahimi not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2353*22dc650dSSadaf Ebrahimi when Perl does, I think.
2354*22dc650dSSadaf Ebrahimi 
2355*22dc650dSSadaf Ebrahimi A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2356*22dc650dSSadaf Ebrahimi It seems that the appearance of a nested POSIX class supersedes an apparent
2357*22dc650dSSadaf Ebrahimi external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2358*22dc650dSSadaf Ebrahimi a digit. This is handled by returning FALSE if the start of a new group with
2359*22dc650dSSadaf Ebrahimi the same terminator is encountered, since the next closing sequence must close
2360*22dc650dSSadaf Ebrahimi the nested group, not the outer one.
2361*22dc650dSSadaf Ebrahimi 
2362*22dc650dSSadaf Ebrahimi In Perl, unescaped square brackets may also appear as part of class names. For
2363*22dc650dSSadaf Ebrahimi example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2364*22dc650dSSadaf Ebrahimi [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2365*22dc650dSSadaf Ebrahimi seem right at all. PCRE does not allow closing square brackets in POSIX class
2366*22dc650dSSadaf Ebrahimi names.
2367*22dc650dSSadaf Ebrahimi 
2368*22dc650dSSadaf Ebrahimi Arguments:
2369*22dc650dSSadaf Ebrahimi   ptr      pointer to the character after the initial [ (colon, dot, equals)
2370*22dc650dSSadaf Ebrahimi   ptrend   pointer to the end of the pattern
2371*22dc650dSSadaf Ebrahimi   endptr   where to return a pointer to the terminating ':', '.', or '='
2372*22dc650dSSadaf Ebrahimi 
2373*22dc650dSSadaf Ebrahimi Returns:   TRUE or FALSE
2374*22dc650dSSadaf Ebrahimi */
2375*22dc650dSSadaf Ebrahimi 
2376*22dc650dSSadaf Ebrahimi static BOOL
check_posix_syntax(PCRE2_SPTR ptr,PCRE2_SPTR ptrend,PCRE2_SPTR * endptr)2377*22dc650dSSadaf Ebrahimi check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2378*22dc650dSSadaf Ebrahimi {
2379*22dc650dSSadaf Ebrahimi PCRE2_UCHAR terminator;  /* Don't combine these lines; the Solaris cc */
2380*22dc650dSSadaf Ebrahimi terminator = *ptr++;     /* compiler warns about "non-constant" initializer. */
2381*22dc650dSSadaf Ebrahimi 
2382*22dc650dSSadaf Ebrahimi for (; ptrend - ptr >= 2; ptr++)
2383*22dc650dSSadaf Ebrahimi   {
2384*22dc650dSSadaf Ebrahimi   if (*ptr == CHAR_BACKSLASH &&
2385*22dc650dSSadaf Ebrahimi       (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2386*22dc650dSSadaf Ebrahimi     ptr++;
2387*22dc650dSSadaf Ebrahimi 
2388*22dc650dSSadaf Ebrahimi   else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2389*22dc650dSSadaf Ebrahimi             *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2390*22dc650dSSadaf Ebrahimi 
2391*22dc650dSSadaf Ebrahimi   else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2392*22dc650dSSadaf Ebrahimi     {
2393*22dc650dSSadaf Ebrahimi     *endptr = ptr;
2394*22dc650dSSadaf Ebrahimi     return TRUE;
2395*22dc650dSSadaf Ebrahimi     }
2396*22dc650dSSadaf Ebrahimi   }
2397*22dc650dSSadaf Ebrahimi 
2398*22dc650dSSadaf Ebrahimi return FALSE;
2399*22dc650dSSadaf Ebrahimi }
2400*22dc650dSSadaf Ebrahimi 
2401*22dc650dSSadaf Ebrahimi 
2402*22dc650dSSadaf Ebrahimi 
2403*22dc650dSSadaf Ebrahimi /*************************************************
2404*22dc650dSSadaf Ebrahimi *          Check POSIX class name                *
2405*22dc650dSSadaf Ebrahimi *************************************************/
2406*22dc650dSSadaf Ebrahimi 
2407*22dc650dSSadaf Ebrahimi /* This function is called to check the name given in a POSIX-style class entry
2408*22dc650dSSadaf Ebrahimi such as [:alnum:].
2409*22dc650dSSadaf Ebrahimi 
2410*22dc650dSSadaf Ebrahimi Arguments:
2411*22dc650dSSadaf Ebrahimi   ptr        points to the first letter
2412*22dc650dSSadaf Ebrahimi   len        the length of the name
2413*22dc650dSSadaf Ebrahimi 
2414*22dc650dSSadaf Ebrahimi Returns:     a value representing the name, or -1 if unknown
2415*22dc650dSSadaf Ebrahimi */
2416*22dc650dSSadaf Ebrahimi 
2417*22dc650dSSadaf Ebrahimi static int
check_posix_name(PCRE2_SPTR ptr,int len)2418*22dc650dSSadaf Ebrahimi check_posix_name(PCRE2_SPTR ptr, int len)
2419*22dc650dSSadaf Ebrahimi {
2420*22dc650dSSadaf Ebrahimi const char *pn = posix_names;
2421*22dc650dSSadaf Ebrahimi int yield = 0;
2422*22dc650dSSadaf Ebrahimi while (posix_name_lengths[yield] != 0)
2423*22dc650dSSadaf Ebrahimi   {
2424*22dc650dSSadaf Ebrahimi   if (len == posix_name_lengths[yield] &&
2425*22dc650dSSadaf Ebrahimi     PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2426*22dc650dSSadaf Ebrahimi   pn += posix_name_lengths[yield] + 1;
2427*22dc650dSSadaf Ebrahimi   yield++;
2428*22dc650dSSadaf Ebrahimi   }
2429*22dc650dSSadaf Ebrahimi return -1;
2430*22dc650dSSadaf Ebrahimi }
2431*22dc650dSSadaf Ebrahimi 
2432*22dc650dSSadaf Ebrahimi 
2433*22dc650dSSadaf Ebrahimi 
2434*22dc650dSSadaf Ebrahimi /*************************************************
2435*22dc650dSSadaf Ebrahimi *       Read a subpattern or VERB name           *
2436*22dc650dSSadaf Ebrahimi *************************************************/
2437*22dc650dSSadaf Ebrahimi 
2438*22dc650dSSadaf Ebrahimi /* This function is called from parse_regex() below whenever it needs to read
2439*22dc650dSSadaf Ebrahimi the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial
2440*22dc650dSSadaf Ebrahimi pointer must be to the preceding character. If that character is '*' we are
2441*22dc650dSSadaf Ebrahimi reading a verb or alpha assertion name. The pointer is updated to point after
2442*22dc650dSSadaf Ebrahimi the name, for a VERB or alpha assertion name, or after tha name's terminator
2443*22dc650dSSadaf Ebrahimi for a subpattern name. Returning both the offset and the name pointer is
2444*22dc650dSSadaf Ebrahimi redundant information, but some callers use one and some the other, so it is
2445*22dc650dSSadaf Ebrahimi simplest just to return both. When the name is in braces, spaces and tabs are
2446*22dc650dSSadaf Ebrahimi allowed (and ignored) at either end.
2447*22dc650dSSadaf Ebrahimi 
2448*22dc650dSSadaf Ebrahimi Arguments:
2449*22dc650dSSadaf Ebrahimi   ptrptr      points to the character pointer variable
2450*22dc650dSSadaf Ebrahimi   ptrend      points to the end of the input string
2451*22dc650dSSadaf Ebrahimi   utf         true if the input is UTF-encoded
2452*22dc650dSSadaf Ebrahimi   terminator  the terminator of a subpattern name must be this
2453*22dc650dSSadaf Ebrahimi   offsetptr   where to put the offset from the start of the pattern
2454*22dc650dSSadaf Ebrahimi   nameptr     where to put a pointer to the name in the input
2455*22dc650dSSadaf Ebrahimi   namelenptr  where to put the length of the name
2456*22dc650dSSadaf Ebrahimi   errcodeptr  where to put an error code
2457*22dc650dSSadaf Ebrahimi   cb          pointer to the compile data block
2458*22dc650dSSadaf Ebrahimi 
2459*22dc650dSSadaf Ebrahimi Returns:    TRUE if a name was read
2460*22dc650dSSadaf Ebrahimi             FALSE otherwise, with error code set
2461*22dc650dSSadaf Ebrahimi */
2462*22dc650dSSadaf Ebrahimi 
2463*22dc650dSSadaf Ebrahimi static BOOL
read_name(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,BOOL utf,uint32_t terminator,PCRE2_SIZE * offsetptr,PCRE2_SPTR * nameptr,uint32_t * namelenptr,int * errorcodeptr,compile_block * cb)2464*22dc650dSSadaf Ebrahimi read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
2465*22dc650dSSadaf Ebrahimi   PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
2466*22dc650dSSadaf Ebrahimi   int *errorcodeptr, compile_block *cb)
2467*22dc650dSSadaf Ebrahimi {
2468*22dc650dSSadaf Ebrahimi PCRE2_SPTR ptr = *ptrptr;
2469*22dc650dSSadaf Ebrahimi BOOL is_group = (*ptr++ != CHAR_ASTERISK);
2470*22dc650dSSadaf Ebrahimi BOOL is_braced = terminator == CHAR_RIGHT_CURLY_BRACKET;
2471*22dc650dSSadaf Ebrahimi 
2472*22dc650dSSadaf Ebrahimi if (is_braced)
2473*22dc650dSSadaf Ebrahimi   while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2474*22dc650dSSadaf Ebrahimi 
2475*22dc650dSSadaf Ebrahimi if (ptr >= ptrend)                 /* No characters in name */
2476*22dc650dSSadaf Ebrahimi   {
2477*22dc650dSSadaf Ebrahimi   *errorcodeptr = is_group? ERR62: /* Subpattern name expected */
2478*22dc650dSSadaf Ebrahimi                             ERR60; /* Verb not recognized or malformed */
2479*22dc650dSSadaf Ebrahimi   goto FAILED;
2480*22dc650dSSadaf Ebrahimi   }
2481*22dc650dSSadaf Ebrahimi 
2482*22dc650dSSadaf Ebrahimi *nameptr = ptr;
2483*22dc650dSSadaf Ebrahimi *offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2484*22dc650dSSadaf Ebrahimi 
2485*22dc650dSSadaf Ebrahimi /* In UTF mode, a group name may contain letters and decimal digits as defined
2486*22dc650dSSadaf Ebrahimi by Unicode properties, and underscores, but must not start with a digit. */
2487*22dc650dSSadaf Ebrahimi 
2488*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
2489*22dc650dSSadaf Ebrahimi if (utf && is_group)
2490*22dc650dSSadaf Ebrahimi   {
2491*22dc650dSSadaf Ebrahimi   uint32_t c, type;
2492*22dc650dSSadaf Ebrahimi 
2493*22dc650dSSadaf Ebrahimi   GETCHAR(c, ptr);
2494*22dc650dSSadaf Ebrahimi   type = UCD_CHARTYPE(c);
2495*22dc650dSSadaf Ebrahimi 
2496*22dc650dSSadaf Ebrahimi   if (type == ucp_Nd)
2497*22dc650dSSadaf Ebrahimi     {
2498*22dc650dSSadaf Ebrahimi     *errorcodeptr = ERR44;
2499*22dc650dSSadaf Ebrahimi     goto FAILED;
2500*22dc650dSSadaf Ebrahimi     }
2501*22dc650dSSadaf Ebrahimi 
2502*22dc650dSSadaf Ebrahimi   for(;;)
2503*22dc650dSSadaf Ebrahimi     {
2504*22dc650dSSadaf Ebrahimi     if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
2505*22dc650dSSadaf Ebrahimi         c != CHAR_UNDERSCORE) break;
2506*22dc650dSSadaf Ebrahimi     ptr++;
2507*22dc650dSSadaf Ebrahimi     FORWARDCHARTEST(ptr, ptrend);
2508*22dc650dSSadaf Ebrahimi     if (ptr >= ptrend) break;
2509*22dc650dSSadaf Ebrahimi     GETCHAR(c, ptr);
2510*22dc650dSSadaf Ebrahimi     type = UCD_CHARTYPE(c);
2511*22dc650dSSadaf Ebrahimi     }
2512*22dc650dSSadaf Ebrahimi   }
2513*22dc650dSSadaf Ebrahimi else
2514*22dc650dSSadaf Ebrahimi #else
2515*22dc650dSSadaf Ebrahimi (void)utf;  /* Avoid compiler warning */
2516*22dc650dSSadaf Ebrahimi #endif      /* SUPPORT_UNICODE */
2517*22dc650dSSadaf Ebrahimi 
2518*22dc650dSSadaf Ebrahimi /* Handle non-group names and group names in non-UTF modes. A group name must
2519*22dc650dSSadaf Ebrahimi not start with a digit. If either of the others start with a digit it just
2520*22dc650dSSadaf Ebrahimi won't be recognized. */
2521*22dc650dSSadaf Ebrahimi 
2522*22dc650dSSadaf Ebrahimi   {
2523*22dc650dSSadaf Ebrahimi   if (is_group && IS_DIGIT(*ptr))
2524*22dc650dSSadaf Ebrahimi     {
2525*22dc650dSSadaf Ebrahimi     *errorcodeptr = ERR44;
2526*22dc650dSSadaf Ebrahimi     goto FAILED;
2527*22dc650dSSadaf Ebrahimi     }
2528*22dc650dSSadaf Ebrahimi 
2529*22dc650dSSadaf Ebrahimi   while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
2530*22dc650dSSadaf Ebrahimi     {
2531*22dc650dSSadaf Ebrahimi     ptr++;
2532*22dc650dSSadaf Ebrahimi     }
2533*22dc650dSSadaf Ebrahimi   }
2534*22dc650dSSadaf Ebrahimi 
2535*22dc650dSSadaf Ebrahimi /* Check name length */
2536*22dc650dSSadaf Ebrahimi 
2537*22dc650dSSadaf Ebrahimi if (ptr > *nameptr + MAX_NAME_SIZE)
2538*22dc650dSSadaf Ebrahimi   {
2539*22dc650dSSadaf Ebrahimi   *errorcodeptr = ERR48;
2540*22dc650dSSadaf Ebrahimi   goto FAILED;
2541*22dc650dSSadaf Ebrahimi   }
2542*22dc650dSSadaf Ebrahimi *namelenptr = (uint32_t)(ptr - *nameptr);
2543*22dc650dSSadaf Ebrahimi 
2544*22dc650dSSadaf Ebrahimi /* Subpattern names must not be empty, and their terminator is checked here.
2545*22dc650dSSadaf Ebrahimi (What follows a verb or alpha assertion name is checked separately.) */
2546*22dc650dSSadaf Ebrahimi 
2547*22dc650dSSadaf Ebrahimi if (is_group)
2548*22dc650dSSadaf Ebrahimi   {
2549*22dc650dSSadaf Ebrahimi   if (ptr == *nameptr)
2550*22dc650dSSadaf Ebrahimi     {
2551*22dc650dSSadaf Ebrahimi     *errorcodeptr = ERR62;   /* Subpattern name expected */
2552*22dc650dSSadaf Ebrahimi     goto FAILED;
2553*22dc650dSSadaf Ebrahimi     }
2554*22dc650dSSadaf Ebrahimi   if (is_braced)
2555*22dc650dSSadaf Ebrahimi     while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2556*22dc650dSSadaf Ebrahimi   if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
2557*22dc650dSSadaf Ebrahimi     {
2558*22dc650dSSadaf Ebrahimi     *errorcodeptr = ERR42;
2559*22dc650dSSadaf Ebrahimi     goto FAILED;
2560*22dc650dSSadaf Ebrahimi     }
2561*22dc650dSSadaf Ebrahimi   ptr++;
2562*22dc650dSSadaf Ebrahimi   }
2563*22dc650dSSadaf Ebrahimi 
2564*22dc650dSSadaf Ebrahimi *ptrptr = ptr;
2565*22dc650dSSadaf Ebrahimi return TRUE;
2566*22dc650dSSadaf Ebrahimi 
2567*22dc650dSSadaf Ebrahimi FAILED:
2568*22dc650dSSadaf Ebrahimi *ptrptr = ptr;
2569*22dc650dSSadaf Ebrahimi return FALSE;
2570*22dc650dSSadaf Ebrahimi }
2571*22dc650dSSadaf Ebrahimi 
2572*22dc650dSSadaf Ebrahimi 
2573*22dc650dSSadaf Ebrahimi 
2574*22dc650dSSadaf Ebrahimi /*************************************************
2575*22dc650dSSadaf Ebrahimi *          Manage callouts at start of cycle     *
2576*22dc650dSSadaf Ebrahimi *************************************************/
2577*22dc650dSSadaf Ebrahimi 
2578*22dc650dSSadaf Ebrahimi /* At the start of a new item in parse_regex() we are able to record the
2579*22dc650dSSadaf Ebrahimi details of the previous item in a prior callout, and also to set up an
2580*22dc650dSSadaf Ebrahimi automatic callout if enabled. Avoid having two adjacent automatic callouts,
2581*22dc650dSSadaf Ebrahimi which would otherwise happen for items such as \Q that contribute nothing to
2582*22dc650dSSadaf Ebrahimi the parsed pattern.
2583*22dc650dSSadaf Ebrahimi 
2584*22dc650dSSadaf Ebrahimi Arguments:
2585*22dc650dSSadaf Ebrahimi   ptr              current pattern pointer
2586*22dc650dSSadaf Ebrahimi   pcalloutptr      points to a pointer to previous callout, or NULL
2587*22dc650dSSadaf Ebrahimi   auto_callout     TRUE if auto_callouts are enabled
2588*22dc650dSSadaf Ebrahimi   parsed_pattern   the parsed pattern pointer
2589*22dc650dSSadaf Ebrahimi   cb               compile block
2590*22dc650dSSadaf Ebrahimi 
2591*22dc650dSSadaf Ebrahimi Returns: possibly updated parsed_pattern pointer.
2592*22dc650dSSadaf Ebrahimi */
2593*22dc650dSSadaf Ebrahimi 
2594*22dc650dSSadaf Ebrahimi static uint32_t *
manage_callouts(PCRE2_SPTR ptr,uint32_t ** pcalloutptr,BOOL auto_callout,uint32_t * parsed_pattern,compile_block * cb)2595*22dc650dSSadaf Ebrahimi manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2596*22dc650dSSadaf Ebrahimi   uint32_t *parsed_pattern, compile_block *cb)
2597*22dc650dSSadaf Ebrahimi {
2598*22dc650dSSadaf Ebrahimi uint32_t *previous_callout = *pcalloutptr;
2599*22dc650dSSadaf Ebrahimi 
2600*22dc650dSSadaf Ebrahimi if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
2601*22dc650dSSadaf Ebrahimi   cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
2602*22dc650dSSadaf Ebrahimi 
2603*22dc650dSSadaf Ebrahimi if (!auto_callout) previous_callout = NULL; else
2604*22dc650dSSadaf Ebrahimi   {
2605*22dc650dSSadaf Ebrahimi   if (previous_callout == NULL ||
2606*22dc650dSSadaf Ebrahimi       previous_callout != parsed_pattern - 4 ||
2607*22dc650dSSadaf Ebrahimi       previous_callout[3] != 255)
2608*22dc650dSSadaf Ebrahimi     {
2609*22dc650dSSadaf Ebrahimi     previous_callout = parsed_pattern;  /* Set up new automatic callout */
2610*22dc650dSSadaf Ebrahimi     parsed_pattern += 4;
2611*22dc650dSSadaf Ebrahimi     previous_callout[0] = META_CALLOUT_NUMBER;
2612*22dc650dSSadaf Ebrahimi     previous_callout[2] = 0;
2613*22dc650dSSadaf Ebrahimi     previous_callout[3] = 255;
2614*22dc650dSSadaf Ebrahimi     }
2615*22dc650dSSadaf Ebrahimi   previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
2616*22dc650dSSadaf Ebrahimi   }
2617*22dc650dSSadaf Ebrahimi 
2618*22dc650dSSadaf Ebrahimi *pcalloutptr = previous_callout;
2619*22dc650dSSadaf Ebrahimi return parsed_pattern;
2620*22dc650dSSadaf Ebrahimi }
2621*22dc650dSSadaf Ebrahimi 
2622*22dc650dSSadaf Ebrahimi 
2623*22dc650dSSadaf Ebrahimi 
2624*22dc650dSSadaf Ebrahimi /*************************************************
2625*22dc650dSSadaf Ebrahimi *          Handle \d, \D, \s, \S, \w, \W         *
2626*22dc650dSSadaf Ebrahimi *************************************************/
2627*22dc650dSSadaf Ebrahimi 
2628*22dc650dSSadaf Ebrahimi /* This function is called from parse_regex() below, both for freestanding
2629*22dc650dSSadaf Ebrahimi escapes, and those within classes, to handle those escapes that may change when
2630*22dc650dSSadaf Ebrahimi Unicode property support is requested. Note that PCRE2_UCP will never be set
2631*22dc650dSSadaf Ebrahimi without Unicode support because that is checked when pcre2_compile() is called.
2632*22dc650dSSadaf Ebrahimi 
2633*22dc650dSSadaf Ebrahimi Arguments:
2634*22dc650dSSadaf Ebrahimi   escape          the ESC_... value
2635*22dc650dSSadaf Ebrahimi   parsed_pattern  where to add the code
2636*22dc650dSSadaf Ebrahimi   options         options bits
2637*22dc650dSSadaf Ebrahimi   xoptions        extra options bits
2638*22dc650dSSadaf Ebrahimi 
2639*22dc650dSSadaf Ebrahimi Returns:          updated value of parsed_pattern
2640*22dc650dSSadaf Ebrahimi */
2641*22dc650dSSadaf Ebrahimi static uint32_t *
handle_escdsw(int escape,uint32_t * parsed_pattern,uint32_t options,uint32_t xoptions)2642*22dc650dSSadaf Ebrahimi handle_escdsw(int escape, uint32_t *parsed_pattern, uint32_t options,
2643*22dc650dSSadaf Ebrahimi   uint32_t xoptions)
2644*22dc650dSSadaf Ebrahimi {
2645*22dc650dSSadaf Ebrahimi uint32_t ascii_option = 0;
2646*22dc650dSSadaf Ebrahimi uint32_t prop = ESC_p;
2647*22dc650dSSadaf Ebrahimi 
2648*22dc650dSSadaf Ebrahimi switch(escape)
2649*22dc650dSSadaf Ebrahimi   {
2650*22dc650dSSadaf Ebrahimi   case ESC_D:
2651*22dc650dSSadaf Ebrahimi   prop = ESC_P;
2652*22dc650dSSadaf Ebrahimi   /* Fall through */
2653*22dc650dSSadaf Ebrahimi   case ESC_d:
2654*22dc650dSSadaf Ebrahimi   ascii_option = PCRE2_EXTRA_ASCII_BSD;
2655*22dc650dSSadaf Ebrahimi   break;
2656*22dc650dSSadaf Ebrahimi 
2657*22dc650dSSadaf Ebrahimi   case ESC_S:
2658*22dc650dSSadaf Ebrahimi   prop = ESC_P;
2659*22dc650dSSadaf Ebrahimi   /* Fall through */
2660*22dc650dSSadaf Ebrahimi   case ESC_s:
2661*22dc650dSSadaf Ebrahimi   ascii_option = PCRE2_EXTRA_ASCII_BSS;
2662*22dc650dSSadaf Ebrahimi   break;
2663*22dc650dSSadaf Ebrahimi 
2664*22dc650dSSadaf Ebrahimi   case ESC_W:
2665*22dc650dSSadaf Ebrahimi   prop = ESC_P;
2666*22dc650dSSadaf Ebrahimi   /* Fall through */
2667*22dc650dSSadaf Ebrahimi   case ESC_w:
2668*22dc650dSSadaf Ebrahimi   ascii_option = PCRE2_EXTRA_ASCII_BSW;
2669*22dc650dSSadaf Ebrahimi   break;
2670*22dc650dSSadaf Ebrahimi   }
2671*22dc650dSSadaf Ebrahimi 
2672*22dc650dSSadaf Ebrahimi if ((options & PCRE2_UCP) == 0 || (xoptions & ascii_option) != 0)
2673*22dc650dSSadaf Ebrahimi   {
2674*22dc650dSSadaf Ebrahimi   *parsed_pattern++ = META_ESCAPE + escape;
2675*22dc650dSSadaf Ebrahimi   }
2676*22dc650dSSadaf Ebrahimi else
2677*22dc650dSSadaf Ebrahimi   {
2678*22dc650dSSadaf Ebrahimi   *parsed_pattern++ = META_ESCAPE + prop;
2679*22dc650dSSadaf Ebrahimi   switch(escape)
2680*22dc650dSSadaf Ebrahimi     {
2681*22dc650dSSadaf Ebrahimi     case ESC_d:
2682*22dc650dSSadaf Ebrahimi     case ESC_D:
2683*22dc650dSSadaf Ebrahimi     *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
2684*22dc650dSSadaf Ebrahimi     break;
2685*22dc650dSSadaf Ebrahimi 
2686*22dc650dSSadaf Ebrahimi     case ESC_s:
2687*22dc650dSSadaf Ebrahimi     case ESC_S:
2688*22dc650dSSadaf Ebrahimi     *parsed_pattern++ = PT_SPACE << 16;
2689*22dc650dSSadaf Ebrahimi     break;
2690*22dc650dSSadaf Ebrahimi 
2691*22dc650dSSadaf Ebrahimi     case ESC_w:
2692*22dc650dSSadaf Ebrahimi     case ESC_W:
2693*22dc650dSSadaf Ebrahimi     *parsed_pattern++ = PT_WORD << 16;
2694*22dc650dSSadaf Ebrahimi     break;
2695*22dc650dSSadaf Ebrahimi     }
2696*22dc650dSSadaf Ebrahimi   }
2697*22dc650dSSadaf Ebrahimi 
2698*22dc650dSSadaf Ebrahimi return parsed_pattern;
2699*22dc650dSSadaf Ebrahimi }
2700*22dc650dSSadaf Ebrahimi 
2701*22dc650dSSadaf Ebrahimi 
2702*22dc650dSSadaf Ebrahimi 
2703*22dc650dSSadaf Ebrahimi /*************************************************
2704*22dc650dSSadaf Ebrahimi *      Parse regex and identify named groups     *
2705*22dc650dSSadaf Ebrahimi *************************************************/
2706*22dc650dSSadaf Ebrahimi 
2707*22dc650dSSadaf Ebrahimi /* This function is called first of all. It scans the pattern and does two
2708*22dc650dSSadaf Ebrahimi things: (1) It identifies capturing groups and makes a table of named capturing
2709*22dc650dSSadaf Ebrahimi groups so that information about them is fully available to both the compiling
2710*22dc650dSSadaf Ebrahimi scans. (2) It writes a parsed version of the pattern with comments omitted and
2711*22dc650dSSadaf Ebrahimi escapes processed into the parsed_pattern vector.
2712*22dc650dSSadaf Ebrahimi 
2713*22dc650dSSadaf Ebrahimi Arguments:
2714*22dc650dSSadaf Ebrahimi   ptr             points to the start of the pattern
2715*22dc650dSSadaf Ebrahimi   options         compiling dynamic options (may change during the scan)
2716*22dc650dSSadaf Ebrahimi   has_lookbehind  points to a boolean, set TRUE if a lookbehind is found
2717*22dc650dSSadaf Ebrahimi   cb              pointer to the compile data block
2718*22dc650dSSadaf Ebrahimi 
2719*22dc650dSSadaf Ebrahimi Returns:   zero on success or a non-zero error code, with the
2720*22dc650dSSadaf Ebrahimi              error offset placed in the cb field
2721*22dc650dSSadaf Ebrahimi */
2722*22dc650dSSadaf Ebrahimi 
2723*22dc650dSSadaf Ebrahimi /* A structure and some flags for dealing with nested groups. */
2724*22dc650dSSadaf Ebrahimi 
2725*22dc650dSSadaf Ebrahimi typedef struct nest_save {
2726*22dc650dSSadaf Ebrahimi   uint16_t  nest_depth;
2727*22dc650dSSadaf Ebrahimi   uint16_t  reset_group;
2728*22dc650dSSadaf Ebrahimi   uint16_t  max_group;
2729*22dc650dSSadaf Ebrahimi   uint16_t  flags;
2730*22dc650dSSadaf Ebrahimi   uint32_t  options;
2731*22dc650dSSadaf Ebrahimi   uint32_t  xoptions;
2732*22dc650dSSadaf Ebrahimi } nest_save;
2733*22dc650dSSadaf Ebrahimi 
2734*22dc650dSSadaf Ebrahimi #define NSF_RESET          0x0001u
2735*22dc650dSSadaf Ebrahimi #define NSF_CONDASSERT     0x0002u
2736*22dc650dSSadaf Ebrahimi #define NSF_ATOMICSR       0x0004u
2737*22dc650dSSadaf Ebrahimi 
2738*22dc650dSSadaf Ebrahimi /* Options that are changeable within the pattern must be tracked during
2739*22dc650dSSadaf Ebrahimi parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
2740*22dc650dSSadaf Ebrahimi but all must be tracked so that META_OPTIONS items set the correct values for
2741*22dc650dSSadaf Ebrahimi the main compiling phase. */
2742*22dc650dSSadaf Ebrahimi 
2743*22dc650dSSadaf Ebrahimi #define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
2744*22dc650dSSadaf Ebrahimi   PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
2745*22dc650dSSadaf Ebrahimi   PCRE2_UNGREEDY)
2746*22dc650dSSadaf Ebrahimi 
2747*22dc650dSSadaf Ebrahimi #define PARSE_TRACKED_EXTRA_OPTIONS (PCRE2_EXTRA_CASELESS_RESTRICT| \
2748*22dc650dSSadaf Ebrahimi   PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW| \
2749*22dc650dSSadaf Ebrahimi   PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX)
2750*22dc650dSSadaf Ebrahimi 
2751*22dc650dSSadaf Ebrahimi /* States used for analyzing ranges in character classes. The two OK values
2752*22dc650dSSadaf Ebrahimi must be last. */
2753*22dc650dSSadaf Ebrahimi 
2754*22dc650dSSadaf Ebrahimi enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL };
2755*22dc650dSSadaf Ebrahimi 
2756*22dc650dSSadaf Ebrahimi /* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
2757*22dc650dSSadaf Ebrahimi the storing of literal values in the main parsed pattern, where they can always
2758*22dc650dSSadaf Ebrahimi be quantified. */
2759*22dc650dSSadaf Ebrahimi 
2760*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 32
2761*22dc650dSSadaf Ebrahimi #define PARSED_LITERAL(c, p) \
2762*22dc650dSSadaf Ebrahimi   { \
2763*22dc650dSSadaf Ebrahimi   if (c >= META_END) *p++ = META_BIGVALUE; \
2764*22dc650dSSadaf Ebrahimi   *p++ = c; \
2765*22dc650dSSadaf Ebrahimi   okquantifier = TRUE; \
2766*22dc650dSSadaf Ebrahimi   }
2767*22dc650dSSadaf Ebrahimi #else
2768*22dc650dSSadaf Ebrahimi #define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
2769*22dc650dSSadaf Ebrahimi #endif
2770*22dc650dSSadaf Ebrahimi 
2771*22dc650dSSadaf Ebrahimi /* Here's the actual function. */
2772*22dc650dSSadaf Ebrahimi 
parse_regex(PCRE2_SPTR ptr,uint32_t options,BOOL * has_lookbehind,compile_block * cb)2773*22dc650dSSadaf Ebrahimi static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind,
2774*22dc650dSSadaf Ebrahimi   compile_block *cb)
2775*22dc650dSSadaf Ebrahimi {
2776*22dc650dSSadaf Ebrahimi uint32_t c;
2777*22dc650dSSadaf Ebrahimi uint32_t delimiter;
2778*22dc650dSSadaf Ebrahimi uint32_t namelen;
2779*22dc650dSSadaf Ebrahimi uint32_t class_range_state;
2780*22dc650dSSadaf Ebrahimi uint32_t *verblengthptr = NULL;     /* Value avoids compiler warning */
2781*22dc650dSSadaf Ebrahimi uint32_t *verbstartptr = NULL;
2782*22dc650dSSadaf Ebrahimi uint32_t *previous_callout = NULL;
2783*22dc650dSSadaf Ebrahimi uint32_t *parsed_pattern = cb->parsed_pattern;
2784*22dc650dSSadaf Ebrahimi uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
2785*22dc650dSSadaf Ebrahimi uint32_t *this_parsed_item = NULL;
2786*22dc650dSSadaf Ebrahimi uint32_t *prev_parsed_item = NULL;
2787*22dc650dSSadaf Ebrahimi uint32_t meta_quantifier = 0;
2788*22dc650dSSadaf Ebrahimi uint32_t add_after_mark = 0;
2789*22dc650dSSadaf Ebrahimi uint32_t xoptions = cb->cx->extra_options;
2790*22dc650dSSadaf Ebrahimi uint16_t nest_depth = 0;
2791*22dc650dSSadaf Ebrahimi int after_manual_callout = 0;
2792*22dc650dSSadaf Ebrahimi int expect_cond_assert = 0;
2793*22dc650dSSadaf Ebrahimi int errorcode = 0;
2794*22dc650dSSadaf Ebrahimi int escape;
2795*22dc650dSSadaf Ebrahimi int i;
2796*22dc650dSSadaf Ebrahimi BOOL inescq = FALSE;
2797*22dc650dSSadaf Ebrahimi BOOL inverbname = FALSE;
2798*22dc650dSSadaf Ebrahimi BOOL utf = (options & PCRE2_UTF) != 0;
2799*22dc650dSSadaf Ebrahimi BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
2800*22dc650dSSadaf Ebrahimi BOOL isdupname;
2801*22dc650dSSadaf Ebrahimi BOOL negate_class;
2802*22dc650dSSadaf Ebrahimi BOOL okquantifier = FALSE;
2803*22dc650dSSadaf Ebrahimi PCRE2_SPTR thisptr;
2804*22dc650dSSadaf Ebrahimi PCRE2_SPTR name;
2805*22dc650dSSadaf Ebrahimi PCRE2_SPTR ptrend = cb->end_pattern;
2806*22dc650dSSadaf Ebrahimi PCRE2_SPTR verbnamestart = NULL;    /* Value avoids compiler warning */
2807*22dc650dSSadaf Ebrahimi named_group *ng;
2808*22dc650dSSadaf Ebrahimi nest_save *top_nest, *end_nests;
2809*22dc650dSSadaf Ebrahimi 
2810*22dc650dSSadaf Ebrahimi /* Insert leading items for word and line matching (features provided for the
2811*22dc650dSSadaf Ebrahimi benefit of pcre2grep). */
2812*22dc650dSSadaf Ebrahimi 
2813*22dc650dSSadaf Ebrahimi if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
2814*22dc650dSSadaf Ebrahimi   {
2815*22dc650dSSadaf Ebrahimi   *parsed_pattern++ = META_CIRCUMFLEX;
2816*22dc650dSSadaf Ebrahimi   *parsed_pattern++ = META_NOCAPTURE;
2817*22dc650dSSadaf Ebrahimi   }
2818*22dc650dSSadaf Ebrahimi else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
2819*22dc650dSSadaf Ebrahimi   {
2820*22dc650dSSadaf Ebrahimi   *parsed_pattern++ = META_ESCAPE + ESC_b;
2821*22dc650dSSadaf Ebrahimi   *parsed_pattern++ = META_NOCAPTURE;
2822*22dc650dSSadaf Ebrahimi   }
2823*22dc650dSSadaf Ebrahimi 
2824*22dc650dSSadaf Ebrahimi /* If the pattern is actually a literal string, process it separately to avoid
2825*22dc650dSSadaf Ebrahimi cluttering up the main loop. */
2826*22dc650dSSadaf Ebrahimi 
2827*22dc650dSSadaf Ebrahimi if ((options & PCRE2_LITERAL) != 0)
2828*22dc650dSSadaf Ebrahimi   {
2829*22dc650dSSadaf Ebrahimi   while (ptr < ptrend)
2830*22dc650dSSadaf Ebrahimi     {
2831*22dc650dSSadaf Ebrahimi     if (parsed_pattern >= parsed_pattern_end)
2832*22dc650dSSadaf Ebrahimi       {
2833*22dc650dSSadaf Ebrahimi       errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
2834*22dc650dSSadaf Ebrahimi       goto FAILED;
2835*22dc650dSSadaf Ebrahimi       }
2836*22dc650dSSadaf Ebrahimi     thisptr = ptr;
2837*22dc650dSSadaf Ebrahimi     GETCHARINCTEST(c, ptr);
2838*22dc650dSSadaf Ebrahimi     if (auto_callout)
2839*22dc650dSSadaf Ebrahimi       parsed_pattern = manage_callouts(thisptr, &previous_callout,
2840*22dc650dSSadaf Ebrahimi         auto_callout, parsed_pattern, cb);
2841*22dc650dSSadaf Ebrahimi     PARSED_LITERAL(c, parsed_pattern);
2842*22dc650dSSadaf Ebrahimi     }
2843*22dc650dSSadaf Ebrahimi   goto PARSED_END;
2844*22dc650dSSadaf Ebrahimi   }
2845*22dc650dSSadaf Ebrahimi 
2846*22dc650dSSadaf Ebrahimi /* Process a real regex which may contain meta-characters. */
2847*22dc650dSSadaf Ebrahimi 
2848*22dc650dSSadaf Ebrahimi top_nest = NULL;
2849*22dc650dSSadaf Ebrahimi end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
2850*22dc650dSSadaf Ebrahimi 
2851*22dc650dSSadaf Ebrahimi /* The size of the nest_save structure might not be a factor of the size of the
2852*22dc650dSSadaf Ebrahimi workspace. Therefore we must round down end_nests so as to correctly avoid
2853*22dc650dSSadaf Ebrahimi creating a nest_save that spans the end of the workspace. */
2854*22dc650dSSadaf Ebrahimi 
2855*22dc650dSSadaf Ebrahimi end_nests = (nest_save *)((char *)end_nests -
2856*22dc650dSSadaf Ebrahimi   ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
2857*22dc650dSSadaf Ebrahimi 
2858*22dc650dSSadaf Ebrahimi /* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
2859*22dc650dSSadaf Ebrahimi 
2860*22dc650dSSadaf Ebrahimi if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
2861*22dc650dSSadaf Ebrahimi 
2862*22dc650dSSadaf Ebrahimi /* Now scan the pattern */
2863*22dc650dSSadaf Ebrahimi 
2864*22dc650dSSadaf Ebrahimi while (ptr < ptrend)
2865*22dc650dSSadaf Ebrahimi   {
2866*22dc650dSSadaf Ebrahimi   int prev_expect_cond_assert;
2867*22dc650dSSadaf Ebrahimi   uint32_t min_repeat = 0, max_repeat = 0;
2868*22dc650dSSadaf Ebrahimi   uint32_t set, unset, *optset;
2869*22dc650dSSadaf Ebrahimi   uint32_t xset, xunset, *xoptset;
2870*22dc650dSSadaf Ebrahimi   uint32_t terminator;
2871*22dc650dSSadaf Ebrahimi   uint32_t prev_meta_quantifier;
2872*22dc650dSSadaf Ebrahimi   BOOL prev_okquantifier;
2873*22dc650dSSadaf Ebrahimi   PCRE2_SPTR tempptr;
2874*22dc650dSSadaf Ebrahimi   PCRE2_SIZE offset;
2875*22dc650dSSadaf Ebrahimi 
2876*22dc650dSSadaf Ebrahimi   if (parsed_pattern >= parsed_pattern_end)
2877*22dc650dSSadaf Ebrahimi     {
2878*22dc650dSSadaf Ebrahimi     errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
2879*22dc650dSSadaf Ebrahimi     goto FAILED;
2880*22dc650dSSadaf Ebrahimi     }
2881*22dc650dSSadaf Ebrahimi 
2882*22dc650dSSadaf Ebrahimi   if (nest_depth > cb->cx->parens_nest_limit)
2883*22dc650dSSadaf Ebrahimi     {
2884*22dc650dSSadaf Ebrahimi     errorcode = ERR19;
2885*22dc650dSSadaf Ebrahimi     goto FAILED;        /* Parentheses too deeply nested */
2886*22dc650dSSadaf Ebrahimi     }
2887*22dc650dSSadaf Ebrahimi 
2888*22dc650dSSadaf Ebrahimi   /* If the last time round this loop something was added, parsed_pattern will
2889*22dc650dSSadaf Ebrahimi   no longer be equal to this_parsed_item. Remember where the previous item
2890*22dc650dSSadaf Ebrahimi   started and reset for the next item. Note that sometimes round the loop,
2891*22dc650dSSadaf Ebrahimi   nothing gets added (e.g. for ignored white space). */
2892*22dc650dSSadaf Ebrahimi 
2893*22dc650dSSadaf Ebrahimi   if (this_parsed_item != parsed_pattern)
2894*22dc650dSSadaf Ebrahimi     {
2895*22dc650dSSadaf Ebrahimi     prev_parsed_item = this_parsed_item;
2896*22dc650dSSadaf Ebrahimi     this_parsed_item = parsed_pattern;
2897*22dc650dSSadaf Ebrahimi     }
2898*22dc650dSSadaf Ebrahimi 
2899*22dc650dSSadaf Ebrahimi   /* Get next input character, save its position for callout handling. */
2900*22dc650dSSadaf Ebrahimi 
2901*22dc650dSSadaf Ebrahimi   thisptr = ptr;
2902*22dc650dSSadaf Ebrahimi   GETCHARINCTEST(c, ptr);
2903*22dc650dSSadaf Ebrahimi 
2904*22dc650dSSadaf Ebrahimi   /* Copy quoted literals until \E, allowing for the possibility of automatic
2905*22dc650dSSadaf Ebrahimi   callouts, except when processing a (*VERB) "name".  */
2906*22dc650dSSadaf Ebrahimi 
2907*22dc650dSSadaf Ebrahimi   if (inescq)
2908*22dc650dSSadaf Ebrahimi     {
2909*22dc650dSSadaf Ebrahimi     if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
2910*22dc650dSSadaf Ebrahimi       {
2911*22dc650dSSadaf Ebrahimi       inescq = FALSE;
2912*22dc650dSSadaf Ebrahimi       ptr++;   /* Skip E */
2913*22dc650dSSadaf Ebrahimi       }
2914*22dc650dSSadaf Ebrahimi     else
2915*22dc650dSSadaf Ebrahimi       {
2916*22dc650dSSadaf Ebrahimi       if (expect_cond_assert > 0)   /* A literal is not allowed if we are */
2917*22dc650dSSadaf Ebrahimi         {                           /* expecting a conditional assertion, */
2918*22dc650dSSadaf Ebrahimi         ptr--;                      /* but an empty \Q\E sequence is OK.  */
2919*22dc650dSSadaf Ebrahimi         errorcode = ERR28;
2920*22dc650dSSadaf Ebrahimi         goto FAILED;
2921*22dc650dSSadaf Ebrahimi         }
2922*22dc650dSSadaf Ebrahimi       if (inverbname)
2923*22dc650dSSadaf Ebrahimi         {                          /* Don't use PARSED_LITERAL() because it */
2924*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
2925*22dc650dSSadaf Ebrahimi         if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2926*22dc650dSSadaf Ebrahimi #endif
2927*22dc650dSSadaf Ebrahimi         *parsed_pattern++ = c;
2928*22dc650dSSadaf Ebrahimi         }
2929*22dc650dSSadaf Ebrahimi       else
2930*22dc650dSSadaf Ebrahimi         {
2931*22dc650dSSadaf Ebrahimi         if (after_manual_callout-- <= 0)
2932*22dc650dSSadaf Ebrahimi           parsed_pattern = manage_callouts(thisptr, &previous_callout,
2933*22dc650dSSadaf Ebrahimi             auto_callout, parsed_pattern, cb);
2934*22dc650dSSadaf Ebrahimi         PARSED_LITERAL(c, parsed_pattern);
2935*22dc650dSSadaf Ebrahimi         }
2936*22dc650dSSadaf Ebrahimi       meta_quantifier = 0;
2937*22dc650dSSadaf Ebrahimi       }
2938*22dc650dSSadaf Ebrahimi     continue;  /* Next character */
2939*22dc650dSSadaf Ebrahimi     }
2940*22dc650dSSadaf Ebrahimi 
2941*22dc650dSSadaf Ebrahimi   /* If we are processing the "name" part of a (*VERB:NAME) item, all
2942*22dc650dSSadaf Ebrahimi   characters up to the closing parenthesis are literals except when
2943*22dc650dSSadaf Ebrahimi   PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
2944*22dc650dSSadaf Ebrahimi   and \E and escaped characters are allowed (no character types such as \d). If
2945*22dc650dSSadaf Ebrahimi   PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
2946*22dc650dSSadaf Ebrahimi   this by not entering the special (*VERB:NAME) processing - they are then
2947*22dc650dSSadaf Ebrahimi   picked up below. Note that c is a character, not a code unit, so we must not
2948*22dc650dSSadaf Ebrahimi   use MAX_255 to test its size because MAX_255 tests code units and is assumed
2949*22dc650dSSadaf Ebrahimi   TRUE in 8-bit mode. */
2950*22dc650dSSadaf Ebrahimi 
2951*22dc650dSSadaf Ebrahimi   if (inverbname &&
2952*22dc650dSSadaf Ebrahimi        (
2953*22dc650dSSadaf Ebrahimi         /* EITHER: not both options set */
2954*22dc650dSSadaf Ebrahimi         ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
2955*22dc650dSSadaf Ebrahimi                     (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
2956*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
2957*22dc650dSSadaf Ebrahimi         /* OR: character > 255 AND not Unicode Pattern White Space */
2958*22dc650dSSadaf Ebrahimi         (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
2959*22dc650dSSadaf Ebrahimi #endif
2960*22dc650dSSadaf Ebrahimi         /* OR: not a # comment or isspace() white space */
2961*22dc650dSSadaf Ebrahimi         (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
2962*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
2963*22dc650dSSadaf Ebrahimi         /* and not CHAR_NEL when Unicode is supported */
2964*22dc650dSSadaf Ebrahimi           && c != CHAR_NEL
2965*22dc650dSSadaf Ebrahimi #endif
2966*22dc650dSSadaf Ebrahimi        )))
2967*22dc650dSSadaf Ebrahimi     {
2968*22dc650dSSadaf Ebrahimi     PCRE2_SIZE verbnamelength;
2969*22dc650dSSadaf Ebrahimi 
2970*22dc650dSSadaf Ebrahimi     switch(c)
2971*22dc650dSSadaf Ebrahimi       {
2972*22dc650dSSadaf Ebrahimi       default:                     /* Don't use PARSED_LITERAL() because it */
2973*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
2974*22dc650dSSadaf Ebrahimi       if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2975*22dc650dSSadaf Ebrahimi #endif
2976*22dc650dSSadaf Ebrahimi       *parsed_pattern++ = c;
2977*22dc650dSSadaf Ebrahimi       break;
2978*22dc650dSSadaf Ebrahimi 
2979*22dc650dSSadaf Ebrahimi       case CHAR_RIGHT_PARENTHESIS:
2980*22dc650dSSadaf Ebrahimi       inverbname = FALSE;
2981*22dc650dSSadaf Ebrahimi       /* This is the length in characters */
2982*22dc650dSSadaf Ebrahimi       verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
2983*22dc650dSSadaf Ebrahimi       /* But the limit on the length is in code units */
2984*22dc650dSSadaf Ebrahimi       if (ptr - verbnamestart - 1 > (int)MAX_MARK)
2985*22dc650dSSadaf Ebrahimi         {
2986*22dc650dSSadaf Ebrahimi         ptr--;
2987*22dc650dSSadaf Ebrahimi         errorcode = ERR76;
2988*22dc650dSSadaf Ebrahimi         goto FAILED;
2989*22dc650dSSadaf Ebrahimi         }
2990*22dc650dSSadaf Ebrahimi       *verblengthptr = (uint32_t)verbnamelength;
2991*22dc650dSSadaf Ebrahimi 
2992*22dc650dSSadaf Ebrahimi       /* If this name was on a verb such as (*ACCEPT) which does not continue,
2993*22dc650dSSadaf Ebrahimi       a (*MARK) was generated for the name. We now add the original verb as the
2994*22dc650dSSadaf Ebrahimi       next item. */
2995*22dc650dSSadaf Ebrahimi 
2996*22dc650dSSadaf Ebrahimi       if (add_after_mark != 0)
2997*22dc650dSSadaf Ebrahimi         {
2998*22dc650dSSadaf Ebrahimi         *parsed_pattern++ = add_after_mark;
2999*22dc650dSSadaf Ebrahimi         add_after_mark = 0;
3000*22dc650dSSadaf Ebrahimi         }
3001*22dc650dSSadaf Ebrahimi       break;
3002*22dc650dSSadaf Ebrahimi 
3003*22dc650dSSadaf Ebrahimi       case CHAR_BACKSLASH:
3004*22dc650dSSadaf Ebrahimi       if ((options & PCRE2_ALT_VERBNAMES) != 0)
3005*22dc650dSSadaf Ebrahimi         {
3006*22dc650dSSadaf Ebrahimi         escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3007*22dc650dSSadaf Ebrahimi           xoptions, FALSE, cb);
3008*22dc650dSSadaf Ebrahimi         if (errorcode != 0) goto FAILED;
3009*22dc650dSSadaf Ebrahimi         }
3010*22dc650dSSadaf Ebrahimi       else escape = 0;   /* Treat all as literal */
3011*22dc650dSSadaf Ebrahimi 
3012*22dc650dSSadaf Ebrahimi       switch(escape)
3013*22dc650dSSadaf Ebrahimi         {
3014*22dc650dSSadaf Ebrahimi         case 0:                    /* Don't use PARSED_LITERAL() because it */
3015*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
3016*22dc650dSSadaf Ebrahimi         if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3017*22dc650dSSadaf Ebrahimi #endif
3018*22dc650dSSadaf Ebrahimi         *parsed_pattern++ = c;
3019*22dc650dSSadaf Ebrahimi         break;
3020*22dc650dSSadaf Ebrahimi 
3021*22dc650dSSadaf Ebrahimi         case ESC_ub:
3022*22dc650dSSadaf Ebrahimi         *parsed_pattern++ = CHAR_u;
3023*22dc650dSSadaf Ebrahimi         PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3024*22dc650dSSadaf Ebrahimi         break;
3025*22dc650dSSadaf Ebrahimi 
3026*22dc650dSSadaf Ebrahimi         case ESC_Q:
3027*22dc650dSSadaf Ebrahimi         inescq = TRUE;
3028*22dc650dSSadaf Ebrahimi         break;
3029*22dc650dSSadaf Ebrahimi 
3030*22dc650dSSadaf Ebrahimi         case ESC_E:           /* Ignore */
3031*22dc650dSSadaf Ebrahimi         break;
3032*22dc650dSSadaf Ebrahimi 
3033*22dc650dSSadaf Ebrahimi         default:
3034*22dc650dSSadaf Ebrahimi         errorcode = ERR40;    /* Invalid in verb name */
3035*22dc650dSSadaf Ebrahimi         goto FAILED;
3036*22dc650dSSadaf Ebrahimi         }
3037*22dc650dSSadaf Ebrahimi       }
3038*22dc650dSSadaf Ebrahimi     continue;   /* Next character in pattern */
3039*22dc650dSSadaf Ebrahimi     }
3040*22dc650dSSadaf Ebrahimi 
3041*22dc650dSSadaf Ebrahimi   /* Not a verb name character. At this point we must process everything that
3042*22dc650dSSadaf Ebrahimi   must not change the quantification state. This is mainly comments, but we
3043*22dc650dSSadaf Ebrahimi   handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
3044*22dc650dSSadaf Ebrahimi   A+, as in Perl. An isolated \E is ignored. */
3045*22dc650dSSadaf Ebrahimi 
3046*22dc650dSSadaf Ebrahimi   if (c == CHAR_BACKSLASH && ptr < ptrend)
3047*22dc650dSSadaf Ebrahimi     {
3048*22dc650dSSadaf Ebrahimi     if (*ptr == CHAR_Q || *ptr == CHAR_E)
3049*22dc650dSSadaf Ebrahimi       {
3050*22dc650dSSadaf Ebrahimi       inescq = *ptr == CHAR_Q;
3051*22dc650dSSadaf Ebrahimi       ptr++;
3052*22dc650dSSadaf Ebrahimi       continue;
3053*22dc650dSSadaf Ebrahimi       }
3054*22dc650dSSadaf Ebrahimi     }
3055*22dc650dSSadaf Ebrahimi 
3056*22dc650dSSadaf Ebrahimi   /* Skip over whitespace and # comments in extended mode. Note that c is a
3057*22dc650dSSadaf Ebrahimi   character, not a code unit, so we must not use MAX_255 to test its size
3058*22dc650dSSadaf Ebrahimi   because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
3059*22dc650dSSadaf Ebrahimi   whitespace characters are those designated as "Pattern White Space" by
3060*22dc650dSSadaf Ebrahimi   Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
3061*22dc650dSSadaf Ebrahimi   U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
3062*22dc650dSSadaf Ebrahimi   subset of space characters that match \h and \v. */
3063*22dc650dSSadaf Ebrahimi 
3064*22dc650dSSadaf Ebrahimi   if ((options & PCRE2_EXTENDED) != 0)
3065*22dc650dSSadaf Ebrahimi     {
3066*22dc650dSSadaf Ebrahimi     if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
3067*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
3068*22dc650dSSadaf Ebrahimi     if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
3069*22dc650dSSadaf Ebrahimi #endif
3070*22dc650dSSadaf Ebrahimi     if (c == CHAR_NUMBER_SIGN)
3071*22dc650dSSadaf Ebrahimi       {
3072*22dc650dSSadaf Ebrahimi       while (ptr < ptrend)
3073*22dc650dSSadaf Ebrahimi         {
3074*22dc650dSSadaf Ebrahimi         if (IS_NEWLINE(ptr))      /* For non-fixed-length newline cases, */
3075*22dc650dSSadaf Ebrahimi           {                       /* IS_NEWLINE sets cb->nllen. */
3076*22dc650dSSadaf Ebrahimi           ptr += cb->nllen;
3077*22dc650dSSadaf Ebrahimi           break;
3078*22dc650dSSadaf Ebrahimi           }
3079*22dc650dSSadaf Ebrahimi         ptr++;
3080*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
3081*22dc650dSSadaf Ebrahimi         if (utf) FORWARDCHARTEST(ptr, ptrend);
3082*22dc650dSSadaf Ebrahimi #endif
3083*22dc650dSSadaf Ebrahimi         }
3084*22dc650dSSadaf Ebrahimi       continue;  /* Next character in pattern */
3085*22dc650dSSadaf Ebrahimi       }
3086*22dc650dSSadaf Ebrahimi     }
3087*22dc650dSSadaf Ebrahimi 
3088*22dc650dSSadaf Ebrahimi   /* Skip over bracketed comments */
3089*22dc650dSSadaf Ebrahimi 
3090*22dc650dSSadaf Ebrahimi   if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
3091*22dc650dSSadaf Ebrahimi       ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
3092*22dc650dSSadaf Ebrahimi     {
3093*22dc650dSSadaf Ebrahimi     while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
3094*22dc650dSSadaf Ebrahimi     if (ptr >= ptrend)
3095*22dc650dSSadaf Ebrahimi       {
3096*22dc650dSSadaf Ebrahimi       errorcode = ERR18;  /* A special error for missing ) in a comment */
3097*22dc650dSSadaf Ebrahimi       goto FAILED;        /* to make it easier to debug. */
3098*22dc650dSSadaf Ebrahimi       }
3099*22dc650dSSadaf Ebrahimi     ptr++;
3100*22dc650dSSadaf Ebrahimi     continue;  /* Next character in pattern */
3101*22dc650dSSadaf Ebrahimi     }
3102*22dc650dSSadaf Ebrahimi 
3103*22dc650dSSadaf Ebrahimi   /* If the next item is not a quantifier, fill in length of any previous
3104*22dc650dSSadaf Ebrahimi   callout and create an auto callout if required. */
3105*22dc650dSSadaf Ebrahimi 
3106*22dc650dSSadaf Ebrahimi   if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
3107*22dc650dSSadaf Ebrahimi        (c != CHAR_LEFT_CURLY_BRACKET ||
3108*22dc650dSSadaf Ebrahimi          (tempptr = ptr,
3109*22dc650dSSadaf Ebrahimi          !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
3110*22dc650dSSadaf Ebrahimi     {
3111*22dc650dSSadaf Ebrahimi     if (after_manual_callout-- <= 0)
3112*22dc650dSSadaf Ebrahimi       {
3113*22dc650dSSadaf Ebrahimi       parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
3114*22dc650dSSadaf Ebrahimi         parsed_pattern, cb);
3115*22dc650dSSadaf Ebrahimi       this_parsed_item = parsed_pattern;  /* New start for current item */
3116*22dc650dSSadaf Ebrahimi       }
3117*22dc650dSSadaf Ebrahimi     }
3118*22dc650dSSadaf Ebrahimi 
3119*22dc650dSSadaf Ebrahimi   /* If expect_cond_assert is 2, we have just passed (?( and are expecting an
3120*22dc650dSSadaf Ebrahimi   assertion, possibly preceded by a callout. If the value is 1, we have just
3121*22dc650dSSadaf Ebrahimi   had the callout and expect an assertion. There must be at least 3 more
3122*22dc650dSSadaf Ebrahimi   characters in all cases. When expect_cond_assert is 2, we know that the
3123*22dc650dSSadaf Ebrahimi   current character is an opening parenthesis, as otherwise we wouldn't be
3124*22dc650dSSadaf Ebrahimi   here. However, when it is 1, we need to check, and it's easiest just to check
3125*22dc650dSSadaf Ebrahimi   always. Note that expect_cond_assert may be negative, since all callouts just
3126*22dc650dSSadaf Ebrahimi   decrement it. */
3127*22dc650dSSadaf Ebrahimi 
3128*22dc650dSSadaf Ebrahimi   if (expect_cond_assert > 0)
3129*22dc650dSSadaf Ebrahimi     {
3130*22dc650dSSadaf Ebrahimi     BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
3131*22dc650dSSadaf Ebrahimi               (ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK);
3132*22dc650dSSadaf Ebrahimi     if (ok)
3133*22dc650dSSadaf Ebrahimi       {
3134*22dc650dSSadaf Ebrahimi       if (ptr[0] == CHAR_ASTERISK)  /* New alpha assertion format, possibly */
3135*22dc650dSSadaf Ebrahimi         {
3136*22dc650dSSadaf Ebrahimi         ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;
3137*22dc650dSSadaf Ebrahimi         }
3138*22dc650dSSadaf Ebrahimi       else switch(ptr[1])  /* Traditional symbolic format */
3139*22dc650dSSadaf Ebrahimi         {
3140*22dc650dSSadaf Ebrahimi         case CHAR_C:
3141*22dc650dSSadaf Ebrahimi         ok = expect_cond_assert == 2;
3142*22dc650dSSadaf Ebrahimi         break;
3143*22dc650dSSadaf Ebrahimi 
3144*22dc650dSSadaf Ebrahimi         case CHAR_EQUALS_SIGN:
3145*22dc650dSSadaf Ebrahimi         case CHAR_EXCLAMATION_MARK:
3146*22dc650dSSadaf Ebrahimi         break;
3147*22dc650dSSadaf Ebrahimi 
3148*22dc650dSSadaf Ebrahimi         case CHAR_LESS_THAN_SIGN:
3149*22dc650dSSadaf Ebrahimi         ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
3150*22dc650dSSadaf Ebrahimi         break;
3151*22dc650dSSadaf Ebrahimi 
3152*22dc650dSSadaf Ebrahimi         default:
3153*22dc650dSSadaf Ebrahimi         ok = FALSE;
3154*22dc650dSSadaf Ebrahimi         }
3155*22dc650dSSadaf Ebrahimi       }
3156*22dc650dSSadaf Ebrahimi 
3157*22dc650dSSadaf Ebrahimi     if (!ok)
3158*22dc650dSSadaf Ebrahimi       {
3159*22dc650dSSadaf Ebrahimi       ptr--;   /* Adjust error offset */
3160*22dc650dSSadaf Ebrahimi       errorcode = ERR28;
3161*22dc650dSSadaf Ebrahimi       goto FAILED;
3162*22dc650dSSadaf Ebrahimi       }
3163*22dc650dSSadaf Ebrahimi     }
3164*22dc650dSSadaf Ebrahimi 
3165*22dc650dSSadaf Ebrahimi   /* Remember whether we are expecting a conditional assertion, and set the
3166*22dc650dSSadaf Ebrahimi   default for this item. */
3167*22dc650dSSadaf Ebrahimi 
3168*22dc650dSSadaf Ebrahimi   prev_expect_cond_assert = expect_cond_assert;
3169*22dc650dSSadaf Ebrahimi   expect_cond_assert = 0;
3170*22dc650dSSadaf Ebrahimi 
3171*22dc650dSSadaf Ebrahimi   /* Remember quantification status for the previous significant item, then set
3172*22dc650dSSadaf Ebrahimi   default for this item. */
3173*22dc650dSSadaf Ebrahimi 
3174*22dc650dSSadaf Ebrahimi   prev_okquantifier = okquantifier;
3175*22dc650dSSadaf Ebrahimi   prev_meta_quantifier = meta_quantifier;
3176*22dc650dSSadaf Ebrahimi   okquantifier = FALSE;
3177*22dc650dSSadaf Ebrahimi   meta_quantifier = 0;
3178*22dc650dSSadaf Ebrahimi 
3179*22dc650dSSadaf Ebrahimi   /* If the previous significant item was a quantifier, adjust the parsed code
3180*22dc650dSSadaf Ebrahimi   if there is a following modifier. The base meta value is always followed by
3181*22dc650dSSadaf Ebrahimi   the PLUS and QUERY values, in that order. We do this here rather than after
3182*22dc650dSSadaf Ebrahimi   reading a quantifier so that intervening comments and /x whitespace can be
3183*22dc650dSSadaf Ebrahimi   ignored without having to replicate code. */
3184*22dc650dSSadaf Ebrahimi 
3185*22dc650dSSadaf Ebrahimi   if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
3186*22dc650dSSadaf Ebrahimi     {
3187*22dc650dSSadaf Ebrahimi     parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
3188*22dc650dSSadaf Ebrahimi       prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
3189*22dc650dSSadaf Ebrahimi         0x00020000u : 0x00010000u);
3190*22dc650dSSadaf Ebrahimi     continue;  /* Next character in pattern */
3191*22dc650dSSadaf Ebrahimi     }
3192*22dc650dSSadaf Ebrahimi 
3193*22dc650dSSadaf Ebrahimi   /* Process the next item in the main part of a pattern. */
3194*22dc650dSSadaf Ebrahimi 
3195*22dc650dSSadaf Ebrahimi   switch(c)
3196*22dc650dSSadaf Ebrahimi     {
3197*22dc650dSSadaf Ebrahimi     default:              /* Non-special character */
3198*22dc650dSSadaf Ebrahimi     PARSED_LITERAL(c, parsed_pattern);
3199*22dc650dSSadaf Ebrahimi     break;
3200*22dc650dSSadaf Ebrahimi 
3201*22dc650dSSadaf Ebrahimi 
3202*22dc650dSSadaf Ebrahimi     /* ---- Escape sequence ---- */
3203*22dc650dSSadaf Ebrahimi 
3204*22dc650dSSadaf Ebrahimi     case CHAR_BACKSLASH:
3205*22dc650dSSadaf Ebrahimi     tempptr = ptr;
3206*22dc650dSSadaf Ebrahimi     escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3207*22dc650dSSadaf Ebrahimi       xoptions, FALSE, cb);
3208*22dc650dSSadaf Ebrahimi     if (errorcode != 0)
3209*22dc650dSSadaf Ebrahimi       {
3210*22dc650dSSadaf Ebrahimi       ESCAPE_FAILED:
3211*22dc650dSSadaf Ebrahimi       if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3212*22dc650dSSadaf Ebrahimi         goto FAILED;
3213*22dc650dSSadaf Ebrahimi       ptr = tempptr;
3214*22dc650dSSadaf Ebrahimi       if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3215*22dc650dSSadaf Ebrahimi         {
3216*22dc650dSSadaf Ebrahimi         GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
3217*22dc650dSSadaf Ebrahimi         }
3218*22dc650dSSadaf Ebrahimi       escape = 0;                 /* Treat as literal character */
3219*22dc650dSSadaf Ebrahimi       }
3220*22dc650dSSadaf Ebrahimi 
3221*22dc650dSSadaf Ebrahimi     /* The escape was a data escape or literal character. */
3222*22dc650dSSadaf Ebrahimi 
3223*22dc650dSSadaf Ebrahimi     if (escape == 0)
3224*22dc650dSSadaf Ebrahimi       {
3225*22dc650dSSadaf Ebrahimi       PARSED_LITERAL(c, parsed_pattern);
3226*22dc650dSSadaf Ebrahimi       }
3227*22dc650dSSadaf Ebrahimi 
3228*22dc650dSSadaf Ebrahimi     /* The escape was a back (or forward) reference. We keep the offset in
3229*22dc650dSSadaf Ebrahimi     order to give a more useful diagnostic for a bad forward reference. For
3230*22dc650dSSadaf Ebrahimi     references to groups numbered less than 10 we can't use more than two items
3231*22dc650dSSadaf Ebrahimi     in parsed_pattern because they may be just two characters in the input (and
3232*22dc650dSSadaf Ebrahimi     in a 64-bit world an offset may need two elements). So for them, the offset
3233*22dc650dSSadaf Ebrahimi     of the first occurrent is held in a special vector. */
3234*22dc650dSSadaf Ebrahimi 
3235*22dc650dSSadaf Ebrahimi     else if (escape < 0)
3236*22dc650dSSadaf Ebrahimi       {
3237*22dc650dSSadaf Ebrahimi       offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1);
3238*22dc650dSSadaf Ebrahimi       escape = -escape;
3239*22dc650dSSadaf Ebrahimi       *parsed_pattern++ = META_BACKREF | (uint32_t)escape;
3240*22dc650dSSadaf Ebrahimi       if (escape < 10)
3241*22dc650dSSadaf Ebrahimi         {
3242*22dc650dSSadaf Ebrahimi         if (cb->small_ref_offset[escape] == PCRE2_UNSET)
3243*22dc650dSSadaf Ebrahimi           cb->small_ref_offset[escape] = offset;
3244*22dc650dSSadaf Ebrahimi         }
3245*22dc650dSSadaf Ebrahimi       else
3246*22dc650dSSadaf Ebrahimi         {
3247*22dc650dSSadaf Ebrahimi         PUTOFFSET(offset, parsed_pattern);
3248*22dc650dSSadaf Ebrahimi         }
3249*22dc650dSSadaf Ebrahimi       okquantifier = TRUE;
3250*22dc650dSSadaf Ebrahimi       }
3251*22dc650dSSadaf Ebrahimi 
3252*22dc650dSSadaf Ebrahimi     /* The escape was a character class such as \d etc. or other special
3253*22dc650dSSadaf Ebrahimi     escape indicator such as \A or \X. Most of them generate just a single
3254*22dc650dSSadaf Ebrahimi     parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
3255*22dc650dSSadaf Ebrahimi     value. They are supported only when Unicode is available. The type and
3256*22dc650dSSadaf Ebrahimi     value are packed into a single 32-bit value so that the whole sequences
3257*22dc650dSSadaf Ebrahimi     uses only two elements in the parsed_vector. This is because the same
3258*22dc650dSSadaf Ebrahimi     coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
3259*22dc650dSSadaf Ebrahimi     set.
3260*22dc650dSSadaf Ebrahimi 
3261*22dc650dSSadaf Ebrahimi     There are also some cases where the escape sequence is followed by a name:
3262*22dc650dSSadaf Ebrahimi     \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
3263*22dc650dSSadaf Ebrahimi     and \g'name' are subroutine calls by name; \g{name} is a synonym for
3264*22dc650dSSadaf Ebrahimi     \k{name}. Note that \g<number> and \g'number' are handled by check_escape()
3265*22dc650dSSadaf Ebrahimi     and returned as a negative value (handled above). A name is coded as an
3266*22dc650dSSadaf Ebrahimi     offset into the pattern and a length. */
3267*22dc650dSSadaf Ebrahimi 
3268*22dc650dSSadaf Ebrahimi     else switch (escape)
3269*22dc650dSSadaf Ebrahimi       {
3270*22dc650dSSadaf Ebrahimi       case ESC_C:
3271*22dc650dSSadaf Ebrahimi #ifdef NEVER_BACKSLASH_C
3272*22dc650dSSadaf Ebrahimi       errorcode = ERR85;
3273*22dc650dSSadaf Ebrahimi       goto ESCAPE_FAILED;
3274*22dc650dSSadaf Ebrahimi #else
3275*22dc650dSSadaf Ebrahimi       if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
3276*22dc650dSSadaf Ebrahimi         {
3277*22dc650dSSadaf Ebrahimi         errorcode = ERR83;
3278*22dc650dSSadaf Ebrahimi         goto ESCAPE_FAILED;
3279*22dc650dSSadaf Ebrahimi         }
3280*22dc650dSSadaf Ebrahimi #endif
3281*22dc650dSSadaf Ebrahimi       okquantifier = TRUE;
3282*22dc650dSSadaf Ebrahimi       *parsed_pattern++ = META_ESCAPE + escape;
3283*22dc650dSSadaf Ebrahimi       break;
3284*22dc650dSSadaf Ebrahimi 
3285*22dc650dSSadaf Ebrahimi       /* This is a special return that happens only in EXTRA_ALT_BSUX mode,
3286*22dc650dSSadaf Ebrahimi       when \u{ is not followed by hex digits and }. It requests two literal
3287*22dc650dSSadaf Ebrahimi       characters, u and { and we need this, as otherwise \u{ 12} (for example)
3288*22dc650dSSadaf Ebrahimi       would be treated as u{12} now that spaces are allowed in quantifiers. */
3289*22dc650dSSadaf Ebrahimi 
3290*22dc650dSSadaf Ebrahimi       case ESC_ub:
3291*22dc650dSSadaf Ebrahimi       *parsed_pattern++ = CHAR_u;
3292*22dc650dSSadaf Ebrahimi       PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3293*22dc650dSSadaf Ebrahimi       break;
3294*22dc650dSSadaf Ebrahimi 
3295*22dc650dSSadaf Ebrahimi       case ESC_X:
3296*22dc650dSSadaf Ebrahimi #ifndef SUPPORT_UNICODE
3297*22dc650dSSadaf Ebrahimi       errorcode = ERR45;   /* Supported only with Unicode support */
3298*22dc650dSSadaf Ebrahimi       goto ESCAPE_FAILED;
3299*22dc650dSSadaf Ebrahimi #endif
3300*22dc650dSSadaf Ebrahimi       case ESC_H:
3301*22dc650dSSadaf Ebrahimi       case ESC_h:
3302*22dc650dSSadaf Ebrahimi       case ESC_N:
3303*22dc650dSSadaf Ebrahimi       case ESC_R:
3304*22dc650dSSadaf Ebrahimi       case ESC_V:
3305*22dc650dSSadaf Ebrahimi       case ESC_v:
3306*22dc650dSSadaf Ebrahimi       okquantifier = TRUE;
3307*22dc650dSSadaf Ebrahimi       *parsed_pattern++ = META_ESCAPE + escape;
3308*22dc650dSSadaf Ebrahimi       break;
3309*22dc650dSSadaf Ebrahimi 
3310*22dc650dSSadaf Ebrahimi       default:  /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
3311*22dc650dSSadaf Ebrahimi       *parsed_pattern++ = META_ESCAPE + escape;
3312*22dc650dSSadaf Ebrahimi       break;
3313*22dc650dSSadaf Ebrahimi 
3314*22dc650dSSadaf Ebrahimi       /* Escapes that may change in UCP mode. */
3315*22dc650dSSadaf Ebrahimi 
3316*22dc650dSSadaf Ebrahimi       case ESC_d:
3317*22dc650dSSadaf Ebrahimi       case ESC_D:
3318*22dc650dSSadaf Ebrahimi       case ESC_s:
3319*22dc650dSSadaf Ebrahimi       case ESC_S:
3320*22dc650dSSadaf Ebrahimi       case ESC_w:
3321*22dc650dSSadaf Ebrahimi       case ESC_W:
3322*22dc650dSSadaf Ebrahimi       okquantifier = TRUE;
3323*22dc650dSSadaf Ebrahimi       parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
3324*22dc650dSSadaf Ebrahimi         xoptions);
3325*22dc650dSSadaf Ebrahimi       break;
3326*22dc650dSSadaf Ebrahimi 
3327*22dc650dSSadaf Ebrahimi       /* Unicode property matching */
3328*22dc650dSSadaf Ebrahimi 
3329*22dc650dSSadaf Ebrahimi       case ESC_P:
3330*22dc650dSSadaf Ebrahimi       case ESC_p:
3331*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
3332*22dc650dSSadaf Ebrahimi         {
3333*22dc650dSSadaf Ebrahimi         BOOL negated;
3334*22dc650dSSadaf Ebrahimi         uint16_t ptype = 0, pdata = 0;
3335*22dc650dSSadaf Ebrahimi         if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3336*22dc650dSSadaf Ebrahimi           goto ESCAPE_FAILED;
3337*22dc650dSSadaf Ebrahimi         if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3338*22dc650dSSadaf Ebrahimi         *parsed_pattern++ = META_ESCAPE + escape;
3339*22dc650dSSadaf Ebrahimi         *parsed_pattern++ = (ptype << 16) | pdata;
3340*22dc650dSSadaf Ebrahimi         okquantifier = TRUE;
3341*22dc650dSSadaf Ebrahimi         }
3342*22dc650dSSadaf Ebrahimi #else
3343*22dc650dSSadaf Ebrahimi       errorcode = ERR45;
3344*22dc650dSSadaf Ebrahimi       goto ESCAPE_FAILED;
3345*22dc650dSSadaf Ebrahimi #endif
3346*22dc650dSSadaf Ebrahimi       break;  /* End \P and \p */
3347*22dc650dSSadaf Ebrahimi 
3348*22dc650dSSadaf Ebrahimi       /* When \g is used with quotes or angle brackets as delimiters, it is a
3349*22dc650dSSadaf Ebrahimi       numerical or named subroutine call, and control comes here. When used
3350*22dc650dSSadaf Ebrahimi       with brace delimiters it is a numberical back reference and does not come
3351*22dc650dSSadaf Ebrahimi       here because check_escape() returns it directly as a reference. \k is
3352*22dc650dSSadaf Ebrahimi       always a named back reference. */
3353*22dc650dSSadaf Ebrahimi 
3354*22dc650dSSadaf Ebrahimi       case ESC_g:
3355*22dc650dSSadaf Ebrahimi       case ESC_k:
3356*22dc650dSSadaf Ebrahimi       if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
3357*22dc650dSSadaf Ebrahimi           *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
3358*22dc650dSSadaf Ebrahimi         {
3359*22dc650dSSadaf Ebrahimi         errorcode = (escape == ESC_g)? ERR57 : ERR69;
3360*22dc650dSSadaf Ebrahimi         goto ESCAPE_FAILED;
3361*22dc650dSSadaf Ebrahimi         }
3362*22dc650dSSadaf Ebrahimi       terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
3363*22dc650dSSadaf Ebrahimi         CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
3364*22dc650dSSadaf Ebrahimi         CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
3365*22dc650dSSadaf Ebrahimi 
3366*22dc650dSSadaf Ebrahimi       /* For a non-braced \g, check for a numerical recursion. */
3367*22dc650dSSadaf Ebrahimi 
3368*22dc650dSSadaf Ebrahimi       if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
3369*22dc650dSSadaf Ebrahimi         {
3370*22dc650dSSadaf Ebrahimi         PCRE2_SPTR p = ptr + 1;
3371*22dc650dSSadaf Ebrahimi 
3372*22dc650dSSadaf Ebrahimi         if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3373*22dc650dSSadaf Ebrahimi             &errorcode))
3374*22dc650dSSadaf Ebrahimi           {
3375*22dc650dSSadaf Ebrahimi           if (p >= ptrend || *p != terminator)
3376*22dc650dSSadaf Ebrahimi             {
3377*22dc650dSSadaf Ebrahimi             errorcode = ERR57;
3378*22dc650dSSadaf Ebrahimi             goto ESCAPE_FAILED;
3379*22dc650dSSadaf Ebrahimi             }
3380*22dc650dSSadaf Ebrahimi           ptr = p;
3381*22dc650dSSadaf Ebrahimi           goto SET_RECURSION;
3382*22dc650dSSadaf Ebrahimi           }
3383*22dc650dSSadaf Ebrahimi         if (errorcode != 0) goto ESCAPE_FAILED;
3384*22dc650dSSadaf Ebrahimi         }
3385*22dc650dSSadaf Ebrahimi 
3386*22dc650dSSadaf Ebrahimi       /* Not a numerical recursion. Perl allows spaces and tabs after { and
3387*22dc650dSSadaf Ebrahimi       before } but not for other delimiters. */
3388*22dc650dSSadaf Ebrahimi 
3389*22dc650dSSadaf Ebrahimi       if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
3390*22dc650dSSadaf Ebrahimi           &errorcode, cb)) goto ESCAPE_FAILED;
3391*22dc650dSSadaf Ebrahimi 
3392*22dc650dSSadaf Ebrahimi       /* \k and \g when used with braces are back references, whereas \g used
3393*22dc650dSSadaf Ebrahimi       with quotes or angle brackets is a recursion */
3394*22dc650dSSadaf Ebrahimi 
3395*22dc650dSSadaf Ebrahimi       *parsed_pattern++ =
3396*22dc650dSSadaf Ebrahimi         (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
3397*22dc650dSSadaf Ebrahimi           META_BACKREF_BYNAME : META_RECURSE_BYNAME;
3398*22dc650dSSadaf Ebrahimi       *parsed_pattern++ = namelen;
3399*22dc650dSSadaf Ebrahimi 
3400*22dc650dSSadaf Ebrahimi       PUTOFFSET(offset, parsed_pattern);
3401*22dc650dSSadaf Ebrahimi       okquantifier = TRUE;
3402*22dc650dSSadaf Ebrahimi       break;  /* End special escape processing */
3403*22dc650dSSadaf Ebrahimi       }
3404*22dc650dSSadaf Ebrahimi     break;    /* End escape sequence processing */
3405*22dc650dSSadaf Ebrahimi 
3406*22dc650dSSadaf Ebrahimi 
3407*22dc650dSSadaf Ebrahimi     /* ---- Single-character special items ---- */
3408*22dc650dSSadaf Ebrahimi 
3409*22dc650dSSadaf Ebrahimi     case CHAR_CIRCUMFLEX_ACCENT:
3410*22dc650dSSadaf Ebrahimi     *parsed_pattern++ = META_CIRCUMFLEX;
3411*22dc650dSSadaf Ebrahimi     break;
3412*22dc650dSSadaf Ebrahimi 
3413*22dc650dSSadaf Ebrahimi     case CHAR_DOLLAR_SIGN:
3414*22dc650dSSadaf Ebrahimi     *parsed_pattern++ = META_DOLLAR;
3415*22dc650dSSadaf Ebrahimi     break;
3416*22dc650dSSadaf Ebrahimi 
3417*22dc650dSSadaf Ebrahimi     case CHAR_DOT:
3418*22dc650dSSadaf Ebrahimi     *parsed_pattern++ = META_DOT;
3419*22dc650dSSadaf Ebrahimi     okquantifier = TRUE;
3420*22dc650dSSadaf Ebrahimi     break;
3421*22dc650dSSadaf Ebrahimi 
3422*22dc650dSSadaf Ebrahimi 
3423*22dc650dSSadaf Ebrahimi     /* ---- Single-character quantifiers ---- */
3424*22dc650dSSadaf Ebrahimi 
3425*22dc650dSSadaf Ebrahimi     case CHAR_ASTERISK:
3426*22dc650dSSadaf Ebrahimi     meta_quantifier = META_ASTERISK;
3427*22dc650dSSadaf Ebrahimi     goto CHECK_QUANTIFIER;
3428*22dc650dSSadaf Ebrahimi 
3429*22dc650dSSadaf Ebrahimi     case CHAR_PLUS:
3430*22dc650dSSadaf Ebrahimi     meta_quantifier = META_PLUS;
3431*22dc650dSSadaf Ebrahimi     goto CHECK_QUANTIFIER;
3432*22dc650dSSadaf Ebrahimi 
3433*22dc650dSSadaf Ebrahimi     case CHAR_QUESTION_MARK:
3434*22dc650dSSadaf Ebrahimi     meta_quantifier = META_QUERY;
3435*22dc650dSSadaf Ebrahimi     goto CHECK_QUANTIFIER;
3436*22dc650dSSadaf Ebrahimi 
3437*22dc650dSSadaf Ebrahimi 
3438*22dc650dSSadaf Ebrahimi     /* ---- Potential {n,m} quantifier ---- */
3439*22dc650dSSadaf Ebrahimi 
3440*22dc650dSSadaf Ebrahimi     case CHAR_LEFT_CURLY_BRACKET:
3441*22dc650dSSadaf Ebrahimi     if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
3442*22dc650dSSadaf Ebrahimi         &errorcode))
3443*22dc650dSSadaf Ebrahimi       {
3444*22dc650dSSadaf Ebrahimi       if (errorcode != 0) goto FAILED;     /* Error in quantifier. */
3445*22dc650dSSadaf Ebrahimi       PARSED_LITERAL(c, parsed_pattern);   /* Not a quantifier */
3446*22dc650dSSadaf Ebrahimi       break;                               /* No more quantifier processing */
3447*22dc650dSSadaf Ebrahimi       }
3448*22dc650dSSadaf Ebrahimi     meta_quantifier = META_MINMAX;
3449*22dc650dSSadaf Ebrahimi     /* Fall through */
3450*22dc650dSSadaf Ebrahimi 
3451*22dc650dSSadaf Ebrahimi 
3452*22dc650dSSadaf Ebrahimi     /* ---- Quantifier post-processing ---- */
3453*22dc650dSSadaf Ebrahimi 
3454*22dc650dSSadaf Ebrahimi     /* Check that a quantifier is allowed after the previous item. This
3455*22dc650dSSadaf Ebrahimi     guarantees that there is a previous item. */
3456*22dc650dSSadaf Ebrahimi 
3457*22dc650dSSadaf Ebrahimi     CHECK_QUANTIFIER:
3458*22dc650dSSadaf Ebrahimi     if (!prev_okquantifier)
3459*22dc650dSSadaf Ebrahimi       {
3460*22dc650dSSadaf Ebrahimi       errorcode = ERR9;
3461*22dc650dSSadaf Ebrahimi       goto FAILED_BACK;
3462*22dc650dSSadaf Ebrahimi       }
3463*22dc650dSSadaf Ebrahimi 
3464*22dc650dSSadaf Ebrahimi     /* Most (*VERB)s are not allowed to be quantified, but an ungreedy
3465*22dc650dSSadaf Ebrahimi     quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a
3466*22dc650dSSadaf Ebrahimi     sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by
3467*22dc650dSSadaf Ebrahimi     wrapping it in non-capturing brackets, but we have to allow for a preceding
3468*22dc650dSSadaf Ebrahimi     (*MARK) for when (*ACCEPT) has an argument. */
3469*22dc650dSSadaf Ebrahimi 
3470*22dc650dSSadaf Ebrahimi     if (*prev_parsed_item == META_ACCEPT)
3471*22dc650dSSadaf Ebrahimi       {
3472*22dc650dSSadaf Ebrahimi       uint32_t *p;
3473*22dc650dSSadaf Ebrahimi       for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0];
3474*22dc650dSSadaf Ebrahimi       *verbstartptr = META_NOCAPTURE;
3475*22dc650dSSadaf Ebrahimi       parsed_pattern[1] = META_KET;
3476*22dc650dSSadaf Ebrahimi       parsed_pattern += 2;
3477*22dc650dSSadaf Ebrahimi       }
3478*22dc650dSSadaf Ebrahimi 
3479*22dc650dSSadaf Ebrahimi     /* Now we can put the quantifier into the parsed pattern vector. At this
3480*22dc650dSSadaf Ebrahimi     stage, we have only the basic quantifier. The check for a following + or ?
3481*22dc650dSSadaf Ebrahimi     modifier happens at the top of the loop, after any intervening comments
3482*22dc650dSSadaf Ebrahimi     have been removed. */
3483*22dc650dSSadaf Ebrahimi 
3484*22dc650dSSadaf Ebrahimi     *parsed_pattern++ = meta_quantifier;
3485*22dc650dSSadaf Ebrahimi     if (c == CHAR_LEFT_CURLY_BRACKET)
3486*22dc650dSSadaf Ebrahimi       {
3487*22dc650dSSadaf Ebrahimi       *parsed_pattern++ = min_repeat;
3488*22dc650dSSadaf Ebrahimi       *parsed_pattern++ = max_repeat;
3489*22dc650dSSadaf Ebrahimi       }
3490*22dc650dSSadaf Ebrahimi     break;
3491*22dc650dSSadaf Ebrahimi 
3492*22dc650dSSadaf Ebrahimi 
3493*22dc650dSSadaf Ebrahimi     /* ---- Character class ---- */
3494*22dc650dSSadaf Ebrahimi 
3495*22dc650dSSadaf Ebrahimi     case CHAR_LEFT_SQUARE_BRACKET:
3496*22dc650dSSadaf Ebrahimi     okquantifier = TRUE;
3497*22dc650dSSadaf Ebrahimi 
3498*22dc650dSSadaf Ebrahimi     /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
3499*22dc650dSSadaf Ebrahimi     used for "start of word" and "end of word". As these are otherwise illegal
3500*22dc650dSSadaf Ebrahimi     sequences, we don't break anything by recognizing them. They are replaced
3501*22dc650dSSadaf Ebrahimi     by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3502*22dc650dSSadaf Ebrahimi     erroneous and are handled by the normal code below. */
3503*22dc650dSSadaf Ebrahimi 
3504*22dc650dSSadaf Ebrahimi     if (ptrend - ptr >= 6 &&
3505*22dc650dSSadaf Ebrahimi          (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
3506*22dc650dSSadaf Ebrahimi           PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
3507*22dc650dSSadaf Ebrahimi       {
3508*22dc650dSSadaf Ebrahimi       *parsed_pattern++ = META_ESCAPE + ESC_b;
3509*22dc650dSSadaf Ebrahimi 
3510*22dc650dSSadaf Ebrahimi       if (ptr[2] == CHAR_LESS_THAN_SIGN)
3511*22dc650dSSadaf Ebrahimi         {
3512*22dc650dSSadaf Ebrahimi         *parsed_pattern++ = META_LOOKAHEAD;
3513*22dc650dSSadaf Ebrahimi         }
3514*22dc650dSSadaf Ebrahimi       else
3515*22dc650dSSadaf Ebrahimi         {
3516*22dc650dSSadaf Ebrahimi         *parsed_pattern++ = META_LOOKBEHIND;
3517*22dc650dSSadaf Ebrahimi         *has_lookbehind = TRUE;
3518*22dc650dSSadaf Ebrahimi 
3519*22dc650dSSadaf Ebrahimi         /* The offset is used only for the "non-fixed length" error; this won't
3520*22dc650dSSadaf Ebrahimi         occur here, so just store zero. */
3521*22dc650dSSadaf Ebrahimi 
3522*22dc650dSSadaf Ebrahimi         PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
3523*22dc650dSSadaf Ebrahimi         }
3524*22dc650dSSadaf Ebrahimi 
3525*22dc650dSSadaf Ebrahimi       if ((options & PCRE2_UCP) == 0)
3526*22dc650dSSadaf Ebrahimi         *parsed_pattern++ = META_ESCAPE + ESC_w;
3527*22dc650dSSadaf Ebrahimi       else
3528*22dc650dSSadaf Ebrahimi         {
3529*22dc650dSSadaf Ebrahimi         *parsed_pattern++ = META_ESCAPE + ESC_p;
3530*22dc650dSSadaf Ebrahimi         *parsed_pattern++ = PT_WORD << 16;
3531*22dc650dSSadaf Ebrahimi         }
3532*22dc650dSSadaf Ebrahimi       *parsed_pattern++ = META_KET;
3533*22dc650dSSadaf Ebrahimi       ptr += 6;
3534*22dc650dSSadaf Ebrahimi       break;
3535*22dc650dSSadaf Ebrahimi       }
3536*22dc650dSSadaf Ebrahimi 
3537*22dc650dSSadaf Ebrahimi     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3538*22dc650dSSadaf Ebrahimi     they are encountered at the top level, so we'll do that too. */
3539*22dc650dSSadaf Ebrahimi 
3540*22dc650dSSadaf Ebrahimi     if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3541*22dc650dSSadaf Ebrahimi          *ptr == CHAR_EQUALS_SIGN) &&
3542*22dc650dSSadaf Ebrahimi         check_posix_syntax(ptr, ptrend, &tempptr))
3543*22dc650dSSadaf Ebrahimi       {
3544*22dc650dSSadaf Ebrahimi       errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3545*22dc650dSSadaf Ebrahimi       goto FAILED;
3546*22dc650dSSadaf Ebrahimi       }
3547*22dc650dSSadaf Ebrahimi 
3548*22dc650dSSadaf Ebrahimi     /* Process a regular character class. If the first character is '^', set
3549*22dc650dSSadaf Ebrahimi     the negation flag. If the first few characters (either before or after ^)
3550*22dc650dSSadaf Ebrahimi     are \Q\E or \E or space or tab in extended-more mode, we skip them too.
3551*22dc650dSSadaf Ebrahimi     This makes for compatibility with Perl. */
3552*22dc650dSSadaf Ebrahimi 
3553*22dc650dSSadaf Ebrahimi     negate_class = FALSE;
3554*22dc650dSSadaf Ebrahimi     while (ptr < ptrend)
3555*22dc650dSSadaf Ebrahimi       {
3556*22dc650dSSadaf Ebrahimi       GETCHARINCTEST(c, ptr);
3557*22dc650dSSadaf Ebrahimi       if (c == CHAR_BACKSLASH)
3558*22dc650dSSadaf Ebrahimi         {
3559*22dc650dSSadaf Ebrahimi         if (ptr < ptrend && *ptr == CHAR_E) ptr++;
3560*22dc650dSSadaf Ebrahimi         else if (ptrend - ptr >= 3 &&
3561*22dc650dSSadaf Ebrahimi              PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3562*22dc650dSSadaf Ebrahimi           ptr += 3;
3563*22dc650dSSadaf Ebrahimi         else
3564*22dc650dSSadaf Ebrahimi           break;
3565*22dc650dSSadaf Ebrahimi         }
3566*22dc650dSSadaf Ebrahimi       else if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3567*22dc650dSSadaf Ebrahimi                (c == CHAR_SPACE || c == CHAR_HT))  /* Note: just these two */
3568*22dc650dSSadaf Ebrahimi         continue;
3569*22dc650dSSadaf Ebrahimi       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3570*22dc650dSSadaf Ebrahimi         negate_class = TRUE;
3571*22dc650dSSadaf Ebrahimi       else break;
3572*22dc650dSSadaf Ebrahimi       }
3573*22dc650dSSadaf Ebrahimi 
3574*22dc650dSSadaf Ebrahimi     /* Now the real contents of the class; c has the first "real" character.
3575*22dc650dSSadaf Ebrahimi     Empty classes are permitted only if the option is set. */
3576*22dc650dSSadaf Ebrahimi 
3577*22dc650dSSadaf Ebrahimi     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3578*22dc650dSSadaf Ebrahimi         (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
3579*22dc650dSSadaf Ebrahimi       {
3580*22dc650dSSadaf Ebrahimi       *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
3581*22dc650dSSadaf Ebrahimi       break;  /* End of class processing */
3582*22dc650dSSadaf Ebrahimi       }
3583*22dc650dSSadaf Ebrahimi 
3584*22dc650dSSadaf Ebrahimi     /* Process a non-empty class. */
3585*22dc650dSSadaf Ebrahimi 
3586*22dc650dSSadaf Ebrahimi     *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
3587*22dc650dSSadaf Ebrahimi     class_range_state = RANGE_NO;
3588*22dc650dSSadaf Ebrahimi 
3589*22dc650dSSadaf Ebrahimi     /* In an EBCDIC environment, Perl treats alphabetic ranges specially
3590*22dc650dSSadaf Ebrahimi     because there are holes in the encoding, and simply using the range A-Z
3591*22dc650dSSadaf Ebrahimi     (for example) would include the characters in the holes. This applies only
3592*22dc650dSSadaf Ebrahimi     to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3593*22dc650dSSadaf Ebrahimi     in this respect. In order to accommodate this, we keep track of whether
3594*22dc650dSSadaf Ebrahimi     character values are literal or not, and a state variable for handling
3595*22dc650dSSadaf Ebrahimi     ranges. */
3596*22dc650dSSadaf Ebrahimi 
3597*22dc650dSSadaf Ebrahimi     /* Loop for the contents of the class */
3598*22dc650dSSadaf Ebrahimi 
3599*22dc650dSSadaf Ebrahimi     for (;;)
3600*22dc650dSSadaf Ebrahimi       {
3601*22dc650dSSadaf Ebrahimi       BOOL char_is_literal = TRUE;
3602*22dc650dSSadaf Ebrahimi 
3603*22dc650dSSadaf Ebrahimi       /* Inside \Q...\E everything is literal except \E */
3604*22dc650dSSadaf Ebrahimi 
3605*22dc650dSSadaf Ebrahimi       if (inescq)
3606*22dc650dSSadaf Ebrahimi         {
3607*22dc650dSSadaf Ebrahimi         if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3608*22dc650dSSadaf Ebrahimi           {
3609*22dc650dSSadaf Ebrahimi           inescq = FALSE;                   /* Reset literal state */
3610*22dc650dSSadaf Ebrahimi           ptr++;                            /* Skip the 'E' */
3611*22dc650dSSadaf Ebrahimi           goto CLASS_CONTINUE;
3612*22dc650dSSadaf Ebrahimi           }
3613*22dc650dSSadaf Ebrahimi         goto CLASS_LITERAL;
3614*22dc650dSSadaf Ebrahimi         }
3615*22dc650dSSadaf Ebrahimi 
3616*22dc650dSSadaf Ebrahimi       /* Skip over space and tab (only) in extended-more mode. */
3617*22dc650dSSadaf Ebrahimi 
3618*22dc650dSSadaf Ebrahimi       if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3619*22dc650dSSadaf Ebrahimi           (c == CHAR_SPACE || c == CHAR_HT))
3620*22dc650dSSadaf Ebrahimi         goto CLASS_CONTINUE;
3621*22dc650dSSadaf Ebrahimi 
3622*22dc650dSSadaf Ebrahimi       /* Handle POSIX class names. Perl allows a negation extension of the
3623*22dc650dSSadaf Ebrahimi       form [:^name:]. A square bracket that doesn't match the syntax is
3624*22dc650dSSadaf Ebrahimi       treated as a literal. We also recognize the POSIX constructions
3625*22dc650dSSadaf Ebrahimi       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3626*22dc650dSSadaf Ebrahimi       5.6 and 5.8 do. */
3627*22dc650dSSadaf Ebrahimi 
3628*22dc650dSSadaf Ebrahimi       if (c == CHAR_LEFT_SQUARE_BRACKET &&
3629*22dc650dSSadaf Ebrahimi           ptrend - ptr >= 3 &&
3630*22dc650dSSadaf Ebrahimi           (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3631*22dc650dSSadaf Ebrahimi            *ptr == CHAR_EQUALS_SIGN) &&
3632*22dc650dSSadaf Ebrahimi           check_posix_syntax(ptr, ptrend, &tempptr))
3633*22dc650dSSadaf Ebrahimi         {
3634*22dc650dSSadaf Ebrahimi         BOOL posix_negate = FALSE;
3635*22dc650dSSadaf Ebrahimi         int posix_class;
3636*22dc650dSSadaf Ebrahimi 
3637*22dc650dSSadaf Ebrahimi         /* Perl treats a hyphen before a POSIX class as a literal, not the
3638*22dc650dSSadaf Ebrahimi         start of a range. However, it gives a warning in its warning mode. PCRE
3639*22dc650dSSadaf Ebrahimi         does not have a warning mode, so we give an error, because this is
3640*22dc650dSSadaf Ebrahimi         likely an error on the user's part. */
3641*22dc650dSSadaf Ebrahimi 
3642*22dc650dSSadaf Ebrahimi         if (class_range_state == RANGE_STARTED)
3643*22dc650dSSadaf Ebrahimi           {
3644*22dc650dSSadaf Ebrahimi           errorcode = ERR50;
3645*22dc650dSSadaf Ebrahimi           goto FAILED;
3646*22dc650dSSadaf Ebrahimi           }
3647*22dc650dSSadaf Ebrahimi 
3648*22dc650dSSadaf Ebrahimi         if (*ptr != CHAR_COLON)
3649*22dc650dSSadaf Ebrahimi           {
3650*22dc650dSSadaf Ebrahimi           errorcode = ERR13;
3651*22dc650dSSadaf Ebrahimi           goto FAILED_BACK;
3652*22dc650dSSadaf Ebrahimi           }
3653*22dc650dSSadaf Ebrahimi 
3654*22dc650dSSadaf Ebrahimi         if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
3655*22dc650dSSadaf Ebrahimi           {
3656*22dc650dSSadaf Ebrahimi           posix_negate = TRUE;
3657*22dc650dSSadaf Ebrahimi           ptr++;
3658*22dc650dSSadaf Ebrahimi           }
3659*22dc650dSSadaf Ebrahimi 
3660*22dc650dSSadaf Ebrahimi         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3661*22dc650dSSadaf Ebrahimi         if (posix_class < 0)
3662*22dc650dSSadaf Ebrahimi           {
3663*22dc650dSSadaf Ebrahimi           errorcode = ERR30;
3664*22dc650dSSadaf Ebrahimi           goto FAILED;
3665*22dc650dSSadaf Ebrahimi           }
3666*22dc650dSSadaf Ebrahimi         ptr = tempptr + 2;
3667*22dc650dSSadaf Ebrahimi 
3668*22dc650dSSadaf Ebrahimi         /* Perl treats a hyphen after a POSIX class as a literal, not the
3669*22dc650dSSadaf Ebrahimi         start of a range. However, it gives a warning in its warning mode
3670*22dc650dSSadaf Ebrahimi         unless the hyphen is the last character in the class. PCRE does not
3671*22dc650dSSadaf Ebrahimi         have a warning mode, so we give an error, because this is likely an
3672*22dc650dSSadaf Ebrahimi         error on the user's part. */
3673*22dc650dSSadaf Ebrahimi 
3674*22dc650dSSadaf Ebrahimi         if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3675*22dc650dSSadaf Ebrahimi             ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3676*22dc650dSSadaf Ebrahimi           {
3677*22dc650dSSadaf Ebrahimi           errorcode = ERR50;
3678*22dc650dSSadaf Ebrahimi           goto FAILED;
3679*22dc650dSSadaf Ebrahimi           }
3680*22dc650dSSadaf Ebrahimi 
3681*22dc650dSSadaf Ebrahimi         /* Set "a hyphen is not the start of a range" for the -] case, and also
3682*22dc650dSSadaf Ebrahimi         in case the POSIX class is followed by \E or \Q\E (possibly repeated -
3683*22dc650dSSadaf Ebrahimi         fuzzers do that kind of thing) and *then* a hyphen. This causes that
3684*22dc650dSSadaf Ebrahimi         hyphen to be treated as a literal. I don't think it's worth setting up
3685*22dc650dSSadaf Ebrahimi         special apparatus to do otherwise. */
3686*22dc650dSSadaf Ebrahimi 
3687*22dc650dSSadaf Ebrahimi         class_range_state = RANGE_NO;
3688*22dc650dSSadaf Ebrahimi 
3689*22dc650dSSadaf Ebrahimi         /* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some
3690*22dc650dSSadaf Ebrahimi         of the POSIX classes are converted to use Unicode properties \p or \P
3691*22dc650dSSadaf Ebrahimi         or, in one case, \h or \H. The substitutes table has two values per
3692*22dc650dSSadaf Ebrahimi         class, containing the type and value of a \p or \P item. The special
3693*22dc650dSSadaf Ebrahimi         cases are specified with a negative type: a non-zero value causes \h or
3694*22dc650dSSadaf Ebrahimi         \H to be used, and a zero value falls through to behave like a non-UCP
3695*22dc650dSSadaf Ebrahimi         POSIX class. There are now also some extra options that force ASCII for
3696*22dc650dSSadaf Ebrahimi         some classes. */
3697*22dc650dSSadaf Ebrahimi 
3698*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
3699*22dc650dSSadaf Ebrahimi         if ((options & PCRE2_UCP) != 0 &&
3700*22dc650dSSadaf Ebrahimi             (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0 &&
3701*22dc650dSSadaf Ebrahimi             !((xoptions & PCRE2_EXTRA_ASCII_DIGIT) != 0 &&
3702*22dc650dSSadaf Ebrahimi               (posix_class == PC_DIGIT || posix_class == PC_XDIGIT)))
3703*22dc650dSSadaf Ebrahimi           {
3704*22dc650dSSadaf Ebrahimi           int ptype = posix_substitutes[2*posix_class];
3705*22dc650dSSadaf Ebrahimi           int pvalue = posix_substitutes[2*posix_class + 1];
3706*22dc650dSSadaf Ebrahimi 
3707*22dc650dSSadaf Ebrahimi           if (ptype >= 0)
3708*22dc650dSSadaf Ebrahimi             {
3709*22dc650dSSadaf Ebrahimi             *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
3710*22dc650dSSadaf Ebrahimi             *parsed_pattern++ = (ptype << 16) | pvalue;
3711*22dc650dSSadaf Ebrahimi             goto CLASS_CONTINUE;
3712*22dc650dSSadaf Ebrahimi             }
3713*22dc650dSSadaf Ebrahimi 
3714*22dc650dSSadaf Ebrahimi           if (pvalue != 0)
3715*22dc650dSSadaf Ebrahimi             {
3716*22dc650dSSadaf Ebrahimi             *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
3717*22dc650dSSadaf Ebrahimi             goto CLASS_CONTINUE;
3718*22dc650dSSadaf Ebrahimi             }
3719*22dc650dSSadaf Ebrahimi 
3720*22dc650dSSadaf Ebrahimi           /* Fall through */
3721*22dc650dSSadaf Ebrahimi           }
3722*22dc650dSSadaf Ebrahimi #endif  /* SUPPORT_UNICODE */
3723*22dc650dSSadaf Ebrahimi 
3724*22dc650dSSadaf Ebrahimi         /* Non-UCP POSIX class */
3725*22dc650dSSadaf Ebrahimi 
3726*22dc650dSSadaf Ebrahimi         *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
3727*22dc650dSSadaf Ebrahimi         *parsed_pattern++ = posix_class;
3728*22dc650dSSadaf Ebrahimi         }
3729*22dc650dSSadaf Ebrahimi 
3730*22dc650dSSadaf Ebrahimi       /* Handle potential start of range */
3731*22dc650dSSadaf Ebrahimi 
3732*22dc650dSSadaf Ebrahimi       else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
3733*22dc650dSSadaf Ebrahimi         {
3734*22dc650dSSadaf Ebrahimi         *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
3735*22dc650dSSadaf Ebrahimi           META_RANGE_LITERAL : META_RANGE_ESCAPED;
3736*22dc650dSSadaf Ebrahimi         class_range_state = RANGE_STARTED;
3737*22dc650dSSadaf Ebrahimi         }
3738*22dc650dSSadaf Ebrahimi 
3739*22dc650dSSadaf Ebrahimi       /* Handle a literal character */
3740*22dc650dSSadaf Ebrahimi 
3741*22dc650dSSadaf Ebrahimi       else if (c != CHAR_BACKSLASH)
3742*22dc650dSSadaf Ebrahimi         {
3743*22dc650dSSadaf Ebrahimi         CLASS_LITERAL:
3744*22dc650dSSadaf Ebrahimi         if (class_range_state == RANGE_STARTED)
3745*22dc650dSSadaf Ebrahimi           {
3746*22dc650dSSadaf Ebrahimi           if (c == parsed_pattern[-2])       /* Optimize one-char range */
3747*22dc650dSSadaf Ebrahimi             parsed_pattern--;
3748*22dc650dSSadaf Ebrahimi           else if (parsed_pattern[-2] > c)   /* Check range is in order */
3749*22dc650dSSadaf Ebrahimi             {
3750*22dc650dSSadaf Ebrahimi             errorcode = ERR8;
3751*22dc650dSSadaf Ebrahimi             goto FAILED_BACK;
3752*22dc650dSSadaf Ebrahimi             }
3753*22dc650dSSadaf Ebrahimi           else
3754*22dc650dSSadaf Ebrahimi             {
3755*22dc650dSSadaf Ebrahimi             if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
3756*22dc650dSSadaf Ebrahimi               parsed_pattern[-1] = META_RANGE_ESCAPED;
3757*22dc650dSSadaf Ebrahimi             PARSED_LITERAL(c, parsed_pattern);
3758*22dc650dSSadaf Ebrahimi             }
3759*22dc650dSSadaf Ebrahimi           class_range_state = RANGE_NO;
3760*22dc650dSSadaf Ebrahimi           }
3761*22dc650dSSadaf Ebrahimi         else  /* Potential start of range */
3762*22dc650dSSadaf Ebrahimi           {
3763*22dc650dSSadaf Ebrahimi           class_range_state = char_is_literal?
3764*22dc650dSSadaf Ebrahimi             RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
3765*22dc650dSSadaf Ebrahimi           PARSED_LITERAL(c, parsed_pattern);
3766*22dc650dSSadaf Ebrahimi           }
3767*22dc650dSSadaf Ebrahimi         }
3768*22dc650dSSadaf Ebrahimi 
3769*22dc650dSSadaf Ebrahimi       /* Handle escapes in a class */
3770*22dc650dSSadaf Ebrahimi 
3771*22dc650dSSadaf Ebrahimi       else
3772*22dc650dSSadaf Ebrahimi         {
3773*22dc650dSSadaf Ebrahimi         tempptr = ptr;
3774*22dc650dSSadaf Ebrahimi         escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3775*22dc650dSSadaf Ebrahimi           xoptions, TRUE, cb);
3776*22dc650dSSadaf Ebrahimi 
3777*22dc650dSSadaf Ebrahimi         if (errorcode != 0)
3778*22dc650dSSadaf Ebrahimi           {
3779*22dc650dSSadaf Ebrahimi           if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3780*22dc650dSSadaf Ebrahimi             goto FAILED;
3781*22dc650dSSadaf Ebrahimi           ptr = tempptr;
3782*22dc650dSSadaf Ebrahimi           if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3783*22dc650dSSadaf Ebrahimi             {
3784*22dc650dSSadaf Ebrahimi             GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
3785*22dc650dSSadaf Ebrahimi             }
3786*22dc650dSSadaf Ebrahimi           escape = 0;                 /* Treat as literal character */
3787*22dc650dSSadaf Ebrahimi           }
3788*22dc650dSSadaf Ebrahimi 
3789*22dc650dSSadaf Ebrahimi         switch(escape)
3790*22dc650dSSadaf Ebrahimi           {
3791*22dc650dSSadaf Ebrahimi           case 0:  /* Escaped character code point is in c */
3792*22dc650dSSadaf Ebrahimi           char_is_literal = FALSE;
3793*22dc650dSSadaf Ebrahimi           goto CLASS_LITERAL;      /* (a few lines above) */
3794*22dc650dSSadaf Ebrahimi 
3795*22dc650dSSadaf Ebrahimi           case ESC_b:
3796*22dc650dSSadaf Ebrahimi           c = CHAR_BS;    /* \b is backspace in a class */
3797*22dc650dSSadaf Ebrahimi           char_is_literal = FALSE;
3798*22dc650dSSadaf Ebrahimi           goto CLASS_LITERAL;
3799*22dc650dSSadaf Ebrahimi 
3800*22dc650dSSadaf Ebrahimi           case ESC_Q:
3801*22dc650dSSadaf Ebrahimi           inescq = TRUE;  /* Enter literal mode */
3802*22dc650dSSadaf Ebrahimi           goto CLASS_CONTINUE;
3803*22dc650dSSadaf Ebrahimi 
3804*22dc650dSSadaf Ebrahimi           case ESC_E:     /* Ignore orphan \E */
3805*22dc650dSSadaf Ebrahimi           goto CLASS_CONTINUE;
3806*22dc650dSSadaf Ebrahimi 
3807*22dc650dSSadaf Ebrahimi           case ESC_B:     /* Always an error in a class */
3808*22dc650dSSadaf Ebrahimi           case ESC_R:
3809*22dc650dSSadaf Ebrahimi           case ESC_X:
3810*22dc650dSSadaf Ebrahimi           errorcode = ERR7;
3811*22dc650dSSadaf Ebrahimi           ptr--;
3812*22dc650dSSadaf Ebrahimi           goto FAILED;
3813*22dc650dSSadaf Ebrahimi           }
3814*22dc650dSSadaf Ebrahimi 
3815*22dc650dSSadaf Ebrahimi         /* The second part of a range can be a single-character escape
3816*22dc650dSSadaf Ebrahimi         sequence (detected above), but not any of the other escapes. Perl
3817*22dc650dSSadaf Ebrahimi         treats a hyphen as a literal in such circumstances. However, in Perl's
3818*22dc650dSSadaf Ebrahimi         warning mode, a warning is given, so PCRE now faults it, as it is
3819*22dc650dSSadaf Ebrahimi         almost certainly a mistake on the user's part. */
3820*22dc650dSSadaf Ebrahimi 
3821*22dc650dSSadaf Ebrahimi         if (class_range_state == RANGE_STARTED)
3822*22dc650dSSadaf Ebrahimi           {
3823*22dc650dSSadaf Ebrahimi           errorcode = ERR50;
3824*22dc650dSSadaf Ebrahimi           goto FAILED;  /* Not CLASS_ESCAPE_FAILED; always an error */
3825*22dc650dSSadaf Ebrahimi           }
3826*22dc650dSSadaf Ebrahimi 
3827*22dc650dSSadaf Ebrahimi         /* Of the remaining escapes, only those that define characters are
3828*22dc650dSSadaf Ebrahimi         allowed in a class. None may start a range. */
3829*22dc650dSSadaf Ebrahimi 
3830*22dc650dSSadaf Ebrahimi         class_range_state = RANGE_NO;
3831*22dc650dSSadaf Ebrahimi         switch(escape)
3832*22dc650dSSadaf Ebrahimi           {
3833*22dc650dSSadaf Ebrahimi           case ESC_N:
3834*22dc650dSSadaf Ebrahimi           errorcode = ERR71;
3835*22dc650dSSadaf Ebrahimi           goto FAILED;
3836*22dc650dSSadaf Ebrahimi 
3837*22dc650dSSadaf Ebrahimi           case ESC_H:
3838*22dc650dSSadaf Ebrahimi           case ESC_h:
3839*22dc650dSSadaf Ebrahimi           case ESC_V:
3840*22dc650dSSadaf Ebrahimi           case ESC_v:
3841*22dc650dSSadaf Ebrahimi           *parsed_pattern++ = META_ESCAPE + escape;
3842*22dc650dSSadaf Ebrahimi           break;
3843*22dc650dSSadaf Ebrahimi 
3844*22dc650dSSadaf Ebrahimi           /* These escapes may be converted to Unicode property tests when
3845*22dc650dSSadaf Ebrahimi           PCRE2_UCP is set. */
3846*22dc650dSSadaf Ebrahimi 
3847*22dc650dSSadaf Ebrahimi           case ESC_d:
3848*22dc650dSSadaf Ebrahimi           case ESC_D:
3849*22dc650dSSadaf Ebrahimi           case ESC_s:
3850*22dc650dSSadaf Ebrahimi           case ESC_S:
3851*22dc650dSSadaf Ebrahimi           case ESC_w:
3852*22dc650dSSadaf Ebrahimi           case ESC_W:
3853*22dc650dSSadaf Ebrahimi           parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
3854*22dc650dSSadaf Ebrahimi             xoptions);
3855*22dc650dSSadaf Ebrahimi           break;
3856*22dc650dSSadaf Ebrahimi 
3857*22dc650dSSadaf Ebrahimi           /* Explicit Unicode property matching */
3858*22dc650dSSadaf Ebrahimi 
3859*22dc650dSSadaf Ebrahimi           case ESC_P:
3860*22dc650dSSadaf Ebrahimi           case ESC_p:
3861*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
3862*22dc650dSSadaf Ebrahimi             {
3863*22dc650dSSadaf Ebrahimi             BOOL negated;
3864*22dc650dSSadaf Ebrahimi             uint16_t ptype = 0, pdata = 0;
3865*22dc650dSSadaf Ebrahimi             if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3866*22dc650dSSadaf Ebrahimi               goto FAILED;
3867*22dc650dSSadaf Ebrahimi             if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3868*22dc650dSSadaf Ebrahimi             *parsed_pattern++ = META_ESCAPE + escape;
3869*22dc650dSSadaf Ebrahimi             *parsed_pattern++ = (ptype << 16) | pdata;
3870*22dc650dSSadaf Ebrahimi             }
3871*22dc650dSSadaf Ebrahimi #else
3872*22dc650dSSadaf Ebrahimi           errorcode = ERR45;
3873*22dc650dSSadaf Ebrahimi           goto FAILED;
3874*22dc650dSSadaf Ebrahimi #endif
3875*22dc650dSSadaf Ebrahimi           break;  /* End \P and \p */
3876*22dc650dSSadaf Ebrahimi 
3877*22dc650dSSadaf Ebrahimi           default:    /* All others are not allowed in a class */
3878*22dc650dSSadaf Ebrahimi           errorcode = ERR7;
3879*22dc650dSSadaf Ebrahimi           ptr--;
3880*22dc650dSSadaf Ebrahimi           goto FAILED;
3881*22dc650dSSadaf Ebrahimi           }
3882*22dc650dSSadaf Ebrahimi 
3883*22dc650dSSadaf Ebrahimi         /* Perl gives a warning unless a following hyphen is the last character
3884*22dc650dSSadaf Ebrahimi         in the class. PCRE throws an error. */
3885*22dc650dSSadaf Ebrahimi 
3886*22dc650dSSadaf Ebrahimi         if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3887*22dc650dSSadaf Ebrahimi             ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3888*22dc650dSSadaf Ebrahimi           {
3889*22dc650dSSadaf Ebrahimi           errorcode = ERR50;
3890*22dc650dSSadaf Ebrahimi           goto FAILED;
3891*22dc650dSSadaf Ebrahimi           }
3892*22dc650dSSadaf Ebrahimi         }
3893*22dc650dSSadaf Ebrahimi 
3894*22dc650dSSadaf Ebrahimi       /* Proceed to next thing in the class. */
3895*22dc650dSSadaf Ebrahimi 
3896*22dc650dSSadaf Ebrahimi       CLASS_CONTINUE:
3897*22dc650dSSadaf Ebrahimi       if (ptr >= ptrend)
3898*22dc650dSSadaf Ebrahimi         {
3899*22dc650dSSadaf Ebrahimi         errorcode = ERR6;  /* Missing terminating ']' */
3900*22dc650dSSadaf Ebrahimi         goto FAILED;
3901*22dc650dSSadaf Ebrahimi         }
3902*22dc650dSSadaf Ebrahimi       GETCHARINCTEST(c, ptr);
3903*22dc650dSSadaf Ebrahimi       if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
3904*22dc650dSSadaf Ebrahimi       }     /* End of class-processing loop */
3905*22dc650dSSadaf Ebrahimi 
3906*22dc650dSSadaf Ebrahimi     /* -] at the end of a class is a literal '-' */
3907*22dc650dSSadaf Ebrahimi 
3908*22dc650dSSadaf Ebrahimi     if (class_range_state == RANGE_STARTED)
3909*22dc650dSSadaf Ebrahimi       {
3910*22dc650dSSadaf Ebrahimi       parsed_pattern[-1] = CHAR_MINUS;
3911*22dc650dSSadaf Ebrahimi       class_range_state = RANGE_NO;
3912*22dc650dSSadaf Ebrahimi       }
3913*22dc650dSSadaf Ebrahimi 
3914*22dc650dSSadaf Ebrahimi     *parsed_pattern++ = META_CLASS_END;
3915*22dc650dSSadaf Ebrahimi     break;  /* End of character class */
3916*22dc650dSSadaf Ebrahimi 
3917*22dc650dSSadaf Ebrahimi 
3918*22dc650dSSadaf Ebrahimi     /* ---- Opening parenthesis ---- */
3919*22dc650dSSadaf Ebrahimi 
3920*22dc650dSSadaf Ebrahimi     case CHAR_LEFT_PARENTHESIS:
3921*22dc650dSSadaf Ebrahimi     if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3922*22dc650dSSadaf Ebrahimi 
3923*22dc650dSSadaf Ebrahimi     /* If ( is not followed by ? it is either a capture or a special verb or an
3924*22dc650dSSadaf Ebrahimi     alpha assertion or a positive non-atomic lookahead. */
3925*22dc650dSSadaf Ebrahimi 
3926*22dc650dSSadaf Ebrahimi     if (*ptr != CHAR_QUESTION_MARK)
3927*22dc650dSSadaf Ebrahimi       {
3928*22dc650dSSadaf Ebrahimi       const char *vn;
3929*22dc650dSSadaf Ebrahimi 
3930*22dc650dSSadaf Ebrahimi       /* Handle capturing brackets (or non-capturing if auto-capture is turned
3931*22dc650dSSadaf Ebrahimi       off). */
3932*22dc650dSSadaf Ebrahimi 
3933*22dc650dSSadaf Ebrahimi       if (*ptr != CHAR_ASTERISK)
3934*22dc650dSSadaf Ebrahimi         {
3935*22dc650dSSadaf Ebrahimi         nest_depth++;
3936*22dc650dSSadaf Ebrahimi         if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
3937*22dc650dSSadaf Ebrahimi           {
3938*22dc650dSSadaf Ebrahimi           if (cb->bracount >= MAX_GROUP_NUMBER)
3939*22dc650dSSadaf Ebrahimi             {
3940*22dc650dSSadaf Ebrahimi             errorcode = ERR97;
3941*22dc650dSSadaf Ebrahimi             goto FAILED;
3942*22dc650dSSadaf Ebrahimi             }
3943*22dc650dSSadaf Ebrahimi           cb->bracount++;
3944*22dc650dSSadaf Ebrahimi           *parsed_pattern++ = META_CAPTURE | cb->bracount;
3945*22dc650dSSadaf Ebrahimi           }
3946*22dc650dSSadaf Ebrahimi         else *parsed_pattern++ = META_NOCAPTURE;
3947*22dc650dSSadaf Ebrahimi         }
3948*22dc650dSSadaf Ebrahimi 
3949*22dc650dSSadaf Ebrahimi       /* Do nothing for (* followed by end of pattern or ) so it gives a "bad
3950*22dc650dSSadaf Ebrahimi       quantifier" error rather than "(*MARK) must have an argument". */
3951*22dc650dSSadaf Ebrahimi 
3952*22dc650dSSadaf Ebrahimi       else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)
3953*22dc650dSSadaf Ebrahimi         break;
3954*22dc650dSSadaf Ebrahimi 
3955*22dc650dSSadaf Ebrahimi       /* Handle "alpha assertions" such as (*pla:...). Most of these are
3956*22dc650dSSadaf Ebrahimi       synonyms for the historical symbolic assertions, but the script run and
3957*22dc650dSSadaf Ebrahimi       non-atomic lookaround ones are new. They are distinguished by starting
3958*22dc650dSSadaf Ebrahimi       with a lower case letter. Checking both ends of the alphabet makes this
3959*22dc650dSSadaf Ebrahimi       work in all character codes. */
3960*22dc650dSSadaf Ebrahimi 
3961*22dc650dSSadaf Ebrahimi       else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
3962*22dc650dSSadaf Ebrahimi         {
3963*22dc650dSSadaf Ebrahimi         uint32_t meta;
3964*22dc650dSSadaf Ebrahimi 
3965*22dc650dSSadaf Ebrahimi         vn = alasnames;
3966*22dc650dSSadaf Ebrahimi         if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
3967*22dc650dSSadaf Ebrahimi           &errorcode, cb)) goto FAILED;
3968*22dc650dSSadaf Ebrahimi         if (ptr >= ptrend || *ptr != CHAR_COLON)
3969*22dc650dSSadaf Ebrahimi           {
3970*22dc650dSSadaf Ebrahimi           errorcode = ERR95;  /* Malformed */
3971*22dc650dSSadaf Ebrahimi           goto FAILED;
3972*22dc650dSSadaf Ebrahimi           }
3973*22dc650dSSadaf Ebrahimi 
3974*22dc650dSSadaf Ebrahimi         /* Scan the table of alpha assertion names */
3975*22dc650dSSadaf Ebrahimi 
3976*22dc650dSSadaf Ebrahimi         for (i = 0; i < alascount; i++)
3977*22dc650dSSadaf Ebrahimi           {
3978*22dc650dSSadaf Ebrahimi           if (namelen == alasmeta[i].len &&
3979*22dc650dSSadaf Ebrahimi               PRIV(strncmp_c8)(name, vn, namelen) == 0)
3980*22dc650dSSadaf Ebrahimi             break;
3981*22dc650dSSadaf Ebrahimi           vn += alasmeta[i].len + 1;
3982*22dc650dSSadaf Ebrahimi           }
3983*22dc650dSSadaf Ebrahimi 
3984*22dc650dSSadaf Ebrahimi         if (i >= alascount)
3985*22dc650dSSadaf Ebrahimi           {
3986*22dc650dSSadaf Ebrahimi           errorcode = ERR95;  /* Alpha assertion not recognized */
3987*22dc650dSSadaf Ebrahimi           goto FAILED;
3988*22dc650dSSadaf Ebrahimi           }
3989*22dc650dSSadaf Ebrahimi 
3990*22dc650dSSadaf Ebrahimi         /* Check for expecting an assertion condition. If so, only atomic
3991*22dc650dSSadaf Ebrahimi         lookaround assertions are valid. */
3992*22dc650dSSadaf Ebrahimi 
3993*22dc650dSSadaf Ebrahimi         meta = alasmeta[i].meta;
3994*22dc650dSSadaf Ebrahimi         if (prev_expect_cond_assert > 0 &&
3995*22dc650dSSadaf Ebrahimi             (meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))
3996*22dc650dSSadaf Ebrahimi           {
3997*22dc650dSSadaf Ebrahimi           errorcode = (meta == META_LOOKAHEAD_NA || meta == META_LOOKBEHIND_NA)?
3998*22dc650dSSadaf Ebrahimi             ERR98 : ERR28;  /* (Atomic) assertion expected */
3999*22dc650dSSadaf Ebrahimi           goto FAILED;
4000*22dc650dSSadaf Ebrahimi           }
4001*22dc650dSSadaf Ebrahimi 
4002*22dc650dSSadaf Ebrahimi         /* The lookaround alphabetic synonyms can mostly be handled by jumping
4003*22dc650dSSadaf Ebrahimi         to the code that handles the traditional symbolic forms. */
4004*22dc650dSSadaf Ebrahimi 
4005*22dc650dSSadaf Ebrahimi         switch(meta)
4006*22dc650dSSadaf Ebrahimi           {
4007*22dc650dSSadaf Ebrahimi           default:
4008*22dc650dSSadaf Ebrahimi           errorcode = ERR89;  /* Unknown code; should never occur because */
4009*22dc650dSSadaf Ebrahimi           goto FAILED;        /* the meta values come from a table above. */
4010*22dc650dSSadaf Ebrahimi 
4011*22dc650dSSadaf Ebrahimi           case META_ATOMIC:
4012*22dc650dSSadaf Ebrahimi           goto ATOMIC_GROUP;
4013*22dc650dSSadaf Ebrahimi 
4014*22dc650dSSadaf Ebrahimi           case META_LOOKAHEAD:
4015*22dc650dSSadaf Ebrahimi           goto POSITIVE_LOOK_AHEAD;
4016*22dc650dSSadaf Ebrahimi 
4017*22dc650dSSadaf Ebrahimi           case META_LOOKAHEAD_NA:
4018*22dc650dSSadaf Ebrahimi           goto POSITIVE_NONATOMIC_LOOK_AHEAD;
4019*22dc650dSSadaf Ebrahimi 
4020*22dc650dSSadaf Ebrahimi           case META_LOOKAHEADNOT:
4021*22dc650dSSadaf Ebrahimi           goto NEGATIVE_LOOK_AHEAD;
4022*22dc650dSSadaf Ebrahimi 
4023*22dc650dSSadaf Ebrahimi           case META_LOOKBEHIND:
4024*22dc650dSSadaf Ebrahimi           case META_LOOKBEHINDNOT:
4025*22dc650dSSadaf Ebrahimi           case META_LOOKBEHIND_NA:
4026*22dc650dSSadaf Ebrahimi           *parsed_pattern++ = meta;
4027*22dc650dSSadaf Ebrahimi           ptr--;
4028*22dc650dSSadaf Ebrahimi           goto POST_LOOKBEHIND;
4029*22dc650dSSadaf Ebrahimi 
4030*22dc650dSSadaf Ebrahimi           /* The script run facilities are handled here. Unicode support is
4031*22dc650dSSadaf Ebrahimi           required (give an error if not, as this is a security issue). Always
4032*22dc650dSSadaf Ebrahimi           record a META_SCRIPT_RUN item. Then, for the atomic version, insert
4033*22dc650dSSadaf Ebrahimi           META_ATOMIC and remember that we need two META_KETs at the end. */
4034*22dc650dSSadaf Ebrahimi 
4035*22dc650dSSadaf Ebrahimi           case META_SCRIPT_RUN:
4036*22dc650dSSadaf Ebrahimi           case META_ATOMIC_SCRIPT_RUN:
4037*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
4038*22dc650dSSadaf Ebrahimi           *parsed_pattern++ = META_SCRIPT_RUN;
4039*22dc650dSSadaf Ebrahimi           nest_depth++;
4040*22dc650dSSadaf Ebrahimi           ptr++;
4041*22dc650dSSadaf Ebrahimi           if (meta == META_ATOMIC_SCRIPT_RUN)
4042*22dc650dSSadaf Ebrahimi             {
4043*22dc650dSSadaf Ebrahimi             *parsed_pattern++ = META_ATOMIC;
4044*22dc650dSSadaf Ebrahimi             if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4045*22dc650dSSadaf Ebrahimi             else if (++top_nest >= end_nests)
4046*22dc650dSSadaf Ebrahimi               {
4047*22dc650dSSadaf Ebrahimi               errorcode = ERR84;
4048*22dc650dSSadaf Ebrahimi               goto FAILED;
4049*22dc650dSSadaf Ebrahimi               }
4050*22dc650dSSadaf Ebrahimi             top_nest->nest_depth = nest_depth;
4051*22dc650dSSadaf Ebrahimi             top_nest->flags = NSF_ATOMICSR;
4052*22dc650dSSadaf Ebrahimi             top_nest->options = options & PARSE_TRACKED_OPTIONS;
4053*22dc650dSSadaf Ebrahimi             top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4054*22dc650dSSadaf Ebrahimi             }
4055*22dc650dSSadaf Ebrahimi           break;
4056*22dc650dSSadaf Ebrahimi #else  /* SUPPORT_UNICODE */
4057*22dc650dSSadaf Ebrahimi           errorcode = ERR96;
4058*22dc650dSSadaf Ebrahimi           goto FAILED;
4059*22dc650dSSadaf Ebrahimi #endif
4060*22dc650dSSadaf Ebrahimi           }
4061*22dc650dSSadaf Ebrahimi         }
4062*22dc650dSSadaf Ebrahimi 
4063*22dc650dSSadaf Ebrahimi 
4064*22dc650dSSadaf Ebrahimi       /* ---- Handle (*VERB) and (*VERB:NAME) ---- */
4065*22dc650dSSadaf Ebrahimi 
4066*22dc650dSSadaf Ebrahimi       else
4067*22dc650dSSadaf Ebrahimi         {
4068*22dc650dSSadaf Ebrahimi         vn = verbnames;
4069*22dc650dSSadaf Ebrahimi         if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
4070*22dc650dSSadaf Ebrahimi           &errorcode, cb)) goto FAILED;
4071*22dc650dSSadaf Ebrahimi         if (ptr >= ptrend || (*ptr != CHAR_COLON &&
4072*22dc650dSSadaf Ebrahimi                               *ptr != CHAR_RIGHT_PARENTHESIS))
4073*22dc650dSSadaf Ebrahimi           {
4074*22dc650dSSadaf Ebrahimi           errorcode = ERR60;  /* Malformed */
4075*22dc650dSSadaf Ebrahimi           goto FAILED;
4076*22dc650dSSadaf Ebrahimi           }
4077*22dc650dSSadaf Ebrahimi 
4078*22dc650dSSadaf Ebrahimi         /* Scan the table of verb names */
4079*22dc650dSSadaf Ebrahimi 
4080*22dc650dSSadaf Ebrahimi         for (i = 0; i < verbcount; i++)
4081*22dc650dSSadaf Ebrahimi           {
4082*22dc650dSSadaf Ebrahimi           if (namelen == verbs[i].len &&
4083*22dc650dSSadaf Ebrahimi               PRIV(strncmp_c8)(name, vn, namelen) == 0)
4084*22dc650dSSadaf Ebrahimi             break;
4085*22dc650dSSadaf Ebrahimi           vn += verbs[i].len + 1;
4086*22dc650dSSadaf Ebrahimi           }
4087*22dc650dSSadaf Ebrahimi 
4088*22dc650dSSadaf Ebrahimi         if (i >= verbcount)
4089*22dc650dSSadaf Ebrahimi           {
4090*22dc650dSSadaf Ebrahimi           errorcode = ERR60;  /* Verb not recognized */
4091*22dc650dSSadaf Ebrahimi           goto FAILED;
4092*22dc650dSSadaf Ebrahimi           }
4093*22dc650dSSadaf Ebrahimi 
4094*22dc650dSSadaf Ebrahimi         /* An empty argument is treated as no argument. */
4095*22dc650dSSadaf Ebrahimi 
4096*22dc650dSSadaf Ebrahimi         if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
4097*22dc650dSSadaf Ebrahimi              ptr[1] == CHAR_RIGHT_PARENTHESIS)
4098*22dc650dSSadaf Ebrahimi           ptr++;    /* Advance to the closing parens */
4099*22dc650dSSadaf Ebrahimi 
4100*22dc650dSSadaf Ebrahimi         /* Check for mandatory non-empty argument; this is (*MARK) */
4101*22dc650dSSadaf Ebrahimi 
4102*22dc650dSSadaf Ebrahimi         if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
4103*22dc650dSSadaf Ebrahimi           {
4104*22dc650dSSadaf Ebrahimi           errorcode = ERR66;
4105*22dc650dSSadaf Ebrahimi           goto FAILED;
4106*22dc650dSSadaf Ebrahimi           }
4107*22dc650dSSadaf Ebrahimi 
4108*22dc650dSSadaf Ebrahimi         /* Remember where this verb, possibly with a preceding (*MARK), starts,
4109*22dc650dSSadaf Ebrahimi         for handling quantified (*ACCEPT). */
4110*22dc650dSSadaf Ebrahimi 
4111*22dc650dSSadaf Ebrahimi         verbstartptr = parsed_pattern;
4112*22dc650dSSadaf Ebrahimi         okquantifier = (verbs[i].meta == META_ACCEPT);
4113*22dc650dSSadaf Ebrahimi 
4114*22dc650dSSadaf Ebrahimi         /* It appears that Perl allows any characters whatsoever, other than a
4115*22dc650dSSadaf Ebrahimi         closing parenthesis, to appear in arguments ("names"), so we no longer
4116*22dc650dSSadaf Ebrahimi         insist on letters, digits, and underscores. Perl does not, however, do
4117*22dc650dSSadaf Ebrahimi         any interpretation within arguments, and has no means of including a
4118*22dc650dSSadaf Ebrahimi         closing parenthesis. PCRE supports escape processing but only when it
4119*22dc650dSSadaf Ebrahimi         is requested by an option. We set inverbname TRUE here, and let the
4120*22dc650dSSadaf Ebrahimi         main loop take care of this so that escape and \x processing is done by
4121*22dc650dSSadaf Ebrahimi         the main code above. */
4122*22dc650dSSadaf Ebrahimi 
4123*22dc650dSSadaf Ebrahimi         if (*ptr++ == CHAR_COLON)   /* Skip past : or ) */
4124*22dc650dSSadaf Ebrahimi           {
4125*22dc650dSSadaf Ebrahimi           /* Some optional arguments can be treated as a preceding (*MARK) */
4126*22dc650dSSadaf Ebrahimi 
4127*22dc650dSSadaf Ebrahimi           if (verbs[i].has_arg < 0)
4128*22dc650dSSadaf Ebrahimi             {
4129*22dc650dSSadaf Ebrahimi             add_after_mark = verbs[i].meta;
4130*22dc650dSSadaf Ebrahimi             *parsed_pattern++ = META_MARK;
4131*22dc650dSSadaf Ebrahimi             }
4132*22dc650dSSadaf Ebrahimi 
4133*22dc650dSSadaf Ebrahimi           /* The remaining verbs with arguments (except *MARK) need a different
4134*22dc650dSSadaf Ebrahimi           opcode. */
4135*22dc650dSSadaf Ebrahimi 
4136*22dc650dSSadaf Ebrahimi           else
4137*22dc650dSSadaf Ebrahimi             {
4138*22dc650dSSadaf Ebrahimi             *parsed_pattern++ = verbs[i].meta +
4139*22dc650dSSadaf Ebrahimi               ((verbs[i].meta != META_MARK)? 0x00010000u:0);
4140*22dc650dSSadaf Ebrahimi             }
4141*22dc650dSSadaf Ebrahimi 
4142*22dc650dSSadaf Ebrahimi           /* Set up for reading the name in the main loop. */
4143*22dc650dSSadaf Ebrahimi 
4144*22dc650dSSadaf Ebrahimi           verblengthptr = parsed_pattern++;
4145*22dc650dSSadaf Ebrahimi           verbnamestart = ptr;
4146*22dc650dSSadaf Ebrahimi           inverbname = TRUE;
4147*22dc650dSSadaf Ebrahimi           }
4148*22dc650dSSadaf Ebrahimi         else  /* No verb "name" argument */
4149*22dc650dSSadaf Ebrahimi           {
4150*22dc650dSSadaf Ebrahimi           *parsed_pattern++ = verbs[i].meta;
4151*22dc650dSSadaf Ebrahimi           }
4152*22dc650dSSadaf Ebrahimi         }     /* End of (*VERB) handling */
4153*22dc650dSSadaf Ebrahimi       break;  /* Done with this parenthesis */
4154*22dc650dSSadaf Ebrahimi       }       /* End of groups that don't start with (? */
4155*22dc650dSSadaf Ebrahimi 
4156*22dc650dSSadaf Ebrahimi 
4157*22dc650dSSadaf Ebrahimi     /* ---- Items starting (? ---- */
4158*22dc650dSSadaf Ebrahimi 
4159*22dc650dSSadaf Ebrahimi     /* The type of item is determined by what follows (?. Handle (?| and option
4160*22dc650dSSadaf Ebrahimi     changes under "default" because both need a new block on the nest stack.
4161*22dc650dSSadaf Ebrahimi     Comments starting with (?# are handled above. Note that there is some
4162*22dc650dSSadaf Ebrahimi     ambiguity about the sequence (?- because if a digit follows it's a relative
4163*22dc650dSSadaf Ebrahimi     recursion or subroutine call whereas otherwise it's an option unsetting. */
4164*22dc650dSSadaf Ebrahimi 
4165*22dc650dSSadaf Ebrahimi     if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4166*22dc650dSSadaf Ebrahimi 
4167*22dc650dSSadaf Ebrahimi     switch(*ptr)
4168*22dc650dSSadaf Ebrahimi       {
4169*22dc650dSSadaf Ebrahimi       default:
4170*22dc650dSSadaf Ebrahimi       if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
4171*22dc650dSSadaf Ebrahimi         goto RECURSION_BYNUMBER;  /* The + case is handled by CHAR_PLUS */
4172*22dc650dSSadaf Ebrahimi 
4173*22dc650dSSadaf Ebrahimi       /* We now have either (?| or a (possibly empty) option setting,
4174*22dc650dSSadaf Ebrahimi       optionally followed by a non-capturing group. */
4175*22dc650dSSadaf Ebrahimi 
4176*22dc650dSSadaf Ebrahimi       nest_depth++;
4177*22dc650dSSadaf Ebrahimi       if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4178*22dc650dSSadaf Ebrahimi       else if (++top_nest >= end_nests)
4179*22dc650dSSadaf Ebrahimi         {
4180*22dc650dSSadaf Ebrahimi         errorcode = ERR84;
4181*22dc650dSSadaf Ebrahimi         goto FAILED;
4182*22dc650dSSadaf Ebrahimi         }
4183*22dc650dSSadaf Ebrahimi       top_nest->nest_depth = nest_depth;
4184*22dc650dSSadaf Ebrahimi       top_nest->flags = 0;
4185*22dc650dSSadaf Ebrahimi       top_nest->options = options & PARSE_TRACKED_OPTIONS;
4186*22dc650dSSadaf Ebrahimi       top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4187*22dc650dSSadaf Ebrahimi 
4188*22dc650dSSadaf Ebrahimi       /* Start of non-capturing group that resets the capture count for each
4189*22dc650dSSadaf Ebrahimi       branch. */
4190*22dc650dSSadaf Ebrahimi 
4191*22dc650dSSadaf Ebrahimi       if (*ptr == CHAR_VERTICAL_LINE)
4192*22dc650dSSadaf Ebrahimi         {
4193*22dc650dSSadaf Ebrahimi         top_nest->reset_group = (uint16_t)cb->bracount;
4194*22dc650dSSadaf Ebrahimi         top_nest->max_group = (uint16_t)cb->bracount;
4195*22dc650dSSadaf Ebrahimi         top_nest->flags |= NSF_RESET;
4196*22dc650dSSadaf Ebrahimi         cb->external_flags |= PCRE2_DUPCAPUSED;
4197*22dc650dSSadaf Ebrahimi         *parsed_pattern++ = META_NOCAPTURE;
4198*22dc650dSSadaf Ebrahimi         ptr++;
4199*22dc650dSSadaf Ebrahimi         }
4200*22dc650dSSadaf Ebrahimi 
4201*22dc650dSSadaf Ebrahimi       /* Scan for options imnrsxJU to be set or unset. */
4202*22dc650dSSadaf Ebrahimi 
4203*22dc650dSSadaf Ebrahimi       else
4204*22dc650dSSadaf Ebrahimi         {
4205*22dc650dSSadaf Ebrahimi         BOOL hyphenok = TRUE;
4206*22dc650dSSadaf Ebrahimi         uint32_t oldoptions = options;
4207*22dc650dSSadaf Ebrahimi         uint32_t oldxoptions = xoptions;
4208*22dc650dSSadaf Ebrahimi 
4209*22dc650dSSadaf Ebrahimi         top_nest->reset_group = 0;
4210*22dc650dSSadaf Ebrahimi         top_nest->max_group = 0;
4211*22dc650dSSadaf Ebrahimi         set = unset = 0;
4212*22dc650dSSadaf Ebrahimi         optset = &set;
4213*22dc650dSSadaf Ebrahimi         xset = xunset = 0;
4214*22dc650dSSadaf Ebrahimi         xoptset = &xset;
4215*22dc650dSSadaf Ebrahimi 
4216*22dc650dSSadaf Ebrahimi         /* ^ at the start unsets irmnsx and disables the subsequent use of - */
4217*22dc650dSSadaf Ebrahimi 
4218*22dc650dSSadaf Ebrahimi         if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
4219*22dc650dSSadaf Ebrahimi           {
4220*22dc650dSSadaf Ebrahimi           options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
4221*22dc650dSSadaf Ebrahimi                        PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
4222*22dc650dSSadaf Ebrahimi           xoptions &= ~(PCRE2_EXTRA_CASELESS_RESTRICT);
4223*22dc650dSSadaf Ebrahimi           hyphenok = FALSE;
4224*22dc650dSSadaf Ebrahimi           ptr++;
4225*22dc650dSSadaf Ebrahimi           }
4226*22dc650dSSadaf Ebrahimi 
4227*22dc650dSSadaf Ebrahimi         while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
4228*22dc650dSSadaf Ebrahimi                                *ptr != CHAR_COLON)
4229*22dc650dSSadaf Ebrahimi           {
4230*22dc650dSSadaf Ebrahimi           switch (*ptr++)
4231*22dc650dSSadaf Ebrahimi             {
4232*22dc650dSSadaf Ebrahimi             case CHAR_MINUS:
4233*22dc650dSSadaf Ebrahimi             if (!hyphenok)
4234*22dc650dSSadaf Ebrahimi               {
4235*22dc650dSSadaf Ebrahimi               errorcode = ERR94;
4236*22dc650dSSadaf Ebrahimi               ptr--;  /* Correct the offset */
4237*22dc650dSSadaf Ebrahimi               goto FAILED;
4238*22dc650dSSadaf Ebrahimi               }
4239*22dc650dSSadaf Ebrahimi             optset = &unset;
4240*22dc650dSSadaf Ebrahimi             xoptset = &xunset;
4241*22dc650dSSadaf Ebrahimi             hyphenok = FALSE;
4242*22dc650dSSadaf Ebrahimi             break;
4243*22dc650dSSadaf Ebrahimi 
4244*22dc650dSSadaf Ebrahimi             /* There are some two-character sequences that start with 'a'. */
4245*22dc650dSSadaf Ebrahimi 
4246*22dc650dSSadaf Ebrahimi             case CHAR_a:
4247*22dc650dSSadaf Ebrahimi             if (ptr < ptrend)
4248*22dc650dSSadaf Ebrahimi               {
4249*22dc650dSSadaf Ebrahimi               if (*ptr == CHAR_D)
4250*22dc650dSSadaf Ebrahimi                 {
4251*22dc650dSSadaf Ebrahimi                 *xoptset |= PCRE2_EXTRA_ASCII_BSD;
4252*22dc650dSSadaf Ebrahimi                 ptr++;
4253*22dc650dSSadaf Ebrahimi                 break;
4254*22dc650dSSadaf Ebrahimi                 }
4255*22dc650dSSadaf Ebrahimi               if (*ptr == CHAR_P)
4256*22dc650dSSadaf Ebrahimi                 {
4257*22dc650dSSadaf Ebrahimi                 *xoptset |= (PCRE2_EXTRA_ASCII_POSIX|PCRE2_EXTRA_ASCII_DIGIT);
4258*22dc650dSSadaf Ebrahimi                 ptr++;
4259*22dc650dSSadaf Ebrahimi                 break;
4260*22dc650dSSadaf Ebrahimi                 }
4261*22dc650dSSadaf Ebrahimi               if (*ptr == CHAR_S)
4262*22dc650dSSadaf Ebrahimi                 {
4263*22dc650dSSadaf Ebrahimi                 *xoptset |= PCRE2_EXTRA_ASCII_BSS;
4264*22dc650dSSadaf Ebrahimi                 ptr++;
4265*22dc650dSSadaf Ebrahimi                 break;
4266*22dc650dSSadaf Ebrahimi                 }
4267*22dc650dSSadaf Ebrahimi               if (*ptr == CHAR_T)
4268*22dc650dSSadaf Ebrahimi                 {
4269*22dc650dSSadaf Ebrahimi                 *xoptset |= PCRE2_EXTRA_ASCII_DIGIT;
4270*22dc650dSSadaf Ebrahimi                 ptr++;
4271*22dc650dSSadaf Ebrahimi                 break;
4272*22dc650dSSadaf Ebrahimi                 }
4273*22dc650dSSadaf Ebrahimi               if (*ptr == CHAR_W)
4274*22dc650dSSadaf Ebrahimi                 {
4275*22dc650dSSadaf Ebrahimi                 *xoptset |= PCRE2_EXTRA_ASCII_BSW;
4276*22dc650dSSadaf Ebrahimi                 ptr++;
4277*22dc650dSSadaf Ebrahimi                 break;
4278*22dc650dSSadaf Ebrahimi                 }
4279*22dc650dSSadaf Ebrahimi               }
4280*22dc650dSSadaf Ebrahimi             *xoptset |= PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|
4281*22dc650dSSadaf Ebrahimi                         PCRE2_EXTRA_ASCII_BSW|
4282*22dc650dSSadaf Ebrahimi                         PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX;
4283*22dc650dSSadaf Ebrahimi             break;
4284*22dc650dSSadaf Ebrahimi 
4285*22dc650dSSadaf Ebrahimi             case CHAR_J:  /* Record that it changed in the external options */
4286*22dc650dSSadaf Ebrahimi             *optset |= PCRE2_DUPNAMES;
4287*22dc650dSSadaf Ebrahimi             cb->external_flags |= PCRE2_JCHANGED;
4288*22dc650dSSadaf Ebrahimi             break;
4289*22dc650dSSadaf Ebrahimi 
4290*22dc650dSSadaf Ebrahimi             case CHAR_i: *optset |= PCRE2_CASELESS; break;
4291*22dc650dSSadaf Ebrahimi             case CHAR_m: *optset |= PCRE2_MULTILINE; break;
4292*22dc650dSSadaf Ebrahimi             case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
4293*22dc650dSSadaf Ebrahimi             case CHAR_r: *xoptset|= PCRE2_EXTRA_CASELESS_RESTRICT; break;
4294*22dc650dSSadaf Ebrahimi             case CHAR_s: *optset |= PCRE2_DOTALL; break;
4295*22dc650dSSadaf Ebrahimi             case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
4296*22dc650dSSadaf Ebrahimi 
4297*22dc650dSSadaf Ebrahimi             /* If x appears twice it sets the extended extended option. */
4298*22dc650dSSadaf Ebrahimi 
4299*22dc650dSSadaf Ebrahimi             case CHAR_x:
4300*22dc650dSSadaf Ebrahimi             *optset |= PCRE2_EXTENDED;
4301*22dc650dSSadaf Ebrahimi             if (ptr < ptrend && *ptr == CHAR_x)
4302*22dc650dSSadaf Ebrahimi               {
4303*22dc650dSSadaf Ebrahimi               *optset |= PCRE2_EXTENDED_MORE;
4304*22dc650dSSadaf Ebrahimi               ptr++;
4305*22dc650dSSadaf Ebrahimi               }
4306*22dc650dSSadaf Ebrahimi             break;
4307*22dc650dSSadaf Ebrahimi 
4308*22dc650dSSadaf Ebrahimi             default:
4309*22dc650dSSadaf Ebrahimi             errorcode = ERR11;
4310*22dc650dSSadaf Ebrahimi             ptr--;    /* Correct the offset */
4311*22dc650dSSadaf Ebrahimi             goto FAILED;
4312*22dc650dSSadaf Ebrahimi             }
4313*22dc650dSSadaf Ebrahimi           }
4314*22dc650dSSadaf Ebrahimi 
4315*22dc650dSSadaf Ebrahimi         /* If we are setting extended without extended-more, ensure that any
4316*22dc650dSSadaf Ebrahimi         existing extended-more gets unset. Also, unsetting extended must also
4317*22dc650dSSadaf Ebrahimi         unset extended-more. */
4318*22dc650dSSadaf Ebrahimi 
4319*22dc650dSSadaf Ebrahimi         if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
4320*22dc650dSSadaf Ebrahimi             (unset & PCRE2_EXTENDED) != 0)
4321*22dc650dSSadaf Ebrahimi           unset |= PCRE2_EXTENDED_MORE;
4322*22dc650dSSadaf Ebrahimi 
4323*22dc650dSSadaf Ebrahimi         options = (options | set) & (~unset);
4324*22dc650dSSadaf Ebrahimi         xoptions = (xoptions | xset) & (~xunset);
4325*22dc650dSSadaf Ebrahimi 
4326*22dc650dSSadaf Ebrahimi         /* If the options ended with ')' this is not the start of a nested
4327*22dc650dSSadaf Ebrahimi         group with option changes, so the options change at this level.
4328*22dc650dSSadaf Ebrahimi         In this case, if the previous level set up a nest block, discard the
4329*22dc650dSSadaf Ebrahimi         one we have just created. Otherwise adjust it for the previous level.
4330*22dc650dSSadaf Ebrahimi         If the options ended with ':' we are starting a non-capturing group,
4331*22dc650dSSadaf Ebrahimi         possibly with an options setting. */
4332*22dc650dSSadaf Ebrahimi 
4333*22dc650dSSadaf Ebrahimi         if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4334*22dc650dSSadaf Ebrahimi         if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
4335*22dc650dSSadaf Ebrahimi           {
4336*22dc650dSSadaf Ebrahimi           nest_depth--;  /* This is not a nested group after all. */
4337*22dc650dSSadaf Ebrahimi           if (top_nest > (nest_save *)(cb->start_workspace) &&
4338*22dc650dSSadaf Ebrahimi               (top_nest-1)->nest_depth == nest_depth) top_nest--;
4339*22dc650dSSadaf Ebrahimi           else top_nest->nest_depth = nest_depth;
4340*22dc650dSSadaf Ebrahimi           }
4341*22dc650dSSadaf Ebrahimi         else *parsed_pattern++ = META_NOCAPTURE;
4342*22dc650dSSadaf Ebrahimi 
4343*22dc650dSSadaf Ebrahimi         /* If nothing changed, no need to record. */
4344*22dc650dSSadaf Ebrahimi 
4345*22dc650dSSadaf Ebrahimi         if (options != oldoptions || xoptions != oldxoptions)
4346*22dc650dSSadaf Ebrahimi           {
4347*22dc650dSSadaf Ebrahimi           *parsed_pattern++ = META_OPTIONS;
4348*22dc650dSSadaf Ebrahimi           *parsed_pattern++ = options;
4349*22dc650dSSadaf Ebrahimi           *parsed_pattern++ = xoptions;
4350*22dc650dSSadaf Ebrahimi           }
4351*22dc650dSSadaf Ebrahimi         }     /* End options processing */
4352*22dc650dSSadaf Ebrahimi       break;  /* End default case after (? */
4353*22dc650dSSadaf Ebrahimi 
4354*22dc650dSSadaf Ebrahimi 
4355*22dc650dSSadaf Ebrahimi       /* ---- Python syntax support ---- */
4356*22dc650dSSadaf Ebrahimi 
4357*22dc650dSSadaf Ebrahimi       case CHAR_P:
4358*22dc650dSSadaf Ebrahimi       if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4359*22dc650dSSadaf Ebrahimi 
4360*22dc650dSSadaf Ebrahimi       /* (?P<name> is the same as (?<name>, which defines a named group. */
4361*22dc650dSSadaf Ebrahimi 
4362*22dc650dSSadaf Ebrahimi       if (*ptr == CHAR_LESS_THAN_SIGN)
4363*22dc650dSSadaf Ebrahimi         {
4364*22dc650dSSadaf Ebrahimi         terminator = CHAR_GREATER_THAN_SIGN;
4365*22dc650dSSadaf Ebrahimi         goto DEFINE_NAME;
4366*22dc650dSSadaf Ebrahimi         }
4367*22dc650dSSadaf Ebrahimi 
4368*22dc650dSSadaf Ebrahimi       /* (?P>name) is the same as (?&name), which is a recursion or subroutine
4369*22dc650dSSadaf Ebrahimi       call. */
4370*22dc650dSSadaf Ebrahimi 
4371*22dc650dSSadaf Ebrahimi       if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
4372*22dc650dSSadaf Ebrahimi 
4373*22dc650dSSadaf Ebrahimi       /* (?P=name) is the same as \k<name>, a back reference by name. Anything
4374*22dc650dSSadaf Ebrahimi       else after (?P is an error. */
4375*22dc650dSSadaf Ebrahimi 
4376*22dc650dSSadaf Ebrahimi       if (*ptr != CHAR_EQUALS_SIGN)
4377*22dc650dSSadaf Ebrahimi         {
4378*22dc650dSSadaf Ebrahimi         errorcode = ERR41;
4379*22dc650dSSadaf Ebrahimi         goto FAILED;
4380*22dc650dSSadaf Ebrahimi         }
4381*22dc650dSSadaf Ebrahimi       if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4382*22dc650dSSadaf Ebrahimi           &namelen, &errorcode, cb)) goto FAILED;
4383*22dc650dSSadaf Ebrahimi       *parsed_pattern++ = META_BACKREF_BYNAME;
4384*22dc650dSSadaf Ebrahimi       *parsed_pattern++ = namelen;
4385*22dc650dSSadaf Ebrahimi       PUTOFFSET(offset, parsed_pattern);
4386*22dc650dSSadaf Ebrahimi       okquantifier = TRUE;
4387*22dc650dSSadaf Ebrahimi       break;   /* End of (?P processing */
4388*22dc650dSSadaf Ebrahimi 
4389*22dc650dSSadaf Ebrahimi 
4390*22dc650dSSadaf Ebrahimi       /* ---- Recursion/subroutine calls by number ---- */
4391*22dc650dSSadaf Ebrahimi 
4392*22dc650dSSadaf Ebrahimi       case CHAR_R:
4393*22dc650dSSadaf Ebrahimi       i = 0;         /* (?R) == (?R0) */
4394*22dc650dSSadaf Ebrahimi       ptr++;
4395*22dc650dSSadaf Ebrahimi       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4396*22dc650dSSadaf Ebrahimi         {
4397*22dc650dSSadaf Ebrahimi         errorcode = ERR58;
4398*22dc650dSSadaf Ebrahimi         goto FAILED;
4399*22dc650dSSadaf Ebrahimi         }
4400*22dc650dSSadaf Ebrahimi       goto SET_RECURSION;
4401*22dc650dSSadaf Ebrahimi 
4402*22dc650dSSadaf Ebrahimi       /* An item starting (?- followed by a digit comes here via the "default"
4403*22dc650dSSadaf Ebrahimi       case because (?- followed by a non-digit is an options setting. */
4404*22dc650dSSadaf Ebrahimi 
4405*22dc650dSSadaf Ebrahimi       case CHAR_PLUS:
4406*22dc650dSSadaf Ebrahimi       if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1]))
4407*22dc650dSSadaf Ebrahimi         {
4408*22dc650dSSadaf Ebrahimi         errorcode = ERR29;   /* Missing number */
4409*22dc650dSSadaf Ebrahimi         goto FAILED;
4410*22dc650dSSadaf Ebrahimi         }
4411*22dc650dSSadaf Ebrahimi       /* Fall through */
4412*22dc650dSSadaf Ebrahimi 
4413*22dc650dSSadaf Ebrahimi       case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
4414*22dc650dSSadaf Ebrahimi       case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
4415*22dc650dSSadaf Ebrahimi       RECURSION_BYNUMBER:
4416*22dc650dSSadaf Ebrahimi       if (!read_number(&ptr, ptrend,
4417*22dc650dSSadaf Ebrahimi           (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
4418*22dc650dSSadaf Ebrahimi           MAX_GROUP_NUMBER, ERR61,
4419*22dc650dSSadaf Ebrahimi           &i, &errorcode)) goto FAILED;
4420*22dc650dSSadaf Ebrahimi       if (i < 0)  /* NB (?0) is permitted */
4421*22dc650dSSadaf Ebrahimi         {
4422*22dc650dSSadaf Ebrahimi         errorcode = ERR15;   /* Unknown group */
4423*22dc650dSSadaf Ebrahimi         goto FAILED_BACK;
4424*22dc650dSSadaf Ebrahimi         }
4425*22dc650dSSadaf Ebrahimi       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4426*22dc650dSSadaf Ebrahimi         goto UNCLOSED_PARENTHESIS;
4427*22dc650dSSadaf Ebrahimi 
4428*22dc650dSSadaf Ebrahimi       SET_RECURSION:
4429*22dc650dSSadaf Ebrahimi       *parsed_pattern++ = META_RECURSE | (uint32_t)i;
4430*22dc650dSSadaf Ebrahimi       offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4431*22dc650dSSadaf Ebrahimi       ptr++;
4432*22dc650dSSadaf Ebrahimi       PUTOFFSET(offset, parsed_pattern);
4433*22dc650dSSadaf Ebrahimi       okquantifier = TRUE;
4434*22dc650dSSadaf Ebrahimi       break;  /* End of recursive call by number handling */
4435*22dc650dSSadaf Ebrahimi 
4436*22dc650dSSadaf Ebrahimi 
4437*22dc650dSSadaf Ebrahimi       /* ---- Recursion/subroutine calls by name ---- */
4438*22dc650dSSadaf Ebrahimi 
4439*22dc650dSSadaf Ebrahimi       case CHAR_AMPERSAND:
4440*22dc650dSSadaf Ebrahimi       RECURSE_BY_NAME:
4441*22dc650dSSadaf Ebrahimi       if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4442*22dc650dSSadaf Ebrahimi           &namelen, &errorcode, cb)) goto FAILED;
4443*22dc650dSSadaf Ebrahimi       *parsed_pattern++ = META_RECURSE_BYNAME;
4444*22dc650dSSadaf Ebrahimi       *parsed_pattern++ = namelen;
4445*22dc650dSSadaf Ebrahimi       PUTOFFSET(offset, parsed_pattern);
4446*22dc650dSSadaf Ebrahimi       okquantifier = TRUE;
4447*22dc650dSSadaf Ebrahimi       break;
4448*22dc650dSSadaf Ebrahimi 
4449*22dc650dSSadaf Ebrahimi       /* ---- Callout with numerical or string argument ---- */
4450*22dc650dSSadaf Ebrahimi 
4451*22dc650dSSadaf Ebrahimi       case CHAR_C:
4452*22dc650dSSadaf Ebrahimi       if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4453*22dc650dSSadaf Ebrahimi 
4454*22dc650dSSadaf Ebrahimi       /* If the previous item was a condition starting (?(? an assertion,
4455*22dc650dSSadaf Ebrahimi       optionally preceded by a callout, is expected. This is checked later on,
4456*22dc650dSSadaf Ebrahimi       during actual compilation. However we need to identify this kind of
4457*22dc650dSSadaf Ebrahimi       assertion in this pass because it must not be qualified. The value of
4458*22dc650dSSadaf Ebrahimi       expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4459*22dc650dSSadaf Ebrahimi       for a callout - still leaving a positive value that identifies the
4460*22dc650dSSadaf Ebrahimi       assertion. Multiple callouts or any other items will make it zero or
4461*22dc650dSSadaf Ebrahimi       less, which doesn't matter because they will cause an error later. */
4462*22dc650dSSadaf Ebrahimi 
4463*22dc650dSSadaf Ebrahimi       expect_cond_assert = prev_expect_cond_assert - 1;
4464*22dc650dSSadaf Ebrahimi 
4465*22dc650dSSadaf Ebrahimi       /* If previous_callout is not NULL, it means this follows a previous
4466*22dc650dSSadaf Ebrahimi       callout. If it was a manual callout, do nothing; this means its "length
4467*22dc650dSSadaf Ebrahimi       of next pattern item" field will remain zero. If it was an automatic
4468*22dc650dSSadaf Ebrahimi       callout, abolish it. */
4469*22dc650dSSadaf Ebrahimi 
4470*22dc650dSSadaf Ebrahimi       if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
4471*22dc650dSSadaf Ebrahimi           previous_callout == parsed_pattern - 4 &&
4472*22dc650dSSadaf Ebrahimi           parsed_pattern[-1] == 255)
4473*22dc650dSSadaf Ebrahimi         parsed_pattern = previous_callout;
4474*22dc650dSSadaf Ebrahimi 
4475*22dc650dSSadaf Ebrahimi       /* Save for updating next pattern item length, and skip one item before
4476*22dc650dSSadaf Ebrahimi       completing. */
4477*22dc650dSSadaf Ebrahimi 
4478*22dc650dSSadaf Ebrahimi       previous_callout = parsed_pattern;
4479*22dc650dSSadaf Ebrahimi       after_manual_callout = 1;
4480*22dc650dSSadaf Ebrahimi 
4481*22dc650dSSadaf Ebrahimi       /* Handle a string argument; specific delimiter is required. */
4482*22dc650dSSadaf Ebrahimi 
4483*22dc650dSSadaf Ebrahimi       if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
4484*22dc650dSSadaf Ebrahimi         {
4485*22dc650dSSadaf Ebrahimi         PCRE2_SIZE calloutlength;
4486*22dc650dSSadaf Ebrahimi         PCRE2_SPTR startptr = ptr;
4487*22dc650dSSadaf Ebrahimi 
4488*22dc650dSSadaf Ebrahimi         delimiter = 0;
4489*22dc650dSSadaf Ebrahimi         for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
4490*22dc650dSSadaf Ebrahimi           {
4491*22dc650dSSadaf Ebrahimi           if (*ptr == PRIV(callout_start_delims)[i])
4492*22dc650dSSadaf Ebrahimi             {
4493*22dc650dSSadaf Ebrahimi             delimiter = PRIV(callout_end_delims)[i];
4494*22dc650dSSadaf Ebrahimi             break;
4495*22dc650dSSadaf Ebrahimi             }
4496*22dc650dSSadaf Ebrahimi           }
4497*22dc650dSSadaf Ebrahimi         if (delimiter == 0)
4498*22dc650dSSadaf Ebrahimi           {
4499*22dc650dSSadaf Ebrahimi           errorcode = ERR82;
4500*22dc650dSSadaf Ebrahimi           goto FAILED;
4501*22dc650dSSadaf Ebrahimi           }
4502*22dc650dSSadaf Ebrahimi 
4503*22dc650dSSadaf Ebrahimi         *parsed_pattern = META_CALLOUT_STRING;
4504*22dc650dSSadaf Ebrahimi         parsed_pattern += 3;   /* Skip pattern info */
4505*22dc650dSSadaf Ebrahimi 
4506*22dc650dSSadaf Ebrahimi         for (;;)
4507*22dc650dSSadaf Ebrahimi           {
4508*22dc650dSSadaf Ebrahimi           if (++ptr >= ptrend)
4509*22dc650dSSadaf Ebrahimi             {
4510*22dc650dSSadaf Ebrahimi             errorcode = ERR81;
4511*22dc650dSSadaf Ebrahimi             ptr = startptr;   /* To give a more useful message */
4512*22dc650dSSadaf Ebrahimi             goto FAILED;
4513*22dc650dSSadaf Ebrahimi             }
4514*22dc650dSSadaf Ebrahimi           if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
4515*22dc650dSSadaf Ebrahimi             break;
4516*22dc650dSSadaf Ebrahimi           }
4517*22dc650dSSadaf Ebrahimi 
4518*22dc650dSSadaf Ebrahimi         calloutlength = (PCRE2_SIZE)(ptr - startptr);
4519*22dc650dSSadaf Ebrahimi         if (calloutlength > UINT32_MAX)
4520*22dc650dSSadaf Ebrahimi           {
4521*22dc650dSSadaf Ebrahimi           errorcode = ERR72;
4522*22dc650dSSadaf Ebrahimi           goto FAILED;
4523*22dc650dSSadaf Ebrahimi           }
4524*22dc650dSSadaf Ebrahimi         *parsed_pattern++ = (uint32_t)calloutlength;
4525*22dc650dSSadaf Ebrahimi         offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
4526*22dc650dSSadaf Ebrahimi         PUTOFFSET(offset, parsed_pattern);
4527*22dc650dSSadaf Ebrahimi         }
4528*22dc650dSSadaf Ebrahimi 
4529*22dc650dSSadaf Ebrahimi       /* Handle a callout with an optional numerical argument, which must be
4530*22dc650dSSadaf Ebrahimi       less than or equal to 255. A missing argument gives 0. */
4531*22dc650dSSadaf Ebrahimi 
4532*22dc650dSSadaf Ebrahimi       else
4533*22dc650dSSadaf Ebrahimi         {
4534*22dc650dSSadaf Ebrahimi         int n = 0;
4535*22dc650dSSadaf Ebrahimi         *parsed_pattern = META_CALLOUT_NUMBER;     /* Numerical callout */
4536*22dc650dSSadaf Ebrahimi         parsed_pattern += 3;                       /* Skip pattern info */
4537*22dc650dSSadaf Ebrahimi         while (ptr < ptrend && IS_DIGIT(*ptr))
4538*22dc650dSSadaf Ebrahimi           {
4539*22dc650dSSadaf Ebrahimi           n = n * 10 + *ptr++ - CHAR_0;
4540*22dc650dSSadaf Ebrahimi           if (n > 255)
4541*22dc650dSSadaf Ebrahimi             {
4542*22dc650dSSadaf Ebrahimi             errorcode = ERR38;
4543*22dc650dSSadaf Ebrahimi             goto FAILED;
4544*22dc650dSSadaf Ebrahimi             }
4545*22dc650dSSadaf Ebrahimi           }
4546*22dc650dSSadaf Ebrahimi         *parsed_pattern++ = n;
4547*22dc650dSSadaf Ebrahimi         }
4548*22dc650dSSadaf Ebrahimi 
4549*22dc650dSSadaf Ebrahimi       /* Both formats must have a closing parenthesis */
4550*22dc650dSSadaf Ebrahimi 
4551*22dc650dSSadaf Ebrahimi       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4552*22dc650dSSadaf Ebrahimi         {
4553*22dc650dSSadaf Ebrahimi         errorcode = ERR39;
4554*22dc650dSSadaf Ebrahimi         goto FAILED;
4555*22dc650dSSadaf Ebrahimi         }
4556*22dc650dSSadaf Ebrahimi       ptr++;
4557*22dc650dSSadaf Ebrahimi 
4558*22dc650dSSadaf Ebrahimi       /* Remember the offset to the next item in the pattern, and set a default
4559*22dc650dSSadaf Ebrahimi       length. This should get updated after the next item is read. */
4560*22dc650dSSadaf Ebrahimi 
4561*22dc650dSSadaf Ebrahimi       previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
4562*22dc650dSSadaf Ebrahimi       previous_callout[2] = 0;
4563*22dc650dSSadaf Ebrahimi       break;                  /* End callout */
4564*22dc650dSSadaf Ebrahimi 
4565*22dc650dSSadaf Ebrahimi 
4566*22dc650dSSadaf Ebrahimi       /* ---- Conditional group ---- */
4567*22dc650dSSadaf Ebrahimi 
4568*22dc650dSSadaf Ebrahimi       /* A condition can be an assertion, a number (referring to a numbered
4569*22dc650dSSadaf Ebrahimi       group's having been set), a name (referring to a named group), or 'R',
4570*22dc650dSSadaf Ebrahimi       referring to overall recursion. R<digits> and R&name are also permitted
4571*22dc650dSSadaf Ebrahimi       for recursion state tests. Numbers may be preceded by + or - to specify a
4572*22dc650dSSadaf Ebrahimi       relative group number.
4573*22dc650dSSadaf Ebrahimi 
4574*22dc650dSSadaf Ebrahimi       There are several syntaxes for testing a named group: (?(name)) is used
4575*22dc650dSSadaf Ebrahimi       by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4576*22dc650dSSadaf Ebrahimi 
4577*22dc650dSSadaf Ebrahimi       There are two unfortunate ambiguities. 'R' can be the recursive thing or
4578*22dc650dSSadaf Ebrahimi       the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
4579*22dc650dSSadaf Ebrahimi       the Perl DEFINE feature or the Python named test. We look for a name
4580*22dc650dSSadaf Ebrahimi       first; if not found, we try the other case.
4581*22dc650dSSadaf Ebrahimi 
4582*22dc650dSSadaf Ebrahimi       For compatibility with auto-callouts, we allow a callout to be specified
4583*22dc650dSSadaf Ebrahimi       before a condition that is an assertion. */
4584*22dc650dSSadaf Ebrahimi 
4585*22dc650dSSadaf Ebrahimi       case CHAR_LEFT_PARENTHESIS:
4586*22dc650dSSadaf Ebrahimi       if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4587*22dc650dSSadaf Ebrahimi       nest_depth++;
4588*22dc650dSSadaf Ebrahimi 
4589*22dc650dSSadaf Ebrahimi       /* If the next character is ? or * there must be an assertion next
4590*22dc650dSSadaf Ebrahimi       (optionally preceded by a callout). We do not check this here, but
4591*22dc650dSSadaf Ebrahimi       instead we set expect_cond_assert to 2. If this is still greater than
4592*22dc650dSSadaf Ebrahimi       zero (callouts decrement it) when the next assertion is read, it will be
4593*22dc650dSSadaf Ebrahimi       marked as a condition that must not be repeated. A value greater than
4594*22dc650dSSadaf Ebrahimi       zero also causes checking that an assertion (possibly with callout)
4595*22dc650dSSadaf Ebrahimi       follows. */
4596*22dc650dSSadaf Ebrahimi 
4597*22dc650dSSadaf Ebrahimi       if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK)
4598*22dc650dSSadaf Ebrahimi         {
4599*22dc650dSSadaf Ebrahimi         *parsed_pattern++ = META_COND_ASSERT;
4600*22dc650dSSadaf Ebrahimi         ptr--;   /* Pull pointer back to the opening parenthesis. */
4601*22dc650dSSadaf Ebrahimi         expect_cond_assert = 2;
4602*22dc650dSSadaf Ebrahimi         break;  /* End of conditional */
4603*22dc650dSSadaf Ebrahimi         }
4604*22dc650dSSadaf Ebrahimi 
4605*22dc650dSSadaf Ebrahimi       /* Handle (?([+-]number)... */
4606*22dc650dSSadaf Ebrahimi 
4607*22dc650dSSadaf Ebrahimi       if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
4608*22dc650dSSadaf Ebrahimi           &errorcode))
4609*22dc650dSSadaf Ebrahimi         {
4610*22dc650dSSadaf Ebrahimi         if (i <= 0)
4611*22dc650dSSadaf Ebrahimi           {
4612*22dc650dSSadaf Ebrahimi           errorcode = ERR15;
4613*22dc650dSSadaf Ebrahimi           goto FAILED;
4614*22dc650dSSadaf Ebrahimi           }
4615*22dc650dSSadaf Ebrahimi         *parsed_pattern++ = META_COND_NUMBER;
4616*22dc650dSSadaf Ebrahimi         offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4617*22dc650dSSadaf Ebrahimi         PUTOFFSET(offset, parsed_pattern);
4618*22dc650dSSadaf Ebrahimi         *parsed_pattern++ = i;
4619*22dc650dSSadaf Ebrahimi         }
4620*22dc650dSSadaf Ebrahimi       else if (errorcode != 0) goto FAILED;   /* Number too big */
4621*22dc650dSSadaf Ebrahimi 
4622*22dc650dSSadaf Ebrahimi       /* No number found. Handle the special case (?(VERSION[>]=n.m)... */
4623*22dc650dSSadaf Ebrahimi 
4624*22dc650dSSadaf Ebrahimi       else if (ptrend - ptr >= 10 &&
4625*22dc650dSSadaf Ebrahimi                PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
4626*22dc650dSSadaf Ebrahimi                ptr[7] != CHAR_RIGHT_PARENTHESIS)
4627*22dc650dSSadaf Ebrahimi         {
4628*22dc650dSSadaf Ebrahimi         uint32_t ge = 0;
4629*22dc650dSSadaf Ebrahimi         int major = 0;
4630*22dc650dSSadaf Ebrahimi         int minor = 0;
4631*22dc650dSSadaf Ebrahimi 
4632*22dc650dSSadaf Ebrahimi         ptr += 7;
4633*22dc650dSSadaf Ebrahimi         if (*ptr == CHAR_GREATER_THAN_SIGN)
4634*22dc650dSSadaf Ebrahimi           {
4635*22dc650dSSadaf Ebrahimi           ge = 1;
4636*22dc650dSSadaf Ebrahimi           ptr++;
4637*22dc650dSSadaf Ebrahimi           }
4638*22dc650dSSadaf Ebrahimi 
4639*22dc650dSSadaf Ebrahimi         /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
4640*22dc650dSSadaf Ebrahimi         references its argument twice. */
4641*22dc650dSSadaf Ebrahimi 
4642*22dc650dSSadaf Ebrahimi         if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
4643*22dc650dSSadaf Ebrahimi           goto BAD_VERSION_CONDITION;
4644*22dc650dSSadaf Ebrahimi 
4645*22dc650dSSadaf Ebrahimi         if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
4646*22dc650dSSadaf Ebrahimi           goto FAILED;
4647*22dc650dSSadaf Ebrahimi 
4648*22dc650dSSadaf Ebrahimi         if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4649*22dc650dSSadaf Ebrahimi         if (*ptr == CHAR_DOT)
4650*22dc650dSSadaf Ebrahimi           {
4651*22dc650dSSadaf Ebrahimi           if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION;
4652*22dc650dSSadaf Ebrahimi           minor = (*ptr++ - CHAR_0) * 10;
4653*22dc650dSSadaf Ebrahimi           if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4654*22dc650dSSadaf Ebrahimi           if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0;
4655*22dc650dSSadaf Ebrahimi           if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4656*22dc650dSSadaf Ebrahimi             goto BAD_VERSION_CONDITION;
4657*22dc650dSSadaf Ebrahimi           }
4658*22dc650dSSadaf Ebrahimi 
4659*22dc650dSSadaf Ebrahimi         *parsed_pattern++ = META_COND_VERSION;
4660*22dc650dSSadaf Ebrahimi         *parsed_pattern++ = ge;
4661*22dc650dSSadaf Ebrahimi         *parsed_pattern++ = major;
4662*22dc650dSSadaf Ebrahimi         *parsed_pattern++ = minor;
4663*22dc650dSSadaf Ebrahimi         }
4664*22dc650dSSadaf Ebrahimi 
4665*22dc650dSSadaf Ebrahimi       /* All the remaining cases now require us to read a name. We cannot at
4666*22dc650dSSadaf Ebrahimi       this stage distinguish ambiguous cases such as (?(R12) which might be a
4667*22dc650dSSadaf Ebrahimi       recursion test by number or a name, because the named groups have not yet
4668*22dc650dSSadaf Ebrahimi       all been identified. Those cases are treated as names, but given a
4669*22dc650dSSadaf Ebrahimi       different META code. */
4670*22dc650dSSadaf Ebrahimi 
4671*22dc650dSSadaf Ebrahimi       else
4672*22dc650dSSadaf Ebrahimi         {
4673*22dc650dSSadaf Ebrahimi         BOOL was_r_ampersand = FALSE;
4674*22dc650dSSadaf Ebrahimi 
4675*22dc650dSSadaf Ebrahimi         if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
4676*22dc650dSSadaf Ebrahimi           {
4677*22dc650dSSadaf Ebrahimi           terminator = CHAR_RIGHT_PARENTHESIS;
4678*22dc650dSSadaf Ebrahimi           was_r_ampersand = TRUE;
4679*22dc650dSSadaf Ebrahimi           ptr++;
4680*22dc650dSSadaf Ebrahimi           }
4681*22dc650dSSadaf Ebrahimi         else if (*ptr == CHAR_LESS_THAN_SIGN)
4682*22dc650dSSadaf Ebrahimi           terminator = CHAR_GREATER_THAN_SIGN;
4683*22dc650dSSadaf Ebrahimi         else if (*ptr == CHAR_APOSTROPHE)
4684*22dc650dSSadaf Ebrahimi           terminator = CHAR_APOSTROPHE;
4685*22dc650dSSadaf Ebrahimi         else
4686*22dc650dSSadaf Ebrahimi           {
4687*22dc650dSSadaf Ebrahimi           terminator = CHAR_RIGHT_PARENTHESIS;
4688*22dc650dSSadaf Ebrahimi           ptr--;   /* Point to char before name */
4689*22dc650dSSadaf Ebrahimi           }
4690*22dc650dSSadaf Ebrahimi         if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4691*22dc650dSSadaf Ebrahimi             &errorcode, cb)) goto FAILED;
4692*22dc650dSSadaf Ebrahimi 
4693*22dc650dSSadaf Ebrahimi         /* Handle (?(R&name) */
4694*22dc650dSSadaf Ebrahimi 
4695*22dc650dSSadaf Ebrahimi         if (was_r_ampersand)
4696*22dc650dSSadaf Ebrahimi           {
4697*22dc650dSSadaf Ebrahimi           *parsed_pattern = META_COND_RNAME;
4698*22dc650dSSadaf Ebrahimi           ptr--;   /* Back to closing parens */
4699*22dc650dSSadaf Ebrahimi           }
4700*22dc650dSSadaf Ebrahimi 
4701*22dc650dSSadaf Ebrahimi         /* Handle (?(name). If the name is "DEFINE" we identify it with a
4702*22dc650dSSadaf Ebrahimi         special code. Likewise if the name consists of R followed only by
4703*22dc650dSSadaf Ebrahimi         digits. Otherwise, handle it like a quoted name. */
4704*22dc650dSSadaf Ebrahimi 
4705*22dc650dSSadaf Ebrahimi         else if (terminator == CHAR_RIGHT_PARENTHESIS)
4706*22dc650dSSadaf Ebrahimi           {
4707*22dc650dSSadaf Ebrahimi           if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
4708*22dc650dSSadaf Ebrahimi             *parsed_pattern = META_COND_DEFINE;
4709*22dc650dSSadaf Ebrahimi           else
4710*22dc650dSSadaf Ebrahimi             {
4711*22dc650dSSadaf Ebrahimi             for (i = 1; i < (int)namelen; i++)
4712*22dc650dSSadaf Ebrahimi               if (!IS_DIGIT(name[i])) break;
4713*22dc650dSSadaf Ebrahimi             *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
4714*22dc650dSSadaf Ebrahimi               META_COND_RNUMBER : META_COND_NAME;
4715*22dc650dSSadaf Ebrahimi             }
4716*22dc650dSSadaf Ebrahimi           ptr--;   /* Back to closing parens */
4717*22dc650dSSadaf Ebrahimi           }
4718*22dc650dSSadaf Ebrahimi 
4719*22dc650dSSadaf Ebrahimi         /* Handle (?('name') or (?(<name>) */
4720*22dc650dSSadaf Ebrahimi 
4721*22dc650dSSadaf Ebrahimi         else *parsed_pattern = META_COND_NAME;
4722*22dc650dSSadaf Ebrahimi 
4723*22dc650dSSadaf Ebrahimi         /* All these cases except DEFINE end with the name length and offset;
4724*22dc650dSSadaf Ebrahimi         DEFINE just has an offset (for the "too many branches" error). */
4725*22dc650dSSadaf Ebrahimi 
4726*22dc650dSSadaf Ebrahimi         if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
4727*22dc650dSSadaf Ebrahimi         PUTOFFSET(offset, parsed_pattern);
4728*22dc650dSSadaf Ebrahimi         }  /* End cases that read a name */
4729*22dc650dSSadaf Ebrahimi 
4730*22dc650dSSadaf Ebrahimi       /* Check the closing parenthesis of the condition */
4731*22dc650dSSadaf Ebrahimi 
4732*22dc650dSSadaf Ebrahimi       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4733*22dc650dSSadaf Ebrahimi         {
4734*22dc650dSSadaf Ebrahimi         errorcode = ERR24;
4735*22dc650dSSadaf Ebrahimi         goto FAILED;
4736*22dc650dSSadaf Ebrahimi         }
4737*22dc650dSSadaf Ebrahimi       ptr++;
4738*22dc650dSSadaf Ebrahimi       break;  /* End of condition processing */
4739*22dc650dSSadaf Ebrahimi 
4740*22dc650dSSadaf Ebrahimi 
4741*22dc650dSSadaf Ebrahimi       /* ---- Atomic group ---- */
4742*22dc650dSSadaf Ebrahimi 
4743*22dc650dSSadaf Ebrahimi       case CHAR_GREATER_THAN_SIGN:
4744*22dc650dSSadaf Ebrahimi       ATOMIC_GROUP:                          /* Come from (*atomic: */
4745*22dc650dSSadaf Ebrahimi       *parsed_pattern++ = META_ATOMIC;
4746*22dc650dSSadaf Ebrahimi       nest_depth++;
4747*22dc650dSSadaf Ebrahimi       ptr++;
4748*22dc650dSSadaf Ebrahimi       break;
4749*22dc650dSSadaf Ebrahimi 
4750*22dc650dSSadaf Ebrahimi 
4751*22dc650dSSadaf Ebrahimi       /* ---- Lookahead assertions ---- */
4752*22dc650dSSadaf Ebrahimi 
4753*22dc650dSSadaf Ebrahimi       case CHAR_EQUALS_SIGN:
4754*22dc650dSSadaf Ebrahimi       POSITIVE_LOOK_AHEAD:                   /* Come from (*pla: */
4755*22dc650dSSadaf Ebrahimi       *parsed_pattern++ = META_LOOKAHEAD;
4756*22dc650dSSadaf Ebrahimi       ptr++;
4757*22dc650dSSadaf Ebrahimi       goto POST_ASSERTION;
4758*22dc650dSSadaf Ebrahimi 
4759*22dc650dSSadaf Ebrahimi       case CHAR_ASTERISK:
4760*22dc650dSSadaf Ebrahimi       POSITIVE_NONATOMIC_LOOK_AHEAD:         /* Come from (?* */
4761*22dc650dSSadaf Ebrahimi       *parsed_pattern++ = META_LOOKAHEAD_NA;
4762*22dc650dSSadaf Ebrahimi       ptr++;
4763*22dc650dSSadaf Ebrahimi       goto POST_ASSERTION;
4764*22dc650dSSadaf Ebrahimi 
4765*22dc650dSSadaf Ebrahimi       case CHAR_EXCLAMATION_MARK:
4766*22dc650dSSadaf Ebrahimi       NEGATIVE_LOOK_AHEAD:                   /* Come from (*nla: */
4767*22dc650dSSadaf Ebrahimi       *parsed_pattern++ = META_LOOKAHEADNOT;
4768*22dc650dSSadaf Ebrahimi       ptr++;
4769*22dc650dSSadaf Ebrahimi       goto POST_ASSERTION;
4770*22dc650dSSadaf Ebrahimi 
4771*22dc650dSSadaf Ebrahimi 
4772*22dc650dSSadaf Ebrahimi       /* ---- Lookbehind assertions ---- */
4773*22dc650dSSadaf Ebrahimi 
4774*22dc650dSSadaf Ebrahimi       /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
4775*22dc650dSSadaf Ebrahimi       is the start of the name of a capturing group. */
4776*22dc650dSSadaf Ebrahimi 
4777*22dc650dSSadaf Ebrahimi       case CHAR_LESS_THAN_SIGN:
4778*22dc650dSSadaf Ebrahimi       if (ptrend - ptr <= 1 ||
4779*22dc650dSSadaf Ebrahimi          (ptr[1] != CHAR_EQUALS_SIGN &&
4780*22dc650dSSadaf Ebrahimi           ptr[1] != CHAR_EXCLAMATION_MARK &&
4781*22dc650dSSadaf Ebrahimi           ptr[1] != CHAR_ASTERISK))
4782*22dc650dSSadaf Ebrahimi         {
4783*22dc650dSSadaf Ebrahimi         terminator = CHAR_GREATER_THAN_SIGN;
4784*22dc650dSSadaf Ebrahimi         goto DEFINE_NAME;
4785*22dc650dSSadaf Ebrahimi         }
4786*22dc650dSSadaf Ebrahimi       *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
4787*22dc650dSSadaf Ebrahimi         META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
4788*22dc650dSSadaf Ebrahimi         META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
4789*22dc650dSSadaf Ebrahimi 
4790*22dc650dSSadaf Ebrahimi       POST_LOOKBEHIND:           /* Come from (*plb: (*naplb: and (*nlb: */
4791*22dc650dSSadaf Ebrahimi       *has_lookbehind = TRUE;
4792*22dc650dSSadaf Ebrahimi       offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4793*22dc650dSSadaf Ebrahimi       PUTOFFSET(offset, parsed_pattern);
4794*22dc650dSSadaf Ebrahimi       ptr += 2;
4795*22dc650dSSadaf Ebrahimi       /* Fall through */
4796*22dc650dSSadaf Ebrahimi 
4797*22dc650dSSadaf Ebrahimi       /* If the previous item was a condition starting (?(? an assertion,
4798*22dc650dSSadaf Ebrahimi       optionally preceded by a callout, is expected. This is checked later on,
4799*22dc650dSSadaf Ebrahimi       during actual compilation. However we need to identify this kind of
4800*22dc650dSSadaf Ebrahimi       assertion in this pass because it must not be qualified. The value of
4801*22dc650dSSadaf Ebrahimi       expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4802*22dc650dSSadaf Ebrahimi       for a callout - still leaving a positive value that identifies the
4803*22dc650dSSadaf Ebrahimi       assertion. Multiple callouts or any other items will make it zero or
4804*22dc650dSSadaf Ebrahimi       less, which doesn't matter because they will cause an error later. */
4805*22dc650dSSadaf Ebrahimi 
4806*22dc650dSSadaf Ebrahimi       POST_ASSERTION:
4807*22dc650dSSadaf Ebrahimi       nest_depth++;
4808*22dc650dSSadaf Ebrahimi       if (prev_expect_cond_assert > 0)
4809*22dc650dSSadaf Ebrahimi         {
4810*22dc650dSSadaf Ebrahimi         if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4811*22dc650dSSadaf Ebrahimi         else if (++top_nest >= end_nests)
4812*22dc650dSSadaf Ebrahimi           {
4813*22dc650dSSadaf Ebrahimi           errorcode = ERR84;
4814*22dc650dSSadaf Ebrahimi           goto FAILED;
4815*22dc650dSSadaf Ebrahimi           }
4816*22dc650dSSadaf Ebrahimi         top_nest->nest_depth = nest_depth;
4817*22dc650dSSadaf Ebrahimi         top_nest->flags = NSF_CONDASSERT;
4818*22dc650dSSadaf Ebrahimi         top_nest->options = options & PARSE_TRACKED_OPTIONS;
4819*22dc650dSSadaf Ebrahimi         top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4820*22dc650dSSadaf Ebrahimi         }
4821*22dc650dSSadaf Ebrahimi       break;
4822*22dc650dSSadaf Ebrahimi 
4823*22dc650dSSadaf Ebrahimi 
4824*22dc650dSSadaf Ebrahimi       /* ---- Define a named group ---- */
4825*22dc650dSSadaf Ebrahimi 
4826*22dc650dSSadaf Ebrahimi       /* A named group may be defined as (?'name') or (?<name>). In the latter
4827*22dc650dSSadaf Ebrahimi       case we jump to DEFINE_NAME from the disambiguation of (?< above with the
4828*22dc650dSSadaf Ebrahimi       terminator set to '>'. */
4829*22dc650dSSadaf Ebrahimi 
4830*22dc650dSSadaf Ebrahimi       case CHAR_APOSTROPHE:
4831*22dc650dSSadaf Ebrahimi       terminator = CHAR_APOSTROPHE;    /* Terminator */
4832*22dc650dSSadaf Ebrahimi 
4833*22dc650dSSadaf Ebrahimi       DEFINE_NAME:
4834*22dc650dSSadaf Ebrahimi       if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4835*22dc650dSSadaf Ebrahimi           &errorcode, cb)) goto FAILED;
4836*22dc650dSSadaf Ebrahimi 
4837*22dc650dSSadaf Ebrahimi       /* We have a name for this capturing group. It is also assigned a number,
4838*22dc650dSSadaf Ebrahimi       which is its primary means of identification. */
4839*22dc650dSSadaf Ebrahimi 
4840*22dc650dSSadaf Ebrahimi       if (cb->bracount >= MAX_GROUP_NUMBER)
4841*22dc650dSSadaf Ebrahimi         {
4842*22dc650dSSadaf Ebrahimi         errorcode = ERR97;
4843*22dc650dSSadaf Ebrahimi         goto FAILED;
4844*22dc650dSSadaf Ebrahimi         }
4845*22dc650dSSadaf Ebrahimi       cb->bracount++;
4846*22dc650dSSadaf Ebrahimi       *parsed_pattern++ = META_CAPTURE | cb->bracount;
4847*22dc650dSSadaf Ebrahimi       nest_depth++;
4848*22dc650dSSadaf Ebrahimi 
4849*22dc650dSSadaf Ebrahimi       /* Check not too many names */
4850*22dc650dSSadaf Ebrahimi 
4851*22dc650dSSadaf Ebrahimi       if (cb->names_found >= MAX_NAME_COUNT)
4852*22dc650dSSadaf Ebrahimi         {
4853*22dc650dSSadaf Ebrahimi         errorcode = ERR49;
4854*22dc650dSSadaf Ebrahimi         goto FAILED;
4855*22dc650dSSadaf Ebrahimi         }
4856*22dc650dSSadaf Ebrahimi 
4857*22dc650dSSadaf Ebrahimi       /* Adjust the entry size to accommodate the longest name found. */
4858*22dc650dSSadaf Ebrahimi 
4859*22dc650dSSadaf Ebrahimi       if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
4860*22dc650dSSadaf Ebrahimi         cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
4861*22dc650dSSadaf Ebrahimi 
4862*22dc650dSSadaf Ebrahimi       /* Scan the list to check for duplicates. For duplicate names, if the
4863*22dc650dSSadaf Ebrahimi       number is the same, break the loop, which causes the name to be
4864*22dc650dSSadaf Ebrahimi       discarded; otherwise, if DUPNAMES is not set, give an error.
4865*22dc650dSSadaf Ebrahimi       If it is set, allow the name with a different number, but continue
4866*22dc650dSSadaf Ebrahimi       scanning in case this is a duplicate with the same number. For
4867*22dc650dSSadaf Ebrahimi       non-duplicate names, give an error if the number is duplicated. */
4868*22dc650dSSadaf Ebrahimi 
4869*22dc650dSSadaf Ebrahimi       isdupname = FALSE;
4870*22dc650dSSadaf Ebrahimi       ng = cb->named_groups;
4871*22dc650dSSadaf Ebrahimi       for (i = 0; i < cb->names_found; i++, ng++)
4872*22dc650dSSadaf Ebrahimi         {
4873*22dc650dSSadaf Ebrahimi         if (namelen == ng->length &&
4874*22dc650dSSadaf Ebrahimi             PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
4875*22dc650dSSadaf Ebrahimi           {
4876*22dc650dSSadaf Ebrahimi           if (ng->number == cb->bracount) break;
4877*22dc650dSSadaf Ebrahimi           if ((options & PCRE2_DUPNAMES) == 0)
4878*22dc650dSSadaf Ebrahimi             {
4879*22dc650dSSadaf Ebrahimi             errorcode = ERR43;
4880*22dc650dSSadaf Ebrahimi             goto FAILED;
4881*22dc650dSSadaf Ebrahimi             }
4882*22dc650dSSadaf Ebrahimi           isdupname = ng->isdup = TRUE;     /* Mark as a duplicate */
4883*22dc650dSSadaf Ebrahimi           cb->dupnames = TRUE;              /* Duplicate names exist */
4884*22dc650dSSadaf Ebrahimi           }
4885*22dc650dSSadaf Ebrahimi         else if (ng->number == cb->bracount)
4886*22dc650dSSadaf Ebrahimi           {
4887*22dc650dSSadaf Ebrahimi           errorcode = ERR65;
4888*22dc650dSSadaf Ebrahimi           goto FAILED;
4889*22dc650dSSadaf Ebrahimi           }
4890*22dc650dSSadaf Ebrahimi         }
4891*22dc650dSSadaf Ebrahimi 
4892*22dc650dSSadaf Ebrahimi       if (i < cb->names_found) break;   /* Ignore duplicate with same number */
4893*22dc650dSSadaf Ebrahimi 
4894*22dc650dSSadaf Ebrahimi       /* Increase the list size if necessary */
4895*22dc650dSSadaf Ebrahimi 
4896*22dc650dSSadaf Ebrahimi       if (cb->names_found >= cb->named_group_list_size)
4897*22dc650dSSadaf Ebrahimi         {
4898*22dc650dSSadaf Ebrahimi         uint32_t newsize = cb->named_group_list_size * 2;
4899*22dc650dSSadaf Ebrahimi         named_group *newspace =
4900*22dc650dSSadaf Ebrahimi           cb->cx->memctl.malloc(newsize * sizeof(named_group),
4901*22dc650dSSadaf Ebrahimi           cb->cx->memctl.memory_data);
4902*22dc650dSSadaf Ebrahimi         if (newspace == NULL)
4903*22dc650dSSadaf Ebrahimi           {
4904*22dc650dSSadaf Ebrahimi           errorcode = ERR21;
4905*22dc650dSSadaf Ebrahimi           goto FAILED;
4906*22dc650dSSadaf Ebrahimi           }
4907*22dc650dSSadaf Ebrahimi 
4908*22dc650dSSadaf Ebrahimi         memcpy(newspace, cb->named_groups,
4909*22dc650dSSadaf Ebrahimi           cb->named_group_list_size * sizeof(named_group));
4910*22dc650dSSadaf Ebrahimi         if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
4911*22dc650dSSadaf Ebrahimi           cb->cx->memctl.free((void *)cb->named_groups,
4912*22dc650dSSadaf Ebrahimi           cb->cx->memctl.memory_data);
4913*22dc650dSSadaf Ebrahimi         cb->named_groups = newspace;
4914*22dc650dSSadaf Ebrahimi         cb->named_group_list_size = newsize;
4915*22dc650dSSadaf Ebrahimi         }
4916*22dc650dSSadaf Ebrahimi 
4917*22dc650dSSadaf Ebrahimi       /* Add this name to the list */
4918*22dc650dSSadaf Ebrahimi 
4919*22dc650dSSadaf Ebrahimi       cb->named_groups[cb->names_found].name = name;
4920*22dc650dSSadaf Ebrahimi       cb->named_groups[cb->names_found].length = (uint16_t)namelen;
4921*22dc650dSSadaf Ebrahimi       cb->named_groups[cb->names_found].number = cb->bracount;
4922*22dc650dSSadaf Ebrahimi       cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
4923*22dc650dSSadaf Ebrahimi       cb->names_found++;
4924*22dc650dSSadaf Ebrahimi       break;
4925*22dc650dSSadaf Ebrahimi       }        /* End of (? switch */
4926*22dc650dSSadaf Ebrahimi     break;     /* End of ( handling */
4927*22dc650dSSadaf Ebrahimi 
4928*22dc650dSSadaf Ebrahimi 
4929*22dc650dSSadaf Ebrahimi     /* ---- Branch terminators ---- */
4930*22dc650dSSadaf Ebrahimi 
4931*22dc650dSSadaf Ebrahimi     /* Alternation: reset the capture count if we are in a (?| group. */
4932*22dc650dSSadaf Ebrahimi 
4933*22dc650dSSadaf Ebrahimi     case CHAR_VERTICAL_LINE:
4934*22dc650dSSadaf Ebrahimi     if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
4935*22dc650dSSadaf Ebrahimi         (top_nest->flags & NSF_RESET) != 0)
4936*22dc650dSSadaf Ebrahimi       {
4937*22dc650dSSadaf Ebrahimi       if (cb->bracount > top_nest->max_group)
4938*22dc650dSSadaf Ebrahimi         top_nest->max_group = (uint16_t)cb->bracount;
4939*22dc650dSSadaf Ebrahimi       cb->bracount = top_nest->reset_group;
4940*22dc650dSSadaf Ebrahimi       }
4941*22dc650dSSadaf Ebrahimi     *parsed_pattern++ = META_ALT;
4942*22dc650dSSadaf Ebrahimi     break;
4943*22dc650dSSadaf Ebrahimi 
4944*22dc650dSSadaf Ebrahimi     /* End of group; reset the capture count to the maximum if we are in a (?|
4945*22dc650dSSadaf Ebrahimi     group and/or reset the options that are tracked during parsing. Disallow
4946*22dc650dSSadaf Ebrahimi     quantifier for a condition that is an assertion. */
4947*22dc650dSSadaf Ebrahimi 
4948*22dc650dSSadaf Ebrahimi     case CHAR_RIGHT_PARENTHESIS:
4949*22dc650dSSadaf Ebrahimi     okquantifier = TRUE;
4950*22dc650dSSadaf Ebrahimi     if (top_nest != NULL && top_nest->nest_depth == nest_depth)
4951*22dc650dSSadaf Ebrahimi       {
4952*22dc650dSSadaf Ebrahimi       options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
4953*22dc650dSSadaf Ebrahimi       xoptions = (xoptions & ~PARSE_TRACKED_EXTRA_OPTIONS) | top_nest->xoptions;
4954*22dc650dSSadaf Ebrahimi       if ((top_nest->flags & NSF_RESET) != 0 &&
4955*22dc650dSSadaf Ebrahimi           top_nest->max_group > cb->bracount)
4956*22dc650dSSadaf Ebrahimi         cb->bracount = top_nest->max_group;
4957*22dc650dSSadaf Ebrahimi       if ((top_nest->flags & NSF_CONDASSERT) != 0)
4958*22dc650dSSadaf Ebrahimi         okquantifier = FALSE;
4959*22dc650dSSadaf Ebrahimi 
4960*22dc650dSSadaf Ebrahimi       if ((top_nest->flags & NSF_ATOMICSR) != 0)
4961*22dc650dSSadaf Ebrahimi         {
4962*22dc650dSSadaf Ebrahimi         *parsed_pattern++ = META_KET;
4963*22dc650dSSadaf Ebrahimi         }
4964*22dc650dSSadaf Ebrahimi 
4965*22dc650dSSadaf Ebrahimi       if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
4966*22dc650dSSadaf Ebrahimi         else top_nest--;
4967*22dc650dSSadaf Ebrahimi       }
4968*22dc650dSSadaf Ebrahimi     if (nest_depth == 0)    /* Unmatched closing parenthesis */
4969*22dc650dSSadaf Ebrahimi       {
4970*22dc650dSSadaf Ebrahimi       errorcode = ERR22;
4971*22dc650dSSadaf Ebrahimi       goto FAILED_BACK;
4972*22dc650dSSadaf Ebrahimi       }
4973*22dc650dSSadaf Ebrahimi     nest_depth--;
4974*22dc650dSSadaf Ebrahimi     *parsed_pattern++ = META_KET;
4975*22dc650dSSadaf Ebrahimi     break;
4976*22dc650dSSadaf Ebrahimi     }  /* End of switch on pattern character */
4977*22dc650dSSadaf Ebrahimi   }    /* End of main character scan loop */
4978*22dc650dSSadaf Ebrahimi 
4979*22dc650dSSadaf Ebrahimi /* End of pattern reached. Check for missing ) at the end of a verb name. */
4980*22dc650dSSadaf Ebrahimi 
4981*22dc650dSSadaf Ebrahimi if (inverbname && ptr >= ptrend)
4982*22dc650dSSadaf Ebrahimi   {
4983*22dc650dSSadaf Ebrahimi   errorcode = ERR60;
4984*22dc650dSSadaf Ebrahimi   goto FAILED;
4985*22dc650dSSadaf Ebrahimi   }
4986*22dc650dSSadaf Ebrahimi 
4987*22dc650dSSadaf Ebrahimi /* Manage callout for the final item */
4988*22dc650dSSadaf Ebrahimi 
4989*22dc650dSSadaf Ebrahimi PARSED_END:
4990*22dc650dSSadaf Ebrahimi parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
4991*22dc650dSSadaf Ebrahimi   parsed_pattern, cb);
4992*22dc650dSSadaf Ebrahimi 
4993*22dc650dSSadaf Ebrahimi /* Insert trailing items for word and line matching (features provided for the
4994*22dc650dSSadaf Ebrahimi benefit of pcre2grep). */
4995*22dc650dSSadaf Ebrahimi 
4996*22dc650dSSadaf Ebrahimi if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
4997*22dc650dSSadaf Ebrahimi   {
4998*22dc650dSSadaf Ebrahimi   *parsed_pattern++ = META_KET;
4999*22dc650dSSadaf Ebrahimi   *parsed_pattern++ = META_DOLLAR;
5000*22dc650dSSadaf Ebrahimi   }
5001*22dc650dSSadaf Ebrahimi else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
5002*22dc650dSSadaf Ebrahimi   {
5003*22dc650dSSadaf Ebrahimi   *parsed_pattern++ = META_KET;
5004*22dc650dSSadaf Ebrahimi   *parsed_pattern++ = META_ESCAPE + ESC_b;
5005*22dc650dSSadaf Ebrahimi   }
5006*22dc650dSSadaf Ebrahimi 
5007*22dc650dSSadaf Ebrahimi /* Terminate the parsed pattern, then return success if all groups are closed.
5008*22dc650dSSadaf Ebrahimi Otherwise we have unclosed parentheses. */
5009*22dc650dSSadaf Ebrahimi 
5010*22dc650dSSadaf Ebrahimi if (parsed_pattern >= parsed_pattern_end)
5011*22dc650dSSadaf Ebrahimi   {
5012*22dc650dSSadaf Ebrahimi   errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
5013*22dc650dSSadaf Ebrahimi   goto FAILED;
5014*22dc650dSSadaf Ebrahimi   }
5015*22dc650dSSadaf Ebrahimi 
5016*22dc650dSSadaf Ebrahimi *parsed_pattern = META_END;
5017*22dc650dSSadaf Ebrahimi if (nest_depth == 0) return 0;
5018*22dc650dSSadaf Ebrahimi 
5019*22dc650dSSadaf Ebrahimi UNCLOSED_PARENTHESIS:
5020*22dc650dSSadaf Ebrahimi errorcode = ERR14;
5021*22dc650dSSadaf Ebrahimi 
5022*22dc650dSSadaf Ebrahimi /* Come here for all failures. */
5023*22dc650dSSadaf Ebrahimi 
5024*22dc650dSSadaf Ebrahimi FAILED:
5025*22dc650dSSadaf Ebrahimi cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
5026*22dc650dSSadaf Ebrahimi return errorcode;
5027*22dc650dSSadaf Ebrahimi 
5028*22dc650dSSadaf Ebrahimi /* Some errors need to indicate the previous character. */
5029*22dc650dSSadaf Ebrahimi 
5030*22dc650dSSadaf Ebrahimi FAILED_BACK:
5031*22dc650dSSadaf Ebrahimi ptr--;
5032*22dc650dSSadaf Ebrahimi goto FAILED;
5033*22dc650dSSadaf Ebrahimi 
5034*22dc650dSSadaf Ebrahimi /* This failure happens several times. */
5035*22dc650dSSadaf Ebrahimi 
5036*22dc650dSSadaf Ebrahimi BAD_VERSION_CONDITION:
5037*22dc650dSSadaf Ebrahimi errorcode = ERR79;
5038*22dc650dSSadaf Ebrahimi goto FAILED;
5039*22dc650dSSadaf Ebrahimi }
5040*22dc650dSSadaf Ebrahimi 
5041*22dc650dSSadaf Ebrahimi 
5042*22dc650dSSadaf Ebrahimi 
5043*22dc650dSSadaf Ebrahimi /*************************************************
5044*22dc650dSSadaf Ebrahimi *       Find first significant opcode            *
5045*22dc650dSSadaf Ebrahimi *************************************************/
5046*22dc650dSSadaf Ebrahimi 
5047*22dc650dSSadaf Ebrahimi /* This is called by several functions that scan a compiled expression looking
5048*22dc650dSSadaf Ebrahimi for a fixed first character, or an anchoring opcode etc. It skips over things
5049*22dc650dSSadaf Ebrahimi that do not influence this. For some calls, it makes sense to skip negative
5050*22dc650dSSadaf Ebrahimi forward and all backward assertions, and also the \b assertion; for others it
5051*22dc650dSSadaf Ebrahimi does not.
5052*22dc650dSSadaf Ebrahimi 
5053*22dc650dSSadaf Ebrahimi Arguments:
5054*22dc650dSSadaf Ebrahimi   code         pointer to the start of the group
5055*22dc650dSSadaf Ebrahimi   skipassert   TRUE if certain assertions are to be skipped
5056*22dc650dSSadaf Ebrahimi 
5057*22dc650dSSadaf Ebrahimi Returns:       pointer to the first significant opcode
5058*22dc650dSSadaf Ebrahimi */
5059*22dc650dSSadaf Ebrahimi 
5060*22dc650dSSadaf Ebrahimi static const PCRE2_UCHAR*
first_significant_code(PCRE2_SPTR code,BOOL skipassert)5061*22dc650dSSadaf Ebrahimi first_significant_code(PCRE2_SPTR code, BOOL skipassert)
5062*22dc650dSSadaf Ebrahimi {
5063*22dc650dSSadaf Ebrahimi for (;;)
5064*22dc650dSSadaf Ebrahimi   {
5065*22dc650dSSadaf Ebrahimi   switch ((int)*code)
5066*22dc650dSSadaf Ebrahimi     {
5067*22dc650dSSadaf Ebrahimi     case OP_ASSERT_NOT:
5068*22dc650dSSadaf Ebrahimi     case OP_ASSERTBACK:
5069*22dc650dSSadaf Ebrahimi     case OP_ASSERTBACK_NOT:
5070*22dc650dSSadaf Ebrahimi     case OP_ASSERTBACK_NA:
5071*22dc650dSSadaf Ebrahimi     if (!skipassert) return code;
5072*22dc650dSSadaf Ebrahimi     do code += GET(code, 1); while (*code == OP_ALT);
5073*22dc650dSSadaf Ebrahimi     code += PRIV(OP_lengths)[*code];
5074*22dc650dSSadaf Ebrahimi     break;
5075*22dc650dSSadaf Ebrahimi 
5076*22dc650dSSadaf Ebrahimi     case OP_WORD_BOUNDARY:
5077*22dc650dSSadaf Ebrahimi     case OP_NOT_WORD_BOUNDARY:
5078*22dc650dSSadaf Ebrahimi     case OP_UCP_WORD_BOUNDARY:
5079*22dc650dSSadaf Ebrahimi     case OP_NOT_UCP_WORD_BOUNDARY:
5080*22dc650dSSadaf Ebrahimi     if (!skipassert) return code;
5081*22dc650dSSadaf Ebrahimi     /* Fall through */
5082*22dc650dSSadaf Ebrahimi 
5083*22dc650dSSadaf Ebrahimi     case OP_CALLOUT:
5084*22dc650dSSadaf Ebrahimi     case OP_CREF:
5085*22dc650dSSadaf Ebrahimi     case OP_DNCREF:
5086*22dc650dSSadaf Ebrahimi     case OP_RREF:
5087*22dc650dSSadaf Ebrahimi     case OP_DNRREF:
5088*22dc650dSSadaf Ebrahimi     case OP_FALSE:
5089*22dc650dSSadaf Ebrahimi     case OP_TRUE:
5090*22dc650dSSadaf Ebrahimi     code += PRIV(OP_lengths)[*code];
5091*22dc650dSSadaf Ebrahimi     break;
5092*22dc650dSSadaf Ebrahimi 
5093*22dc650dSSadaf Ebrahimi     case OP_CALLOUT_STR:
5094*22dc650dSSadaf Ebrahimi     code += GET(code, 1 + 2*LINK_SIZE);
5095*22dc650dSSadaf Ebrahimi     break;
5096*22dc650dSSadaf Ebrahimi 
5097*22dc650dSSadaf Ebrahimi     case OP_SKIPZERO:
5098*22dc650dSSadaf Ebrahimi     code += 2 + GET(code, 2) + LINK_SIZE;
5099*22dc650dSSadaf Ebrahimi     break;
5100*22dc650dSSadaf Ebrahimi 
5101*22dc650dSSadaf Ebrahimi     case OP_COND:
5102*22dc650dSSadaf Ebrahimi     case OP_SCOND:
5103*22dc650dSSadaf Ebrahimi     if (code[1+LINK_SIZE] != OP_FALSE ||   /* Not DEFINE */
5104*22dc650dSSadaf Ebrahimi         code[GET(code, 1)] != OP_KET)      /* More than one branch */
5105*22dc650dSSadaf Ebrahimi       return code;
5106*22dc650dSSadaf Ebrahimi     code += GET(code, 1) + 1 + LINK_SIZE;
5107*22dc650dSSadaf Ebrahimi     break;
5108*22dc650dSSadaf Ebrahimi 
5109*22dc650dSSadaf Ebrahimi     case OP_MARK:
5110*22dc650dSSadaf Ebrahimi     case OP_COMMIT_ARG:
5111*22dc650dSSadaf Ebrahimi     case OP_PRUNE_ARG:
5112*22dc650dSSadaf Ebrahimi     case OP_SKIP_ARG:
5113*22dc650dSSadaf Ebrahimi     case OP_THEN_ARG:
5114*22dc650dSSadaf Ebrahimi     code += code[1] + PRIV(OP_lengths)[*code];
5115*22dc650dSSadaf Ebrahimi     break;
5116*22dc650dSSadaf Ebrahimi 
5117*22dc650dSSadaf Ebrahimi     default:
5118*22dc650dSSadaf Ebrahimi     return code;
5119*22dc650dSSadaf Ebrahimi     }
5120*22dc650dSSadaf Ebrahimi   }
5121*22dc650dSSadaf Ebrahimi /* Control never reaches here */
5122*22dc650dSSadaf Ebrahimi }
5123*22dc650dSSadaf Ebrahimi 
5124*22dc650dSSadaf Ebrahimi 
5125*22dc650dSSadaf Ebrahimi 
5126*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
5127*22dc650dSSadaf Ebrahimi /*************************************************
5128*22dc650dSSadaf Ebrahimi *           Get othercase range                  *
5129*22dc650dSSadaf Ebrahimi *************************************************/
5130*22dc650dSSadaf Ebrahimi 
5131*22dc650dSSadaf Ebrahimi /* This function is passed the start and end of a class range in UCP mode. For
5132*22dc650dSSadaf Ebrahimi single characters the range may be just one character long. The function
5133*22dc650dSSadaf Ebrahimi searches up the characters, looking for ranges of characters in the "other"
5134*22dc650dSSadaf Ebrahimi case. Each call returns the next one, updating the start address. A character
5135*22dc650dSSadaf Ebrahimi with multiple other cases is returned on its own with a special return value.
5136*22dc650dSSadaf Ebrahimi 
5137*22dc650dSSadaf Ebrahimi Arguments:
5138*22dc650dSSadaf Ebrahimi   cptr        points to starting character value; updated
5139*22dc650dSSadaf Ebrahimi   d           end value
5140*22dc650dSSadaf Ebrahimi   ocptr       where to put start of othercase range
5141*22dc650dSSadaf Ebrahimi   odptr       where to put end of othercase range
5142*22dc650dSSadaf Ebrahimi   restricted  TRUE if caseless restriction applies
5143*22dc650dSSadaf Ebrahimi 
5144*22dc650dSSadaf Ebrahimi Yield:        -1 when no more
5145*22dc650dSSadaf Ebrahimi                0 when a range is returned
5146*22dc650dSSadaf Ebrahimi               >0 the CASESET offset for char with multiple other cases;
5147*22dc650dSSadaf Ebrahimi                  for this return, *ocptr contains the original
5148*22dc650dSSadaf Ebrahimi */
5149*22dc650dSSadaf Ebrahimi 
5150*22dc650dSSadaf Ebrahimi static int
get_othercase_range(uint32_t * cptr,uint32_t d,uint32_t * ocptr,uint32_t * odptr,BOOL restricted)5151*22dc650dSSadaf Ebrahimi get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr,
5152*22dc650dSSadaf Ebrahimi   uint32_t *odptr, BOOL restricted)
5153*22dc650dSSadaf Ebrahimi {
5154*22dc650dSSadaf Ebrahimi uint32_t c, othercase, next;
5155*22dc650dSSadaf Ebrahimi unsigned int co;
5156*22dc650dSSadaf Ebrahimi 
5157*22dc650dSSadaf Ebrahimi /* Find the first character that has an other case. If it has multiple other
5158*22dc650dSSadaf Ebrahimi cases, return its case offset value. When CASELESS_RESTRICT is set, ignore the
5159*22dc650dSSadaf Ebrahimi multi-case entries that begin with ASCII values. In 32-bit mode, a value
5160*22dc650dSSadaf Ebrahimi greater than the Unicode maximum ends the range. */
5161*22dc650dSSadaf Ebrahimi 
5162*22dc650dSSadaf Ebrahimi for (c = *cptr; c <= d; c++)
5163*22dc650dSSadaf Ebrahimi   {
5164*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 32
5165*22dc650dSSadaf Ebrahimi   if (c > MAX_UTF_CODE_POINT) return -1;
5166*22dc650dSSadaf Ebrahimi #endif
5167*22dc650dSSadaf Ebrahimi   if ((co = UCD_CASESET(c)) != 0 &&
5168*22dc650dSSadaf Ebrahimi       (!restricted || PRIV(ucd_caseless_sets)[co] > 127))
5169*22dc650dSSadaf Ebrahimi     {
5170*22dc650dSSadaf Ebrahimi     *ocptr = c++;   /* Character that has the set */
5171*22dc650dSSadaf Ebrahimi     *cptr = c;      /* Rest of input range */
5172*22dc650dSSadaf Ebrahimi     return (int)co;
5173*22dc650dSSadaf Ebrahimi     }
5174*22dc650dSSadaf Ebrahimi 
5175*22dc650dSSadaf Ebrahimi    /* This is not a valid multiple-case character. Check that the single other
5176*22dc650dSSadaf Ebrahimi    case is different to the original. We don't need to check "restricted" here
5177*22dc650dSSadaf Ebrahimi    because the non-ASCII characters with multiple cases that include an ASCII
5178*22dc650dSSadaf Ebrahimi    character don't have a different "othercase". */
5179*22dc650dSSadaf Ebrahimi 
5180*22dc650dSSadaf Ebrahimi   if ((othercase = UCD_OTHERCASE(c)) != c) break;
5181*22dc650dSSadaf Ebrahimi   }
5182*22dc650dSSadaf Ebrahimi 
5183*22dc650dSSadaf Ebrahimi if (c > d) return -1;  /* Reached end of range */
5184*22dc650dSSadaf Ebrahimi 
5185*22dc650dSSadaf Ebrahimi /* Found a character that has a single other case. Search for the end of the
5186*22dc650dSSadaf Ebrahimi range, which is either the end of the input range, or a character that has zero
5187*22dc650dSSadaf Ebrahimi or more than one other cases. */
5188*22dc650dSSadaf Ebrahimi 
5189*22dc650dSSadaf Ebrahimi *ocptr = othercase;
5190*22dc650dSSadaf Ebrahimi next = othercase + 1;
5191*22dc650dSSadaf Ebrahimi 
5192*22dc650dSSadaf Ebrahimi for (++c; c <= d; c++)
5193*22dc650dSSadaf Ebrahimi   {
5194*22dc650dSSadaf Ebrahimi   if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
5195*22dc650dSSadaf Ebrahimi   next++;
5196*22dc650dSSadaf Ebrahimi   }
5197*22dc650dSSadaf Ebrahimi 
5198*22dc650dSSadaf Ebrahimi *odptr = next - 1;     /* End of othercase range */
5199*22dc650dSSadaf Ebrahimi *cptr = c;             /* Rest of input range */
5200*22dc650dSSadaf Ebrahimi return 0;
5201*22dc650dSSadaf Ebrahimi }
5202*22dc650dSSadaf Ebrahimi #endif  /* SUPPORT_UNICODE */
5203*22dc650dSSadaf Ebrahimi 
5204*22dc650dSSadaf Ebrahimi 
5205*22dc650dSSadaf Ebrahimi 
5206*22dc650dSSadaf Ebrahimi /*************************************************
5207*22dc650dSSadaf Ebrahimi * Add a character or range to a class (internal) *
5208*22dc650dSSadaf Ebrahimi *************************************************/
5209*22dc650dSSadaf Ebrahimi 
5210*22dc650dSSadaf Ebrahimi /* This function packages up the logic of adding a character or range of
5211*22dc650dSSadaf Ebrahimi characters to a class. The character values in the arguments will be within the
5212*22dc650dSSadaf Ebrahimi valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
5213*22dc650dSSadaf Ebrahimi called only from within the "add to class" group of functions, some of which
5214*22dc650dSSadaf Ebrahimi are recursive and mutually recursive. The external entry point is
5215*22dc650dSSadaf Ebrahimi add_to_class().
5216*22dc650dSSadaf Ebrahimi 
5217*22dc650dSSadaf Ebrahimi Arguments:
5218*22dc650dSSadaf Ebrahimi   classbits     the bit map for characters < 256
5219*22dc650dSSadaf Ebrahimi   uchardptr     points to the pointer for extra data
5220*22dc650dSSadaf Ebrahimi   options       the options bits
5221*22dc650dSSadaf Ebrahimi   xoptions      the extra options bits
5222*22dc650dSSadaf Ebrahimi   cb            compile data
5223*22dc650dSSadaf Ebrahimi   start         start of range character
5224*22dc650dSSadaf Ebrahimi   end           end of range character
5225*22dc650dSSadaf Ebrahimi 
5226*22dc650dSSadaf Ebrahimi Returns:        the number of < 256 characters added
5227*22dc650dSSadaf Ebrahimi                 the pointer to extra data is updated
5228*22dc650dSSadaf Ebrahimi */
5229*22dc650dSSadaf Ebrahimi 
5230*22dc650dSSadaf Ebrahimi static unsigned int
add_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,uint32_t start,uint32_t end)5231*22dc650dSSadaf Ebrahimi add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5232*22dc650dSSadaf Ebrahimi   uint32_t options, uint32_t xoptions, compile_block *cb, uint32_t start,
5233*22dc650dSSadaf Ebrahimi   uint32_t end)
5234*22dc650dSSadaf Ebrahimi {
5235*22dc650dSSadaf Ebrahimi uint32_t c;
5236*22dc650dSSadaf Ebrahimi uint32_t classbits_end = (end <= 0xff ? end : 0xff);
5237*22dc650dSSadaf Ebrahimi unsigned int n8 = 0;
5238*22dc650dSSadaf Ebrahimi 
5239*22dc650dSSadaf Ebrahimi /* If caseless matching is required, scan the range and process alternate
5240*22dc650dSSadaf Ebrahimi cases. In Unicode, there are 8-bit characters that have alternate cases that
5241*22dc650dSSadaf Ebrahimi are greater than 255 and vice-versa (though these may be ignored if caseless
5242*22dc650dSSadaf Ebrahimi restriction is in force). Sometimes we can just extend the original range. */
5243*22dc650dSSadaf Ebrahimi 
5244*22dc650dSSadaf Ebrahimi if ((options & PCRE2_CASELESS) != 0)
5245*22dc650dSSadaf Ebrahimi   {
5246*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
5247*22dc650dSSadaf Ebrahimi   if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
5248*22dc650dSSadaf Ebrahimi     {
5249*22dc650dSSadaf Ebrahimi     int rc;
5250*22dc650dSSadaf Ebrahimi     uint32_t oc, od;
5251*22dc650dSSadaf Ebrahimi 
5252*22dc650dSSadaf Ebrahimi     options &= ~PCRE2_CASELESS;   /* Remove for recursive calls */
5253*22dc650dSSadaf Ebrahimi     c = start;
5254*22dc650dSSadaf Ebrahimi 
5255*22dc650dSSadaf Ebrahimi     while ((rc = get_othercase_range(&c, end, &oc, &od,
5256*22dc650dSSadaf Ebrahimi              (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)) >= 0)
5257*22dc650dSSadaf Ebrahimi       {
5258*22dc650dSSadaf Ebrahimi       /* Handle a single character that has more than one other case. */
5259*22dc650dSSadaf Ebrahimi 
5260*22dc650dSSadaf Ebrahimi       if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr,
5261*22dc650dSSadaf Ebrahimi         options, xoptions, cb, PRIV(ucd_caseless_sets) + rc, oc);
5262*22dc650dSSadaf Ebrahimi 
5263*22dc650dSSadaf Ebrahimi       /* Do nothing if the other case range is within the original range. */
5264*22dc650dSSadaf Ebrahimi 
5265*22dc650dSSadaf Ebrahimi       else if (oc >= cb->class_range_start && od <= cb->class_range_end)
5266*22dc650dSSadaf Ebrahimi         continue;
5267*22dc650dSSadaf Ebrahimi 
5268*22dc650dSSadaf Ebrahimi       /* Extend the original range if there is overlap, noting that if oc < c,
5269*22dc650dSSadaf Ebrahimi       we can't have od > end because a subrange is always shorter than the
5270*22dc650dSSadaf Ebrahimi       basic range. Otherwise, use a recursive call to add the additional range.
5271*22dc650dSSadaf Ebrahimi       */
5272*22dc650dSSadaf Ebrahimi 
5273*22dc650dSSadaf Ebrahimi       else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
5274*22dc650dSSadaf Ebrahimi       else if (od > end && oc <= end + 1)
5275*22dc650dSSadaf Ebrahimi         {
5276*22dc650dSSadaf Ebrahimi         end = od;       /* Extend upwards */
5277*22dc650dSSadaf Ebrahimi         if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
5278*22dc650dSSadaf Ebrahimi         }
5279*22dc650dSSadaf Ebrahimi       else n8 += add_to_class_internal(classbits, uchardptr, options, xoptions,
5280*22dc650dSSadaf Ebrahimi         cb, oc, od);
5281*22dc650dSSadaf Ebrahimi       }
5282*22dc650dSSadaf Ebrahimi     }
5283*22dc650dSSadaf Ebrahimi   else
5284*22dc650dSSadaf Ebrahimi #else
5285*22dc650dSSadaf Ebrahimi   (void)xoptions;   /* Avoid compiler warning */
5286*22dc650dSSadaf Ebrahimi #endif  /* SUPPORT_UNICODE */
5287*22dc650dSSadaf Ebrahimi 
5288*22dc650dSSadaf Ebrahimi   /* Not UTF mode */
5289*22dc650dSSadaf Ebrahimi 
5290*22dc650dSSadaf Ebrahimi   for (c = start; c <= classbits_end; c++)
5291*22dc650dSSadaf Ebrahimi     {
5292*22dc650dSSadaf Ebrahimi     SETBIT(classbits, cb->fcc[c]);
5293*22dc650dSSadaf Ebrahimi     n8++;
5294*22dc650dSSadaf Ebrahimi     }
5295*22dc650dSSadaf Ebrahimi   }
5296*22dc650dSSadaf Ebrahimi 
5297*22dc650dSSadaf Ebrahimi /* Now handle the originally supplied range. Adjust the final value according
5298*22dc650dSSadaf Ebrahimi to the bit length - this means that the same lists of (e.g.) horizontal spaces
5299*22dc650dSSadaf Ebrahimi can be used in all cases. */
5300*22dc650dSSadaf Ebrahimi 
5301*22dc650dSSadaf Ebrahimi if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
5302*22dc650dSSadaf Ebrahimi   end = MAX_NON_UTF_CHAR;
5303*22dc650dSSadaf Ebrahimi 
5304*22dc650dSSadaf Ebrahimi if (start > cb->class_range_start && end < cb->class_range_end) return n8;
5305*22dc650dSSadaf Ebrahimi 
5306*22dc650dSSadaf Ebrahimi /* Use the bitmap for characters < 256. Otherwise use extra data.*/
5307*22dc650dSSadaf Ebrahimi 
5308*22dc650dSSadaf Ebrahimi for (c = start; c <= classbits_end; c++)
5309*22dc650dSSadaf Ebrahimi   {
5310*22dc650dSSadaf Ebrahimi   /* Regardless of start, c will always be <= 255. */
5311*22dc650dSSadaf Ebrahimi   SETBIT(classbits, c);
5312*22dc650dSSadaf Ebrahimi   n8++;
5313*22dc650dSSadaf Ebrahimi   }
5314*22dc650dSSadaf Ebrahimi 
5315*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_WIDE_CHARS
5316*22dc650dSSadaf Ebrahimi if (start <= 0xff) start = 0xff + 1;
5317*22dc650dSSadaf Ebrahimi 
5318*22dc650dSSadaf Ebrahimi if (end >= start)
5319*22dc650dSSadaf Ebrahimi   {
5320*22dc650dSSadaf Ebrahimi   PCRE2_UCHAR *uchardata = *uchardptr;
5321*22dc650dSSadaf Ebrahimi 
5322*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
5323*22dc650dSSadaf Ebrahimi   if ((options & PCRE2_UTF) != 0)
5324*22dc650dSSadaf Ebrahimi     {
5325*22dc650dSSadaf Ebrahimi     if (start < end)
5326*22dc650dSSadaf Ebrahimi       {
5327*22dc650dSSadaf Ebrahimi       *uchardata++ = XCL_RANGE;
5328*22dc650dSSadaf Ebrahimi       uchardata += PRIV(ord2utf)(start, uchardata);
5329*22dc650dSSadaf Ebrahimi       uchardata += PRIV(ord2utf)(end, uchardata);
5330*22dc650dSSadaf Ebrahimi       }
5331*22dc650dSSadaf Ebrahimi     else if (start == end)
5332*22dc650dSSadaf Ebrahimi       {
5333*22dc650dSSadaf Ebrahimi       *uchardata++ = XCL_SINGLE;
5334*22dc650dSSadaf Ebrahimi       uchardata += PRIV(ord2utf)(start, uchardata);
5335*22dc650dSSadaf Ebrahimi       }
5336*22dc650dSSadaf Ebrahimi     }
5337*22dc650dSSadaf Ebrahimi   else
5338*22dc650dSSadaf Ebrahimi #endif  /* SUPPORT_UNICODE */
5339*22dc650dSSadaf Ebrahimi 
5340*22dc650dSSadaf Ebrahimi   /* Without UTF support, character values are constrained by the bit length,
5341*22dc650dSSadaf Ebrahimi   and can only be > 256 for 16-bit and 32-bit libraries. */
5342*22dc650dSSadaf Ebrahimi 
5343*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
5344*22dc650dSSadaf Ebrahimi     {}
5345*22dc650dSSadaf Ebrahimi #else
5346*22dc650dSSadaf Ebrahimi   if (start < end)
5347*22dc650dSSadaf Ebrahimi     {
5348*22dc650dSSadaf Ebrahimi     *uchardata++ = XCL_RANGE;
5349*22dc650dSSadaf Ebrahimi     *uchardata++ = start;
5350*22dc650dSSadaf Ebrahimi     *uchardata++ = end;
5351*22dc650dSSadaf Ebrahimi     }
5352*22dc650dSSadaf Ebrahimi   else if (start == end)
5353*22dc650dSSadaf Ebrahimi     {
5354*22dc650dSSadaf Ebrahimi     *uchardata++ = XCL_SINGLE;
5355*22dc650dSSadaf Ebrahimi     *uchardata++ = start;
5356*22dc650dSSadaf Ebrahimi     }
5357*22dc650dSSadaf Ebrahimi #endif  /* PCRE2_CODE_UNIT_WIDTH == 8 */
5358*22dc650dSSadaf Ebrahimi   *uchardptr = uchardata;   /* Updata extra data pointer */
5359*22dc650dSSadaf Ebrahimi   }
5360*22dc650dSSadaf Ebrahimi #else  /* SUPPORT_WIDE_CHARS */
5361*22dc650dSSadaf Ebrahimi   (void)uchardptr;          /* Avoid compiler warning */
5362*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_WIDE_CHARS */
5363*22dc650dSSadaf Ebrahimi 
5364*22dc650dSSadaf Ebrahimi return n8;    /* Number of 8-bit characters */
5365*22dc650dSSadaf Ebrahimi }
5366*22dc650dSSadaf Ebrahimi 
5367*22dc650dSSadaf Ebrahimi 
5368*22dc650dSSadaf Ebrahimi 
5369*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
5370*22dc650dSSadaf Ebrahimi /*************************************************
5371*22dc650dSSadaf Ebrahimi * Add a list of characters to a class (internal) *
5372*22dc650dSSadaf Ebrahimi *************************************************/
5373*22dc650dSSadaf Ebrahimi 
5374*22dc650dSSadaf Ebrahimi /* This function is used for adding a list of case-equivalent characters to a
5375*22dc650dSSadaf Ebrahimi class when in UTF mode. This function is called only from within
5376*22dc650dSSadaf Ebrahimi add_to_class_internal(), with which it is mutually recursive.
5377*22dc650dSSadaf Ebrahimi 
5378*22dc650dSSadaf Ebrahimi Arguments:
5379*22dc650dSSadaf Ebrahimi   classbits     the bit map for characters < 256
5380*22dc650dSSadaf Ebrahimi   uchardptr     points to the pointer for extra data
5381*22dc650dSSadaf Ebrahimi   options       the options bits
5382*22dc650dSSadaf Ebrahimi   xoptions      the extra options bits
5383*22dc650dSSadaf Ebrahimi   cb            contains pointers to tables etc.
5384*22dc650dSSadaf Ebrahimi   p             points to row of 32-bit values, terminated by NOTACHAR
5385*22dc650dSSadaf Ebrahimi   except        character to omit; this is used when adding lists of
5386*22dc650dSSadaf Ebrahimi                   case-equivalent characters to avoid including the one we
5387*22dc650dSSadaf Ebrahimi                   already know about
5388*22dc650dSSadaf Ebrahimi 
5389*22dc650dSSadaf Ebrahimi Returns:        the number of < 256 characters added
5390*22dc650dSSadaf Ebrahimi                 the pointer to extra data is updated
5391*22dc650dSSadaf Ebrahimi */
5392*22dc650dSSadaf Ebrahimi 
5393*22dc650dSSadaf Ebrahimi static unsigned int
add_list_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,const uint32_t * p,unsigned int except)5394*22dc650dSSadaf Ebrahimi add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5395*22dc650dSSadaf Ebrahimi   uint32_t options, uint32_t xoptions, compile_block *cb, const uint32_t *p,
5396*22dc650dSSadaf Ebrahimi   unsigned int except)
5397*22dc650dSSadaf Ebrahimi {
5398*22dc650dSSadaf Ebrahimi unsigned int n8 = 0;
5399*22dc650dSSadaf Ebrahimi while (p[0] < NOTACHAR)
5400*22dc650dSSadaf Ebrahimi   {
5401*22dc650dSSadaf Ebrahimi   unsigned int n = 0;
5402*22dc650dSSadaf Ebrahimi   if (p[0] != except)
5403*22dc650dSSadaf Ebrahimi     {
5404*22dc650dSSadaf Ebrahimi     while(p[n+1] == p[0] + n + 1) n++;
5405*22dc650dSSadaf Ebrahimi     n8 += add_to_class_internal(classbits, uchardptr, options, xoptions, cb,
5406*22dc650dSSadaf Ebrahimi       p[0], p[n]);
5407*22dc650dSSadaf Ebrahimi     }
5408*22dc650dSSadaf Ebrahimi   p += n + 1;
5409*22dc650dSSadaf Ebrahimi   }
5410*22dc650dSSadaf Ebrahimi return n8;
5411*22dc650dSSadaf Ebrahimi }
5412*22dc650dSSadaf Ebrahimi #endif
5413*22dc650dSSadaf Ebrahimi 
5414*22dc650dSSadaf Ebrahimi 
5415*22dc650dSSadaf Ebrahimi 
5416*22dc650dSSadaf Ebrahimi /*************************************************
5417*22dc650dSSadaf Ebrahimi *   External entry point for add range to class  *
5418*22dc650dSSadaf Ebrahimi *************************************************/
5419*22dc650dSSadaf Ebrahimi 
5420*22dc650dSSadaf Ebrahimi /* This function sets the overall range so that the internal functions can try
5421*22dc650dSSadaf Ebrahimi to avoid duplication when handling case-independence.
5422*22dc650dSSadaf Ebrahimi 
5423*22dc650dSSadaf Ebrahimi Arguments:
5424*22dc650dSSadaf Ebrahimi   classbits     the bit map for characters < 256
5425*22dc650dSSadaf Ebrahimi   uchardptr     points to the pointer for extra data
5426*22dc650dSSadaf Ebrahimi   options       the options bits
5427*22dc650dSSadaf Ebrahimi   xoptions      the extra options bits
5428*22dc650dSSadaf Ebrahimi   cb            compile data
5429*22dc650dSSadaf Ebrahimi   start         start of range character
5430*22dc650dSSadaf Ebrahimi   end           end of range character
5431*22dc650dSSadaf Ebrahimi 
5432*22dc650dSSadaf Ebrahimi Returns:        the number of < 256 characters added
5433*22dc650dSSadaf Ebrahimi                 the pointer to extra data is updated
5434*22dc650dSSadaf Ebrahimi */
5435*22dc650dSSadaf Ebrahimi 
5436*22dc650dSSadaf Ebrahimi static unsigned int
add_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,uint32_t start,uint32_t end)5437*22dc650dSSadaf Ebrahimi add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5438*22dc650dSSadaf Ebrahimi   uint32_t xoptions, compile_block *cb, uint32_t start, uint32_t end)
5439*22dc650dSSadaf Ebrahimi {
5440*22dc650dSSadaf Ebrahimi cb->class_range_start = start;
5441*22dc650dSSadaf Ebrahimi cb->class_range_end = end;
5442*22dc650dSSadaf Ebrahimi return add_to_class_internal(classbits, uchardptr, options, xoptions, cb,
5443*22dc650dSSadaf Ebrahimi   start, end);
5444*22dc650dSSadaf Ebrahimi }
5445*22dc650dSSadaf Ebrahimi 
5446*22dc650dSSadaf Ebrahimi 
5447*22dc650dSSadaf Ebrahimi /*************************************************
5448*22dc650dSSadaf Ebrahimi *   External entry point for add list to class   *
5449*22dc650dSSadaf Ebrahimi *************************************************/
5450*22dc650dSSadaf Ebrahimi 
5451*22dc650dSSadaf Ebrahimi /* This function is used for adding a list of horizontal or vertical whitespace
5452*22dc650dSSadaf Ebrahimi characters to a class. The list must be in order so that ranges of characters
5453*22dc650dSSadaf Ebrahimi can be detected and handled appropriately. This function sets the overall range
5454*22dc650dSSadaf Ebrahimi so that the internal functions can try to avoid duplication when handling
5455*22dc650dSSadaf Ebrahimi case-independence.
5456*22dc650dSSadaf Ebrahimi 
5457*22dc650dSSadaf Ebrahimi Arguments:
5458*22dc650dSSadaf Ebrahimi   classbits     the bit map for characters < 256
5459*22dc650dSSadaf Ebrahimi   uchardptr     points to the pointer for extra data
5460*22dc650dSSadaf Ebrahimi   options       the options bits
5461*22dc650dSSadaf Ebrahimi   xoptions      the extra options bits
5462*22dc650dSSadaf Ebrahimi   cb            contains pointers to tables etc.
5463*22dc650dSSadaf Ebrahimi   p             points to row of 32-bit values, terminated by NOTACHAR
5464*22dc650dSSadaf Ebrahimi   except        character to omit; this is used when adding lists of
5465*22dc650dSSadaf Ebrahimi                   case-equivalent characters to avoid including the one we
5466*22dc650dSSadaf Ebrahimi                   already know about
5467*22dc650dSSadaf Ebrahimi 
5468*22dc650dSSadaf Ebrahimi Returns:        the number of < 256 characters added
5469*22dc650dSSadaf Ebrahimi                 the pointer to extra data is updated
5470*22dc650dSSadaf Ebrahimi */
5471*22dc650dSSadaf Ebrahimi 
5472*22dc650dSSadaf Ebrahimi static unsigned int
add_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,const uint32_t * p,unsigned int except)5473*22dc650dSSadaf Ebrahimi add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5474*22dc650dSSadaf Ebrahimi   uint32_t xoptions, compile_block *cb, const uint32_t *p, unsigned int except)
5475*22dc650dSSadaf Ebrahimi {
5476*22dc650dSSadaf Ebrahimi unsigned int n8 = 0;
5477*22dc650dSSadaf Ebrahimi while (p[0] < NOTACHAR)
5478*22dc650dSSadaf Ebrahimi   {
5479*22dc650dSSadaf Ebrahimi   unsigned int n = 0;
5480*22dc650dSSadaf Ebrahimi   if (p[0] != except)
5481*22dc650dSSadaf Ebrahimi     {
5482*22dc650dSSadaf Ebrahimi     while(p[n+1] == p[0] + n + 1) n++;
5483*22dc650dSSadaf Ebrahimi     cb->class_range_start = p[0];
5484*22dc650dSSadaf Ebrahimi     cb->class_range_end = p[n];
5485*22dc650dSSadaf Ebrahimi     n8 += add_to_class_internal(classbits, uchardptr, options, xoptions, cb,
5486*22dc650dSSadaf Ebrahimi       p[0], p[n]);
5487*22dc650dSSadaf Ebrahimi     }
5488*22dc650dSSadaf Ebrahimi   p += n + 1;
5489*22dc650dSSadaf Ebrahimi   }
5490*22dc650dSSadaf Ebrahimi return n8;
5491*22dc650dSSadaf Ebrahimi }
5492*22dc650dSSadaf Ebrahimi 
5493*22dc650dSSadaf Ebrahimi 
5494*22dc650dSSadaf Ebrahimi 
5495*22dc650dSSadaf Ebrahimi /*************************************************
5496*22dc650dSSadaf Ebrahimi *    Add characters not in a list to a class     *
5497*22dc650dSSadaf Ebrahimi *************************************************/
5498*22dc650dSSadaf Ebrahimi 
5499*22dc650dSSadaf Ebrahimi /* This function is used for adding the complement of a list of horizontal or
5500*22dc650dSSadaf Ebrahimi vertical whitespace to a class. The list must be in order.
5501*22dc650dSSadaf Ebrahimi 
5502*22dc650dSSadaf Ebrahimi Arguments:
5503*22dc650dSSadaf Ebrahimi   classbits     the bit map for characters < 256
5504*22dc650dSSadaf Ebrahimi   uchardptr     points to the pointer for extra data
5505*22dc650dSSadaf Ebrahimi   options       the options bits
5506*22dc650dSSadaf Ebrahimi   xoptions      the extra options bits
5507*22dc650dSSadaf Ebrahimi   cb            contains pointers to tables etc.
5508*22dc650dSSadaf Ebrahimi   p             points to row of 32-bit values, terminated by NOTACHAR
5509*22dc650dSSadaf Ebrahimi 
5510*22dc650dSSadaf Ebrahimi Returns:        the number of < 256 characters added
5511*22dc650dSSadaf Ebrahimi                 the pointer to extra data is updated
5512*22dc650dSSadaf Ebrahimi */
5513*22dc650dSSadaf Ebrahimi 
5514*22dc650dSSadaf Ebrahimi static unsigned int
add_not_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,const uint32_t * p)5515*22dc650dSSadaf Ebrahimi add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5516*22dc650dSSadaf Ebrahimi   uint32_t options, uint32_t xoptions, compile_block *cb, const uint32_t *p)
5517*22dc650dSSadaf Ebrahimi {
5518*22dc650dSSadaf Ebrahimi BOOL utf = (options & PCRE2_UTF) != 0;
5519*22dc650dSSadaf Ebrahimi unsigned int n8 = 0;
5520*22dc650dSSadaf Ebrahimi if (p[0] > 0)
5521*22dc650dSSadaf Ebrahimi   n8 += add_to_class(classbits, uchardptr, options, xoptions, cb, 0, p[0] - 1);
5522*22dc650dSSadaf Ebrahimi while (p[0] < NOTACHAR)
5523*22dc650dSSadaf Ebrahimi   {
5524*22dc650dSSadaf Ebrahimi   while (p[1] == p[0] + 1) p++;
5525*22dc650dSSadaf Ebrahimi   n8 += add_to_class(classbits, uchardptr, options, xoptions, cb, p[0] + 1,
5526*22dc650dSSadaf Ebrahimi     (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
5527*22dc650dSSadaf Ebrahimi   p++;
5528*22dc650dSSadaf Ebrahimi   }
5529*22dc650dSSadaf Ebrahimi return n8;
5530*22dc650dSSadaf Ebrahimi }
5531*22dc650dSSadaf Ebrahimi 
5532*22dc650dSSadaf Ebrahimi 
5533*22dc650dSSadaf Ebrahimi 
5534*22dc650dSSadaf Ebrahimi /*************************************************
5535*22dc650dSSadaf Ebrahimi *    Find details of duplicate group names       *
5536*22dc650dSSadaf Ebrahimi *************************************************/
5537*22dc650dSSadaf Ebrahimi 
5538*22dc650dSSadaf Ebrahimi /* This is called from compile_branch() when it needs to know the index and
5539*22dc650dSSadaf Ebrahimi count of duplicates in the names table when processing named backreferences,
5540*22dc650dSSadaf Ebrahimi either directly, or as conditions.
5541*22dc650dSSadaf Ebrahimi 
5542*22dc650dSSadaf Ebrahimi Arguments:
5543*22dc650dSSadaf Ebrahimi   name          points to the name
5544*22dc650dSSadaf Ebrahimi   length        the length of the name
5545*22dc650dSSadaf Ebrahimi   indexptr      where to put the index
5546*22dc650dSSadaf Ebrahimi   countptr      where to put the count of duplicates
5547*22dc650dSSadaf Ebrahimi   errorcodeptr  where to put an error code
5548*22dc650dSSadaf Ebrahimi   cb            the compile block
5549*22dc650dSSadaf Ebrahimi 
5550*22dc650dSSadaf Ebrahimi Returns:        TRUE if OK, FALSE if not, error code set
5551*22dc650dSSadaf Ebrahimi */
5552*22dc650dSSadaf Ebrahimi 
5553*22dc650dSSadaf Ebrahimi static BOOL
find_dupname_details(PCRE2_SPTR name,uint32_t length,int * indexptr,int * countptr,int * errorcodeptr,compile_block * cb)5554*22dc650dSSadaf Ebrahimi find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr,
5555*22dc650dSSadaf Ebrahimi   int *countptr, int *errorcodeptr, compile_block *cb)
5556*22dc650dSSadaf Ebrahimi {
5557*22dc650dSSadaf Ebrahimi uint32_t i, groupnumber;
5558*22dc650dSSadaf Ebrahimi int count;
5559*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *slot = cb->name_table;
5560*22dc650dSSadaf Ebrahimi 
5561*22dc650dSSadaf Ebrahimi /* Find the first entry in the table */
5562*22dc650dSSadaf Ebrahimi 
5563*22dc650dSSadaf Ebrahimi for (i = 0; i < cb->names_found; i++)
5564*22dc650dSSadaf Ebrahimi   {
5565*22dc650dSSadaf Ebrahimi   if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 &&
5566*22dc650dSSadaf Ebrahimi       slot[IMM2_SIZE+length] == 0) break;
5567*22dc650dSSadaf Ebrahimi   slot += cb->name_entry_size;
5568*22dc650dSSadaf Ebrahimi   }
5569*22dc650dSSadaf Ebrahimi 
5570*22dc650dSSadaf Ebrahimi /* This should not occur, because this function is called only when we know we
5571*22dc650dSSadaf Ebrahimi have duplicate names. Give an internal error. */
5572*22dc650dSSadaf Ebrahimi 
5573*22dc650dSSadaf Ebrahimi if (i >= cb->names_found)
5574*22dc650dSSadaf Ebrahimi   {
5575*22dc650dSSadaf Ebrahimi   *errorcodeptr = ERR53;
5576*22dc650dSSadaf Ebrahimi   cb->erroroffset = name - cb->start_pattern;
5577*22dc650dSSadaf Ebrahimi   return FALSE;
5578*22dc650dSSadaf Ebrahimi   }
5579*22dc650dSSadaf Ebrahimi 
5580*22dc650dSSadaf Ebrahimi /* Record the index and then see how many duplicates there are, updating the
5581*22dc650dSSadaf Ebrahimi backref map and maximum back reference as we do. */
5582*22dc650dSSadaf Ebrahimi 
5583*22dc650dSSadaf Ebrahimi *indexptr = i;
5584*22dc650dSSadaf Ebrahimi count = 0;
5585*22dc650dSSadaf Ebrahimi 
5586*22dc650dSSadaf Ebrahimi for (;;)
5587*22dc650dSSadaf Ebrahimi   {
5588*22dc650dSSadaf Ebrahimi   count++;
5589*22dc650dSSadaf Ebrahimi   groupnumber = GET2(slot,0);
5590*22dc650dSSadaf Ebrahimi   cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
5591*22dc650dSSadaf Ebrahimi   if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
5592*22dc650dSSadaf Ebrahimi   if (++i >= cb->names_found) break;
5593*22dc650dSSadaf Ebrahimi   slot += cb->name_entry_size;
5594*22dc650dSSadaf Ebrahimi   if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 ||
5595*22dc650dSSadaf Ebrahimi     (slot+IMM2_SIZE)[length] != 0) break;
5596*22dc650dSSadaf Ebrahimi   }
5597*22dc650dSSadaf Ebrahimi 
5598*22dc650dSSadaf Ebrahimi *countptr = count;
5599*22dc650dSSadaf Ebrahimi return TRUE;
5600*22dc650dSSadaf Ebrahimi }
5601*22dc650dSSadaf Ebrahimi 
5602*22dc650dSSadaf Ebrahimi 
5603*22dc650dSSadaf Ebrahimi 
5604*22dc650dSSadaf Ebrahimi /*************************************************
5605*22dc650dSSadaf Ebrahimi *           Compile one branch                   *
5606*22dc650dSSadaf Ebrahimi *************************************************/
5607*22dc650dSSadaf Ebrahimi 
5608*22dc650dSSadaf Ebrahimi /* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
5609*22dc650dSSadaf Ebrahimi the options are changed during the branch, the pointer is used to change the
5610*22dc650dSSadaf Ebrahimi external options bits. This function is used during the pre-compile phase when
5611*22dc650dSSadaf Ebrahimi we are trying to find out the amount of memory needed, as well as during the
5612*22dc650dSSadaf Ebrahimi real compile phase. The value of lengthptr distinguishes the two phases.
5613*22dc650dSSadaf Ebrahimi 
5614*22dc650dSSadaf Ebrahimi Arguments:
5615*22dc650dSSadaf Ebrahimi   optionsptr        pointer to the option bits
5616*22dc650dSSadaf Ebrahimi   xoptionsptr       pointer to the extra option bits
5617*22dc650dSSadaf Ebrahimi   codeptr           points to the pointer to the current code point
5618*22dc650dSSadaf Ebrahimi   pptrptr           points to the current parsed pattern pointer
5619*22dc650dSSadaf Ebrahimi   errorcodeptr      points to error code variable
5620*22dc650dSSadaf Ebrahimi   firstcuptr        place to put the first required code unit
5621*22dc650dSSadaf Ebrahimi   firstcuflagsptr   place to put the first code unit flags
5622*22dc650dSSadaf Ebrahimi   reqcuptr          place to put the last required code unit
5623*22dc650dSSadaf Ebrahimi   reqcuflagsptr     place to put the last required code unit flags
5624*22dc650dSSadaf Ebrahimi   bcptr             points to current branch chain
5625*22dc650dSSadaf Ebrahimi   open_caps         points to current capitem
5626*22dc650dSSadaf Ebrahimi   cb                contains pointers to tables etc.
5627*22dc650dSSadaf Ebrahimi   lengthptr         NULL during the real compile phase
5628*22dc650dSSadaf Ebrahimi                     points to length accumulator during pre-compile phase
5629*22dc650dSSadaf Ebrahimi 
5630*22dc650dSSadaf Ebrahimi Returns:            0 There's been an error, *errorcodeptr is non-zero
5631*22dc650dSSadaf Ebrahimi                    +1 Success, this branch must match at least one character
5632*22dc650dSSadaf Ebrahimi                    -1 Success, this branch may match an empty string
5633*22dc650dSSadaf Ebrahimi */
5634*22dc650dSSadaf Ebrahimi 
5635*22dc650dSSadaf Ebrahimi static int
compile_branch(uint32_t * optionsptr,uint32_t * xoptionsptr,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t * firstcuptr,uint32_t * firstcuflagsptr,uint32_t * reqcuptr,uint32_t * reqcuflagsptr,branch_chain * bcptr,open_capitem * open_caps,compile_block * cb,PCRE2_SIZE * lengthptr)5636*22dc650dSSadaf Ebrahimi compile_branch(uint32_t *optionsptr, uint32_t *xoptionsptr,
5637*22dc650dSSadaf Ebrahimi   PCRE2_UCHAR **codeptr, uint32_t **pptrptr, int *errorcodeptr,
5638*22dc650dSSadaf Ebrahimi   uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
5639*22dc650dSSadaf Ebrahimi   uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
5640*22dc650dSSadaf Ebrahimi   compile_block *cb, PCRE2_SIZE *lengthptr)
5641*22dc650dSSadaf Ebrahimi {
5642*22dc650dSSadaf Ebrahimi int bravalue = 0;
5643*22dc650dSSadaf Ebrahimi int okreturn = -1;
5644*22dc650dSSadaf Ebrahimi int group_return = 0;
5645*22dc650dSSadaf Ebrahimi uint32_t repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
5646*22dc650dSSadaf Ebrahimi uint32_t greedy_default, greedy_non_default;
5647*22dc650dSSadaf Ebrahimi uint32_t repeat_type, op_type;
5648*22dc650dSSadaf Ebrahimi uint32_t options = *optionsptr;               /* May change dynamically */
5649*22dc650dSSadaf Ebrahimi uint32_t xoptions = *xoptionsptr;             /* May change dynamically */
5650*22dc650dSSadaf Ebrahimi uint32_t firstcu, reqcu;
5651*22dc650dSSadaf Ebrahimi uint32_t zeroreqcu, zerofirstcu;
5652*22dc650dSSadaf Ebrahimi uint32_t escape;
5653*22dc650dSSadaf Ebrahimi uint32_t *pptr = *pptrptr;
5654*22dc650dSSadaf Ebrahimi uint32_t meta, meta_arg;
5655*22dc650dSSadaf Ebrahimi uint32_t firstcuflags, reqcuflags;
5656*22dc650dSSadaf Ebrahimi uint32_t zeroreqcuflags, zerofirstcuflags;
5657*22dc650dSSadaf Ebrahimi uint32_t req_caseopt, reqvary, tempreqvary;
5658*22dc650dSSadaf Ebrahimi PCRE2_SIZE offset = 0;
5659*22dc650dSSadaf Ebrahimi PCRE2_SIZE length_prevgroup = 0;
5660*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *code = *codeptr;
5661*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *last_code = code;
5662*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *orig_code = code;
5663*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *tempcode;
5664*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *previous = NULL;
5665*22dc650dSSadaf Ebrahimi PCRE2_UCHAR op_previous;
5666*22dc650dSSadaf Ebrahimi BOOL groupsetfirstcu = FALSE;
5667*22dc650dSSadaf Ebrahimi BOOL had_accept = FALSE;
5668*22dc650dSSadaf Ebrahimi BOOL matched_char = FALSE;
5669*22dc650dSSadaf Ebrahimi BOOL previous_matched_char = FALSE;
5670*22dc650dSSadaf Ebrahimi BOOL reset_caseful = FALSE;
5671*22dc650dSSadaf Ebrahimi const uint8_t *cbits = cb->cbits;
5672*22dc650dSSadaf Ebrahimi uint8_t classbits[32];
5673*22dc650dSSadaf Ebrahimi 
5674*22dc650dSSadaf Ebrahimi /* We can fish out the UTF setting once and for all into a BOOL, but we must
5675*22dc650dSSadaf Ebrahimi not do this for other options (e.g. PCRE2_EXTENDED) that may change dynamically
5676*22dc650dSSadaf Ebrahimi as we process the pattern. */
5677*22dc650dSSadaf Ebrahimi 
5678*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
5679*22dc650dSSadaf Ebrahimi BOOL utf = (options & PCRE2_UTF) != 0;
5680*22dc650dSSadaf Ebrahimi BOOL ucp = (options & PCRE2_UCP) != 0;
5681*22dc650dSSadaf Ebrahimi #else  /* No Unicode support */
5682*22dc650dSSadaf Ebrahimi BOOL utf = FALSE;
5683*22dc650dSSadaf Ebrahimi #endif
5684*22dc650dSSadaf Ebrahimi 
5685*22dc650dSSadaf Ebrahimi /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
5686*22dc650dSSadaf Ebrahimi class_uchardata always so that it can be passed to add_to_class() always,
5687*22dc650dSSadaf Ebrahimi though it will not be used in non-UTF 8-bit cases. This avoids having to supply
5688*22dc650dSSadaf Ebrahimi alternative calls for the different cases. */
5689*22dc650dSSadaf Ebrahimi 
5690*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *class_uchardata;
5691*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_WIDE_CHARS
5692*22dc650dSSadaf Ebrahimi BOOL xclass;
5693*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *class_uchardata_base;
5694*22dc650dSSadaf Ebrahimi #endif
5695*22dc650dSSadaf Ebrahimi 
5696*22dc650dSSadaf Ebrahimi /* Set up the default and non-default settings for greediness */
5697*22dc650dSSadaf Ebrahimi 
5698*22dc650dSSadaf Ebrahimi greedy_default = ((options & PCRE2_UNGREEDY) != 0);
5699*22dc650dSSadaf Ebrahimi greedy_non_default = greedy_default ^ 1;
5700*22dc650dSSadaf Ebrahimi 
5701*22dc650dSSadaf Ebrahimi /* Initialize no first unit, no required unit. REQ_UNSET means "no char
5702*22dc650dSSadaf Ebrahimi matching encountered yet". It gets changed to REQ_NONE if we hit something that
5703*22dc650dSSadaf Ebrahimi matches a non-fixed first unit; reqcu just remains unset if we never find one.
5704*22dc650dSSadaf Ebrahimi 
5705*22dc650dSSadaf Ebrahimi When we hit a repeat whose minimum is zero, we may have to adjust these values
5706*22dc650dSSadaf Ebrahimi to take the zero repeat into account. This is implemented by setting them to
5707*22dc650dSSadaf Ebrahimi zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
5708*22dc650dSSadaf Ebrahimi item types that can be repeated set these backoff variables appropriately. */
5709*22dc650dSSadaf Ebrahimi 
5710*22dc650dSSadaf Ebrahimi firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
5711*22dc650dSSadaf Ebrahimi firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
5712*22dc650dSSadaf Ebrahimi 
5713*22dc650dSSadaf Ebrahimi /* The variable req_caseopt contains either the REQ_CASELESS bit or zero,
5714*22dc650dSSadaf Ebrahimi according to the current setting of the caseless flag. The REQ_CASELESS value
5715*22dc650dSSadaf Ebrahimi leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
5716*22dc650dSSadaf Ebrahimi to record the case status of the value. This is used only for ASCII characters.
5717*22dc650dSSadaf Ebrahimi */
5718*22dc650dSSadaf Ebrahimi 
5719*22dc650dSSadaf Ebrahimi req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
5720*22dc650dSSadaf Ebrahimi 
5721*22dc650dSSadaf Ebrahimi /* Switch on next META item until the end of the branch */
5722*22dc650dSSadaf Ebrahimi 
5723*22dc650dSSadaf Ebrahimi for (;; pptr++)
5724*22dc650dSSadaf Ebrahimi   {
5725*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_WIDE_CHARS
5726*22dc650dSSadaf Ebrahimi   BOOL xclass_has_prop;
5727*22dc650dSSadaf Ebrahimi #endif
5728*22dc650dSSadaf Ebrahimi   BOOL negate_class;
5729*22dc650dSSadaf Ebrahimi   BOOL should_flip_negation;
5730*22dc650dSSadaf Ebrahimi   BOOL match_all_or_no_wide_chars;
5731*22dc650dSSadaf Ebrahimi   BOOL possessive_quantifier;
5732*22dc650dSSadaf Ebrahimi   BOOL note_group_empty;
5733*22dc650dSSadaf Ebrahimi   int class_has_8bitchar;
5734*22dc650dSSadaf Ebrahimi   uint32_t mclength;
5735*22dc650dSSadaf Ebrahimi   uint32_t skipunits;
5736*22dc650dSSadaf Ebrahimi   uint32_t subreqcu, subfirstcu;
5737*22dc650dSSadaf Ebrahimi   uint32_t groupnumber;
5738*22dc650dSSadaf Ebrahimi   uint32_t verbarglen, verbculen;
5739*22dc650dSSadaf Ebrahimi   uint32_t subreqcuflags, subfirstcuflags;
5740*22dc650dSSadaf Ebrahimi   open_capitem *oc;
5741*22dc650dSSadaf Ebrahimi   PCRE2_UCHAR mcbuffer[8];
5742*22dc650dSSadaf Ebrahimi 
5743*22dc650dSSadaf Ebrahimi   /* Get next META item in the pattern and its potential argument. */
5744*22dc650dSSadaf Ebrahimi 
5745*22dc650dSSadaf Ebrahimi   meta = META_CODE(*pptr);
5746*22dc650dSSadaf Ebrahimi   meta_arg = META_DATA(*pptr);
5747*22dc650dSSadaf Ebrahimi 
5748*22dc650dSSadaf Ebrahimi   /* If we are in the pre-compile phase, accumulate the length used for the
5749*22dc650dSSadaf Ebrahimi   previous cycle of this loop, unless the next item is a quantifier. */
5750*22dc650dSSadaf Ebrahimi 
5751*22dc650dSSadaf Ebrahimi   if (lengthptr != NULL)
5752*22dc650dSSadaf Ebrahimi     {
5753*22dc650dSSadaf Ebrahimi     if (code > cb->start_workspace + cb->workspace_size -
5754*22dc650dSSadaf Ebrahimi         WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
5755*22dc650dSSadaf Ebrahimi       {
5756*22dc650dSSadaf Ebrahimi       *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)?
5757*22dc650dSSadaf Ebrahimi         ERR52 : ERR86;
5758*22dc650dSSadaf Ebrahimi       return 0;
5759*22dc650dSSadaf Ebrahimi       }
5760*22dc650dSSadaf Ebrahimi 
5761*22dc650dSSadaf Ebrahimi     /* There is at least one situation where code goes backwards: this is the
5762*22dc650dSSadaf Ebrahimi     case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
5763*22dc650dSSadaf Ebrahimi     is processed, the whole class is eliminated. However, it is created first,
5764*22dc650dSSadaf Ebrahimi     so we have to allow memory for it. Therefore, don't ever reduce the length
5765*22dc650dSSadaf Ebrahimi     at this point. */
5766*22dc650dSSadaf Ebrahimi 
5767*22dc650dSSadaf Ebrahimi     if (code < last_code) code = last_code;
5768*22dc650dSSadaf Ebrahimi 
5769*22dc650dSSadaf Ebrahimi     /* If the next thing is not a quantifier, we add the length of the previous
5770*22dc650dSSadaf Ebrahimi     item into the total, and reset the code pointer to the start of the
5771*22dc650dSSadaf Ebrahimi     workspace. Otherwise leave the previous item available to be quantified. */
5772*22dc650dSSadaf Ebrahimi 
5773*22dc650dSSadaf Ebrahimi     if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5774*22dc650dSSadaf Ebrahimi       {
5775*22dc650dSSadaf Ebrahimi       if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
5776*22dc650dSSadaf Ebrahimi         {
5777*22dc650dSSadaf Ebrahimi         *errorcodeptr = ERR20;   /* Integer overflow */
5778*22dc650dSSadaf Ebrahimi         return 0;
5779*22dc650dSSadaf Ebrahimi         }
5780*22dc650dSSadaf Ebrahimi       *lengthptr += (PCRE2_SIZE)(code - orig_code);
5781*22dc650dSSadaf Ebrahimi       if (*lengthptr > MAX_PATTERN_SIZE)
5782*22dc650dSSadaf Ebrahimi         {
5783*22dc650dSSadaf Ebrahimi         *errorcodeptr = ERR20;   /* Pattern is too large */
5784*22dc650dSSadaf Ebrahimi         return 0;
5785*22dc650dSSadaf Ebrahimi         }
5786*22dc650dSSadaf Ebrahimi       code = orig_code;
5787*22dc650dSSadaf Ebrahimi       }
5788*22dc650dSSadaf Ebrahimi 
5789*22dc650dSSadaf Ebrahimi     /* Remember where this code item starts so we can catch the "backwards"
5790*22dc650dSSadaf Ebrahimi     case above next time round. */
5791*22dc650dSSadaf Ebrahimi 
5792*22dc650dSSadaf Ebrahimi     last_code = code;
5793*22dc650dSSadaf Ebrahimi     }
5794*22dc650dSSadaf Ebrahimi 
5795*22dc650dSSadaf Ebrahimi   /* Process the next parsed pattern item. If it is not a quantifier, remember
5796*22dc650dSSadaf Ebrahimi   where it starts so that it can be quantified when a quantifier follows.
5797*22dc650dSSadaf Ebrahimi   Checking for the legality of quantifiers happens in parse_regex(), except for
5798*22dc650dSSadaf Ebrahimi   a quantifier after an assertion that is a condition. */
5799*22dc650dSSadaf Ebrahimi 
5800*22dc650dSSadaf Ebrahimi   if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5801*22dc650dSSadaf Ebrahimi     {
5802*22dc650dSSadaf Ebrahimi     previous = code;
5803*22dc650dSSadaf Ebrahimi     if (matched_char && !had_accept) okreturn = 1;
5804*22dc650dSSadaf Ebrahimi     }
5805*22dc650dSSadaf Ebrahimi 
5806*22dc650dSSadaf Ebrahimi   previous_matched_char = matched_char;
5807*22dc650dSSadaf Ebrahimi   matched_char = FALSE;
5808*22dc650dSSadaf Ebrahimi   note_group_empty = FALSE;
5809*22dc650dSSadaf Ebrahimi   skipunits = 0;         /* Default value for most subgroups */
5810*22dc650dSSadaf Ebrahimi 
5811*22dc650dSSadaf Ebrahimi   switch(meta)
5812*22dc650dSSadaf Ebrahimi     {
5813*22dc650dSSadaf Ebrahimi     /* ===================================================================*/
5814*22dc650dSSadaf Ebrahimi     /* The branch terminates at pattern end or | or ) */
5815*22dc650dSSadaf Ebrahimi 
5816*22dc650dSSadaf Ebrahimi     case META_END:
5817*22dc650dSSadaf Ebrahimi     case META_ALT:
5818*22dc650dSSadaf Ebrahimi     case META_KET:
5819*22dc650dSSadaf Ebrahimi     *firstcuptr = firstcu;
5820*22dc650dSSadaf Ebrahimi     *firstcuflagsptr = firstcuflags;
5821*22dc650dSSadaf Ebrahimi     *reqcuptr = reqcu;
5822*22dc650dSSadaf Ebrahimi     *reqcuflagsptr = reqcuflags;
5823*22dc650dSSadaf Ebrahimi     *codeptr = code;
5824*22dc650dSSadaf Ebrahimi     *pptrptr = pptr;
5825*22dc650dSSadaf Ebrahimi     return okreturn;
5826*22dc650dSSadaf Ebrahimi 
5827*22dc650dSSadaf Ebrahimi 
5828*22dc650dSSadaf Ebrahimi     /* ===================================================================*/
5829*22dc650dSSadaf Ebrahimi     /* Handle single-character metacharacters. In multiline mode, ^ disables
5830*22dc650dSSadaf Ebrahimi     the setting of any following char as a first character. */
5831*22dc650dSSadaf Ebrahimi 
5832*22dc650dSSadaf Ebrahimi     case META_CIRCUMFLEX:
5833*22dc650dSSadaf Ebrahimi     if ((options & PCRE2_MULTILINE) != 0)
5834*22dc650dSSadaf Ebrahimi       {
5835*22dc650dSSadaf Ebrahimi       if (firstcuflags == REQ_UNSET)
5836*22dc650dSSadaf Ebrahimi         zerofirstcuflags = firstcuflags = REQ_NONE;
5837*22dc650dSSadaf Ebrahimi       *code++ = OP_CIRCM;
5838*22dc650dSSadaf Ebrahimi       }
5839*22dc650dSSadaf Ebrahimi     else *code++ = OP_CIRC;
5840*22dc650dSSadaf Ebrahimi     break;
5841*22dc650dSSadaf Ebrahimi 
5842*22dc650dSSadaf Ebrahimi     case META_DOLLAR:
5843*22dc650dSSadaf Ebrahimi     *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
5844*22dc650dSSadaf Ebrahimi     break;
5845*22dc650dSSadaf Ebrahimi 
5846*22dc650dSSadaf Ebrahimi     /* There can never be a first char if '.' is first, whatever happens about
5847*22dc650dSSadaf Ebrahimi     repeats. The value of reqcu doesn't change either. */
5848*22dc650dSSadaf Ebrahimi 
5849*22dc650dSSadaf Ebrahimi     case META_DOT:
5850*22dc650dSSadaf Ebrahimi     matched_char = TRUE;
5851*22dc650dSSadaf Ebrahimi     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5852*22dc650dSSadaf Ebrahimi     zerofirstcu = firstcu;
5853*22dc650dSSadaf Ebrahimi     zerofirstcuflags = firstcuflags;
5854*22dc650dSSadaf Ebrahimi     zeroreqcu = reqcu;
5855*22dc650dSSadaf Ebrahimi     zeroreqcuflags = reqcuflags;
5856*22dc650dSSadaf Ebrahimi     *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
5857*22dc650dSSadaf Ebrahimi     break;
5858*22dc650dSSadaf Ebrahimi 
5859*22dc650dSSadaf Ebrahimi 
5860*22dc650dSSadaf Ebrahimi     /* ===================================================================*/
5861*22dc650dSSadaf Ebrahimi     /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
5862*22dc650dSSadaf Ebrahimi     Otherwise, an initial ']' is taken as a data character. When empty classes
5863*22dc650dSSadaf Ebrahimi     are allowed, [] must always fail, so generate OP_FAIL, whereas [^] must
5864*22dc650dSSadaf Ebrahimi     match any character, so generate OP_ALLANY. */
5865*22dc650dSSadaf Ebrahimi 
5866*22dc650dSSadaf Ebrahimi     case META_CLASS_EMPTY:
5867*22dc650dSSadaf Ebrahimi     case META_CLASS_EMPTY_NOT:
5868*22dc650dSSadaf Ebrahimi     matched_char = TRUE;
5869*22dc650dSSadaf Ebrahimi     *code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL;
5870*22dc650dSSadaf Ebrahimi     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5871*22dc650dSSadaf Ebrahimi     zerofirstcu = firstcu;
5872*22dc650dSSadaf Ebrahimi     zerofirstcuflags = firstcuflags;
5873*22dc650dSSadaf Ebrahimi     break;
5874*22dc650dSSadaf Ebrahimi 
5875*22dc650dSSadaf Ebrahimi 
5876*22dc650dSSadaf Ebrahimi     /* ===================================================================*/
5877*22dc650dSSadaf Ebrahimi     /* Non-empty character class. If the included characters are all < 256, we
5878*22dc650dSSadaf Ebrahimi     build a 32-byte bitmap of the permitted characters, except in the special
5879*22dc650dSSadaf Ebrahimi     case where there is only one such character. For negated classes, we build
5880*22dc650dSSadaf Ebrahimi     the map as usual, then invert it at the end. However, we use a different
5881*22dc650dSSadaf Ebrahimi     opcode so that data characters > 255 can be handled correctly.
5882*22dc650dSSadaf Ebrahimi 
5883*22dc650dSSadaf Ebrahimi     If the class contains characters outside the 0-255 range, a different
5884*22dc650dSSadaf Ebrahimi     opcode is compiled. It may optionally have a bit map for characters < 256,
5885*22dc650dSSadaf Ebrahimi     but those above are explicitly listed afterwards. A flag code unit tells
5886*22dc650dSSadaf Ebrahimi     whether the bitmap is present, and whether this is a negated class or
5887*22dc650dSSadaf Ebrahimi     not. */
5888*22dc650dSSadaf Ebrahimi 
5889*22dc650dSSadaf Ebrahimi     case META_CLASS_NOT:
5890*22dc650dSSadaf Ebrahimi     case META_CLASS:
5891*22dc650dSSadaf Ebrahimi     matched_char = TRUE;
5892*22dc650dSSadaf Ebrahimi     negate_class = meta == META_CLASS_NOT;
5893*22dc650dSSadaf Ebrahimi 
5894*22dc650dSSadaf Ebrahimi     /* We can optimize the case of a single character in a class by generating
5895*22dc650dSSadaf Ebrahimi     OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
5896*22dc650dSSadaf Ebrahimi     negative. In the negative case there can be no first char if this item is
5897*22dc650dSSadaf Ebrahimi     first, whatever repeat count may follow. In the case of reqcu, save the
5898*22dc650dSSadaf Ebrahimi     previous value for reinstating. */
5899*22dc650dSSadaf Ebrahimi 
5900*22dc650dSSadaf Ebrahimi     /* NOTE: at present this optimization is not effective if the only
5901*22dc650dSSadaf Ebrahimi     character in a class in 32-bit, non-UCP mode has its top bit set. */
5902*22dc650dSSadaf Ebrahimi 
5903*22dc650dSSadaf Ebrahimi     if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
5904*22dc650dSSadaf Ebrahimi       {
5905*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
5906*22dc650dSSadaf Ebrahimi       uint32_t d;
5907*22dc650dSSadaf Ebrahimi #endif
5908*22dc650dSSadaf Ebrahimi       uint32_t c = pptr[1];
5909*22dc650dSSadaf Ebrahimi 
5910*22dc650dSSadaf Ebrahimi       pptr += 2;                 /* Move on to class end */
5911*22dc650dSSadaf Ebrahimi       if (meta == META_CLASS)    /* A positive one-char class can be */
5912*22dc650dSSadaf Ebrahimi         {                        /* handled as a normal literal character. */
5913*22dc650dSSadaf Ebrahimi         meta = c;                /* Set up the character */
5914*22dc650dSSadaf Ebrahimi         goto NORMAL_CHAR_SET;
5915*22dc650dSSadaf Ebrahimi         }
5916*22dc650dSSadaf Ebrahimi 
5917*22dc650dSSadaf Ebrahimi       /* Handle a negative one-character class */
5918*22dc650dSSadaf Ebrahimi 
5919*22dc650dSSadaf Ebrahimi       zeroreqcu = reqcu;
5920*22dc650dSSadaf Ebrahimi       zeroreqcuflags = reqcuflags;
5921*22dc650dSSadaf Ebrahimi       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5922*22dc650dSSadaf Ebrahimi       zerofirstcu = firstcu;
5923*22dc650dSSadaf Ebrahimi       zerofirstcuflags = firstcuflags;
5924*22dc650dSSadaf Ebrahimi 
5925*22dc650dSSadaf Ebrahimi       /* For caseless UTF or UCP mode, check whether this character has more
5926*22dc650dSSadaf Ebrahimi       than one other case. If so, generate a special OP_NOTPROP item instead of
5927*22dc650dSSadaf Ebrahimi       OP_NOTI. When restricted by PCRE2_EXTRA_CASELESS_RESTRICT, ignore any
5928*22dc650dSSadaf Ebrahimi       caseless set that starts with an ASCII character. */
5929*22dc650dSSadaf Ebrahimi 
5930*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
5931*22dc650dSSadaf Ebrahimi       if ((utf||ucp) && (options & PCRE2_CASELESS) != 0 &&
5932*22dc650dSSadaf Ebrahimi           (d = UCD_CASESET(c)) != 0 &&
5933*22dc650dSSadaf Ebrahimi           ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) == 0 ||
5934*22dc650dSSadaf Ebrahimi           PRIV(ucd_caseless_sets)[d] > 127))
5935*22dc650dSSadaf Ebrahimi         {
5936*22dc650dSSadaf Ebrahimi         *code++ = OP_NOTPROP;
5937*22dc650dSSadaf Ebrahimi         *code++ = PT_CLIST;
5938*22dc650dSSadaf Ebrahimi         *code++ = d;
5939*22dc650dSSadaf Ebrahimi         break;   /* We are finished with this class */
5940*22dc650dSSadaf Ebrahimi         }
5941*22dc650dSSadaf Ebrahimi #endif
5942*22dc650dSSadaf Ebrahimi       /* Char has only one other (usable) case, or UCP not available */
5943*22dc650dSSadaf Ebrahimi 
5944*22dc650dSSadaf Ebrahimi       *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
5945*22dc650dSSadaf Ebrahimi       code += PUTCHAR(c, code);
5946*22dc650dSSadaf Ebrahimi       break;   /* We are finished with this class */
5947*22dc650dSSadaf Ebrahimi       }        /* End of 1-char optimization */
5948*22dc650dSSadaf Ebrahimi 
5949*22dc650dSSadaf Ebrahimi     /* Handle character classes that contain more than just one literal
5950*22dc650dSSadaf Ebrahimi     character. If there are exactly two characters in a positive class, see if
5951*22dc650dSSadaf Ebrahimi     they are case partners. This can be optimized to generate a caseless single
5952*22dc650dSSadaf Ebrahimi     character match (which also sets first/required code units if relevant).
5953*22dc650dSSadaf Ebrahimi     When casing restrictions apply, ignore a caseless set if both characters
5954*22dc650dSSadaf Ebrahimi     are ASCII. */
5955*22dc650dSSadaf Ebrahimi 
5956*22dc650dSSadaf Ebrahimi     if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&
5957*22dc650dSSadaf Ebrahimi         pptr[3] == META_CLASS_END)
5958*22dc650dSSadaf Ebrahimi       {
5959*22dc650dSSadaf Ebrahimi       uint32_t c = pptr[1];
5960*22dc650dSSadaf Ebrahimi 
5961*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
5962*22dc650dSSadaf Ebrahimi       if (UCD_CASESET(c) == 0 ||
5963*22dc650dSSadaf Ebrahimi          ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
5964*22dc650dSSadaf Ebrahimi          c < 128 && pptr[2] < 128))
5965*22dc650dSSadaf Ebrahimi #endif
5966*22dc650dSSadaf Ebrahimi         {
5967*22dc650dSSadaf Ebrahimi         uint32_t d;
5968*22dc650dSSadaf Ebrahimi 
5969*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
5970*22dc650dSSadaf Ebrahimi         if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
5971*22dc650dSSadaf Ebrahimi #endif
5972*22dc650dSSadaf Ebrahimi           {
5973*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH != 8
5974*22dc650dSSadaf Ebrahimi           if (c > 255) d = c; else
5975*22dc650dSSadaf Ebrahimi #endif
5976*22dc650dSSadaf Ebrahimi           d = TABLE_GET(c, cb->fcc, c);
5977*22dc650dSSadaf Ebrahimi           }
5978*22dc650dSSadaf Ebrahimi 
5979*22dc650dSSadaf Ebrahimi         if (c != d && pptr[2] == d)
5980*22dc650dSSadaf Ebrahimi           {
5981*22dc650dSSadaf Ebrahimi           pptr += 3;                 /* Move on to class end */
5982*22dc650dSSadaf Ebrahimi           meta = c;
5983*22dc650dSSadaf Ebrahimi           if ((options & PCRE2_CASELESS) == 0)
5984*22dc650dSSadaf Ebrahimi             {
5985*22dc650dSSadaf Ebrahimi             reset_caseful = TRUE;
5986*22dc650dSSadaf Ebrahimi             options |= PCRE2_CASELESS;
5987*22dc650dSSadaf Ebrahimi             req_caseopt = REQ_CASELESS;
5988*22dc650dSSadaf Ebrahimi             }
5989*22dc650dSSadaf Ebrahimi           goto CLASS_CASELESS_CHAR;
5990*22dc650dSSadaf Ebrahimi           }
5991*22dc650dSSadaf Ebrahimi         }
5992*22dc650dSSadaf Ebrahimi       }
5993*22dc650dSSadaf Ebrahimi 
5994*22dc650dSSadaf Ebrahimi     /* If a non-extended class contains a negative special such as \S, we need
5995*22dc650dSSadaf Ebrahimi     to flip the negation flag at the end, so that support for characters > 255
5996*22dc650dSSadaf Ebrahimi     works correctly (they are all included in the class). An extended class may
5997*22dc650dSSadaf Ebrahimi     need to insert specific matching or non-matching code for wide characters.
5998*22dc650dSSadaf Ebrahimi     */
5999*22dc650dSSadaf Ebrahimi 
6000*22dc650dSSadaf Ebrahimi     should_flip_negation = match_all_or_no_wide_chars = FALSE;
6001*22dc650dSSadaf Ebrahimi 
6002*22dc650dSSadaf Ebrahimi     /* Extended class (xclass) will be used when characters > 255
6003*22dc650dSSadaf Ebrahimi     might match. */
6004*22dc650dSSadaf Ebrahimi 
6005*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_WIDE_CHARS
6006*22dc650dSSadaf Ebrahimi     xclass = FALSE;
6007*22dc650dSSadaf Ebrahimi     class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
6008*22dc650dSSadaf Ebrahimi     class_uchardata_base = class_uchardata;   /* Save the start */
6009*22dc650dSSadaf Ebrahimi #endif
6010*22dc650dSSadaf Ebrahimi 
6011*22dc650dSSadaf Ebrahimi     /* For optimization purposes, we track some properties of the class:
6012*22dc650dSSadaf Ebrahimi     class_has_8bitchar will be non-zero if the class contains at least one
6013*22dc650dSSadaf Ebrahimi     character with a code point less than 256; xclass_has_prop will be TRUE if
6014*22dc650dSSadaf Ebrahimi     Unicode property checks are present in the class. */
6015*22dc650dSSadaf Ebrahimi 
6016*22dc650dSSadaf Ebrahimi     class_has_8bitchar = 0;
6017*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_WIDE_CHARS
6018*22dc650dSSadaf Ebrahimi     xclass_has_prop = FALSE;
6019*22dc650dSSadaf Ebrahimi #endif
6020*22dc650dSSadaf Ebrahimi 
6021*22dc650dSSadaf Ebrahimi     /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
6022*22dc650dSSadaf Ebrahimi     in a temporary bit of memory, in case the class contains fewer than two
6023*22dc650dSSadaf Ebrahimi     8-bit characters because in that case the compiled code doesn't use the bit
6024*22dc650dSSadaf Ebrahimi     map. */
6025*22dc650dSSadaf Ebrahimi 
6026*22dc650dSSadaf Ebrahimi     memset(classbits, 0, 32 * sizeof(uint8_t));
6027*22dc650dSSadaf Ebrahimi 
6028*22dc650dSSadaf Ebrahimi     /* Process items until META_CLASS_END is reached. */
6029*22dc650dSSadaf Ebrahimi 
6030*22dc650dSSadaf Ebrahimi     while ((meta = *(++pptr)) != META_CLASS_END)
6031*22dc650dSSadaf Ebrahimi       {
6032*22dc650dSSadaf Ebrahimi       /* Handle POSIX classes such as [:alpha:] etc. */
6033*22dc650dSSadaf Ebrahimi 
6034*22dc650dSSadaf Ebrahimi       if (meta == META_POSIX || meta == META_POSIX_NEG)
6035*22dc650dSSadaf Ebrahimi         {
6036*22dc650dSSadaf Ebrahimi         BOOL local_negate = (meta == META_POSIX_NEG);
6037*22dc650dSSadaf Ebrahimi         int posix_class = *(++pptr);
6038*22dc650dSSadaf Ebrahimi         int taboffset, tabopt;
6039*22dc650dSSadaf Ebrahimi         uint8_t pbits[32];
6040*22dc650dSSadaf Ebrahimi 
6041*22dc650dSSadaf Ebrahimi         should_flip_negation = local_negate;  /* Note negative special */
6042*22dc650dSSadaf Ebrahimi 
6043*22dc650dSSadaf Ebrahimi         /* If matching is caseless, upper and lower are converted to alpha.
6044*22dc650dSSadaf Ebrahimi         This relies on the fact that the class table starts with alpha,
6045*22dc650dSSadaf Ebrahimi         lower, upper as the first 3 entries. */
6046*22dc650dSSadaf Ebrahimi 
6047*22dc650dSSadaf Ebrahimi         if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
6048*22dc650dSSadaf Ebrahimi           posix_class = 0;
6049*22dc650dSSadaf Ebrahimi 
6050*22dc650dSSadaf Ebrahimi         /* When PCRE2_UCP is set, some of the POSIX classes are converted to
6051*22dc650dSSadaf Ebrahimi         different escape sequences that use Unicode properties \p or \P.
6052*22dc650dSSadaf Ebrahimi         Others that are not available via \p or \P have to generate
6053*22dc650dSSadaf Ebrahimi         XCL_PROP/XCL_NOTPROP directly, which is done here. */
6054*22dc650dSSadaf Ebrahimi 
6055*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
6056*22dc650dSSadaf Ebrahimi         if ((options & PCRE2_UCP) != 0 &&
6057*22dc650dSSadaf Ebrahimi             (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0)
6058*22dc650dSSadaf Ebrahimi           {
6059*22dc650dSSadaf Ebrahimi           switch(posix_class)
6060*22dc650dSSadaf Ebrahimi             {
6061*22dc650dSSadaf Ebrahimi             case PC_GRAPH:
6062*22dc650dSSadaf Ebrahimi             case PC_PRINT:
6063*22dc650dSSadaf Ebrahimi             case PC_PUNCT:
6064*22dc650dSSadaf Ebrahimi             *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
6065*22dc650dSSadaf Ebrahimi             *class_uchardata++ = (PCRE2_UCHAR)
6066*22dc650dSSadaf Ebrahimi               ((posix_class == PC_GRAPH)? PT_PXGRAPH :
6067*22dc650dSSadaf Ebrahimi                (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT);
6068*22dc650dSSadaf Ebrahimi             *class_uchardata++ = 0;
6069*22dc650dSSadaf Ebrahimi             xclass_has_prop = TRUE;
6070*22dc650dSSadaf Ebrahimi             goto CONTINUE_CLASS;
6071*22dc650dSSadaf Ebrahimi 
6072*22dc650dSSadaf Ebrahimi             /* For the other POSIX classes (ex: ascii) we are going to
6073*22dc650dSSadaf Ebrahimi             fall through to the non-UCP case and build a bit map for
6074*22dc650dSSadaf Ebrahimi             characters with code points less than 256. However, if we are in
6075*22dc650dSSadaf Ebrahimi             a negated POSIX class, characters with code points greater than
6076*22dc650dSSadaf Ebrahimi             255 must either all match or all not match, depending on whether
6077*22dc650dSSadaf Ebrahimi             the whole class is not or is negated. For example, for
6078*22dc650dSSadaf Ebrahimi             [[:^ascii:]... they must all match, whereas for [^[:^ascii:]...
6079*22dc650dSSadaf Ebrahimi             they must not.
6080*22dc650dSSadaf Ebrahimi 
6081*22dc650dSSadaf Ebrahimi             In the special case where there are no xclass items, this is
6082*22dc650dSSadaf Ebrahimi             automatically handled by the use of OP_CLASS or OP_NCLASS, but an
6083*22dc650dSSadaf Ebrahimi             explicit range is needed for OP_XCLASS. Setting a flag here
6084*22dc650dSSadaf Ebrahimi             causes the range to be generated later when it is known that
6085*22dc650dSSadaf Ebrahimi             OP_XCLASS is required. In the 8-bit library this is relevant only in
6086*22dc650dSSadaf Ebrahimi             utf mode, since no wide characters can exist otherwise. */
6087*22dc650dSSadaf Ebrahimi 
6088*22dc650dSSadaf Ebrahimi             default:
6089*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
6090*22dc650dSSadaf Ebrahimi             if (utf)
6091*22dc650dSSadaf Ebrahimi #endif
6092*22dc650dSSadaf Ebrahimi             match_all_or_no_wide_chars |= local_negate;
6093*22dc650dSSadaf Ebrahimi             break;
6094*22dc650dSSadaf Ebrahimi             }
6095*22dc650dSSadaf Ebrahimi           }
6096*22dc650dSSadaf Ebrahimi #endif  /* SUPPORT_UNICODE */
6097*22dc650dSSadaf Ebrahimi 
6098*22dc650dSSadaf Ebrahimi         /* In the non-UCP case, or when UCP makes no difference, we build the
6099*22dc650dSSadaf Ebrahimi         bit map for the POSIX class in a chunk of local store because we may
6100*22dc650dSSadaf Ebrahimi         be adding and subtracting from it, and we don't want to subtract bits
6101*22dc650dSSadaf Ebrahimi         that may be in the main map already. At the end we or the result into
6102*22dc650dSSadaf Ebrahimi         the bit map that is being built. */
6103*22dc650dSSadaf Ebrahimi 
6104*22dc650dSSadaf Ebrahimi         posix_class *= 3;
6105*22dc650dSSadaf Ebrahimi 
6106*22dc650dSSadaf Ebrahimi         /* Copy in the first table (always present) */
6107*22dc650dSSadaf Ebrahimi 
6108*22dc650dSSadaf Ebrahimi         memcpy(pbits, cbits + posix_class_maps[posix_class],
6109*22dc650dSSadaf Ebrahimi           32 * sizeof(uint8_t));
6110*22dc650dSSadaf Ebrahimi 
6111*22dc650dSSadaf Ebrahimi         /* If there is a second table, add or remove it as required. */
6112*22dc650dSSadaf Ebrahimi 
6113*22dc650dSSadaf Ebrahimi         taboffset = posix_class_maps[posix_class + 1];
6114*22dc650dSSadaf Ebrahimi         tabopt = posix_class_maps[posix_class + 2];
6115*22dc650dSSadaf Ebrahimi 
6116*22dc650dSSadaf Ebrahimi         if (taboffset >= 0)
6117*22dc650dSSadaf Ebrahimi           {
6118*22dc650dSSadaf Ebrahimi           if (tabopt >= 0)
6119*22dc650dSSadaf Ebrahimi             for (int i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset];
6120*22dc650dSSadaf Ebrahimi           else
6121*22dc650dSSadaf Ebrahimi             for (int i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset];
6122*22dc650dSSadaf Ebrahimi           }
6123*22dc650dSSadaf Ebrahimi 
6124*22dc650dSSadaf Ebrahimi         /* Now see if we need to remove any special characters. An option
6125*22dc650dSSadaf Ebrahimi         value of 1 removes vertical space and 2 removes underscore. */
6126*22dc650dSSadaf Ebrahimi 
6127*22dc650dSSadaf Ebrahimi         if (tabopt < 0) tabopt = -tabopt;
6128*22dc650dSSadaf Ebrahimi         if (tabopt == 1) pbits[1] &= ~0x3c;
6129*22dc650dSSadaf Ebrahimi           else if (tabopt == 2) pbits[11] &= 0x7f;
6130*22dc650dSSadaf Ebrahimi 
6131*22dc650dSSadaf Ebrahimi         /* Add the POSIX table or its complement into the main table that is
6132*22dc650dSSadaf Ebrahimi         being built and we are done. */
6133*22dc650dSSadaf Ebrahimi 
6134*22dc650dSSadaf Ebrahimi         if (local_negate)
6135*22dc650dSSadaf Ebrahimi           for (int i = 0; i < 32; i++) classbits[i] |= (uint8_t)(~pbits[i]);
6136*22dc650dSSadaf Ebrahimi         else
6137*22dc650dSSadaf Ebrahimi           for (int i = 0; i < 32; i++) classbits[i] |= pbits[i];
6138*22dc650dSSadaf Ebrahimi 
6139*22dc650dSSadaf Ebrahimi         /* Every class contains at least one < 256 character. */
6140*22dc650dSSadaf Ebrahimi 
6141*22dc650dSSadaf Ebrahimi         class_has_8bitchar = 1;
6142*22dc650dSSadaf Ebrahimi         goto CONTINUE_CLASS;    /* End of POSIX handling */
6143*22dc650dSSadaf Ebrahimi         }
6144*22dc650dSSadaf Ebrahimi 
6145*22dc650dSSadaf Ebrahimi       /* Other than POSIX classes, the only items we should encounter are
6146*22dc650dSSadaf Ebrahimi       \d-type escapes and literal characters (possibly as ranges). */
6147*22dc650dSSadaf Ebrahimi 
6148*22dc650dSSadaf Ebrahimi       if (meta == META_BIGVALUE)
6149*22dc650dSSadaf Ebrahimi         {
6150*22dc650dSSadaf Ebrahimi         meta = *(++pptr);
6151*22dc650dSSadaf Ebrahimi         goto CLASS_LITERAL;
6152*22dc650dSSadaf Ebrahimi         }
6153*22dc650dSSadaf Ebrahimi 
6154*22dc650dSSadaf Ebrahimi       /* Any other non-literal must be an escape */
6155*22dc650dSSadaf Ebrahimi 
6156*22dc650dSSadaf Ebrahimi       if (meta >= META_END)
6157*22dc650dSSadaf Ebrahimi         {
6158*22dc650dSSadaf Ebrahimi         if (META_CODE(meta) != META_ESCAPE)
6159*22dc650dSSadaf Ebrahimi           {
6160*22dc650dSSadaf Ebrahimi #ifdef DEBUG_SHOW_PARSED
6161*22dc650dSSadaf Ebrahimi           fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x "
6162*22dc650dSSadaf Ebrahimi                           "in character class\n", meta);
6163*22dc650dSSadaf Ebrahimi #endif
6164*22dc650dSSadaf Ebrahimi           *errorcodeptr = ERR89;  /* Internal error - unrecognized. */
6165*22dc650dSSadaf Ebrahimi           return 0;
6166*22dc650dSSadaf Ebrahimi           }
6167*22dc650dSSadaf Ebrahimi         escape = META_DATA(meta);
6168*22dc650dSSadaf Ebrahimi 
6169*22dc650dSSadaf Ebrahimi         /* Every class contains at least one < 256 character. */
6170*22dc650dSSadaf Ebrahimi 
6171*22dc650dSSadaf Ebrahimi         class_has_8bitchar++;
6172*22dc650dSSadaf Ebrahimi 
6173*22dc650dSSadaf Ebrahimi         switch(escape)
6174*22dc650dSSadaf Ebrahimi           {
6175*22dc650dSSadaf Ebrahimi           case ESC_d:
6176*22dc650dSSadaf Ebrahimi           for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
6177*22dc650dSSadaf Ebrahimi           break;
6178*22dc650dSSadaf Ebrahimi 
6179*22dc650dSSadaf Ebrahimi           case ESC_D:
6180*22dc650dSSadaf Ebrahimi           should_flip_negation = TRUE;
6181*22dc650dSSadaf Ebrahimi           for (int i = 0; i < 32; i++)
6182*22dc650dSSadaf Ebrahimi             classbits[i] |= (uint8_t)(~cbits[i+cbit_digit]);
6183*22dc650dSSadaf Ebrahimi           break;
6184*22dc650dSSadaf Ebrahimi 
6185*22dc650dSSadaf Ebrahimi           case ESC_w:
6186*22dc650dSSadaf Ebrahimi           for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
6187*22dc650dSSadaf Ebrahimi           break;
6188*22dc650dSSadaf Ebrahimi 
6189*22dc650dSSadaf Ebrahimi           case ESC_W:
6190*22dc650dSSadaf Ebrahimi           should_flip_negation = TRUE;
6191*22dc650dSSadaf Ebrahimi           for (int i = 0; i < 32; i++)
6192*22dc650dSSadaf Ebrahimi             classbits[i] |= (uint8_t)(~cbits[i+cbit_word]);
6193*22dc650dSSadaf Ebrahimi           break;
6194*22dc650dSSadaf Ebrahimi 
6195*22dc650dSSadaf Ebrahimi           /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
6196*22dc650dSSadaf Ebrahimi           5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
6197*22dc650dSSadaf Ebrahimi           previously set by something earlier in the character class.
6198*22dc650dSSadaf Ebrahimi           Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
6199*22dc650dSSadaf Ebrahimi           we could just adjust the appropriate bit. From PCRE 8.34 we no
6200*22dc650dSSadaf Ebrahimi           longer treat \s and \S specially. */
6201*22dc650dSSadaf Ebrahimi 
6202*22dc650dSSadaf Ebrahimi           case ESC_s:
6203*22dc650dSSadaf Ebrahimi           for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
6204*22dc650dSSadaf Ebrahimi           break;
6205*22dc650dSSadaf Ebrahimi 
6206*22dc650dSSadaf Ebrahimi           case ESC_S:
6207*22dc650dSSadaf Ebrahimi           should_flip_negation = TRUE;
6208*22dc650dSSadaf Ebrahimi           for (int i = 0; i < 32; i++)
6209*22dc650dSSadaf Ebrahimi             classbits[i] |= (uint8_t)(~cbits[i+cbit_space]);
6210*22dc650dSSadaf Ebrahimi           break;
6211*22dc650dSSadaf Ebrahimi 
6212*22dc650dSSadaf Ebrahimi           /* When adding the horizontal or vertical space lists to a class, or
6213*22dc650dSSadaf Ebrahimi           their complements, disable PCRE2_CASELESS, because it justs wastes
6214*22dc650dSSadaf Ebrahimi           time, and in the "not-x" UTF cases can create unwanted duplicates in
6215*22dc650dSSadaf Ebrahimi           the XCLASS list (provoked by characters that have more than one other
6216*22dc650dSSadaf Ebrahimi           case and by both cases being in the same "not-x" sublist). */
6217*22dc650dSSadaf Ebrahimi 
6218*22dc650dSSadaf Ebrahimi           case ESC_h:
6219*22dc650dSSadaf Ebrahimi           (void)add_list_to_class(classbits, &class_uchardata,
6220*22dc650dSSadaf Ebrahimi             options & ~PCRE2_CASELESS, xoptions, cb, PRIV(hspace_list),
6221*22dc650dSSadaf Ebrahimi               NOTACHAR);
6222*22dc650dSSadaf Ebrahimi           break;
6223*22dc650dSSadaf Ebrahimi 
6224*22dc650dSSadaf Ebrahimi           case ESC_H:
6225*22dc650dSSadaf Ebrahimi           (void)add_not_list_to_class(classbits, &class_uchardata,
6226*22dc650dSSadaf Ebrahimi             options & ~PCRE2_CASELESS, xoptions, cb, PRIV(hspace_list));
6227*22dc650dSSadaf Ebrahimi           break;
6228*22dc650dSSadaf Ebrahimi 
6229*22dc650dSSadaf Ebrahimi           case ESC_v:
6230*22dc650dSSadaf Ebrahimi           (void)add_list_to_class(classbits, &class_uchardata,
6231*22dc650dSSadaf Ebrahimi             options & ~PCRE2_CASELESS, xoptions, cb, PRIV(vspace_list),
6232*22dc650dSSadaf Ebrahimi               NOTACHAR);
6233*22dc650dSSadaf Ebrahimi           break;
6234*22dc650dSSadaf Ebrahimi 
6235*22dc650dSSadaf Ebrahimi           case ESC_V:
6236*22dc650dSSadaf Ebrahimi           (void)add_not_list_to_class(classbits, &class_uchardata,
6237*22dc650dSSadaf Ebrahimi             options & ~PCRE2_CASELESS, xoptions, cb, PRIV(vspace_list));
6238*22dc650dSSadaf Ebrahimi           break;
6239*22dc650dSSadaf Ebrahimi 
6240*22dc650dSSadaf Ebrahimi           /* If Unicode is not supported, \P and \p are not allowed and are
6241*22dc650dSSadaf Ebrahimi           faulted at parse time, so will never appear here. */
6242*22dc650dSSadaf Ebrahimi 
6243*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
6244*22dc650dSSadaf Ebrahimi           case ESC_p:
6245*22dc650dSSadaf Ebrahimi           case ESC_P:
6246*22dc650dSSadaf Ebrahimi             {
6247*22dc650dSSadaf Ebrahimi             uint32_t ptype = *(++pptr) >> 16;
6248*22dc650dSSadaf Ebrahimi             uint32_t pdata = *pptr & 0xffff;
6249*22dc650dSSadaf Ebrahimi             *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
6250*22dc650dSSadaf Ebrahimi             *class_uchardata++ = ptype;
6251*22dc650dSSadaf Ebrahimi             *class_uchardata++ = pdata;
6252*22dc650dSSadaf Ebrahimi             xclass_has_prop = TRUE;
6253*22dc650dSSadaf Ebrahimi             class_has_8bitchar--;                /* Undo! */
6254*22dc650dSSadaf Ebrahimi             }
6255*22dc650dSSadaf Ebrahimi           break;
6256*22dc650dSSadaf Ebrahimi #endif
6257*22dc650dSSadaf Ebrahimi           }
6258*22dc650dSSadaf Ebrahimi 
6259*22dc650dSSadaf Ebrahimi         goto CONTINUE_CLASS;
6260*22dc650dSSadaf Ebrahimi         }  /* End handling \d-type escapes */
6261*22dc650dSSadaf Ebrahimi 
6262*22dc650dSSadaf Ebrahimi       /* A literal character may be followed by a range meta. At parse time
6263*22dc650dSSadaf Ebrahimi       there are checks for out-of-order characters, for ranges where the two
6264*22dc650dSSadaf Ebrahimi       characters are equal, and for hyphens that cannot indicate a range. At
6265*22dc650dSSadaf Ebrahimi       this point, therefore, no checking is needed. */
6266*22dc650dSSadaf Ebrahimi 
6267*22dc650dSSadaf Ebrahimi       else
6268*22dc650dSSadaf Ebrahimi         {
6269*22dc650dSSadaf Ebrahimi         uint32_t c, d;
6270*22dc650dSSadaf Ebrahimi 
6271*22dc650dSSadaf Ebrahimi         CLASS_LITERAL:
6272*22dc650dSSadaf Ebrahimi         c = d = meta;
6273*22dc650dSSadaf Ebrahimi 
6274*22dc650dSSadaf Ebrahimi         /* Remember if \r or \n were explicitly used */
6275*22dc650dSSadaf Ebrahimi 
6276*22dc650dSSadaf Ebrahimi         if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
6277*22dc650dSSadaf Ebrahimi 
6278*22dc650dSSadaf Ebrahimi         /* Process a character range */
6279*22dc650dSSadaf Ebrahimi 
6280*22dc650dSSadaf Ebrahimi         if (pptr[1] == META_RANGE_LITERAL || pptr[1] == META_RANGE_ESCAPED)
6281*22dc650dSSadaf Ebrahimi           {
6282*22dc650dSSadaf Ebrahimi #ifdef EBCDIC
6283*22dc650dSSadaf Ebrahimi           BOOL range_is_literal = (pptr[1] == META_RANGE_LITERAL);
6284*22dc650dSSadaf Ebrahimi #endif
6285*22dc650dSSadaf Ebrahimi           pptr += 2;
6286*22dc650dSSadaf Ebrahimi           d = *pptr;
6287*22dc650dSSadaf Ebrahimi           if (d == META_BIGVALUE) d = *(++pptr);
6288*22dc650dSSadaf Ebrahimi 
6289*22dc650dSSadaf Ebrahimi           /* Remember an explicit \r or \n, and add the range to the class. */
6290*22dc650dSSadaf Ebrahimi 
6291*22dc650dSSadaf Ebrahimi           if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
6292*22dc650dSSadaf Ebrahimi 
6293*22dc650dSSadaf Ebrahimi           /* In an EBCDIC environment, Perl treats alphabetic ranges specially
6294*22dc650dSSadaf Ebrahimi           because there are holes in the encoding, and simply using the range
6295*22dc650dSSadaf Ebrahimi           A-Z (for example) would include the characters in the holes. This
6296*22dc650dSSadaf Ebrahimi           applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
6297*22dc650dSSadaf Ebrahimi 
6298*22dc650dSSadaf Ebrahimi #ifdef EBCDIC
6299*22dc650dSSadaf Ebrahimi           if (range_is_literal &&
6300*22dc650dSSadaf Ebrahimi                (cb->ctypes[c] & ctype_letter) != 0 &&
6301*22dc650dSSadaf Ebrahimi                (cb->ctypes[d] & ctype_letter) != 0 &&
6302*22dc650dSSadaf Ebrahimi                (c <= CHAR_z) == (d <= CHAR_z))
6303*22dc650dSSadaf Ebrahimi             {
6304*22dc650dSSadaf Ebrahimi             uint32_t uc = (d <= CHAR_z)? 0 : 64;
6305*22dc650dSSadaf Ebrahimi             uint32_t C = c - uc;
6306*22dc650dSSadaf Ebrahimi             uint32_t D = d - uc;
6307*22dc650dSSadaf Ebrahimi 
6308*22dc650dSSadaf Ebrahimi             if (C <= CHAR_i)
6309*22dc650dSSadaf Ebrahimi               {
6310*22dc650dSSadaf Ebrahimi               class_has_8bitchar +=
6311*22dc650dSSadaf Ebrahimi                 add_to_class(classbits, &class_uchardata, options, xoptions,
6312*22dc650dSSadaf Ebrahimi                   cb, C + uc, ((D < CHAR_i)? D : CHAR_i) + uc);
6313*22dc650dSSadaf Ebrahimi               C = CHAR_j;
6314*22dc650dSSadaf Ebrahimi               }
6315*22dc650dSSadaf Ebrahimi 
6316*22dc650dSSadaf Ebrahimi             if (C <= D && C <= CHAR_r)
6317*22dc650dSSadaf Ebrahimi               {
6318*22dc650dSSadaf Ebrahimi               class_has_8bitchar +=
6319*22dc650dSSadaf Ebrahimi                 add_to_class(classbits, &class_uchardata, options, xoptions,
6320*22dc650dSSadaf Ebrahimi                   cb, C + uc, ((D < CHAR_r)? D : CHAR_r) + uc);
6321*22dc650dSSadaf Ebrahimi               C = CHAR_s;
6322*22dc650dSSadaf Ebrahimi               }
6323*22dc650dSSadaf Ebrahimi 
6324*22dc650dSSadaf Ebrahimi             if (C <= D)
6325*22dc650dSSadaf Ebrahimi               {
6326*22dc650dSSadaf Ebrahimi               class_has_8bitchar +=
6327*22dc650dSSadaf Ebrahimi                 add_to_class(classbits, &class_uchardata, options, xoptions,
6328*22dc650dSSadaf Ebrahimi                   cb, C + uc, D + uc);
6329*22dc650dSSadaf Ebrahimi               }
6330*22dc650dSSadaf Ebrahimi             }
6331*22dc650dSSadaf Ebrahimi           else
6332*22dc650dSSadaf Ebrahimi #endif
6333*22dc650dSSadaf Ebrahimi           /* Not an EBCDIC special range */
6334*22dc650dSSadaf Ebrahimi 
6335*22dc650dSSadaf Ebrahimi           class_has_8bitchar += add_to_class(classbits, &class_uchardata,
6336*22dc650dSSadaf Ebrahimi             options, xoptions, cb, c, d);
6337*22dc650dSSadaf Ebrahimi           goto CONTINUE_CLASS;   /* Go get the next char in the class */
6338*22dc650dSSadaf Ebrahimi           }  /* End of range handling */
6339*22dc650dSSadaf Ebrahimi 
6340*22dc650dSSadaf Ebrahimi 
6341*22dc650dSSadaf Ebrahimi         /* Handle a single character. */
6342*22dc650dSSadaf Ebrahimi 
6343*22dc650dSSadaf Ebrahimi         class_has_8bitchar +=
6344*22dc650dSSadaf Ebrahimi           add_to_class(classbits, &class_uchardata, options, xoptions, cb,
6345*22dc650dSSadaf Ebrahimi             meta, meta);
6346*22dc650dSSadaf Ebrahimi         }
6347*22dc650dSSadaf Ebrahimi 
6348*22dc650dSSadaf Ebrahimi       /* Continue to the next item in the class. */
6349*22dc650dSSadaf Ebrahimi 
6350*22dc650dSSadaf Ebrahimi       CONTINUE_CLASS:
6351*22dc650dSSadaf Ebrahimi 
6352*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_WIDE_CHARS
6353*22dc650dSSadaf Ebrahimi       /* If any wide characters or Unicode properties have been encountered,
6354*22dc650dSSadaf Ebrahimi       set xclass = TRUE. Then, in the pre-compile phase, accumulate the length
6355*22dc650dSSadaf Ebrahimi       of the extra data and reset the pointer. This is so that very large
6356*22dc650dSSadaf Ebrahimi       classes that contain a zillion wide characters or Unicode property tests
6357*22dc650dSSadaf Ebrahimi       do not overwrite the workspace (which is on the stack). */
6358*22dc650dSSadaf Ebrahimi 
6359*22dc650dSSadaf Ebrahimi       if (class_uchardata > class_uchardata_base)
6360*22dc650dSSadaf Ebrahimi         {
6361*22dc650dSSadaf Ebrahimi         xclass = TRUE;
6362*22dc650dSSadaf Ebrahimi         if (lengthptr != NULL)
6363*22dc650dSSadaf Ebrahimi           {
6364*22dc650dSSadaf Ebrahimi           *lengthptr += class_uchardata - class_uchardata_base;
6365*22dc650dSSadaf Ebrahimi           class_uchardata = class_uchardata_base;
6366*22dc650dSSadaf Ebrahimi           }
6367*22dc650dSSadaf Ebrahimi         }
6368*22dc650dSSadaf Ebrahimi #endif
6369*22dc650dSSadaf Ebrahimi 
6370*22dc650dSSadaf Ebrahimi       continue;  /* Needed to avoid error when not supporting wide chars */
6371*22dc650dSSadaf Ebrahimi       }   /* End of main class-processing loop */
6372*22dc650dSSadaf Ebrahimi 
6373*22dc650dSSadaf Ebrahimi     /* If this class is the first thing in the branch, there can be no first
6374*22dc650dSSadaf Ebrahimi     char setting, whatever the repeat count. Any reqcu setting must remain
6375*22dc650dSSadaf Ebrahimi     unchanged after any kind of repeat. */
6376*22dc650dSSadaf Ebrahimi 
6377*22dc650dSSadaf Ebrahimi     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6378*22dc650dSSadaf Ebrahimi     zerofirstcu = firstcu;
6379*22dc650dSSadaf Ebrahimi     zerofirstcuflags = firstcuflags;
6380*22dc650dSSadaf Ebrahimi     zeroreqcu = reqcu;
6381*22dc650dSSadaf Ebrahimi     zeroreqcuflags = reqcuflags;
6382*22dc650dSSadaf Ebrahimi 
6383*22dc650dSSadaf Ebrahimi     /* If there are characters with values > 255, or Unicode property settings
6384*22dc650dSSadaf Ebrahimi     (\p or \P), we have to compile an extended class, with its own opcode,
6385*22dc650dSSadaf Ebrahimi     unless there were no property settings and there was a negated special such
6386*22dc650dSSadaf Ebrahimi     as \S in the class, and PCRE2_UCP is not set, because in that case all
6387*22dc650dSSadaf Ebrahimi     characters > 255 are in or not in the class, so any that were explicitly
6388*22dc650dSSadaf Ebrahimi     given as well can be ignored.
6389*22dc650dSSadaf Ebrahimi 
6390*22dc650dSSadaf Ebrahimi     In the UCP case, if certain negated POSIX classes (ex: [:^ascii:]) were
6391*22dc650dSSadaf Ebrahimi     were present in a class, we either have to match or not match all wide
6392*22dc650dSSadaf Ebrahimi     characters (depending on whether the whole class is or is not negated).
6393*22dc650dSSadaf Ebrahimi     This requirement is indicated by match_all_or_no_wide_chars being true.
6394*22dc650dSSadaf Ebrahimi     We do this by including an explicit range, which works in both cases.
6395*22dc650dSSadaf Ebrahimi     This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
6396*22dc650dSSadaf Ebrahimi     cannot be any wide characters in 8-bit non-UTF mode.
6397*22dc650dSSadaf Ebrahimi 
6398*22dc650dSSadaf Ebrahimi     When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
6399*22dc650dSSadaf Ebrahimi     class where \S etc is present without PCRE2_UCP, causing an extended class
6400*22dc650dSSadaf Ebrahimi     to be compiled, we make sure that all characters > 255 are included by
6401*22dc650dSSadaf Ebrahimi     forcing match_all_or_no_wide_chars to be true.
6402*22dc650dSSadaf Ebrahimi 
6403*22dc650dSSadaf Ebrahimi     If, when generating an xclass, there are no characters < 256, we can omit
6404*22dc650dSSadaf Ebrahimi     the bitmap in the actual compiled code. */
6405*22dc650dSSadaf Ebrahimi 
6406*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_WIDE_CHARS  /* Defined for 16/32 bits, or 8-bit with Unicode */
6407*22dc650dSSadaf Ebrahimi     if (xclass && (
6408*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
6409*22dc650dSSadaf Ebrahimi         (options & PCRE2_UCP) != 0 ||
6410*22dc650dSSadaf Ebrahimi #endif
6411*22dc650dSSadaf Ebrahimi         xclass_has_prop || !should_flip_negation))
6412*22dc650dSSadaf Ebrahimi       {
6413*22dc650dSSadaf Ebrahimi       if (match_all_or_no_wide_chars || (
6414*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
6415*22dc650dSSadaf Ebrahimi            utf &&
6416*22dc650dSSadaf Ebrahimi #endif
6417*22dc650dSSadaf Ebrahimi            should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0))
6418*22dc650dSSadaf Ebrahimi         {
6419*22dc650dSSadaf Ebrahimi         *class_uchardata++ = XCL_RANGE;
6420*22dc650dSSadaf Ebrahimi         if (utf)   /* Will always be utf in the 8-bit library */
6421*22dc650dSSadaf Ebrahimi           {
6422*22dc650dSSadaf Ebrahimi           class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
6423*22dc650dSSadaf Ebrahimi           class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
6424*22dc650dSSadaf Ebrahimi           }
6425*22dc650dSSadaf Ebrahimi         else       /* Can only happen for the 16-bit & 32-bit libraries */
6426*22dc650dSSadaf Ebrahimi           {
6427*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 16
6428*22dc650dSSadaf Ebrahimi           *class_uchardata++ = 0x100;
6429*22dc650dSSadaf Ebrahimi           *class_uchardata++ = 0xffffu;
6430*22dc650dSSadaf Ebrahimi #elif PCRE2_CODE_UNIT_WIDTH == 32
6431*22dc650dSSadaf Ebrahimi           *class_uchardata++ = 0x100;
6432*22dc650dSSadaf Ebrahimi           *class_uchardata++ = 0xffffffffu;
6433*22dc650dSSadaf Ebrahimi #endif
6434*22dc650dSSadaf Ebrahimi           }
6435*22dc650dSSadaf Ebrahimi         }
6436*22dc650dSSadaf Ebrahimi       *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
6437*22dc650dSSadaf Ebrahimi       *code++ = OP_XCLASS;
6438*22dc650dSSadaf Ebrahimi       code += LINK_SIZE;
6439*22dc650dSSadaf Ebrahimi       *code = negate_class? XCL_NOT:0;
6440*22dc650dSSadaf Ebrahimi       if (xclass_has_prop) *code |= XCL_HASPROP;
6441*22dc650dSSadaf Ebrahimi 
6442*22dc650dSSadaf Ebrahimi       /* If the map is required, move up the extra data to make room for it;
6443*22dc650dSSadaf Ebrahimi       otherwise just move the code pointer to the end of the extra data. */
6444*22dc650dSSadaf Ebrahimi 
6445*22dc650dSSadaf Ebrahimi       if (class_has_8bitchar > 0)
6446*22dc650dSSadaf Ebrahimi         {
6447*22dc650dSSadaf Ebrahimi         *code++ |= XCL_MAP;
6448*22dc650dSSadaf Ebrahimi         (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
6449*22dc650dSSadaf Ebrahimi           CU2BYTES(class_uchardata - code));
6450*22dc650dSSadaf Ebrahimi         if (negate_class && !xclass_has_prop)
6451*22dc650dSSadaf Ebrahimi           {
6452*22dc650dSSadaf Ebrahimi           /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6453*22dc650dSSadaf Ebrahimi           for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6454*22dc650dSSadaf Ebrahimi           }
6455*22dc650dSSadaf Ebrahimi         memcpy(code, classbits, 32);
6456*22dc650dSSadaf Ebrahimi         code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
6457*22dc650dSSadaf Ebrahimi         }
6458*22dc650dSSadaf Ebrahimi       else code = class_uchardata;
6459*22dc650dSSadaf Ebrahimi 
6460*22dc650dSSadaf Ebrahimi       /* Now fill in the complete length of the item */
6461*22dc650dSSadaf Ebrahimi 
6462*22dc650dSSadaf Ebrahimi       PUT(previous, 1, (int)(code - previous));
6463*22dc650dSSadaf Ebrahimi       break;   /* End of class handling */
6464*22dc650dSSadaf Ebrahimi       }
6465*22dc650dSSadaf Ebrahimi #endif  /* SUPPORT_WIDE_CHARS */
6466*22dc650dSSadaf Ebrahimi 
6467*22dc650dSSadaf Ebrahimi     /* If there are no characters > 255, or they are all to be included or
6468*22dc650dSSadaf Ebrahimi     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
6469*22dc650dSSadaf Ebrahimi     whole class was negated and whether there were negative specials such as \S
6470*22dc650dSSadaf Ebrahimi     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
6471*22dc650dSSadaf Ebrahimi     negating it if necessary. */
6472*22dc650dSSadaf Ebrahimi 
6473*22dc650dSSadaf Ebrahimi     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
6474*22dc650dSSadaf Ebrahimi     if (lengthptr == NULL)    /* Save time in the pre-compile phase */
6475*22dc650dSSadaf Ebrahimi       {
6476*22dc650dSSadaf Ebrahimi       if (negate_class)
6477*22dc650dSSadaf Ebrahimi         {
6478*22dc650dSSadaf Ebrahimi        /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6479*22dc650dSSadaf Ebrahimi        for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6480*22dc650dSSadaf Ebrahimi        }
6481*22dc650dSSadaf Ebrahimi       memcpy(code, classbits, 32);
6482*22dc650dSSadaf Ebrahimi       }
6483*22dc650dSSadaf Ebrahimi     code += 32 / sizeof(PCRE2_UCHAR);
6484*22dc650dSSadaf Ebrahimi     break;  /* End of class processing */
6485*22dc650dSSadaf Ebrahimi 
6486*22dc650dSSadaf Ebrahimi 
6487*22dc650dSSadaf Ebrahimi     /* ===================================================================*/
6488*22dc650dSSadaf Ebrahimi     /* Deal with (*VERB)s. */
6489*22dc650dSSadaf Ebrahimi 
6490*22dc650dSSadaf Ebrahimi     /* Check for open captures before ACCEPT and close those that are within
6491*22dc650dSSadaf Ebrahimi     the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
6492*22dc650dSSadaf Ebrahimi     assertion. In the first pass, just accumulate the length required;
6493*22dc650dSSadaf Ebrahimi     otherwise hitting (*ACCEPT) inside many nested parentheses can cause
6494*22dc650dSSadaf Ebrahimi     workspace overflow. Do not set firstcu after *ACCEPT. */
6495*22dc650dSSadaf Ebrahimi 
6496*22dc650dSSadaf Ebrahimi     case META_ACCEPT:
6497*22dc650dSSadaf Ebrahimi     cb->had_accept = had_accept = TRUE;
6498*22dc650dSSadaf Ebrahimi     for (oc = open_caps;
6499*22dc650dSSadaf Ebrahimi          oc != NULL && oc->assert_depth >= cb->assert_depth;
6500*22dc650dSSadaf Ebrahimi          oc = oc->next)
6501*22dc650dSSadaf Ebrahimi       {
6502*22dc650dSSadaf Ebrahimi       if (lengthptr != NULL)
6503*22dc650dSSadaf Ebrahimi         {
6504*22dc650dSSadaf Ebrahimi         *lengthptr += CU2BYTES(1) + IMM2_SIZE;
6505*22dc650dSSadaf Ebrahimi         }
6506*22dc650dSSadaf Ebrahimi       else
6507*22dc650dSSadaf Ebrahimi         {
6508*22dc650dSSadaf Ebrahimi         *code++ = OP_CLOSE;
6509*22dc650dSSadaf Ebrahimi         PUT2INC(code, 0, oc->number);
6510*22dc650dSSadaf Ebrahimi         }
6511*22dc650dSSadaf Ebrahimi       }
6512*22dc650dSSadaf Ebrahimi     *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6513*22dc650dSSadaf Ebrahimi     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6514*22dc650dSSadaf Ebrahimi     break;
6515*22dc650dSSadaf Ebrahimi 
6516*22dc650dSSadaf Ebrahimi     case META_PRUNE:
6517*22dc650dSSadaf Ebrahimi     case META_SKIP:
6518*22dc650dSSadaf Ebrahimi     cb->had_pruneorskip = TRUE;
6519*22dc650dSSadaf Ebrahimi     /* Fall through */
6520*22dc650dSSadaf Ebrahimi     case META_COMMIT:
6521*22dc650dSSadaf Ebrahimi     case META_FAIL:
6522*22dc650dSSadaf Ebrahimi     *code++ = verbops[(meta - META_MARK) >> 16];
6523*22dc650dSSadaf Ebrahimi     break;
6524*22dc650dSSadaf Ebrahimi 
6525*22dc650dSSadaf Ebrahimi     case META_THEN:
6526*22dc650dSSadaf Ebrahimi     cb->external_flags |= PCRE2_HASTHEN;
6527*22dc650dSSadaf Ebrahimi     *code++ = OP_THEN;
6528*22dc650dSSadaf Ebrahimi     break;
6529*22dc650dSSadaf Ebrahimi 
6530*22dc650dSSadaf Ebrahimi     /* Handle verbs with arguments. Arguments can be very long, especially in
6531*22dc650dSSadaf Ebrahimi     16- and 32-bit modes, and can overflow the workspace in the first pass.
6532*22dc650dSSadaf Ebrahimi     However, the argument length is constrained to be small enough to fit in
6533*22dc650dSSadaf Ebrahimi     one code unit. This check happens in parse_regex(). In the first pass,
6534*22dc650dSSadaf Ebrahimi     instead of putting the argument into memory, we just update the length
6535*22dc650dSSadaf Ebrahimi     counter and set up an empty argument. */
6536*22dc650dSSadaf Ebrahimi 
6537*22dc650dSSadaf Ebrahimi     case META_THEN_ARG:
6538*22dc650dSSadaf Ebrahimi     cb->external_flags |= PCRE2_HASTHEN;
6539*22dc650dSSadaf Ebrahimi     goto VERB_ARG;
6540*22dc650dSSadaf Ebrahimi 
6541*22dc650dSSadaf Ebrahimi     case META_PRUNE_ARG:
6542*22dc650dSSadaf Ebrahimi     case META_SKIP_ARG:
6543*22dc650dSSadaf Ebrahimi     cb->had_pruneorskip = TRUE;
6544*22dc650dSSadaf Ebrahimi     /* Fall through */
6545*22dc650dSSadaf Ebrahimi     case META_MARK:
6546*22dc650dSSadaf Ebrahimi     case META_COMMIT_ARG:
6547*22dc650dSSadaf Ebrahimi     VERB_ARG:
6548*22dc650dSSadaf Ebrahimi     *code++ = verbops[(meta - META_MARK) >> 16];
6549*22dc650dSSadaf Ebrahimi     /* The length is in characters. */
6550*22dc650dSSadaf Ebrahimi     verbarglen = *(++pptr);
6551*22dc650dSSadaf Ebrahimi     verbculen = 0;
6552*22dc650dSSadaf Ebrahimi     tempcode = code++;
6553*22dc650dSSadaf Ebrahimi     for (int i = 0; i < (int)verbarglen; i++)
6554*22dc650dSSadaf Ebrahimi       {
6555*22dc650dSSadaf Ebrahimi       meta = *(++pptr);
6556*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
6557*22dc650dSSadaf Ebrahimi       if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
6558*22dc650dSSadaf Ebrahimi #endif
6559*22dc650dSSadaf Ebrahimi         {
6560*22dc650dSSadaf Ebrahimi         mclength = 1;
6561*22dc650dSSadaf Ebrahimi         mcbuffer[0] = meta;
6562*22dc650dSSadaf Ebrahimi         }
6563*22dc650dSSadaf Ebrahimi       if (lengthptr != NULL) *lengthptr += mclength; else
6564*22dc650dSSadaf Ebrahimi         {
6565*22dc650dSSadaf Ebrahimi         memcpy(code, mcbuffer, CU2BYTES(mclength));
6566*22dc650dSSadaf Ebrahimi         code += mclength;
6567*22dc650dSSadaf Ebrahimi         verbculen += mclength;
6568*22dc650dSSadaf Ebrahimi         }
6569*22dc650dSSadaf Ebrahimi       }
6570*22dc650dSSadaf Ebrahimi 
6571*22dc650dSSadaf Ebrahimi     *tempcode = verbculen;   /* Fill in the code unit length */
6572*22dc650dSSadaf Ebrahimi     *code++ = 0;             /* Terminating zero */
6573*22dc650dSSadaf Ebrahimi     break;
6574*22dc650dSSadaf Ebrahimi 
6575*22dc650dSSadaf Ebrahimi 
6576*22dc650dSSadaf Ebrahimi     /* ===================================================================*/
6577*22dc650dSSadaf Ebrahimi     /* Handle options change. The new setting must be passed back for use in
6578*22dc650dSSadaf Ebrahimi     subsequent branches. Reset the greedy defaults and the case value for
6579*22dc650dSSadaf Ebrahimi     firstcu and reqcu. */
6580*22dc650dSSadaf Ebrahimi 
6581*22dc650dSSadaf Ebrahimi     case META_OPTIONS:
6582*22dc650dSSadaf Ebrahimi     *optionsptr = options = *(++pptr);
6583*22dc650dSSadaf Ebrahimi     *xoptionsptr = xoptions = *(++pptr);
6584*22dc650dSSadaf Ebrahimi     greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6585*22dc650dSSadaf Ebrahimi     greedy_non_default = greedy_default ^ 1;
6586*22dc650dSSadaf Ebrahimi     req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6587*22dc650dSSadaf Ebrahimi     break;
6588*22dc650dSSadaf Ebrahimi 
6589*22dc650dSSadaf Ebrahimi 
6590*22dc650dSSadaf Ebrahimi     /* ===================================================================*/
6591*22dc650dSSadaf Ebrahimi     /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
6592*22dc650dSSadaf Ebrahimi     because it could be a numerical check on recursion, or a name check on a
6593*22dc650dSSadaf Ebrahimi     group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
6594*22dc650dSSadaf Ebrahimi     we can handle it either way. We first try for a name; if not found, process
6595*22dc650dSSadaf Ebrahimi     the number. */
6596*22dc650dSSadaf Ebrahimi 
6597*22dc650dSSadaf Ebrahimi     case META_COND_RNUMBER:   /* (?(Rdigits) */
6598*22dc650dSSadaf Ebrahimi     case META_COND_NAME:      /* (?(name) or (?'name') or ?(<name>) */
6599*22dc650dSSadaf Ebrahimi     case META_COND_RNAME:     /* (?(R&name) - test for recursion */
6600*22dc650dSSadaf Ebrahimi     bravalue = OP_COND;
6601*22dc650dSSadaf Ebrahimi       {
6602*22dc650dSSadaf Ebrahimi       int count, index;
6603*22dc650dSSadaf Ebrahimi       unsigned int i;
6604*22dc650dSSadaf Ebrahimi       PCRE2_SPTR name;
6605*22dc650dSSadaf Ebrahimi       named_group *ng = cb->named_groups;
6606*22dc650dSSadaf Ebrahimi       uint32_t length = *(++pptr);
6607*22dc650dSSadaf Ebrahimi 
6608*22dc650dSSadaf Ebrahimi       GETPLUSOFFSET(offset, pptr);
6609*22dc650dSSadaf Ebrahimi       name = cb->start_pattern + offset;
6610*22dc650dSSadaf Ebrahimi 
6611*22dc650dSSadaf Ebrahimi       /* In the first pass, the names generated in the pre-pass are available,
6612*22dc650dSSadaf Ebrahimi       but the main name table has not yet been created. Scan the list of names
6613*22dc650dSSadaf Ebrahimi       generated in the pre-pass in order to get a number and whether or not
6614*22dc650dSSadaf Ebrahimi       this name is duplicated. If it is not duplicated, we can handle it as a
6615*22dc650dSSadaf Ebrahimi       numerical group. */
6616*22dc650dSSadaf Ebrahimi 
6617*22dc650dSSadaf Ebrahimi       for (i = 0; i < cb->names_found; i++, ng++)
6618*22dc650dSSadaf Ebrahimi         {
6619*22dc650dSSadaf Ebrahimi         if (length == ng->length &&
6620*22dc650dSSadaf Ebrahimi             PRIV(strncmp)(name, ng->name, length) == 0)
6621*22dc650dSSadaf Ebrahimi           {
6622*22dc650dSSadaf Ebrahimi           if (!ng->isdup)
6623*22dc650dSSadaf Ebrahimi             {
6624*22dc650dSSadaf Ebrahimi             code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6625*22dc650dSSadaf Ebrahimi             PUT2(code, 2+LINK_SIZE, ng->number);
6626*22dc650dSSadaf Ebrahimi             if (ng->number > cb->top_backref) cb->top_backref = ng->number;
6627*22dc650dSSadaf Ebrahimi             skipunits = 1+IMM2_SIZE;
6628*22dc650dSSadaf Ebrahimi             goto GROUP_PROCESS_NOTE_EMPTY;
6629*22dc650dSSadaf Ebrahimi             }
6630*22dc650dSSadaf Ebrahimi           break;  /* Found a duplicated name */
6631*22dc650dSSadaf Ebrahimi           }
6632*22dc650dSSadaf Ebrahimi         }
6633*22dc650dSSadaf Ebrahimi 
6634*22dc650dSSadaf Ebrahimi       /* If the name was not found we have a bad reference, unless we are
6635*22dc650dSSadaf Ebrahimi       dealing with R<digits>, which is treated as a recursion test by number.
6636*22dc650dSSadaf Ebrahimi       */
6637*22dc650dSSadaf Ebrahimi 
6638*22dc650dSSadaf Ebrahimi       if (i >= cb->names_found)
6639*22dc650dSSadaf Ebrahimi         {
6640*22dc650dSSadaf Ebrahimi         groupnumber = 0;
6641*22dc650dSSadaf Ebrahimi         if (meta == META_COND_RNUMBER)
6642*22dc650dSSadaf Ebrahimi           {
6643*22dc650dSSadaf Ebrahimi           for (i = 1; i < length; i++)
6644*22dc650dSSadaf Ebrahimi             {
6645*22dc650dSSadaf Ebrahimi             groupnumber = groupnumber * 10 + name[i] - CHAR_0;
6646*22dc650dSSadaf Ebrahimi             if (groupnumber > MAX_GROUP_NUMBER)
6647*22dc650dSSadaf Ebrahimi               {
6648*22dc650dSSadaf Ebrahimi               *errorcodeptr = ERR61;
6649*22dc650dSSadaf Ebrahimi               cb->erroroffset = offset + i;
6650*22dc650dSSadaf Ebrahimi               return 0;
6651*22dc650dSSadaf Ebrahimi               }
6652*22dc650dSSadaf Ebrahimi             }
6653*22dc650dSSadaf Ebrahimi           }
6654*22dc650dSSadaf Ebrahimi 
6655*22dc650dSSadaf Ebrahimi         if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
6656*22dc650dSSadaf Ebrahimi           {
6657*22dc650dSSadaf Ebrahimi           *errorcodeptr = ERR15;
6658*22dc650dSSadaf Ebrahimi           cb->erroroffset = offset;
6659*22dc650dSSadaf Ebrahimi           return 0;
6660*22dc650dSSadaf Ebrahimi           }
6661*22dc650dSSadaf Ebrahimi 
6662*22dc650dSSadaf Ebrahimi         /* (?Rdigits) treated as a recursion reference by number. A value of
6663*22dc650dSSadaf Ebrahimi         zero (which is the result of both (?R) and (?R0)) means "any", and is
6664*22dc650dSSadaf Ebrahimi         translated into RREF_ANY (which is 0xffff). */
6665*22dc650dSSadaf Ebrahimi 
6666*22dc650dSSadaf Ebrahimi         if (groupnumber == 0) groupnumber = RREF_ANY;
6667*22dc650dSSadaf Ebrahimi         code[1+LINK_SIZE] = OP_RREF;
6668*22dc650dSSadaf Ebrahimi         PUT2(code, 2+LINK_SIZE, groupnumber);
6669*22dc650dSSadaf Ebrahimi         skipunits = 1+IMM2_SIZE;
6670*22dc650dSSadaf Ebrahimi         goto GROUP_PROCESS_NOTE_EMPTY;
6671*22dc650dSSadaf Ebrahimi         }
6672*22dc650dSSadaf Ebrahimi 
6673*22dc650dSSadaf Ebrahimi       /* A duplicated name was found. Note that if an R<digits> name is found
6674*22dc650dSSadaf Ebrahimi       (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6675*22dc650dSSadaf Ebrahimi 
6676*22dc650dSSadaf Ebrahimi       code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6677*22dc650dSSadaf Ebrahimi 
6678*22dc650dSSadaf Ebrahimi       /* We have a duplicated name. In the compile pass we have to search the
6679*22dc650dSSadaf Ebrahimi       main table in order to get the index and count values. */
6680*22dc650dSSadaf Ebrahimi 
6681*22dc650dSSadaf Ebrahimi       count = 0;  /* Values for first pass (avoids compiler warning) */
6682*22dc650dSSadaf Ebrahimi       index = 0;
6683*22dc650dSSadaf Ebrahimi       if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6684*22dc650dSSadaf Ebrahimi             &count, errorcodeptr, cb)) return 0;
6685*22dc650dSSadaf Ebrahimi 
6686*22dc650dSSadaf Ebrahimi       /* Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and
6687*22dc650dSSadaf Ebrahimi       insert appropriate data values. */
6688*22dc650dSSadaf Ebrahimi 
6689*22dc650dSSadaf Ebrahimi       code[1+LINK_SIZE]++;
6690*22dc650dSSadaf Ebrahimi       skipunits = 1+2*IMM2_SIZE;
6691*22dc650dSSadaf Ebrahimi       PUT2(code, 2+LINK_SIZE, index);
6692*22dc650dSSadaf Ebrahimi       PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6693*22dc650dSSadaf Ebrahimi       }
6694*22dc650dSSadaf Ebrahimi     goto GROUP_PROCESS_NOTE_EMPTY;
6695*22dc650dSSadaf Ebrahimi 
6696*22dc650dSSadaf Ebrahimi     /* The DEFINE condition is always false. Its internal groups may never
6697*22dc650dSSadaf Ebrahimi     be called, so matched_char must remain false, hence the jump to
6698*22dc650dSSadaf Ebrahimi     GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
6699*22dc650dSSadaf Ebrahimi 
6700*22dc650dSSadaf Ebrahimi     case META_COND_DEFINE:
6701*22dc650dSSadaf Ebrahimi     bravalue = OP_COND;
6702*22dc650dSSadaf Ebrahimi     GETPLUSOFFSET(offset, pptr);
6703*22dc650dSSadaf Ebrahimi     code[1+LINK_SIZE] = OP_DEFINE;
6704*22dc650dSSadaf Ebrahimi     skipunits = 1;
6705*22dc650dSSadaf Ebrahimi     goto GROUP_PROCESS;
6706*22dc650dSSadaf Ebrahimi 
6707*22dc650dSSadaf Ebrahimi     /* Conditional test of a group's being set. */
6708*22dc650dSSadaf Ebrahimi 
6709*22dc650dSSadaf Ebrahimi     case META_COND_NUMBER:
6710*22dc650dSSadaf Ebrahimi     bravalue = OP_COND;
6711*22dc650dSSadaf Ebrahimi     GETPLUSOFFSET(offset, pptr);
6712*22dc650dSSadaf Ebrahimi     groupnumber = *(++pptr);
6713*22dc650dSSadaf Ebrahimi     if (groupnumber > cb->bracount)
6714*22dc650dSSadaf Ebrahimi       {
6715*22dc650dSSadaf Ebrahimi       *errorcodeptr = ERR15;
6716*22dc650dSSadaf Ebrahimi       cb->erroroffset = offset;
6717*22dc650dSSadaf Ebrahimi       return 0;
6718*22dc650dSSadaf Ebrahimi       }
6719*22dc650dSSadaf Ebrahimi     if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
6720*22dc650dSSadaf Ebrahimi     offset -= 2;   /* Point at initial ( for too many branches error */
6721*22dc650dSSadaf Ebrahimi     code[1+LINK_SIZE] = OP_CREF;
6722*22dc650dSSadaf Ebrahimi     skipunits = 1+IMM2_SIZE;
6723*22dc650dSSadaf Ebrahimi     PUT2(code, 2+LINK_SIZE, groupnumber);
6724*22dc650dSSadaf Ebrahimi     goto GROUP_PROCESS_NOTE_EMPTY;
6725*22dc650dSSadaf Ebrahimi 
6726*22dc650dSSadaf Ebrahimi     /* Test for the PCRE2 version. */
6727*22dc650dSSadaf Ebrahimi 
6728*22dc650dSSadaf Ebrahimi     case META_COND_VERSION:
6729*22dc650dSSadaf Ebrahimi     bravalue = OP_COND;
6730*22dc650dSSadaf Ebrahimi     if (pptr[1] > 0)
6731*22dc650dSSadaf Ebrahimi       code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
6732*22dc650dSSadaf Ebrahimi         (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
6733*22dc650dSSadaf Ebrahimi           OP_TRUE : OP_FALSE;
6734*22dc650dSSadaf Ebrahimi     else
6735*22dc650dSSadaf Ebrahimi       code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
6736*22dc650dSSadaf Ebrahimi         OP_TRUE : OP_FALSE;
6737*22dc650dSSadaf Ebrahimi     skipunits = 1;
6738*22dc650dSSadaf Ebrahimi     pptr += 3;
6739*22dc650dSSadaf Ebrahimi     goto GROUP_PROCESS_NOTE_EMPTY;
6740*22dc650dSSadaf Ebrahimi 
6741*22dc650dSSadaf Ebrahimi     /* The condition is an assertion, possibly preceded by a callout. */
6742*22dc650dSSadaf Ebrahimi 
6743*22dc650dSSadaf Ebrahimi     case META_COND_ASSERT:
6744*22dc650dSSadaf Ebrahimi     bravalue = OP_COND;
6745*22dc650dSSadaf Ebrahimi     goto GROUP_PROCESS_NOTE_EMPTY;
6746*22dc650dSSadaf Ebrahimi 
6747*22dc650dSSadaf Ebrahimi 
6748*22dc650dSSadaf Ebrahimi     /* ===================================================================*/
6749*22dc650dSSadaf Ebrahimi     /* Handle all kinds of nested bracketed groups. The non-capturing,
6750*22dc650dSSadaf Ebrahimi     non-conditional cases are here; others come to GROUP_PROCESS via goto. */
6751*22dc650dSSadaf Ebrahimi 
6752*22dc650dSSadaf Ebrahimi     case META_LOOKAHEAD:
6753*22dc650dSSadaf Ebrahimi     bravalue = OP_ASSERT;
6754*22dc650dSSadaf Ebrahimi     cb->assert_depth += 1;
6755*22dc650dSSadaf Ebrahimi     goto GROUP_PROCESS;
6756*22dc650dSSadaf Ebrahimi 
6757*22dc650dSSadaf Ebrahimi     case META_LOOKAHEAD_NA:
6758*22dc650dSSadaf Ebrahimi     bravalue = OP_ASSERT_NA;
6759*22dc650dSSadaf Ebrahimi     cb->assert_depth += 1;
6760*22dc650dSSadaf Ebrahimi     goto GROUP_PROCESS;
6761*22dc650dSSadaf Ebrahimi 
6762*22dc650dSSadaf Ebrahimi     /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6763*22dc650dSSadaf Ebrahimi     thing to do, but Perl allows all assertions to be quantified, and when
6764*22dc650dSSadaf Ebrahimi     they contain capturing parentheses there may be a potential use for
6765*22dc650dSSadaf Ebrahimi     this feature. Not that that applies to a quantified (?!) but we allow
6766*22dc650dSSadaf Ebrahimi     it for uniformity. */
6767*22dc650dSSadaf Ebrahimi 
6768*22dc650dSSadaf Ebrahimi     case META_LOOKAHEADNOT:
6769*22dc650dSSadaf Ebrahimi     if (pptr[1] == META_KET &&
6770*22dc650dSSadaf Ebrahimi          (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
6771*22dc650dSSadaf Ebrahimi       {
6772*22dc650dSSadaf Ebrahimi       *code++ = OP_FAIL;
6773*22dc650dSSadaf Ebrahimi       pptr++;
6774*22dc650dSSadaf Ebrahimi       }
6775*22dc650dSSadaf Ebrahimi     else
6776*22dc650dSSadaf Ebrahimi       {
6777*22dc650dSSadaf Ebrahimi       bravalue = OP_ASSERT_NOT;
6778*22dc650dSSadaf Ebrahimi       cb->assert_depth += 1;
6779*22dc650dSSadaf Ebrahimi       goto GROUP_PROCESS;
6780*22dc650dSSadaf Ebrahimi       }
6781*22dc650dSSadaf Ebrahimi     break;
6782*22dc650dSSadaf Ebrahimi 
6783*22dc650dSSadaf Ebrahimi     case META_LOOKBEHIND:
6784*22dc650dSSadaf Ebrahimi     bravalue = OP_ASSERTBACK;
6785*22dc650dSSadaf Ebrahimi     cb->assert_depth += 1;
6786*22dc650dSSadaf Ebrahimi     goto GROUP_PROCESS;
6787*22dc650dSSadaf Ebrahimi 
6788*22dc650dSSadaf Ebrahimi     case META_LOOKBEHINDNOT:
6789*22dc650dSSadaf Ebrahimi     bravalue = OP_ASSERTBACK_NOT;
6790*22dc650dSSadaf Ebrahimi     cb->assert_depth += 1;
6791*22dc650dSSadaf Ebrahimi     goto GROUP_PROCESS;
6792*22dc650dSSadaf Ebrahimi 
6793*22dc650dSSadaf Ebrahimi     case META_LOOKBEHIND_NA:
6794*22dc650dSSadaf Ebrahimi     bravalue = OP_ASSERTBACK_NA;
6795*22dc650dSSadaf Ebrahimi     cb->assert_depth += 1;
6796*22dc650dSSadaf Ebrahimi     goto GROUP_PROCESS;
6797*22dc650dSSadaf Ebrahimi 
6798*22dc650dSSadaf Ebrahimi     case META_ATOMIC:
6799*22dc650dSSadaf Ebrahimi     bravalue = OP_ONCE;
6800*22dc650dSSadaf Ebrahimi     goto GROUP_PROCESS_NOTE_EMPTY;
6801*22dc650dSSadaf Ebrahimi 
6802*22dc650dSSadaf Ebrahimi     case META_SCRIPT_RUN:
6803*22dc650dSSadaf Ebrahimi     bravalue = OP_SCRIPT_RUN;
6804*22dc650dSSadaf Ebrahimi     goto GROUP_PROCESS_NOTE_EMPTY;
6805*22dc650dSSadaf Ebrahimi 
6806*22dc650dSSadaf Ebrahimi     case META_NOCAPTURE:
6807*22dc650dSSadaf Ebrahimi     bravalue = OP_BRA;
6808*22dc650dSSadaf Ebrahimi     /* Fall through */
6809*22dc650dSSadaf Ebrahimi 
6810*22dc650dSSadaf Ebrahimi     /* Process nested bracketed regex. The nesting depth is maintained for the
6811*22dc650dSSadaf Ebrahimi     benefit of the stackguard function. The test for too deep nesting is now
6812*22dc650dSSadaf Ebrahimi     done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6813*22dc650dSSadaf Ebrahimi     others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6814*22dc650dSSadaf Ebrahimi     note of whether or not they may match an empty string. */
6815*22dc650dSSadaf Ebrahimi 
6816*22dc650dSSadaf Ebrahimi     GROUP_PROCESS_NOTE_EMPTY:
6817*22dc650dSSadaf Ebrahimi     note_group_empty = TRUE;
6818*22dc650dSSadaf Ebrahimi 
6819*22dc650dSSadaf Ebrahimi     GROUP_PROCESS:
6820*22dc650dSSadaf Ebrahimi     cb->parens_depth += 1;
6821*22dc650dSSadaf Ebrahimi     *code = bravalue;
6822*22dc650dSSadaf Ebrahimi     pptr++;
6823*22dc650dSSadaf Ebrahimi     tempcode = code;
6824*22dc650dSSadaf Ebrahimi     tempreqvary = cb->req_varyopt;        /* Save value before group */
6825*22dc650dSSadaf Ebrahimi     length_prevgroup = 0;                 /* Initialize for pre-compile phase */
6826*22dc650dSSadaf Ebrahimi 
6827*22dc650dSSadaf Ebrahimi     if ((group_return =
6828*22dc650dSSadaf Ebrahimi          compile_regex(
6829*22dc650dSSadaf Ebrahimi          options,                         /* The options state */
6830*22dc650dSSadaf Ebrahimi          xoptions,                        /* The extra options state */
6831*22dc650dSSadaf Ebrahimi          &tempcode,                       /* Where to put code (updated) */
6832*22dc650dSSadaf Ebrahimi          &pptr,                           /* Input pointer (updated) */
6833*22dc650dSSadaf Ebrahimi          errorcodeptr,                    /* Where to put an error message */
6834*22dc650dSSadaf Ebrahimi          skipunits,                       /* Skip over bracket number */
6835*22dc650dSSadaf Ebrahimi          &subfirstcu,                     /* For possible first char */
6836*22dc650dSSadaf Ebrahimi          &subfirstcuflags,
6837*22dc650dSSadaf Ebrahimi          &subreqcu,                       /* For possible last char */
6838*22dc650dSSadaf Ebrahimi          &subreqcuflags,
6839*22dc650dSSadaf Ebrahimi          bcptr,                           /* Current branch chain */
6840*22dc650dSSadaf Ebrahimi          open_caps,                       /* Pointer to capture stack */
6841*22dc650dSSadaf Ebrahimi          cb,                              /* Compile data block */
6842*22dc650dSSadaf Ebrahimi          (lengthptr == NULL)? NULL :      /* Actual compile phase */
6843*22dc650dSSadaf Ebrahimi            &length_prevgroup              /* Pre-compile phase */
6844*22dc650dSSadaf Ebrahimi          )) == 0)
6845*22dc650dSSadaf Ebrahimi       return 0;  /* Error */
6846*22dc650dSSadaf Ebrahimi 
6847*22dc650dSSadaf Ebrahimi     cb->parens_depth -= 1;
6848*22dc650dSSadaf Ebrahimi 
6849*22dc650dSSadaf Ebrahimi     /* If that was a non-conditional significant group (not an assertion, not a
6850*22dc650dSSadaf Ebrahimi     DEFINE) that matches at least one character, then the current item matches
6851*22dc650dSSadaf Ebrahimi     a character. Conditionals are handled below. */
6852*22dc650dSSadaf Ebrahimi 
6853*22dc650dSSadaf Ebrahimi     if (note_group_empty && bravalue != OP_COND && group_return > 0)
6854*22dc650dSSadaf Ebrahimi       matched_char = TRUE;
6855*22dc650dSSadaf Ebrahimi 
6856*22dc650dSSadaf Ebrahimi     /* If we've just compiled an assertion, pop the assert depth. */
6857*22dc650dSSadaf Ebrahimi 
6858*22dc650dSSadaf Ebrahimi     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NA)
6859*22dc650dSSadaf Ebrahimi       cb->assert_depth -= 1;
6860*22dc650dSSadaf Ebrahimi 
6861*22dc650dSSadaf Ebrahimi     /* At the end of compiling, code is still pointing to the start of the
6862*22dc650dSSadaf Ebrahimi     group, while tempcode has been updated to point past the end of the group.
6863*22dc650dSSadaf Ebrahimi     The parsed pattern pointer (pptr) is on the closing META_KET.
6864*22dc650dSSadaf Ebrahimi 
6865*22dc650dSSadaf Ebrahimi     If this is a conditional bracket, check that there are no more than
6866*22dc650dSSadaf Ebrahimi     two branches in the group, or just one if it's a DEFINE group. We do this
6867*22dc650dSSadaf Ebrahimi     in the real compile phase, not in the pre-pass, where the whole group may
6868*22dc650dSSadaf Ebrahimi     not be available. */
6869*22dc650dSSadaf Ebrahimi 
6870*22dc650dSSadaf Ebrahimi     if (bravalue == OP_COND && lengthptr == NULL)
6871*22dc650dSSadaf Ebrahimi       {
6872*22dc650dSSadaf Ebrahimi       PCRE2_UCHAR *tc = code;
6873*22dc650dSSadaf Ebrahimi       int condcount = 0;
6874*22dc650dSSadaf Ebrahimi 
6875*22dc650dSSadaf Ebrahimi       do {
6876*22dc650dSSadaf Ebrahimi          condcount++;
6877*22dc650dSSadaf Ebrahimi          tc += GET(tc,1);
6878*22dc650dSSadaf Ebrahimi          }
6879*22dc650dSSadaf Ebrahimi       while (*tc != OP_KET);
6880*22dc650dSSadaf Ebrahimi 
6881*22dc650dSSadaf Ebrahimi       /* A DEFINE group is never obeyed inline (the "condition" is always
6882*22dc650dSSadaf Ebrahimi       false). It must have only one branch. Having checked this, change the
6883*22dc650dSSadaf Ebrahimi       opcode to OP_FALSE. */
6884*22dc650dSSadaf Ebrahimi 
6885*22dc650dSSadaf Ebrahimi       if (code[LINK_SIZE+1] == OP_DEFINE)
6886*22dc650dSSadaf Ebrahimi         {
6887*22dc650dSSadaf Ebrahimi         if (condcount > 1)
6888*22dc650dSSadaf Ebrahimi           {
6889*22dc650dSSadaf Ebrahimi           cb->erroroffset = offset;
6890*22dc650dSSadaf Ebrahimi           *errorcodeptr = ERR54;
6891*22dc650dSSadaf Ebrahimi           return 0;
6892*22dc650dSSadaf Ebrahimi           }
6893*22dc650dSSadaf Ebrahimi         code[LINK_SIZE+1] = OP_FALSE;
6894*22dc650dSSadaf Ebrahimi         bravalue = OP_DEFINE;   /* A flag to suppress char handling below */
6895*22dc650dSSadaf Ebrahimi         }
6896*22dc650dSSadaf Ebrahimi 
6897*22dc650dSSadaf Ebrahimi       /* A "normal" conditional group. If there is just one branch, we must not
6898*22dc650dSSadaf Ebrahimi       make use of its firstcu or reqcu, because this is equivalent to an
6899*22dc650dSSadaf Ebrahimi       empty second branch. Also, it may match an empty string. If there are two
6900*22dc650dSSadaf Ebrahimi       branches, this item must match a character if the group must. */
6901*22dc650dSSadaf Ebrahimi 
6902*22dc650dSSadaf Ebrahimi       else
6903*22dc650dSSadaf Ebrahimi         {
6904*22dc650dSSadaf Ebrahimi         if (condcount > 2)
6905*22dc650dSSadaf Ebrahimi           {
6906*22dc650dSSadaf Ebrahimi           cb->erroroffset = offset;
6907*22dc650dSSadaf Ebrahimi           *errorcodeptr = ERR27;
6908*22dc650dSSadaf Ebrahimi           return 0;
6909*22dc650dSSadaf Ebrahimi           }
6910*22dc650dSSadaf Ebrahimi         if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
6911*22dc650dSSadaf Ebrahimi           else if (group_return > 0) matched_char = TRUE;
6912*22dc650dSSadaf Ebrahimi         }
6913*22dc650dSSadaf Ebrahimi       }
6914*22dc650dSSadaf Ebrahimi 
6915*22dc650dSSadaf Ebrahimi     /* In the pre-compile phase, update the length by the length of the group,
6916*22dc650dSSadaf Ebrahimi     less the brackets at either end. Then reduce the compiled code to just a
6917*22dc650dSSadaf Ebrahimi     set of non-capturing brackets so that it doesn't use much memory if it is
6918*22dc650dSSadaf Ebrahimi     duplicated by a quantifier.*/
6919*22dc650dSSadaf Ebrahimi 
6920*22dc650dSSadaf Ebrahimi     if (lengthptr != NULL)
6921*22dc650dSSadaf Ebrahimi       {
6922*22dc650dSSadaf Ebrahimi       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6923*22dc650dSSadaf Ebrahimi         {
6924*22dc650dSSadaf Ebrahimi         *errorcodeptr = ERR20;
6925*22dc650dSSadaf Ebrahimi         return 0;
6926*22dc650dSSadaf Ebrahimi         }
6927*22dc650dSSadaf Ebrahimi       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6928*22dc650dSSadaf Ebrahimi       code++;   /* This already contains bravalue */
6929*22dc650dSSadaf Ebrahimi       PUTINC(code, 0, 1 + LINK_SIZE);
6930*22dc650dSSadaf Ebrahimi       *code++ = OP_KET;
6931*22dc650dSSadaf Ebrahimi       PUTINC(code, 0, 1 + LINK_SIZE);
6932*22dc650dSSadaf Ebrahimi       break;    /* No need to waste time with special character handling */
6933*22dc650dSSadaf Ebrahimi       }
6934*22dc650dSSadaf Ebrahimi 
6935*22dc650dSSadaf Ebrahimi     /* Otherwise update the main code pointer to the end of the group. */
6936*22dc650dSSadaf Ebrahimi 
6937*22dc650dSSadaf Ebrahimi     code = tempcode;
6938*22dc650dSSadaf Ebrahimi 
6939*22dc650dSSadaf Ebrahimi     /* For a DEFINE group, required and first character settings are not
6940*22dc650dSSadaf Ebrahimi     relevant. */
6941*22dc650dSSadaf Ebrahimi 
6942*22dc650dSSadaf Ebrahimi     if (bravalue == OP_DEFINE) break;
6943*22dc650dSSadaf Ebrahimi 
6944*22dc650dSSadaf Ebrahimi     /* Handle updating of the required and first code units for other types of
6945*22dc650dSSadaf Ebrahimi     group. Update for normal brackets of all kinds, and conditions with two
6946*22dc650dSSadaf Ebrahimi     branches (see code above). If the bracket is followed by a quantifier with
6947*22dc650dSSadaf Ebrahimi     zero repeat, we have to back off. Hence the definition of zeroreqcu and
6948*22dc650dSSadaf Ebrahimi     zerofirstcu outside the main loop so that they can be accessed for the back
6949*22dc650dSSadaf Ebrahimi     off. */
6950*22dc650dSSadaf Ebrahimi 
6951*22dc650dSSadaf Ebrahimi     zeroreqcu = reqcu;
6952*22dc650dSSadaf Ebrahimi     zeroreqcuflags = reqcuflags;
6953*22dc650dSSadaf Ebrahimi     zerofirstcu = firstcu;
6954*22dc650dSSadaf Ebrahimi     zerofirstcuflags = firstcuflags;
6955*22dc650dSSadaf Ebrahimi     groupsetfirstcu = FALSE;
6956*22dc650dSSadaf Ebrahimi 
6957*22dc650dSSadaf Ebrahimi     if (bravalue >= OP_ONCE)  /* Not an assertion */
6958*22dc650dSSadaf Ebrahimi       {
6959*22dc650dSSadaf Ebrahimi       /* If we have not yet set a firstcu in this branch, take it from the
6960*22dc650dSSadaf Ebrahimi       subpattern, remembering that it was set here so that a repeat of more
6961*22dc650dSSadaf Ebrahimi       than one can replicate it as reqcu if necessary. If the subpattern has
6962*22dc650dSSadaf Ebrahimi       no firstcu, set "none" for the whole branch. In both cases, a zero
6963*22dc650dSSadaf Ebrahimi       repeat forces firstcu to "none". */
6964*22dc650dSSadaf Ebrahimi 
6965*22dc650dSSadaf Ebrahimi       if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
6966*22dc650dSSadaf Ebrahimi         {
6967*22dc650dSSadaf Ebrahimi         if (subfirstcuflags < REQ_NONE)
6968*22dc650dSSadaf Ebrahimi           {
6969*22dc650dSSadaf Ebrahimi           firstcu = subfirstcu;
6970*22dc650dSSadaf Ebrahimi           firstcuflags = subfirstcuflags;
6971*22dc650dSSadaf Ebrahimi           groupsetfirstcu = TRUE;
6972*22dc650dSSadaf Ebrahimi           }
6973*22dc650dSSadaf Ebrahimi         else firstcuflags = REQ_NONE;
6974*22dc650dSSadaf Ebrahimi         zerofirstcuflags = REQ_NONE;
6975*22dc650dSSadaf Ebrahimi         }
6976*22dc650dSSadaf Ebrahimi 
6977*22dc650dSSadaf Ebrahimi       /* If firstcu was previously set, convert the subpattern's firstcu
6978*22dc650dSSadaf Ebrahimi       into reqcu if there wasn't one, using the vary flag that was in
6979*22dc650dSSadaf Ebrahimi       existence beforehand. */
6980*22dc650dSSadaf Ebrahimi 
6981*22dc650dSSadaf Ebrahimi       else if (subfirstcuflags < REQ_NONE && subreqcuflags >= REQ_NONE)
6982*22dc650dSSadaf Ebrahimi         {
6983*22dc650dSSadaf Ebrahimi         subreqcu = subfirstcu;
6984*22dc650dSSadaf Ebrahimi         subreqcuflags = subfirstcuflags | tempreqvary;
6985*22dc650dSSadaf Ebrahimi         }
6986*22dc650dSSadaf Ebrahimi 
6987*22dc650dSSadaf Ebrahimi       /* If the subpattern set a required code unit (or set a first code unit
6988*22dc650dSSadaf Ebrahimi       that isn't really the first code unit - see above), set it. */
6989*22dc650dSSadaf Ebrahimi 
6990*22dc650dSSadaf Ebrahimi       if (subreqcuflags < REQ_NONE)
6991*22dc650dSSadaf Ebrahimi         {
6992*22dc650dSSadaf Ebrahimi         reqcu = subreqcu;
6993*22dc650dSSadaf Ebrahimi         reqcuflags = subreqcuflags;
6994*22dc650dSSadaf Ebrahimi         }
6995*22dc650dSSadaf Ebrahimi       }
6996*22dc650dSSadaf Ebrahimi 
6997*22dc650dSSadaf Ebrahimi     /* For a forward assertion, we take the reqcu, if set, provided that the
6998*22dc650dSSadaf Ebrahimi     group has also set a firstcu. This can be helpful if the pattern that
6999*22dc650dSSadaf Ebrahimi     follows the assertion doesn't set a different char. For example, it's
7000*22dc650dSSadaf Ebrahimi     useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
7001*22dc650dSSadaf Ebrahimi     because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
7002*22dc650dSSadaf Ebrahimi     the "real" "a" would then become a reqcu instead of a firstcu. This is
7003*22dc650dSSadaf Ebrahimi     overcome by a scan at the end if there's no firstcu, looking for an
7004*22dc650dSSadaf Ebrahimi     asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
7005*22dc650dSSadaf Ebrahimi     we must only take the reqcu when the group also set a firstcu. Otherwise,
7006*22dc650dSSadaf Ebrahimi     in that example, 'X' ends up set for both. */
7007*22dc650dSSadaf Ebrahimi 
7008*22dc650dSSadaf Ebrahimi     else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
7009*22dc650dSSadaf Ebrahimi              subreqcuflags < REQ_NONE && subfirstcuflags < REQ_NONE)
7010*22dc650dSSadaf Ebrahimi       {
7011*22dc650dSSadaf Ebrahimi       reqcu = subreqcu;
7012*22dc650dSSadaf Ebrahimi       reqcuflags = subreqcuflags;
7013*22dc650dSSadaf Ebrahimi       }
7014*22dc650dSSadaf Ebrahimi 
7015*22dc650dSSadaf Ebrahimi     break;  /* End of nested group handling */
7016*22dc650dSSadaf Ebrahimi 
7017*22dc650dSSadaf Ebrahimi 
7018*22dc650dSSadaf Ebrahimi     /* ===================================================================*/
7019*22dc650dSSadaf Ebrahimi     /* Handle named backreferences and recursions. */
7020*22dc650dSSadaf Ebrahimi 
7021*22dc650dSSadaf Ebrahimi     case META_BACKREF_BYNAME:
7022*22dc650dSSadaf Ebrahimi     case META_RECURSE_BYNAME:
7023*22dc650dSSadaf Ebrahimi       {
7024*22dc650dSSadaf Ebrahimi       int count, index;
7025*22dc650dSSadaf Ebrahimi       PCRE2_SPTR name;
7026*22dc650dSSadaf Ebrahimi       BOOL is_dupname = FALSE;
7027*22dc650dSSadaf Ebrahimi       named_group *ng = cb->named_groups;
7028*22dc650dSSadaf Ebrahimi       uint32_t length = *(++pptr);
7029*22dc650dSSadaf Ebrahimi 
7030*22dc650dSSadaf Ebrahimi       GETPLUSOFFSET(offset, pptr);
7031*22dc650dSSadaf Ebrahimi       name = cb->start_pattern + offset;
7032*22dc650dSSadaf Ebrahimi 
7033*22dc650dSSadaf Ebrahimi       /* In the first pass, the names generated in the pre-pass are available,
7034*22dc650dSSadaf Ebrahimi       but the main name table has not yet been created. Scan the list of names
7035*22dc650dSSadaf Ebrahimi       generated in the pre-pass in order to get a number and whether or not
7036*22dc650dSSadaf Ebrahimi       this name is duplicated. */
7037*22dc650dSSadaf Ebrahimi 
7038*22dc650dSSadaf Ebrahimi       groupnumber = 0;
7039*22dc650dSSadaf Ebrahimi       for (unsigned int i = 0; i < cb->names_found; i++, ng++)
7040*22dc650dSSadaf Ebrahimi         {
7041*22dc650dSSadaf Ebrahimi         if (length == ng->length &&
7042*22dc650dSSadaf Ebrahimi             PRIV(strncmp)(name, ng->name, length) == 0)
7043*22dc650dSSadaf Ebrahimi           {
7044*22dc650dSSadaf Ebrahimi           is_dupname = ng->isdup;
7045*22dc650dSSadaf Ebrahimi           groupnumber = ng->number;
7046*22dc650dSSadaf Ebrahimi 
7047*22dc650dSSadaf Ebrahimi           /* For a recursion, that's all that is needed. We can now go to
7048*22dc650dSSadaf Ebrahimi           the code that handles numerical recursion, applying it to the first
7049*22dc650dSSadaf Ebrahimi           group with the given name. */
7050*22dc650dSSadaf Ebrahimi 
7051*22dc650dSSadaf Ebrahimi           if (meta == META_RECURSE_BYNAME)
7052*22dc650dSSadaf Ebrahimi             {
7053*22dc650dSSadaf Ebrahimi             meta_arg = groupnumber;
7054*22dc650dSSadaf Ebrahimi             goto HANDLE_NUMERICAL_RECURSION;
7055*22dc650dSSadaf Ebrahimi             }
7056*22dc650dSSadaf Ebrahimi 
7057*22dc650dSSadaf Ebrahimi           /* For a back reference, update the back reference map and the
7058*22dc650dSSadaf Ebrahimi           maximum back reference. */
7059*22dc650dSSadaf Ebrahimi 
7060*22dc650dSSadaf Ebrahimi           cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
7061*22dc650dSSadaf Ebrahimi           if (groupnumber > cb->top_backref)
7062*22dc650dSSadaf Ebrahimi             cb->top_backref = groupnumber;
7063*22dc650dSSadaf Ebrahimi           }
7064*22dc650dSSadaf Ebrahimi         }
7065*22dc650dSSadaf Ebrahimi 
7066*22dc650dSSadaf Ebrahimi       /* If the name was not found we have a bad reference. */
7067*22dc650dSSadaf Ebrahimi 
7068*22dc650dSSadaf Ebrahimi       if (groupnumber == 0)
7069*22dc650dSSadaf Ebrahimi         {
7070*22dc650dSSadaf Ebrahimi         *errorcodeptr = ERR15;
7071*22dc650dSSadaf Ebrahimi         cb->erroroffset = offset;
7072*22dc650dSSadaf Ebrahimi         return 0;
7073*22dc650dSSadaf Ebrahimi         }
7074*22dc650dSSadaf Ebrahimi 
7075*22dc650dSSadaf Ebrahimi       /* If a back reference name is not duplicated, we can handle it as
7076*22dc650dSSadaf Ebrahimi       a numerical reference. */
7077*22dc650dSSadaf Ebrahimi 
7078*22dc650dSSadaf Ebrahimi       if (!is_dupname)
7079*22dc650dSSadaf Ebrahimi         {
7080*22dc650dSSadaf Ebrahimi         meta_arg = groupnumber;
7081*22dc650dSSadaf Ebrahimi         goto HANDLE_SINGLE_REFERENCE;
7082*22dc650dSSadaf Ebrahimi         }
7083*22dc650dSSadaf Ebrahimi 
7084*22dc650dSSadaf Ebrahimi       /* If a back reference name is duplicated, we generate a different
7085*22dc650dSSadaf Ebrahimi       opcode to a numerical back reference. In the second pass we must
7086*22dc650dSSadaf Ebrahimi       search for the index and count in the final name table. */
7087*22dc650dSSadaf Ebrahimi 
7088*22dc650dSSadaf Ebrahimi       count = 0;  /* Values for first pass (avoids compiler warning) */
7089*22dc650dSSadaf Ebrahimi       index = 0;
7090*22dc650dSSadaf Ebrahimi       if (lengthptr == NULL && !find_dupname_details(name, length, &index,
7091*22dc650dSSadaf Ebrahimi             &count, errorcodeptr, cb)) return 0;
7092*22dc650dSSadaf Ebrahimi 
7093*22dc650dSSadaf Ebrahimi       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7094*22dc650dSSadaf Ebrahimi       *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7095*22dc650dSSadaf Ebrahimi       PUT2INC(code, 0, index);
7096*22dc650dSSadaf Ebrahimi       PUT2INC(code, 0, count);
7097*22dc650dSSadaf Ebrahimi       }
7098*22dc650dSSadaf Ebrahimi     break;
7099*22dc650dSSadaf Ebrahimi 
7100*22dc650dSSadaf Ebrahimi 
7101*22dc650dSSadaf Ebrahimi     /* ===================================================================*/
7102*22dc650dSSadaf Ebrahimi     /* Handle a numerical callout. */
7103*22dc650dSSadaf Ebrahimi 
7104*22dc650dSSadaf Ebrahimi     case META_CALLOUT_NUMBER:
7105*22dc650dSSadaf Ebrahimi     code[0] = OP_CALLOUT;
7106*22dc650dSSadaf Ebrahimi     PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
7107*22dc650dSSadaf Ebrahimi     PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
7108*22dc650dSSadaf Ebrahimi     code[1 + 2*LINK_SIZE] = pptr[3];
7109*22dc650dSSadaf Ebrahimi     pptr += 3;
7110*22dc650dSSadaf Ebrahimi     code += PRIV(OP_lengths)[OP_CALLOUT];
7111*22dc650dSSadaf Ebrahimi     break;
7112*22dc650dSSadaf Ebrahimi 
7113*22dc650dSSadaf Ebrahimi 
7114*22dc650dSSadaf Ebrahimi     /* ===================================================================*/
7115*22dc650dSSadaf Ebrahimi     /* Handle a callout with a string argument. In the pre-pass we just compute
7116*22dc650dSSadaf Ebrahimi     the length without generating anything. The length in pptr[3] includes both
7117*22dc650dSSadaf Ebrahimi     delimiters; in the actual compile only the first one is copied, but a
7118*22dc650dSSadaf Ebrahimi     terminating zero is added. Any doubled delimiters within the string make
7119*22dc650dSSadaf Ebrahimi     this an overestimate, but it is not worth bothering about. */
7120*22dc650dSSadaf Ebrahimi 
7121*22dc650dSSadaf Ebrahimi     case META_CALLOUT_STRING:
7122*22dc650dSSadaf Ebrahimi     if (lengthptr != NULL)
7123*22dc650dSSadaf Ebrahimi       {
7124*22dc650dSSadaf Ebrahimi       *lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
7125*22dc650dSSadaf Ebrahimi       pptr += 3;
7126*22dc650dSSadaf Ebrahimi       SKIPOFFSET(pptr);
7127*22dc650dSSadaf Ebrahimi       }
7128*22dc650dSSadaf Ebrahimi 
7129*22dc650dSSadaf Ebrahimi     /* In the real compile we can copy the string. The starting delimiter is
7130*22dc650dSSadaf Ebrahimi      included so that the client can discover it if they want. We also pass the
7131*22dc650dSSadaf Ebrahimi      start offset to help a script language give better error messages. */
7132*22dc650dSSadaf Ebrahimi 
7133*22dc650dSSadaf Ebrahimi     else
7134*22dc650dSSadaf Ebrahimi       {
7135*22dc650dSSadaf Ebrahimi       PCRE2_SPTR pp;
7136*22dc650dSSadaf Ebrahimi       uint32_t delimiter;
7137*22dc650dSSadaf Ebrahimi       uint32_t length = pptr[3];
7138*22dc650dSSadaf Ebrahimi       PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
7139*22dc650dSSadaf Ebrahimi 
7140*22dc650dSSadaf Ebrahimi       code[0] = OP_CALLOUT_STR;
7141*22dc650dSSadaf Ebrahimi       PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
7142*22dc650dSSadaf Ebrahimi       PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
7143*22dc650dSSadaf Ebrahimi 
7144*22dc650dSSadaf Ebrahimi       pptr += 3;
7145*22dc650dSSadaf Ebrahimi       GETPLUSOFFSET(offset, pptr);         /* Offset to string in pattern */
7146*22dc650dSSadaf Ebrahimi       pp = cb->start_pattern + offset;
7147*22dc650dSSadaf Ebrahimi       delimiter = *callout_string++ = *pp++;
7148*22dc650dSSadaf Ebrahimi       if (delimiter == CHAR_LEFT_CURLY_BRACKET)
7149*22dc650dSSadaf Ebrahimi         delimiter = CHAR_RIGHT_CURLY_BRACKET;
7150*22dc650dSSadaf Ebrahimi       PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1));  /* One after delimiter */
7151*22dc650dSSadaf Ebrahimi 
7152*22dc650dSSadaf Ebrahimi       /* The syntax of the pattern was checked in the parsing scan. The length
7153*22dc650dSSadaf Ebrahimi       includes both delimiters, but we have passed the opening one just above,
7154*22dc650dSSadaf Ebrahimi       so we reduce length before testing it. The test is for > 1 because we do
7155*22dc650dSSadaf Ebrahimi       not want to copy the final delimiter. This also ensures that pp[1] is
7156*22dc650dSSadaf Ebrahimi       accessible. */
7157*22dc650dSSadaf Ebrahimi 
7158*22dc650dSSadaf Ebrahimi       while (--length > 1)
7159*22dc650dSSadaf Ebrahimi         {
7160*22dc650dSSadaf Ebrahimi         if (*pp == delimiter && pp[1] == delimiter)
7161*22dc650dSSadaf Ebrahimi           {
7162*22dc650dSSadaf Ebrahimi           *callout_string++ = delimiter;
7163*22dc650dSSadaf Ebrahimi           pp += 2;
7164*22dc650dSSadaf Ebrahimi           length--;
7165*22dc650dSSadaf Ebrahimi           }
7166*22dc650dSSadaf Ebrahimi         else *callout_string++ = *pp++;
7167*22dc650dSSadaf Ebrahimi         }
7168*22dc650dSSadaf Ebrahimi       *callout_string++ = CHAR_NUL;
7169*22dc650dSSadaf Ebrahimi 
7170*22dc650dSSadaf Ebrahimi       /* Set the length of the entire item, the advance to its end. */
7171*22dc650dSSadaf Ebrahimi 
7172*22dc650dSSadaf Ebrahimi       PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
7173*22dc650dSSadaf Ebrahimi       code = callout_string;
7174*22dc650dSSadaf Ebrahimi       }
7175*22dc650dSSadaf Ebrahimi     break;
7176*22dc650dSSadaf Ebrahimi 
7177*22dc650dSSadaf Ebrahimi 
7178*22dc650dSSadaf Ebrahimi     /* ===================================================================*/
7179*22dc650dSSadaf Ebrahimi     /* Handle repetition. The different types are all sorted out in the parsing
7180*22dc650dSSadaf Ebrahimi     pass. */
7181*22dc650dSSadaf Ebrahimi 
7182*22dc650dSSadaf Ebrahimi     case META_MINMAX_PLUS:
7183*22dc650dSSadaf Ebrahimi     case META_MINMAX_QUERY:
7184*22dc650dSSadaf Ebrahimi     case META_MINMAX:
7185*22dc650dSSadaf Ebrahimi     repeat_min = *(++pptr);
7186*22dc650dSSadaf Ebrahimi     repeat_max = *(++pptr);
7187*22dc650dSSadaf Ebrahimi     goto REPEAT;
7188*22dc650dSSadaf Ebrahimi 
7189*22dc650dSSadaf Ebrahimi     case META_ASTERISK:
7190*22dc650dSSadaf Ebrahimi     case META_ASTERISK_PLUS:
7191*22dc650dSSadaf Ebrahimi     case META_ASTERISK_QUERY:
7192*22dc650dSSadaf Ebrahimi     repeat_min = 0;
7193*22dc650dSSadaf Ebrahimi     repeat_max = REPEAT_UNLIMITED;
7194*22dc650dSSadaf Ebrahimi     goto REPEAT;
7195*22dc650dSSadaf Ebrahimi 
7196*22dc650dSSadaf Ebrahimi     case META_PLUS:
7197*22dc650dSSadaf Ebrahimi     case META_PLUS_PLUS:
7198*22dc650dSSadaf Ebrahimi     case META_PLUS_QUERY:
7199*22dc650dSSadaf Ebrahimi     repeat_min = 1;
7200*22dc650dSSadaf Ebrahimi     repeat_max = REPEAT_UNLIMITED;
7201*22dc650dSSadaf Ebrahimi     goto REPEAT;
7202*22dc650dSSadaf Ebrahimi 
7203*22dc650dSSadaf Ebrahimi     case META_QUERY:
7204*22dc650dSSadaf Ebrahimi     case META_QUERY_PLUS:
7205*22dc650dSSadaf Ebrahimi     case META_QUERY_QUERY:
7206*22dc650dSSadaf Ebrahimi     repeat_min = 0;
7207*22dc650dSSadaf Ebrahimi     repeat_max = 1;
7208*22dc650dSSadaf Ebrahimi 
7209*22dc650dSSadaf Ebrahimi     REPEAT:
7210*22dc650dSSadaf Ebrahimi     if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
7211*22dc650dSSadaf Ebrahimi 
7212*22dc650dSSadaf Ebrahimi     /* Remember whether this is a variable length repeat, and default to
7213*22dc650dSSadaf Ebrahimi     single-char opcodes. */
7214*22dc650dSSadaf Ebrahimi 
7215*22dc650dSSadaf Ebrahimi     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
7216*22dc650dSSadaf Ebrahimi     op_type = 0;
7217*22dc650dSSadaf Ebrahimi 
7218*22dc650dSSadaf Ebrahimi     /* Adjust first and required code units for a zero repeat. */
7219*22dc650dSSadaf Ebrahimi 
7220*22dc650dSSadaf Ebrahimi     if (repeat_min == 0)
7221*22dc650dSSadaf Ebrahimi       {
7222*22dc650dSSadaf Ebrahimi       firstcu = zerofirstcu;
7223*22dc650dSSadaf Ebrahimi       firstcuflags = zerofirstcuflags;
7224*22dc650dSSadaf Ebrahimi       reqcu = zeroreqcu;
7225*22dc650dSSadaf Ebrahimi       reqcuflags = zeroreqcuflags;
7226*22dc650dSSadaf Ebrahimi       }
7227*22dc650dSSadaf Ebrahimi 
7228*22dc650dSSadaf Ebrahimi     /* Note the greediness and possessiveness. */
7229*22dc650dSSadaf Ebrahimi 
7230*22dc650dSSadaf Ebrahimi     switch (meta)
7231*22dc650dSSadaf Ebrahimi       {
7232*22dc650dSSadaf Ebrahimi       case META_MINMAX_PLUS:
7233*22dc650dSSadaf Ebrahimi       case META_ASTERISK_PLUS:
7234*22dc650dSSadaf Ebrahimi       case META_PLUS_PLUS:
7235*22dc650dSSadaf Ebrahimi       case META_QUERY_PLUS:
7236*22dc650dSSadaf Ebrahimi       repeat_type = 0;                  /* Force greedy */
7237*22dc650dSSadaf Ebrahimi       possessive_quantifier = TRUE;
7238*22dc650dSSadaf Ebrahimi       break;
7239*22dc650dSSadaf Ebrahimi 
7240*22dc650dSSadaf Ebrahimi       case META_MINMAX_QUERY:
7241*22dc650dSSadaf Ebrahimi       case META_ASTERISK_QUERY:
7242*22dc650dSSadaf Ebrahimi       case META_PLUS_QUERY:
7243*22dc650dSSadaf Ebrahimi       case META_QUERY_QUERY:
7244*22dc650dSSadaf Ebrahimi       repeat_type = greedy_non_default;
7245*22dc650dSSadaf Ebrahimi       possessive_quantifier = FALSE;
7246*22dc650dSSadaf Ebrahimi       break;
7247*22dc650dSSadaf Ebrahimi 
7248*22dc650dSSadaf Ebrahimi       default:
7249*22dc650dSSadaf Ebrahimi       repeat_type = greedy_default;
7250*22dc650dSSadaf Ebrahimi       possessive_quantifier = FALSE;
7251*22dc650dSSadaf Ebrahimi       break;
7252*22dc650dSSadaf Ebrahimi       }
7253*22dc650dSSadaf Ebrahimi 
7254*22dc650dSSadaf Ebrahimi     /* Save start of previous item, in case we have to move it up in order to
7255*22dc650dSSadaf Ebrahimi     insert something before it, and remember what it was. */
7256*22dc650dSSadaf Ebrahimi 
7257*22dc650dSSadaf Ebrahimi     tempcode = previous;
7258*22dc650dSSadaf Ebrahimi     op_previous = *previous;
7259*22dc650dSSadaf Ebrahimi 
7260*22dc650dSSadaf Ebrahimi     /* Now handle repetition for the different types of item. If the repeat
7261*22dc650dSSadaf Ebrahimi     minimum and the repeat maximum are both 1, we can ignore the quantifier for
7262*22dc650dSSadaf Ebrahimi     non-parenthesized items, as they have only one alternative. For anything in
7263*22dc650dSSadaf Ebrahimi     parentheses, we must not ignore if {1} is possessive. */
7264*22dc650dSSadaf Ebrahimi 
7265*22dc650dSSadaf Ebrahimi     switch (op_previous)
7266*22dc650dSSadaf Ebrahimi       {
7267*22dc650dSSadaf Ebrahimi       /* If previous was a character or negated character match, abolish the
7268*22dc650dSSadaf Ebrahimi       item and generate a repeat item instead. If a char item has a minimum of
7269*22dc650dSSadaf Ebrahimi       more than one, ensure that it is set in reqcu - it might not be if a
7270*22dc650dSSadaf Ebrahimi       sequence such as x{3} is the first thing in a branch because the x will
7271*22dc650dSSadaf Ebrahimi       have gone into firstcu instead.  */
7272*22dc650dSSadaf Ebrahimi 
7273*22dc650dSSadaf Ebrahimi       case OP_CHAR:
7274*22dc650dSSadaf Ebrahimi       case OP_CHARI:
7275*22dc650dSSadaf Ebrahimi       case OP_NOT:
7276*22dc650dSSadaf Ebrahimi       case OP_NOTI:
7277*22dc650dSSadaf Ebrahimi       if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7278*22dc650dSSadaf Ebrahimi       op_type = chartypeoffset[op_previous - OP_CHAR];
7279*22dc650dSSadaf Ebrahimi 
7280*22dc650dSSadaf Ebrahimi       /* Deal with UTF characters that take up more than one code unit. */
7281*22dc650dSSadaf Ebrahimi 
7282*22dc650dSSadaf Ebrahimi #ifdef MAYBE_UTF_MULTI
7283*22dc650dSSadaf Ebrahimi       if (utf && NOT_FIRSTCU(code[-1]))
7284*22dc650dSSadaf Ebrahimi         {
7285*22dc650dSSadaf Ebrahimi         PCRE2_UCHAR *lastchar = code - 1;
7286*22dc650dSSadaf Ebrahimi         BACKCHAR(lastchar);
7287*22dc650dSSadaf Ebrahimi         mclength = (uint32_t)(code - lastchar);   /* Length of UTF character */
7288*22dc650dSSadaf Ebrahimi         memcpy(mcbuffer, lastchar, CU2BYTES(mclength));  /* Save the char */
7289*22dc650dSSadaf Ebrahimi         }
7290*22dc650dSSadaf Ebrahimi       else
7291*22dc650dSSadaf Ebrahimi #endif  /* MAYBE_UTF_MULTI */
7292*22dc650dSSadaf Ebrahimi 
7293*22dc650dSSadaf Ebrahimi       /* Handle the case of a single code unit - either with no UTF support, or
7294*22dc650dSSadaf Ebrahimi       with UTF disabled, or for a single-code-unit UTF character. In the latter
7295*22dc650dSSadaf Ebrahimi       case, for a repeated positive match, get the caseless flag for the
7296*22dc650dSSadaf Ebrahimi       required code unit from the previous character, because a class like [Aa]
7297*22dc650dSSadaf Ebrahimi       sets a caseless A but by now the req_caseopt flag has been reset. */
7298*22dc650dSSadaf Ebrahimi 
7299*22dc650dSSadaf Ebrahimi         {
7300*22dc650dSSadaf Ebrahimi         mcbuffer[0] = code[-1];
7301*22dc650dSSadaf Ebrahimi         mclength = 1;
7302*22dc650dSSadaf Ebrahimi         if (op_previous <= OP_CHARI && repeat_min > 1)
7303*22dc650dSSadaf Ebrahimi           {
7304*22dc650dSSadaf Ebrahimi           reqcu = mcbuffer[0];
7305*22dc650dSSadaf Ebrahimi           reqcuflags = cb->req_varyopt;
7306*22dc650dSSadaf Ebrahimi           if (op_previous == OP_CHARI) reqcuflags |= REQ_CASELESS;
7307*22dc650dSSadaf Ebrahimi           }
7308*22dc650dSSadaf Ebrahimi         }
7309*22dc650dSSadaf Ebrahimi       goto OUTPUT_SINGLE_REPEAT;  /* Code shared with single character types */
7310*22dc650dSSadaf Ebrahimi 
7311*22dc650dSSadaf Ebrahimi       /* If previous was a character class or a back reference, we put the
7312*22dc650dSSadaf Ebrahimi       repeat stuff after it, but just skip the item if the repeat was {0,0}. */
7313*22dc650dSSadaf Ebrahimi 
7314*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_WIDE_CHARS
7315*22dc650dSSadaf Ebrahimi       case OP_XCLASS:
7316*22dc650dSSadaf Ebrahimi #endif
7317*22dc650dSSadaf Ebrahimi       case OP_CLASS:
7318*22dc650dSSadaf Ebrahimi       case OP_NCLASS:
7319*22dc650dSSadaf Ebrahimi       case OP_REF:
7320*22dc650dSSadaf Ebrahimi       case OP_REFI:
7321*22dc650dSSadaf Ebrahimi       case OP_DNREF:
7322*22dc650dSSadaf Ebrahimi       case OP_DNREFI:
7323*22dc650dSSadaf Ebrahimi 
7324*22dc650dSSadaf Ebrahimi       if (repeat_max == 0)
7325*22dc650dSSadaf Ebrahimi         {
7326*22dc650dSSadaf Ebrahimi         code = previous;
7327*22dc650dSSadaf Ebrahimi         goto END_REPEAT;
7328*22dc650dSSadaf Ebrahimi         }
7329*22dc650dSSadaf Ebrahimi       if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7330*22dc650dSSadaf Ebrahimi 
7331*22dc650dSSadaf Ebrahimi       if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
7332*22dc650dSSadaf Ebrahimi         *code++ = OP_CRSTAR + repeat_type;
7333*22dc650dSSadaf Ebrahimi       else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
7334*22dc650dSSadaf Ebrahimi         *code++ = OP_CRPLUS + repeat_type;
7335*22dc650dSSadaf Ebrahimi       else if (repeat_min == 0 && repeat_max == 1)
7336*22dc650dSSadaf Ebrahimi         *code++ = OP_CRQUERY + repeat_type;
7337*22dc650dSSadaf Ebrahimi       else
7338*22dc650dSSadaf Ebrahimi         {
7339*22dc650dSSadaf Ebrahimi         *code++ = OP_CRRANGE + repeat_type;
7340*22dc650dSSadaf Ebrahimi         PUT2INC(code, 0, repeat_min);
7341*22dc650dSSadaf Ebrahimi         if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0;  /* 2-byte encoding for max */
7342*22dc650dSSadaf Ebrahimi         PUT2INC(code, 0, repeat_max);
7343*22dc650dSSadaf Ebrahimi         }
7344*22dc650dSSadaf Ebrahimi       break;
7345*22dc650dSSadaf Ebrahimi 
7346*22dc650dSSadaf Ebrahimi       /* If previous is OP_FAIL, it was generated by an empty class []
7347*22dc650dSSadaf Ebrahimi       (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be
7348*22dc650dSSadaf Ebrahimi       generated, that is by (*FAIL) or (?!), disallow a quantifier at parse
7349*22dc650dSSadaf Ebrahimi       time. We can just ignore this repeat. */
7350*22dc650dSSadaf Ebrahimi 
7351*22dc650dSSadaf Ebrahimi       case OP_FAIL:
7352*22dc650dSSadaf Ebrahimi       goto END_REPEAT;
7353*22dc650dSSadaf Ebrahimi 
7354*22dc650dSSadaf Ebrahimi       /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
7355*22dc650dSSadaf Ebrahimi       because pcre2_match() could not handle backtracking into recursively
7356*22dc650dSSadaf Ebrahimi       called groups. Now that this backtracking is available, we no longer need
7357*22dc650dSSadaf Ebrahimi       to do this. However, we still need to replicate recursions as we do for
7358*22dc650dSSadaf Ebrahimi       groups so as to have independent backtracking points. We can replicate
7359*22dc650dSSadaf Ebrahimi       for the minimum number of repeats directly. For optional repeats we now
7360*22dc650dSSadaf Ebrahimi       wrap the recursion in OP_BRA brackets and make use of the bracket
7361*22dc650dSSadaf Ebrahimi       repetition. */
7362*22dc650dSSadaf Ebrahimi 
7363*22dc650dSSadaf Ebrahimi       case OP_RECURSE:
7364*22dc650dSSadaf Ebrahimi       if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7365*22dc650dSSadaf Ebrahimi         goto END_REPEAT;
7366*22dc650dSSadaf Ebrahimi 
7367*22dc650dSSadaf Ebrahimi       /* Generate unwrapped repeats for a non-zero minimum, except when the
7368*22dc650dSSadaf Ebrahimi       minimum is 1 and the maximum unlimited, because that can be handled with
7369*22dc650dSSadaf Ebrahimi       OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
7370*22dc650dSSadaf Ebrahimi       minimum, we just need to generate the appropriate additional copies.
7371*22dc650dSSadaf Ebrahimi       Otherwise we need to generate one more, to simulate the situation when
7372*22dc650dSSadaf Ebrahimi       the minimum is zero. */
7373*22dc650dSSadaf Ebrahimi 
7374*22dc650dSSadaf Ebrahimi       if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
7375*22dc650dSSadaf Ebrahimi         {
7376*22dc650dSSadaf Ebrahimi         int replicate = repeat_min;
7377*22dc650dSSadaf Ebrahimi         if (repeat_min == repeat_max) replicate--;
7378*22dc650dSSadaf Ebrahimi 
7379*22dc650dSSadaf Ebrahimi         /* In the pre-compile phase, we don't actually do the replication. We
7380*22dc650dSSadaf Ebrahimi         just adjust the length as if we had. Do some paranoid checks for
7381*22dc650dSSadaf Ebrahimi         potential integer overflow. */
7382*22dc650dSSadaf Ebrahimi 
7383*22dc650dSSadaf Ebrahimi         if (lengthptr != NULL)
7384*22dc650dSSadaf Ebrahimi           {
7385*22dc650dSSadaf Ebrahimi           PCRE2_SIZE delta;
7386*22dc650dSSadaf Ebrahimi           if (PRIV(ckd_smul)(&delta, replicate, 1 + LINK_SIZE) ||
7387*22dc650dSSadaf Ebrahimi               OFLOW_MAX - *lengthptr < delta)
7388*22dc650dSSadaf Ebrahimi             {
7389*22dc650dSSadaf Ebrahimi             *errorcodeptr = ERR20;
7390*22dc650dSSadaf Ebrahimi             return 0;
7391*22dc650dSSadaf Ebrahimi             }
7392*22dc650dSSadaf Ebrahimi           *lengthptr += delta;
7393*22dc650dSSadaf Ebrahimi           }
7394*22dc650dSSadaf Ebrahimi 
7395*22dc650dSSadaf Ebrahimi         else for (int i = 0; i < replicate; i++)
7396*22dc650dSSadaf Ebrahimi           {
7397*22dc650dSSadaf Ebrahimi           memcpy(code, previous, CU2BYTES(1 + LINK_SIZE));
7398*22dc650dSSadaf Ebrahimi           previous = code;
7399*22dc650dSSadaf Ebrahimi           code += 1 + LINK_SIZE;
7400*22dc650dSSadaf Ebrahimi           }
7401*22dc650dSSadaf Ebrahimi 
7402*22dc650dSSadaf Ebrahimi         /* If the number of repeats is fixed, we are done. Otherwise, adjust
7403*22dc650dSSadaf Ebrahimi         the counts and fall through. */
7404*22dc650dSSadaf Ebrahimi 
7405*22dc650dSSadaf Ebrahimi         if (repeat_min == repeat_max) break;
7406*22dc650dSSadaf Ebrahimi         if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7407*22dc650dSSadaf Ebrahimi         repeat_min = 0;
7408*22dc650dSSadaf Ebrahimi         }
7409*22dc650dSSadaf Ebrahimi 
7410*22dc650dSSadaf Ebrahimi       /* Wrap the recursion call in OP_BRA brackets. */
7411*22dc650dSSadaf Ebrahimi 
7412*22dc650dSSadaf Ebrahimi       (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));
7413*22dc650dSSadaf Ebrahimi       op_previous = *previous = OP_BRA;
7414*22dc650dSSadaf Ebrahimi       PUT(previous, 1, 2 + 2*LINK_SIZE);
7415*22dc650dSSadaf Ebrahimi       previous[2 + 2*LINK_SIZE] = OP_KET;
7416*22dc650dSSadaf Ebrahimi       PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
7417*22dc650dSSadaf Ebrahimi       code += 2 + 2 * LINK_SIZE;
7418*22dc650dSSadaf Ebrahimi       length_prevgroup = 3 + 3*LINK_SIZE;
7419*22dc650dSSadaf Ebrahimi       group_return = -1;  /* Set "may match empty string" */
7420*22dc650dSSadaf Ebrahimi 
7421*22dc650dSSadaf Ebrahimi       /* Now treat as a repeated OP_BRA. */
7422*22dc650dSSadaf Ebrahimi       /* Fall through */
7423*22dc650dSSadaf Ebrahimi 
7424*22dc650dSSadaf Ebrahimi       /* If previous was a bracket group, we may have to replicate it in
7425*22dc650dSSadaf Ebrahimi       certain cases. Note that at this point we can encounter only the "basic"
7426*22dc650dSSadaf Ebrahimi       bracket opcodes such as BRA and CBRA, as this is the place where they get
7427*22dc650dSSadaf Ebrahimi       converted into the more special varieties such as BRAPOS and SBRA.
7428*22dc650dSSadaf Ebrahimi       Originally, PCRE did not allow repetition of assertions, but now it does,
7429*22dc650dSSadaf Ebrahimi       for Perl compatibility. */
7430*22dc650dSSadaf Ebrahimi 
7431*22dc650dSSadaf Ebrahimi       case OP_ASSERT:
7432*22dc650dSSadaf Ebrahimi       case OP_ASSERT_NOT:
7433*22dc650dSSadaf Ebrahimi       case OP_ASSERT_NA:
7434*22dc650dSSadaf Ebrahimi       case OP_ASSERTBACK:
7435*22dc650dSSadaf Ebrahimi       case OP_ASSERTBACK_NOT:
7436*22dc650dSSadaf Ebrahimi       case OP_ASSERTBACK_NA:
7437*22dc650dSSadaf Ebrahimi       case OP_ONCE:
7438*22dc650dSSadaf Ebrahimi       case OP_SCRIPT_RUN:
7439*22dc650dSSadaf Ebrahimi       case OP_BRA:
7440*22dc650dSSadaf Ebrahimi       case OP_CBRA:
7441*22dc650dSSadaf Ebrahimi       case OP_COND:
7442*22dc650dSSadaf Ebrahimi         {
7443*22dc650dSSadaf Ebrahimi         int len = (int)(code - previous);
7444*22dc650dSSadaf Ebrahimi         PCRE2_UCHAR *bralink = NULL;
7445*22dc650dSSadaf Ebrahimi         PCRE2_UCHAR *brazeroptr = NULL;
7446*22dc650dSSadaf Ebrahimi 
7447*22dc650dSSadaf Ebrahimi         if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7448*22dc650dSSadaf Ebrahimi           goto END_REPEAT;
7449*22dc650dSSadaf Ebrahimi 
7450*22dc650dSSadaf Ebrahimi         /* Repeating a DEFINE group (or any group where the condition is always
7451*22dc650dSSadaf Ebrahimi         FALSE and there is only one branch) is pointless, but Perl allows the
7452*22dc650dSSadaf Ebrahimi         syntax, so we just ignore the repeat. */
7453*22dc650dSSadaf Ebrahimi 
7454*22dc650dSSadaf Ebrahimi         if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
7455*22dc650dSSadaf Ebrahimi             previous[GET(previous, 1)] != OP_ALT)
7456*22dc650dSSadaf Ebrahimi           goto END_REPEAT;
7457*22dc650dSSadaf Ebrahimi 
7458*22dc650dSSadaf Ebrahimi         /* Perl allows all assertions to be quantified, and when they contain
7459*22dc650dSSadaf Ebrahimi         capturing parentheses and/or are optional there are potential uses for
7460*22dc650dSSadaf Ebrahimi         this feature. PCRE2 used to force the maximum quantifier to 1 on the
7461*22dc650dSSadaf Ebrahimi         invalid grounds that further repetition was never useful. This was
7462*22dc650dSSadaf Ebrahimi         always a bit pointless, since an assertion could be wrapped with a
7463*22dc650dSSadaf Ebrahimi         repeated group to achieve the effect. General repetition is now
7464*22dc650dSSadaf Ebrahimi         permitted, but if the maximum is unlimited it is set to one more than
7465*22dc650dSSadaf Ebrahimi         the minimum. */
7466*22dc650dSSadaf Ebrahimi 
7467*22dc650dSSadaf Ebrahimi         if (op_previous < OP_ONCE)    /* Assertion */
7468*22dc650dSSadaf Ebrahimi           {
7469*22dc650dSSadaf Ebrahimi           if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;
7470*22dc650dSSadaf Ebrahimi           }
7471*22dc650dSSadaf Ebrahimi 
7472*22dc650dSSadaf Ebrahimi         /* The case of a zero minimum is special because of the need to stick
7473*22dc650dSSadaf Ebrahimi         OP_BRAZERO in front of it, and because the group appears once in the
7474*22dc650dSSadaf Ebrahimi         data, whereas in other cases it appears the minimum number of times. For
7475*22dc650dSSadaf Ebrahimi         this reason, it is simplest to treat this case separately, as otherwise
7476*22dc650dSSadaf Ebrahimi         the code gets far too messy. There are several special subcases when the
7477*22dc650dSSadaf Ebrahimi         minimum is zero. */
7478*22dc650dSSadaf Ebrahimi 
7479*22dc650dSSadaf Ebrahimi         if (repeat_min == 0)
7480*22dc650dSSadaf Ebrahimi           {
7481*22dc650dSSadaf Ebrahimi           /* If the maximum is also zero, we used to just omit the group from
7482*22dc650dSSadaf Ebrahimi           the output altogether, like this:
7483*22dc650dSSadaf Ebrahimi 
7484*22dc650dSSadaf Ebrahimi           ** if (repeat_max == 0)
7485*22dc650dSSadaf Ebrahimi           **   {
7486*22dc650dSSadaf Ebrahimi           **   code = previous;
7487*22dc650dSSadaf Ebrahimi           **   goto END_REPEAT;
7488*22dc650dSSadaf Ebrahimi           **   }
7489*22dc650dSSadaf Ebrahimi 
7490*22dc650dSSadaf Ebrahimi           However, that fails when a group or a subgroup within it is
7491*22dc650dSSadaf Ebrahimi           referenced as a subroutine from elsewhere in the pattern, so now we
7492*22dc650dSSadaf Ebrahimi           stick in OP_SKIPZERO in front of it so that it is skipped on
7493*22dc650dSSadaf Ebrahimi           execution. As we don't have a list of which groups are referenced, we
7494*22dc650dSSadaf Ebrahimi           cannot do this selectively.
7495*22dc650dSSadaf Ebrahimi 
7496*22dc650dSSadaf Ebrahimi           If the maximum is 1 or unlimited, we just have to stick in the
7497*22dc650dSSadaf Ebrahimi           BRAZERO and do no more at this point. */
7498*22dc650dSSadaf Ebrahimi 
7499*22dc650dSSadaf Ebrahimi           if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
7500*22dc650dSSadaf Ebrahimi             {
7501*22dc650dSSadaf Ebrahimi             (void)memmove(previous + 1, previous, CU2BYTES(len));
7502*22dc650dSSadaf Ebrahimi             code++;
7503*22dc650dSSadaf Ebrahimi             if (repeat_max == 0)
7504*22dc650dSSadaf Ebrahimi               {
7505*22dc650dSSadaf Ebrahimi               *previous++ = OP_SKIPZERO;
7506*22dc650dSSadaf Ebrahimi               goto END_REPEAT;
7507*22dc650dSSadaf Ebrahimi               }
7508*22dc650dSSadaf Ebrahimi             brazeroptr = previous;    /* Save for possessive optimizing */
7509*22dc650dSSadaf Ebrahimi             *previous++ = OP_BRAZERO + repeat_type;
7510*22dc650dSSadaf Ebrahimi             }
7511*22dc650dSSadaf Ebrahimi 
7512*22dc650dSSadaf Ebrahimi           /* If the maximum is greater than 1 and limited, we have to replicate
7513*22dc650dSSadaf Ebrahimi           in a nested fashion, sticking OP_BRAZERO before each set of brackets.
7514*22dc650dSSadaf Ebrahimi           The first one has to be handled carefully because it's the original
7515*22dc650dSSadaf Ebrahimi           copy, which has to be moved up. The remainder can be handled by code
7516*22dc650dSSadaf Ebrahimi           that is common with the non-zero minimum case below. We have to
7517*22dc650dSSadaf Ebrahimi           adjust the value or repeat_max, since one less copy is required. */
7518*22dc650dSSadaf Ebrahimi 
7519*22dc650dSSadaf Ebrahimi           else
7520*22dc650dSSadaf Ebrahimi             {
7521*22dc650dSSadaf Ebrahimi             int linkoffset;
7522*22dc650dSSadaf Ebrahimi             (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
7523*22dc650dSSadaf Ebrahimi             code += 2 + LINK_SIZE;
7524*22dc650dSSadaf Ebrahimi             *previous++ = OP_BRAZERO + repeat_type;
7525*22dc650dSSadaf Ebrahimi             *previous++ = OP_BRA;
7526*22dc650dSSadaf Ebrahimi 
7527*22dc650dSSadaf Ebrahimi             /* We chain together the bracket link offset fields that have to be
7528*22dc650dSSadaf Ebrahimi             filled in later when the ends of the brackets are reached. */
7529*22dc650dSSadaf Ebrahimi 
7530*22dc650dSSadaf Ebrahimi             linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
7531*22dc650dSSadaf Ebrahimi             bralink = previous;
7532*22dc650dSSadaf Ebrahimi             PUTINC(previous, 0, linkoffset);
7533*22dc650dSSadaf Ebrahimi             }
7534*22dc650dSSadaf Ebrahimi 
7535*22dc650dSSadaf Ebrahimi           if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
7536*22dc650dSSadaf Ebrahimi           }
7537*22dc650dSSadaf Ebrahimi 
7538*22dc650dSSadaf Ebrahimi         /* If the minimum is greater than zero, replicate the group as many
7539*22dc650dSSadaf Ebrahimi         times as necessary, and adjust the maximum to the number of subsequent
7540*22dc650dSSadaf Ebrahimi         copies that we need. */
7541*22dc650dSSadaf Ebrahimi 
7542*22dc650dSSadaf Ebrahimi         else
7543*22dc650dSSadaf Ebrahimi           {
7544*22dc650dSSadaf Ebrahimi           if (repeat_min > 1)
7545*22dc650dSSadaf Ebrahimi             {
7546*22dc650dSSadaf Ebrahimi             /* In the pre-compile phase, we don't actually do the replication.
7547*22dc650dSSadaf Ebrahimi             We just adjust the length as if we had. Do some paranoid checks for
7548*22dc650dSSadaf Ebrahimi             potential integer overflow. */
7549*22dc650dSSadaf Ebrahimi 
7550*22dc650dSSadaf Ebrahimi             if (lengthptr != NULL)
7551*22dc650dSSadaf Ebrahimi               {
7552*22dc650dSSadaf Ebrahimi               PCRE2_SIZE delta;
7553*22dc650dSSadaf Ebrahimi               if (PRIV(ckd_smul)(&delta, repeat_min - 1,
7554*22dc650dSSadaf Ebrahimi                                  (int)length_prevgroup) ||
7555*22dc650dSSadaf Ebrahimi                   OFLOW_MAX - *lengthptr < delta)
7556*22dc650dSSadaf Ebrahimi                 {
7557*22dc650dSSadaf Ebrahimi                 *errorcodeptr = ERR20;
7558*22dc650dSSadaf Ebrahimi                 return 0;
7559*22dc650dSSadaf Ebrahimi                 }
7560*22dc650dSSadaf Ebrahimi               *lengthptr += delta;
7561*22dc650dSSadaf Ebrahimi               }
7562*22dc650dSSadaf Ebrahimi 
7563*22dc650dSSadaf Ebrahimi             /* This is compiling for real. If there is a set first code unit
7564*22dc650dSSadaf Ebrahimi             for the group, and we have not yet set a "required code unit", set
7565*22dc650dSSadaf Ebrahimi             it. */
7566*22dc650dSSadaf Ebrahimi 
7567*22dc650dSSadaf Ebrahimi             else
7568*22dc650dSSadaf Ebrahimi               {
7569*22dc650dSSadaf Ebrahimi               if (groupsetfirstcu && reqcuflags >= REQ_NONE)
7570*22dc650dSSadaf Ebrahimi                 {
7571*22dc650dSSadaf Ebrahimi                 reqcu = firstcu;
7572*22dc650dSSadaf Ebrahimi                 reqcuflags = firstcuflags;
7573*22dc650dSSadaf Ebrahimi                 }
7574*22dc650dSSadaf Ebrahimi               for (uint32_t i = 1; i < repeat_min; i++)
7575*22dc650dSSadaf Ebrahimi                 {
7576*22dc650dSSadaf Ebrahimi                 memcpy(code, previous, CU2BYTES(len));
7577*22dc650dSSadaf Ebrahimi                 code += len;
7578*22dc650dSSadaf Ebrahimi                 }
7579*22dc650dSSadaf Ebrahimi               }
7580*22dc650dSSadaf Ebrahimi             }
7581*22dc650dSSadaf Ebrahimi 
7582*22dc650dSSadaf Ebrahimi           if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7583*22dc650dSSadaf Ebrahimi           }
7584*22dc650dSSadaf Ebrahimi 
7585*22dc650dSSadaf Ebrahimi         /* This code is common to both the zero and non-zero minimum cases. If
7586*22dc650dSSadaf Ebrahimi         the maximum is limited, it replicates the group in a nested fashion,
7587*22dc650dSSadaf Ebrahimi         remembering the bracket starts on a stack. In the case of a zero
7588*22dc650dSSadaf Ebrahimi         minimum, the first one was set up above. In all cases the repeat_max
7589*22dc650dSSadaf Ebrahimi         now specifies the number of additional copies needed. Again, we must
7590*22dc650dSSadaf Ebrahimi         remember to replicate entries on the forward reference list. */
7591*22dc650dSSadaf Ebrahimi 
7592*22dc650dSSadaf Ebrahimi         if (repeat_max != REPEAT_UNLIMITED)
7593*22dc650dSSadaf Ebrahimi           {
7594*22dc650dSSadaf Ebrahimi           /* In the pre-compile phase, we don't actually do the replication. We
7595*22dc650dSSadaf Ebrahimi           just adjust the length as if we had. For each repetition we must add
7596*22dc650dSSadaf Ebrahimi           1 to the length for BRAZERO and for all but the last repetition we
7597*22dc650dSSadaf Ebrahimi           must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
7598*22dc650dSSadaf Ebrahimi           paranoid checks to avoid integer overflow. */
7599*22dc650dSSadaf Ebrahimi 
7600*22dc650dSSadaf Ebrahimi           if (lengthptr != NULL && repeat_max > 0)
7601*22dc650dSSadaf Ebrahimi             {
7602*22dc650dSSadaf Ebrahimi             PCRE2_SIZE delta;
7603*22dc650dSSadaf Ebrahimi             if (PRIV(ckd_smul)(&delta, repeat_max,
7604*22dc650dSSadaf Ebrahimi                                (int)length_prevgroup + 1 + 2 + 2*LINK_SIZE) ||
7605*22dc650dSSadaf Ebrahimi                 OFLOW_MAX + (2 + 2*LINK_SIZE) - *lengthptr < delta)
7606*22dc650dSSadaf Ebrahimi               {
7607*22dc650dSSadaf Ebrahimi               *errorcodeptr = ERR20;
7608*22dc650dSSadaf Ebrahimi               return 0;
7609*22dc650dSSadaf Ebrahimi               }
7610*22dc650dSSadaf Ebrahimi             delta -= (2 + 2*LINK_SIZE);   /* Last one doesn't nest */
7611*22dc650dSSadaf Ebrahimi             *lengthptr += delta;
7612*22dc650dSSadaf Ebrahimi             }
7613*22dc650dSSadaf Ebrahimi 
7614*22dc650dSSadaf Ebrahimi           /* This is compiling for real */
7615*22dc650dSSadaf Ebrahimi 
7616*22dc650dSSadaf Ebrahimi           else for (uint32_t i = repeat_max; i >= 1; i--)
7617*22dc650dSSadaf Ebrahimi             {
7618*22dc650dSSadaf Ebrahimi             *code++ = OP_BRAZERO + repeat_type;
7619*22dc650dSSadaf Ebrahimi 
7620*22dc650dSSadaf Ebrahimi             /* All but the final copy start a new nesting, maintaining the
7621*22dc650dSSadaf Ebrahimi             chain of brackets outstanding. */
7622*22dc650dSSadaf Ebrahimi 
7623*22dc650dSSadaf Ebrahimi             if (i != 1)
7624*22dc650dSSadaf Ebrahimi               {
7625*22dc650dSSadaf Ebrahimi               int linkoffset;
7626*22dc650dSSadaf Ebrahimi               *code++ = OP_BRA;
7627*22dc650dSSadaf Ebrahimi               linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
7628*22dc650dSSadaf Ebrahimi               bralink = code;
7629*22dc650dSSadaf Ebrahimi               PUTINC(code, 0, linkoffset);
7630*22dc650dSSadaf Ebrahimi               }
7631*22dc650dSSadaf Ebrahimi 
7632*22dc650dSSadaf Ebrahimi             memcpy(code, previous, CU2BYTES(len));
7633*22dc650dSSadaf Ebrahimi             code += len;
7634*22dc650dSSadaf Ebrahimi             }
7635*22dc650dSSadaf Ebrahimi 
7636*22dc650dSSadaf Ebrahimi           /* Now chain through the pending brackets, and fill in their length
7637*22dc650dSSadaf Ebrahimi           fields (which are holding the chain links pro tem). */
7638*22dc650dSSadaf Ebrahimi 
7639*22dc650dSSadaf Ebrahimi           while (bralink != NULL)
7640*22dc650dSSadaf Ebrahimi             {
7641*22dc650dSSadaf Ebrahimi             int oldlinkoffset;
7642*22dc650dSSadaf Ebrahimi             int linkoffset = (int)(code - bralink + 1);
7643*22dc650dSSadaf Ebrahimi             PCRE2_UCHAR *bra = code - linkoffset;
7644*22dc650dSSadaf Ebrahimi             oldlinkoffset = GET(bra, 1);
7645*22dc650dSSadaf Ebrahimi             bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
7646*22dc650dSSadaf Ebrahimi             *code++ = OP_KET;
7647*22dc650dSSadaf Ebrahimi             PUTINC(code, 0, linkoffset);
7648*22dc650dSSadaf Ebrahimi             PUT(bra, 1, linkoffset);
7649*22dc650dSSadaf Ebrahimi             }
7650*22dc650dSSadaf Ebrahimi           }
7651*22dc650dSSadaf Ebrahimi 
7652*22dc650dSSadaf Ebrahimi         /* If the maximum is unlimited, set a repeater in the final copy. For
7653*22dc650dSSadaf Ebrahimi         SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
7654*22dc650dSSadaf Ebrahimi         possessively repeated ONCE brackets can be converted into non-capturing
7655*22dc650dSSadaf Ebrahimi         brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
7656*22dc650dSSadaf Ebrahimi         saves having to deal with possessive ONCEs specially.
7657*22dc650dSSadaf Ebrahimi 
7658*22dc650dSSadaf Ebrahimi         Otherwise, when we are doing the actual compile phase, check to see
7659*22dc650dSSadaf Ebrahimi         whether this group is one that could match an empty string. If so,
7660*22dc650dSSadaf Ebrahimi         convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
7661*22dc650dSSadaf Ebrahimi         that runtime checking can be done. [This check is also applied to ONCE
7662*22dc650dSSadaf Ebrahimi         and SCRIPT_RUN groups at runtime, but in a different way.]
7663*22dc650dSSadaf Ebrahimi 
7664*22dc650dSSadaf Ebrahimi         Then, if the quantifier was possessive and the bracket is not a
7665*22dc650dSSadaf Ebrahimi         conditional, we convert the BRA code to the POS form, and the KET code
7666*22dc650dSSadaf Ebrahimi         to KETRPOS. (It turns out to be convenient at runtime to detect this
7667*22dc650dSSadaf Ebrahimi         kind of subpattern at both the start and at the end.) The use of
7668*22dc650dSSadaf Ebrahimi         special opcodes makes it possible to reduce greatly the stack usage in
7669*22dc650dSSadaf Ebrahimi         pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to
7670*22dc650dSSadaf Ebrahimi         OP_BRAPOSZERO.
7671*22dc650dSSadaf Ebrahimi 
7672*22dc650dSSadaf Ebrahimi         Then, if the minimum number of matches is 1 or 0, cancel the possessive
7673*22dc650dSSadaf Ebrahimi         flag so that the default action below, of wrapping everything inside
7674*22dc650dSSadaf Ebrahimi         atomic brackets, does not happen. When the minimum is greater than 1,
7675*22dc650dSSadaf Ebrahimi         there will be earlier copies of the group, and so we still have to wrap
7676*22dc650dSSadaf Ebrahimi         the whole thing. */
7677*22dc650dSSadaf Ebrahimi 
7678*22dc650dSSadaf Ebrahimi         else
7679*22dc650dSSadaf Ebrahimi           {
7680*22dc650dSSadaf Ebrahimi           PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
7681*22dc650dSSadaf Ebrahimi           PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
7682*22dc650dSSadaf Ebrahimi 
7683*22dc650dSSadaf Ebrahimi           /* Convert possessive ONCE brackets to non-capturing */
7684*22dc650dSSadaf Ebrahimi 
7685*22dc650dSSadaf Ebrahimi           if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
7686*22dc650dSSadaf Ebrahimi 
7687*22dc650dSSadaf Ebrahimi           /* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
7688*22dc650dSSadaf Ebrahimi           to do is to set the KET. */
7689*22dc650dSSadaf Ebrahimi 
7690*22dc650dSSadaf Ebrahimi           if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
7691*22dc650dSSadaf Ebrahimi             *ketcode = OP_KETRMAX + repeat_type;
7692*22dc650dSSadaf Ebrahimi 
7693*22dc650dSSadaf Ebrahimi           /* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
7694*22dc650dSSadaf Ebrahimi           (which have been converted to non-capturing above). */
7695*22dc650dSSadaf Ebrahimi 
7696*22dc650dSSadaf Ebrahimi           else
7697*22dc650dSSadaf Ebrahimi             {
7698*22dc650dSSadaf Ebrahimi             /* In the compile phase, adjust the opcode if the group can match
7699*22dc650dSSadaf Ebrahimi             an empty string. For a conditional group with only one branch, the
7700*22dc650dSSadaf Ebrahimi             value of group_return will not show "could be empty", so we must
7701*22dc650dSSadaf Ebrahimi             check that separately. */
7702*22dc650dSSadaf Ebrahimi 
7703*22dc650dSSadaf Ebrahimi             if (lengthptr == NULL)
7704*22dc650dSSadaf Ebrahimi               {
7705*22dc650dSSadaf Ebrahimi               if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
7706*22dc650dSSadaf Ebrahimi               if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
7707*22dc650dSSadaf Ebrahimi                 *bracode = OP_SCOND;
7708*22dc650dSSadaf Ebrahimi               }
7709*22dc650dSSadaf Ebrahimi 
7710*22dc650dSSadaf Ebrahimi             /* Handle possessive quantifiers. */
7711*22dc650dSSadaf Ebrahimi 
7712*22dc650dSSadaf Ebrahimi             if (possessive_quantifier)
7713*22dc650dSSadaf Ebrahimi               {
7714*22dc650dSSadaf Ebrahimi               /* For COND brackets, we wrap the whole thing in a possessively
7715*22dc650dSSadaf Ebrahimi               repeated non-capturing bracket, because we have not invented POS
7716*22dc650dSSadaf Ebrahimi               versions of the COND opcodes. */
7717*22dc650dSSadaf Ebrahimi 
7718*22dc650dSSadaf Ebrahimi               if (*bracode == OP_COND || *bracode == OP_SCOND)
7719*22dc650dSSadaf Ebrahimi                 {
7720*22dc650dSSadaf Ebrahimi                 int nlen = (int)(code - bracode);
7721*22dc650dSSadaf Ebrahimi                 (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
7722*22dc650dSSadaf Ebrahimi                 code += 1 + LINK_SIZE;
7723*22dc650dSSadaf Ebrahimi                 nlen += 1 + LINK_SIZE;
7724*22dc650dSSadaf Ebrahimi                 *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
7725*22dc650dSSadaf Ebrahimi                 *code++ = OP_KETRPOS;
7726*22dc650dSSadaf Ebrahimi                 PUTINC(code, 0, nlen);
7727*22dc650dSSadaf Ebrahimi                 PUT(bracode, 1, nlen);
7728*22dc650dSSadaf Ebrahimi                 }
7729*22dc650dSSadaf Ebrahimi 
7730*22dc650dSSadaf Ebrahimi               /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
7731*22dc650dSSadaf Ebrahimi 
7732*22dc650dSSadaf Ebrahimi               else
7733*22dc650dSSadaf Ebrahimi                 {
7734*22dc650dSSadaf Ebrahimi                 *bracode += 1;              /* Switch to xxxPOS opcodes */
7735*22dc650dSSadaf Ebrahimi                 *ketcode = OP_KETRPOS;
7736*22dc650dSSadaf Ebrahimi                 }
7737*22dc650dSSadaf Ebrahimi 
7738*22dc650dSSadaf Ebrahimi               /* If the minimum is zero, mark it as possessive, then unset the
7739*22dc650dSSadaf Ebrahimi               possessive flag when the minimum is 0 or 1. */
7740*22dc650dSSadaf Ebrahimi 
7741*22dc650dSSadaf Ebrahimi               if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
7742*22dc650dSSadaf Ebrahimi               if (repeat_min < 2) possessive_quantifier = FALSE;
7743*22dc650dSSadaf Ebrahimi               }
7744*22dc650dSSadaf Ebrahimi 
7745*22dc650dSSadaf Ebrahimi             /* Non-possessive quantifier */
7746*22dc650dSSadaf Ebrahimi 
7747*22dc650dSSadaf Ebrahimi             else *ketcode = OP_KETRMAX + repeat_type;
7748*22dc650dSSadaf Ebrahimi             }
7749*22dc650dSSadaf Ebrahimi           }
7750*22dc650dSSadaf Ebrahimi         }
7751*22dc650dSSadaf Ebrahimi       break;
7752*22dc650dSSadaf Ebrahimi 
7753*22dc650dSSadaf Ebrahimi       /* If previous was a character type match (\d or similar), abolish it and
7754*22dc650dSSadaf Ebrahimi       create a suitable repeat item. The code is shared with single-character
7755*22dc650dSSadaf Ebrahimi       repeats by setting op_type to add a suitable offset into repeat_type.
7756*22dc650dSSadaf Ebrahimi       Note the the Unicode property types will be present only when
7757*22dc650dSSadaf Ebrahimi       SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
7758*22dc650dSSadaf Ebrahimi       here because it just makes it horribly messy. */
7759*22dc650dSSadaf Ebrahimi 
7760*22dc650dSSadaf Ebrahimi       default:
7761*22dc650dSSadaf Ebrahimi       if (op_previous >= OP_EODN)   /* Not a character type - internal error */
7762*22dc650dSSadaf Ebrahimi         {
7763*22dc650dSSadaf Ebrahimi         *errorcodeptr = ERR10;
7764*22dc650dSSadaf Ebrahimi         return 0;
7765*22dc650dSSadaf Ebrahimi         }
7766*22dc650dSSadaf Ebrahimi       else
7767*22dc650dSSadaf Ebrahimi         {
7768*22dc650dSSadaf Ebrahimi         int prop_type, prop_value;
7769*22dc650dSSadaf Ebrahimi         PCRE2_UCHAR *oldcode;
7770*22dc650dSSadaf Ebrahimi 
7771*22dc650dSSadaf Ebrahimi         if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7772*22dc650dSSadaf Ebrahimi 
7773*22dc650dSSadaf Ebrahimi         op_type = OP_TYPESTAR - OP_STAR;      /* Use type opcodes */
7774*22dc650dSSadaf Ebrahimi         mclength = 0;                         /* Not a character */
7775*22dc650dSSadaf Ebrahimi 
7776*22dc650dSSadaf Ebrahimi         if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
7777*22dc650dSSadaf Ebrahimi           {
7778*22dc650dSSadaf Ebrahimi           prop_type = previous[1];
7779*22dc650dSSadaf Ebrahimi           prop_value = previous[2];
7780*22dc650dSSadaf Ebrahimi           }
7781*22dc650dSSadaf Ebrahimi         else
7782*22dc650dSSadaf Ebrahimi           {
7783*22dc650dSSadaf Ebrahimi           /* Come here from just above with a character in mcbuffer/mclength. */
7784*22dc650dSSadaf Ebrahimi           OUTPUT_SINGLE_REPEAT:
7785*22dc650dSSadaf Ebrahimi           prop_type = prop_value = -1;
7786*22dc650dSSadaf Ebrahimi           }
7787*22dc650dSSadaf Ebrahimi 
7788*22dc650dSSadaf Ebrahimi         /* At this point, if prop_type == prop_value == -1 we either have a
7789*22dc650dSSadaf Ebrahimi         character in mcbuffer when mclength is greater than zero, or we have
7790*22dc650dSSadaf Ebrahimi         mclength zero, in which case there is a non-property character type in
7791*22dc650dSSadaf Ebrahimi         op_previous. If prop_type/value are not negative, we have a property
7792*22dc650dSSadaf Ebrahimi         character type in op_previous. */
7793*22dc650dSSadaf Ebrahimi 
7794*22dc650dSSadaf Ebrahimi         oldcode = code;                   /* Save where we were */
7795*22dc650dSSadaf Ebrahimi         code = previous;                  /* Usually overwrite previous item */
7796*22dc650dSSadaf Ebrahimi 
7797*22dc650dSSadaf Ebrahimi         /* If the maximum is zero then the minimum must also be zero; Perl allows
7798*22dc650dSSadaf Ebrahimi         this case, so we do too - by simply omitting the item altogether. */
7799*22dc650dSSadaf Ebrahimi 
7800*22dc650dSSadaf Ebrahimi         if (repeat_max == 0) goto END_REPEAT;
7801*22dc650dSSadaf Ebrahimi 
7802*22dc650dSSadaf Ebrahimi         /* Combine the op_type with the repeat_type */
7803*22dc650dSSadaf Ebrahimi 
7804*22dc650dSSadaf Ebrahimi         repeat_type += op_type;
7805*22dc650dSSadaf Ebrahimi 
7806*22dc650dSSadaf Ebrahimi         /* A minimum of zero is handled either as the special case * or ?, or as
7807*22dc650dSSadaf Ebrahimi         an UPTO, with the maximum given. */
7808*22dc650dSSadaf Ebrahimi 
7809*22dc650dSSadaf Ebrahimi         if (repeat_min == 0)
7810*22dc650dSSadaf Ebrahimi           {
7811*22dc650dSSadaf Ebrahimi           if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7812*22dc650dSSadaf Ebrahimi             else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
7813*22dc650dSSadaf Ebrahimi           else
7814*22dc650dSSadaf Ebrahimi             {
7815*22dc650dSSadaf Ebrahimi             *code++ = OP_UPTO + repeat_type;
7816*22dc650dSSadaf Ebrahimi             PUT2INC(code, 0, repeat_max);
7817*22dc650dSSadaf Ebrahimi             }
7818*22dc650dSSadaf Ebrahimi           }
7819*22dc650dSSadaf Ebrahimi 
7820*22dc650dSSadaf Ebrahimi         /* A repeat minimum of 1 is optimized into some special cases. If the
7821*22dc650dSSadaf Ebrahimi         maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7822*22dc650dSSadaf Ebrahimi         left in place and, if the maximum is greater than 1, we use OP_UPTO with
7823*22dc650dSSadaf Ebrahimi         one less than the maximum. */
7824*22dc650dSSadaf Ebrahimi 
7825*22dc650dSSadaf Ebrahimi         else if (repeat_min == 1)
7826*22dc650dSSadaf Ebrahimi           {
7827*22dc650dSSadaf Ebrahimi           if (repeat_max == REPEAT_UNLIMITED)
7828*22dc650dSSadaf Ebrahimi             *code++ = OP_PLUS + repeat_type;
7829*22dc650dSSadaf Ebrahimi           else
7830*22dc650dSSadaf Ebrahimi             {
7831*22dc650dSSadaf Ebrahimi             code = oldcode;  /* Leave previous item in place */
7832*22dc650dSSadaf Ebrahimi             if (repeat_max == 1) goto END_REPEAT;
7833*22dc650dSSadaf Ebrahimi             *code++ = OP_UPTO + repeat_type;
7834*22dc650dSSadaf Ebrahimi             PUT2INC(code, 0, repeat_max - 1);
7835*22dc650dSSadaf Ebrahimi             }
7836*22dc650dSSadaf Ebrahimi           }
7837*22dc650dSSadaf Ebrahimi 
7838*22dc650dSSadaf Ebrahimi         /* The case {n,n} is just an EXACT, while the general case {n,m} is
7839*22dc650dSSadaf Ebrahimi         handled as an EXACT followed by an UPTO or STAR or QUERY. */
7840*22dc650dSSadaf Ebrahimi 
7841*22dc650dSSadaf Ebrahimi         else
7842*22dc650dSSadaf Ebrahimi           {
7843*22dc650dSSadaf Ebrahimi           *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
7844*22dc650dSSadaf Ebrahimi           PUT2INC(code, 0, repeat_min);
7845*22dc650dSSadaf Ebrahimi 
7846*22dc650dSSadaf Ebrahimi           /* Unless repeat_max equals repeat_min, fill in the data for EXACT,
7847*22dc650dSSadaf Ebrahimi           and then generate the second opcode. For a repeated Unicode property
7848*22dc650dSSadaf Ebrahimi           match, there are two extra values that define the required property,
7849*22dc650dSSadaf Ebrahimi           and mclength is set zero to indicate this. */
7850*22dc650dSSadaf Ebrahimi 
7851*22dc650dSSadaf Ebrahimi           if (repeat_max != repeat_min)
7852*22dc650dSSadaf Ebrahimi             {
7853*22dc650dSSadaf Ebrahimi             if (mclength > 0)
7854*22dc650dSSadaf Ebrahimi               {
7855*22dc650dSSadaf Ebrahimi               memcpy(code, mcbuffer, CU2BYTES(mclength));
7856*22dc650dSSadaf Ebrahimi               code += mclength;
7857*22dc650dSSadaf Ebrahimi               }
7858*22dc650dSSadaf Ebrahimi             else
7859*22dc650dSSadaf Ebrahimi               {
7860*22dc650dSSadaf Ebrahimi               *code++ = op_previous;
7861*22dc650dSSadaf Ebrahimi               if (prop_type >= 0)
7862*22dc650dSSadaf Ebrahimi                 {
7863*22dc650dSSadaf Ebrahimi                 *code++ = prop_type;
7864*22dc650dSSadaf Ebrahimi                 *code++ = prop_value;
7865*22dc650dSSadaf Ebrahimi                 }
7866*22dc650dSSadaf Ebrahimi               }
7867*22dc650dSSadaf Ebrahimi 
7868*22dc650dSSadaf Ebrahimi             /* Now set up the following opcode */
7869*22dc650dSSadaf Ebrahimi 
7870*22dc650dSSadaf Ebrahimi             if (repeat_max == REPEAT_UNLIMITED)
7871*22dc650dSSadaf Ebrahimi               *code++ = OP_STAR + repeat_type;
7872*22dc650dSSadaf Ebrahimi             else
7873*22dc650dSSadaf Ebrahimi               {
7874*22dc650dSSadaf Ebrahimi               repeat_max -= repeat_min;
7875*22dc650dSSadaf Ebrahimi               if (repeat_max == 1)
7876*22dc650dSSadaf Ebrahimi                 {
7877*22dc650dSSadaf Ebrahimi                 *code++ = OP_QUERY + repeat_type;
7878*22dc650dSSadaf Ebrahimi                 }
7879*22dc650dSSadaf Ebrahimi               else
7880*22dc650dSSadaf Ebrahimi                 {
7881*22dc650dSSadaf Ebrahimi                 *code++ = OP_UPTO + repeat_type;
7882*22dc650dSSadaf Ebrahimi                 PUT2INC(code, 0, repeat_max);
7883*22dc650dSSadaf Ebrahimi                 }
7884*22dc650dSSadaf Ebrahimi               }
7885*22dc650dSSadaf Ebrahimi             }
7886*22dc650dSSadaf Ebrahimi           }
7887*22dc650dSSadaf Ebrahimi 
7888*22dc650dSSadaf Ebrahimi         /* Fill in the character or character type for the final opcode. */
7889*22dc650dSSadaf Ebrahimi 
7890*22dc650dSSadaf Ebrahimi         if (mclength > 0)
7891*22dc650dSSadaf Ebrahimi           {
7892*22dc650dSSadaf Ebrahimi           memcpy(code, mcbuffer, CU2BYTES(mclength));
7893*22dc650dSSadaf Ebrahimi           code += mclength;
7894*22dc650dSSadaf Ebrahimi           }
7895*22dc650dSSadaf Ebrahimi         else
7896*22dc650dSSadaf Ebrahimi           {
7897*22dc650dSSadaf Ebrahimi           *code++ = op_previous;
7898*22dc650dSSadaf Ebrahimi           if (prop_type >= 0)
7899*22dc650dSSadaf Ebrahimi             {
7900*22dc650dSSadaf Ebrahimi             *code++ = prop_type;
7901*22dc650dSSadaf Ebrahimi             *code++ = prop_value;
7902*22dc650dSSadaf Ebrahimi             }
7903*22dc650dSSadaf Ebrahimi           }
7904*22dc650dSSadaf Ebrahimi         }
7905*22dc650dSSadaf Ebrahimi       break;
7906*22dc650dSSadaf Ebrahimi       }  /* End of switch on different op_previous values */
7907*22dc650dSSadaf Ebrahimi 
7908*22dc650dSSadaf Ebrahimi 
7909*22dc650dSSadaf Ebrahimi     /* If the character following a repeat is '+', possessive_quantifier is
7910*22dc650dSSadaf Ebrahimi     TRUE. For some opcodes, there are special alternative opcodes for this
7911*22dc650dSSadaf Ebrahimi     case. For anything else, we wrap the entire repeated item inside OP_ONCE
7912*22dc650dSSadaf Ebrahimi     brackets. Logically, the '+' notation is just syntactic sugar, taken from
7913*22dc650dSSadaf Ebrahimi     Sun's Java package, but the special opcodes can optimize it.
7914*22dc650dSSadaf Ebrahimi 
7915*22dc650dSSadaf Ebrahimi     Some (but not all) possessively repeated subpatterns have already been
7916*22dc650dSSadaf Ebrahimi     completely handled in the code just above. For them, possessive_quantifier
7917*22dc650dSSadaf Ebrahimi     is always FALSE at this stage. Note that the repeated item starts at
7918*22dc650dSSadaf Ebrahimi     tempcode, not at previous, which might be the first part of a string whose
7919*22dc650dSSadaf Ebrahimi     (former) last char we repeated. */
7920*22dc650dSSadaf Ebrahimi 
7921*22dc650dSSadaf Ebrahimi     if (possessive_quantifier)
7922*22dc650dSSadaf Ebrahimi       {
7923*22dc650dSSadaf Ebrahimi       int len;
7924*22dc650dSSadaf Ebrahimi 
7925*22dc650dSSadaf Ebrahimi       /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
7926*22dc650dSSadaf Ebrahimi       However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
7927*22dc650dSSadaf Ebrahimi       {5,}, or {5,10}). We skip over an EXACT item; if the length of what
7928*22dc650dSSadaf Ebrahimi       remains is greater than zero, there's a further opcode that can be
7929*22dc650dSSadaf Ebrahimi       handled. If not, do nothing, leaving the EXACT alone. */
7930*22dc650dSSadaf Ebrahimi 
7931*22dc650dSSadaf Ebrahimi       switch(*tempcode)
7932*22dc650dSSadaf Ebrahimi         {
7933*22dc650dSSadaf Ebrahimi         case OP_TYPEEXACT:
7934*22dc650dSSadaf Ebrahimi         tempcode += PRIV(OP_lengths)[*tempcode] +
7935*22dc650dSSadaf Ebrahimi           ((tempcode[1 + IMM2_SIZE] == OP_PROP
7936*22dc650dSSadaf Ebrahimi           || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
7937*22dc650dSSadaf Ebrahimi         break;
7938*22dc650dSSadaf Ebrahimi 
7939*22dc650dSSadaf Ebrahimi         /* CHAR opcodes are used for exacts whose count is 1. */
7940*22dc650dSSadaf Ebrahimi 
7941*22dc650dSSadaf Ebrahimi         case OP_CHAR:
7942*22dc650dSSadaf Ebrahimi         case OP_CHARI:
7943*22dc650dSSadaf Ebrahimi         case OP_NOT:
7944*22dc650dSSadaf Ebrahimi         case OP_NOTI:
7945*22dc650dSSadaf Ebrahimi         case OP_EXACT:
7946*22dc650dSSadaf Ebrahimi         case OP_EXACTI:
7947*22dc650dSSadaf Ebrahimi         case OP_NOTEXACT:
7948*22dc650dSSadaf Ebrahimi         case OP_NOTEXACTI:
7949*22dc650dSSadaf Ebrahimi         tempcode += PRIV(OP_lengths)[*tempcode];
7950*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
7951*22dc650dSSadaf Ebrahimi         if (utf && HAS_EXTRALEN(tempcode[-1]))
7952*22dc650dSSadaf Ebrahimi           tempcode += GET_EXTRALEN(tempcode[-1]);
7953*22dc650dSSadaf Ebrahimi #endif
7954*22dc650dSSadaf Ebrahimi         break;
7955*22dc650dSSadaf Ebrahimi 
7956*22dc650dSSadaf Ebrahimi         /* For the class opcodes, the repeat operator appears at the end;
7957*22dc650dSSadaf Ebrahimi         adjust tempcode to point to it. */
7958*22dc650dSSadaf Ebrahimi 
7959*22dc650dSSadaf Ebrahimi         case OP_CLASS:
7960*22dc650dSSadaf Ebrahimi         case OP_NCLASS:
7961*22dc650dSSadaf Ebrahimi         tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
7962*22dc650dSSadaf Ebrahimi         break;
7963*22dc650dSSadaf Ebrahimi 
7964*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_WIDE_CHARS
7965*22dc650dSSadaf Ebrahimi         case OP_XCLASS:
7966*22dc650dSSadaf Ebrahimi         tempcode += GET(tempcode, 1);
7967*22dc650dSSadaf Ebrahimi         break;
7968*22dc650dSSadaf Ebrahimi #endif
7969*22dc650dSSadaf Ebrahimi         }
7970*22dc650dSSadaf Ebrahimi 
7971*22dc650dSSadaf Ebrahimi       /* If tempcode is equal to code (which points to the end of the repeated
7972*22dc650dSSadaf Ebrahimi       item), it means we have skipped an EXACT item but there is no following
7973*22dc650dSSadaf Ebrahimi       QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
7974*22dc650dSSadaf Ebrahimi       all other cases, tempcode will be pointing to the repeat opcode, and will
7975*22dc650dSSadaf Ebrahimi       be less than code, so the value of len will be greater than 0. */
7976*22dc650dSSadaf Ebrahimi 
7977*22dc650dSSadaf Ebrahimi       len = (int)(code - tempcode);
7978*22dc650dSSadaf Ebrahimi       if (len > 0)
7979*22dc650dSSadaf Ebrahimi         {
7980*22dc650dSSadaf Ebrahimi         unsigned int repcode = *tempcode;
7981*22dc650dSSadaf Ebrahimi 
7982*22dc650dSSadaf Ebrahimi         /* There is a table for possessifying opcodes, all of which are less
7983*22dc650dSSadaf Ebrahimi         than OP_CALLOUT. A zero entry means there is no possessified version.
7984*22dc650dSSadaf Ebrahimi         */
7985*22dc650dSSadaf Ebrahimi 
7986*22dc650dSSadaf Ebrahimi         if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
7987*22dc650dSSadaf Ebrahimi           *tempcode = opcode_possessify[repcode];
7988*22dc650dSSadaf Ebrahimi 
7989*22dc650dSSadaf Ebrahimi         /* For opcode without a special possessified version, wrap the item in
7990*22dc650dSSadaf Ebrahimi         ONCE brackets. */
7991*22dc650dSSadaf Ebrahimi 
7992*22dc650dSSadaf Ebrahimi         else
7993*22dc650dSSadaf Ebrahimi           {
7994*22dc650dSSadaf Ebrahimi           (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
7995*22dc650dSSadaf Ebrahimi           code += 1 + LINK_SIZE;
7996*22dc650dSSadaf Ebrahimi           len += 1 + LINK_SIZE;
7997*22dc650dSSadaf Ebrahimi           tempcode[0] = OP_ONCE;
7998*22dc650dSSadaf Ebrahimi           *code++ = OP_KET;
7999*22dc650dSSadaf Ebrahimi           PUTINC(code, 0, len);
8000*22dc650dSSadaf Ebrahimi           PUT(tempcode, 1, len);
8001*22dc650dSSadaf Ebrahimi           }
8002*22dc650dSSadaf Ebrahimi         }
8003*22dc650dSSadaf Ebrahimi       }
8004*22dc650dSSadaf Ebrahimi 
8005*22dc650dSSadaf Ebrahimi     /* We set the "follows varying string" flag for subsequently encountered
8006*22dc650dSSadaf Ebrahimi     reqcus if it isn't already set and we have just passed a varying length
8007*22dc650dSSadaf Ebrahimi     item. */
8008*22dc650dSSadaf Ebrahimi 
8009*22dc650dSSadaf Ebrahimi     END_REPEAT:
8010*22dc650dSSadaf Ebrahimi     cb->req_varyopt |= reqvary;
8011*22dc650dSSadaf Ebrahimi     break;
8012*22dc650dSSadaf Ebrahimi 
8013*22dc650dSSadaf Ebrahimi 
8014*22dc650dSSadaf Ebrahimi     /* ===================================================================*/
8015*22dc650dSSadaf Ebrahimi     /* Handle a 32-bit data character with a value greater than META_END. */
8016*22dc650dSSadaf Ebrahimi 
8017*22dc650dSSadaf Ebrahimi     case META_BIGVALUE:
8018*22dc650dSSadaf Ebrahimi     pptr++;
8019*22dc650dSSadaf Ebrahimi     goto NORMAL_CHAR;
8020*22dc650dSSadaf Ebrahimi 
8021*22dc650dSSadaf Ebrahimi 
8022*22dc650dSSadaf Ebrahimi     /* ===============================================================*/
8023*22dc650dSSadaf Ebrahimi     /* Handle a back reference by number, which is the meta argument. The
8024*22dc650dSSadaf Ebrahimi     pattern offsets for back references to group numbers less than 10 are held
8025*22dc650dSSadaf Ebrahimi     in a special vector, to avoid using more than two parsed pattern elements
8026*22dc650dSSadaf Ebrahimi     in 64-bit environments. We only need the offset to the first occurrence,
8027*22dc650dSSadaf Ebrahimi     because if that doesn't fail, subsequent ones will also be OK. */
8028*22dc650dSSadaf Ebrahimi 
8029*22dc650dSSadaf Ebrahimi     case META_BACKREF:
8030*22dc650dSSadaf Ebrahimi     if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
8031*22dc650dSSadaf Ebrahimi       else GETPLUSOFFSET(offset, pptr);
8032*22dc650dSSadaf Ebrahimi 
8033*22dc650dSSadaf Ebrahimi     if (meta_arg > cb->bracount)
8034*22dc650dSSadaf Ebrahimi       {
8035*22dc650dSSadaf Ebrahimi       cb->erroroffset = offset;
8036*22dc650dSSadaf Ebrahimi       *errorcodeptr = ERR15;  /* Non-existent subpattern */
8037*22dc650dSSadaf Ebrahimi       return 0;
8038*22dc650dSSadaf Ebrahimi       }
8039*22dc650dSSadaf Ebrahimi 
8040*22dc650dSSadaf Ebrahimi     /* Come here from named backref handling when the reference is to a
8041*22dc650dSSadaf Ebrahimi     single group (that is, not to a duplicated name). The back reference
8042*22dc650dSSadaf Ebrahimi     data will have already been updated. We must disable firstcu if not
8043*22dc650dSSadaf Ebrahimi     set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
8044*22dc650dSSadaf Ebrahimi     later. */
8045*22dc650dSSadaf Ebrahimi 
8046*22dc650dSSadaf Ebrahimi     HANDLE_SINGLE_REFERENCE:
8047*22dc650dSSadaf Ebrahimi     if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
8048*22dc650dSSadaf Ebrahimi     *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
8049*22dc650dSSadaf Ebrahimi     PUT2INC(code, 0, meta_arg);
8050*22dc650dSSadaf Ebrahimi 
8051*22dc650dSSadaf Ebrahimi     /* Update the map of back references, and keep the highest one. We
8052*22dc650dSSadaf Ebrahimi     could do this in parse_regex() for numerical back references, but not
8053*22dc650dSSadaf Ebrahimi     for named back references, because we don't know the numbers to which
8054*22dc650dSSadaf Ebrahimi     named back references refer. So we do it all in this function. */
8055*22dc650dSSadaf Ebrahimi 
8056*22dc650dSSadaf Ebrahimi     cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
8057*22dc650dSSadaf Ebrahimi     if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
8058*22dc650dSSadaf Ebrahimi     break;
8059*22dc650dSSadaf Ebrahimi 
8060*22dc650dSSadaf Ebrahimi 
8061*22dc650dSSadaf Ebrahimi     /* ===============================================================*/
8062*22dc650dSSadaf Ebrahimi     /* Handle recursion by inserting the number of the called group (which is
8063*22dc650dSSadaf Ebrahimi     the meta argument) after OP_RECURSE. At the end of compiling the pattern is
8064*22dc650dSSadaf Ebrahimi     scanned and these numbers are replaced by offsets within the pattern. It is
8065*22dc650dSSadaf Ebrahimi     done like this to avoid problems with forward references and adjusting
8066*22dc650dSSadaf Ebrahimi     offsets when groups are duplicated and moved (as discovered in previous
8067*22dc650dSSadaf Ebrahimi     implementations). Note that a recursion does not have a set first
8068*22dc650dSSadaf Ebrahimi     character. */
8069*22dc650dSSadaf Ebrahimi 
8070*22dc650dSSadaf Ebrahimi     case META_RECURSE:
8071*22dc650dSSadaf Ebrahimi     GETPLUSOFFSET(offset, pptr);
8072*22dc650dSSadaf Ebrahimi     if (meta_arg > cb->bracount)
8073*22dc650dSSadaf Ebrahimi       {
8074*22dc650dSSadaf Ebrahimi       cb->erroroffset = offset;
8075*22dc650dSSadaf Ebrahimi       *errorcodeptr = ERR15;  /* Non-existent subpattern */
8076*22dc650dSSadaf Ebrahimi       return 0;
8077*22dc650dSSadaf Ebrahimi       }
8078*22dc650dSSadaf Ebrahimi     HANDLE_NUMERICAL_RECURSION:
8079*22dc650dSSadaf Ebrahimi     *code = OP_RECURSE;
8080*22dc650dSSadaf Ebrahimi     PUT(code, 1, meta_arg);
8081*22dc650dSSadaf Ebrahimi     code += 1 + LINK_SIZE;
8082*22dc650dSSadaf Ebrahimi     groupsetfirstcu = FALSE;
8083*22dc650dSSadaf Ebrahimi     cb->had_recurse = TRUE;
8084*22dc650dSSadaf Ebrahimi     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8085*22dc650dSSadaf Ebrahimi     zerofirstcu = firstcu;
8086*22dc650dSSadaf Ebrahimi     zerofirstcuflags = firstcuflags;
8087*22dc650dSSadaf Ebrahimi     break;
8088*22dc650dSSadaf Ebrahimi 
8089*22dc650dSSadaf Ebrahimi 
8090*22dc650dSSadaf Ebrahimi     /* ===============================================================*/
8091*22dc650dSSadaf Ebrahimi     /* Handle capturing parentheses; the number is the meta argument. */
8092*22dc650dSSadaf Ebrahimi 
8093*22dc650dSSadaf Ebrahimi     case META_CAPTURE:
8094*22dc650dSSadaf Ebrahimi     bravalue = OP_CBRA;
8095*22dc650dSSadaf Ebrahimi     skipunits = IMM2_SIZE;
8096*22dc650dSSadaf Ebrahimi     PUT2(code, 1+LINK_SIZE, meta_arg);
8097*22dc650dSSadaf Ebrahimi     cb->lastcapture = meta_arg;
8098*22dc650dSSadaf Ebrahimi     goto GROUP_PROCESS_NOTE_EMPTY;
8099*22dc650dSSadaf Ebrahimi 
8100*22dc650dSSadaf Ebrahimi 
8101*22dc650dSSadaf Ebrahimi     /* ===============================================================*/
8102*22dc650dSSadaf Ebrahimi     /* Handle escape sequence items. For ones like \d, the ESC_values are
8103*22dc650dSSadaf Ebrahimi     arranged to be the same as the corresponding OP_values in the default case
8104*22dc650dSSadaf Ebrahimi     when PCRE2_UCP is not set (which is the only case in which they will appear
8105*22dc650dSSadaf Ebrahimi     here).
8106*22dc650dSSadaf Ebrahimi 
8107*22dc650dSSadaf Ebrahimi     Note: \Q and \E are never seen here, as they were dealt with in
8108*22dc650dSSadaf Ebrahimi     parse_pattern(). Neither are numerical back references or recursions, which
8109*22dc650dSSadaf Ebrahimi     were turned into META_BACKREF or META_RECURSE items, respectively. \k and
8110*22dc650dSSadaf Ebrahimi     \g, when followed by names, are turned into META_BACKREF_BYNAME or
8111*22dc650dSSadaf Ebrahimi     META_RECURSE_BYNAME. */
8112*22dc650dSSadaf Ebrahimi 
8113*22dc650dSSadaf Ebrahimi     case META_ESCAPE:
8114*22dc650dSSadaf Ebrahimi 
8115*22dc650dSSadaf Ebrahimi     /* We can test for escape sequences that consume a character because their
8116*22dc650dSSadaf Ebrahimi     values lie between ESC_b and ESC_Z; this may have to change if any new ones
8117*22dc650dSSadaf Ebrahimi     are ever created. For these sequences, we disable the setting of a first
8118*22dc650dSSadaf Ebrahimi     character if it hasn't already been set. */
8119*22dc650dSSadaf Ebrahimi 
8120*22dc650dSSadaf Ebrahimi     if (meta_arg > ESC_b && meta_arg < ESC_Z)
8121*22dc650dSSadaf Ebrahimi       {
8122*22dc650dSSadaf Ebrahimi       matched_char = TRUE;
8123*22dc650dSSadaf Ebrahimi       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8124*22dc650dSSadaf Ebrahimi       }
8125*22dc650dSSadaf Ebrahimi 
8126*22dc650dSSadaf Ebrahimi     /* Set values to reset to if this is followed by a zero repeat. */
8127*22dc650dSSadaf Ebrahimi 
8128*22dc650dSSadaf Ebrahimi     zerofirstcu = firstcu;
8129*22dc650dSSadaf Ebrahimi     zerofirstcuflags = firstcuflags;
8130*22dc650dSSadaf Ebrahimi     zeroreqcu = reqcu;
8131*22dc650dSSadaf Ebrahimi     zeroreqcuflags = reqcuflags;
8132*22dc650dSSadaf Ebrahimi 
8133*22dc650dSSadaf Ebrahimi     /* If Unicode is not supported, \P and \p are not allowed and are
8134*22dc650dSSadaf Ebrahimi     faulted at parse time, so will never appear here. */
8135*22dc650dSSadaf Ebrahimi 
8136*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
8137*22dc650dSSadaf Ebrahimi     if (meta_arg == ESC_P || meta_arg == ESC_p)
8138*22dc650dSSadaf Ebrahimi       {
8139*22dc650dSSadaf Ebrahimi       uint32_t ptype = *(++pptr) >> 16;
8140*22dc650dSSadaf Ebrahimi       uint32_t pdata = *pptr & 0xffff;
8141*22dc650dSSadaf Ebrahimi 
8142*22dc650dSSadaf Ebrahimi       /* The special case of \p{Any} is compiled to OP_ALLANY so as to benefit
8143*22dc650dSSadaf Ebrahimi       from the auto-anchoring code. */
8144*22dc650dSSadaf Ebrahimi 
8145*22dc650dSSadaf Ebrahimi       if (meta_arg == ESC_p && ptype == PT_ANY)
8146*22dc650dSSadaf Ebrahimi         {
8147*22dc650dSSadaf Ebrahimi         *code++ = OP_ALLANY;
8148*22dc650dSSadaf Ebrahimi         }
8149*22dc650dSSadaf Ebrahimi       else
8150*22dc650dSSadaf Ebrahimi         {
8151*22dc650dSSadaf Ebrahimi         *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
8152*22dc650dSSadaf Ebrahimi         *code++ = ptype;
8153*22dc650dSSadaf Ebrahimi         *code++ = pdata;
8154*22dc650dSSadaf Ebrahimi         }
8155*22dc650dSSadaf Ebrahimi       break;  /* End META_ESCAPE */
8156*22dc650dSSadaf Ebrahimi       }
8157*22dc650dSSadaf Ebrahimi #endif
8158*22dc650dSSadaf Ebrahimi 
8159*22dc650dSSadaf Ebrahimi     /* \K is forbidden in lookarounds since 10.38 because that's what Perl has
8160*22dc650dSSadaf Ebrahimi     done. However, there's an option, in case anyone was relying on it. */
8161*22dc650dSSadaf Ebrahimi 
8162*22dc650dSSadaf Ebrahimi     if (cb->assert_depth > 0 && meta_arg == ESC_K &&
8163*22dc650dSSadaf Ebrahimi         (xoptions & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0)
8164*22dc650dSSadaf Ebrahimi       {
8165*22dc650dSSadaf Ebrahimi       *errorcodeptr = ERR99;
8166*22dc650dSSadaf Ebrahimi       return 0;
8167*22dc650dSSadaf Ebrahimi       }
8168*22dc650dSSadaf Ebrahimi 
8169*22dc650dSSadaf Ebrahimi     /* For the rest (including \X when Unicode is supported - if not it's
8170*22dc650dSSadaf Ebrahimi     faulted at parse time), the OP value is the escape value when PCRE2_UCP is
8171*22dc650dSSadaf Ebrahimi     not set; if it is set, most of them do not show up here because they are
8172*22dc650dSSadaf Ebrahimi     converted into Unicode property tests in parse_regex().
8173*22dc650dSSadaf Ebrahimi 
8174*22dc650dSSadaf Ebrahimi     In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
8175*22dc650dSSadaf Ebrahimi     instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds.
8176*22dc650dSSadaf Ebrahimi     There are special UCP codes for \B and \b which are used in UCP mode unless
8177*22dc650dSSadaf Ebrahimi     "word" matching is being forced to ASCII.
8178*22dc650dSSadaf Ebrahimi 
8179*22dc650dSSadaf Ebrahimi     Note that \b and \B do a one-character lookbehind, and \A also behaves as
8180*22dc650dSSadaf Ebrahimi     if it does. */
8181*22dc650dSSadaf Ebrahimi 
8182*22dc650dSSadaf Ebrahimi     switch(meta_arg)
8183*22dc650dSSadaf Ebrahimi       {
8184*22dc650dSSadaf Ebrahimi       case ESC_C:
8185*22dc650dSSadaf Ebrahimi       cb->external_flags |= PCRE2_HASBKC;  /* Record */
8186*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 32
8187*22dc650dSSadaf Ebrahimi       meta_arg = OP_ALLANY;
8188*22dc650dSSadaf Ebrahimi #else
8189*22dc650dSSadaf Ebrahimi       if (!utf) meta_arg = OP_ALLANY;
8190*22dc650dSSadaf Ebrahimi #endif
8191*22dc650dSSadaf Ebrahimi       break;
8192*22dc650dSSadaf Ebrahimi 
8193*22dc650dSSadaf Ebrahimi       case ESC_B:
8194*22dc650dSSadaf Ebrahimi       case ESC_b:
8195*22dc650dSSadaf Ebrahimi       if ((options & PCRE2_UCP) != 0 && (xoptions & PCRE2_EXTRA_ASCII_BSW) == 0)
8196*22dc650dSSadaf Ebrahimi         meta_arg = (meta_arg == ESC_B)? OP_NOT_UCP_WORD_BOUNDARY :
8197*22dc650dSSadaf Ebrahimi           OP_UCP_WORD_BOUNDARY;
8198*22dc650dSSadaf Ebrahimi       /* Fall through */
8199*22dc650dSSadaf Ebrahimi 
8200*22dc650dSSadaf Ebrahimi       case ESC_A:
8201*22dc650dSSadaf Ebrahimi       if (cb->max_lookbehind == 0) cb->max_lookbehind = 1;
8202*22dc650dSSadaf Ebrahimi       break;
8203*22dc650dSSadaf Ebrahimi       }
8204*22dc650dSSadaf Ebrahimi 
8205*22dc650dSSadaf Ebrahimi     *code++ = meta_arg;
8206*22dc650dSSadaf Ebrahimi     break;  /* End META_ESCAPE */
8207*22dc650dSSadaf Ebrahimi 
8208*22dc650dSSadaf Ebrahimi 
8209*22dc650dSSadaf Ebrahimi     /* ===================================================================*/
8210*22dc650dSSadaf Ebrahimi     /* Handle an unrecognized meta value. A parsed pattern value less than
8211*22dc650dSSadaf Ebrahimi     META_END is a literal. Otherwise we have a problem. */
8212*22dc650dSSadaf Ebrahimi 
8213*22dc650dSSadaf Ebrahimi     default:
8214*22dc650dSSadaf Ebrahimi     if (meta >= META_END)
8215*22dc650dSSadaf Ebrahimi       {
8216*22dc650dSSadaf Ebrahimi #ifdef DEBUG_SHOW_PARSED
8217*22dc650dSSadaf Ebrahimi       fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x\n", *pptr);
8218*22dc650dSSadaf Ebrahimi #endif
8219*22dc650dSSadaf Ebrahimi       *errorcodeptr = ERR89;  /* Internal error - unrecognized. */
8220*22dc650dSSadaf Ebrahimi       return 0;
8221*22dc650dSSadaf Ebrahimi       }
8222*22dc650dSSadaf Ebrahimi 
8223*22dc650dSSadaf Ebrahimi     /* Handle a literal character. We come here by goto in the case of a
8224*22dc650dSSadaf Ebrahimi     32-bit, non-UTF character whose value is greater than META_END. */
8225*22dc650dSSadaf Ebrahimi 
8226*22dc650dSSadaf Ebrahimi     NORMAL_CHAR:
8227*22dc650dSSadaf Ebrahimi     meta = *pptr;     /* Get the full 32 bits */
8228*22dc650dSSadaf Ebrahimi     NORMAL_CHAR_SET:  /* Character is already in meta */
8229*22dc650dSSadaf Ebrahimi     matched_char = TRUE;
8230*22dc650dSSadaf Ebrahimi 
8231*22dc650dSSadaf Ebrahimi     /* For caseless UTF or UCP mode, check whether this character has more than
8232*22dc650dSSadaf Ebrahimi     one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
8233*22dc650dSSadaf Ebrahimi     When casing restrictions apply, ignore caseless sets that start with an
8234*22dc650dSSadaf Ebrahimi     ASCII character. */
8235*22dc650dSSadaf Ebrahimi 
8236*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
8237*22dc650dSSadaf Ebrahimi     if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
8238*22dc650dSSadaf Ebrahimi       {
8239*22dc650dSSadaf Ebrahimi       uint32_t caseset = UCD_CASESET(meta);
8240*22dc650dSSadaf Ebrahimi       if (caseset != 0 &&
8241*22dc650dSSadaf Ebrahimi            ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) == 0 ||
8242*22dc650dSSadaf Ebrahimi            PRIV(ucd_caseless_sets)[caseset] > 127))
8243*22dc650dSSadaf Ebrahimi         {
8244*22dc650dSSadaf Ebrahimi         *code++ = OP_PROP;
8245*22dc650dSSadaf Ebrahimi         *code++ = PT_CLIST;
8246*22dc650dSSadaf Ebrahimi         *code++ = caseset;
8247*22dc650dSSadaf Ebrahimi         if (firstcuflags == REQ_UNSET)
8248*22dc650dSSadaf Ebrahimi           firstcuflags = zerofirstcuflags = REQ_NONE;
8249*22dc650dSSadaf Ebrahimi         break;  /* End handling this meta item */
8250*22dc650dSSadaf Ebrahimi         }
8251*22dc650dSSadaf Ebrahimi       }
8252*22dc650dSSadaf Ebrahimi #endif
8253*22dc650dSSadaf Ebrahimi 
8254*22dc650dSSadaf Ebrahimi     /* Caseful matches, or caseless and not one of the multicase characters. We
8255*22dc650dSSadaf Ebrahimi     come here by goto in the case of a positive class that contains only
8256*22dc650dSSadaf Ebrahimi     case-partners of a character with just two cases; matched_char has already
8257*22dc650dSSadaf Ebrahimi     been set TRUE and options fudged if necessary. */
8258*22dc650dSSadaf Ebrahimi 
8259*22dc650dSSadaf Ebrahimi     CLASS_CASELESS_CHAR:
8260*22dc650dSSadaf Ebrahimi 
8261*22dc650dSSadaf Ebrahimi     /* Get the character's code units into mcbuffer, with the length in
8262*22dc650dSSadaf Ebrahimi     mclength. When not in UTF mode, the length is always 1. */
8263*22dc650dSSadaf Ebrahimi 
8264*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
8265*22dc650dSSadaf Ebrahimi     if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
8266*22dc650dSSadaf Ebrahimi #endif
8267*22dc650dSSadaf Ebrahimi       {
8268*22dc650dSSadaf Ebrahimi       mclength = 1;
8269*22dc650dSSadaf Ebrahimi       mcbuffer[0] = meta;
8270*22dc650dSSadaf Ebrahimi       }
8271*22dc650dSSadaf Ebrahimi 
8272*22dc650dSSadaf Ebrahimi     /* Generate the appropriate code */
8273*22dc650dSSadaf Ebrahimi 
8274*22dc650dSSadaf Ebrahimi     *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8275*22dc650dSSadaf Ebrahimi     memcpy(code, mcbuffer, CU2BYTES(mclength));
8276*22dc650dSSadaf Ebrahimi     code += mclength;
8277*22dc650dSSadaf Ebrahimi 
8278*22dc650dSSadaf Ebrahimi     /* Remember if \r or \n were seen */
8279*22dc650dSSadaf Ebrahimi 
8280*22dc650dSSadaf Ebrahimi     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8281*22dc650dSSadaf Ebrahimi       cb->external_flags |= PCRE2_HASCRORLF;
8282*22dc650dSSadaf Ebrahimi 
8283*22dc650dSSadaf Ebrahimi     /* Set the first and required code units appropriately. If no previous
8284*22dc650dSSadaf Ebrahimi     first code unit, set it from this character, but revert to none on a zero
8285*22dc650dSSadaf Ebrahimi     repeat. Otherwise, leave the firstcu value alone, and don't change it on
8286*22dc650dSSadaf Ebrahimi     a zero repeat. */
8287*22dc650dSSadaf Ebrahimi 
8288*22dc650dSSadaf Ebrahimi     if (firstcuflags == REQ_UNSET)
8289*22dc650dSSadaf Ebrahimi       {
8290*22dc650dSSadaf Ebrahimi       zerofirstcuflags = REQ_NONE;
8291*22dc650dSSadaf Ebrahimi       zeroreqcu = reqcu;
8292*22dc650dSSadaf Ebrahimi       zeroreqcuflags = reqcuflags;
8293*22dc650dSSadaf Ebrahimi 
8294*22dc650dSSadaf Ebrahimi       /* If the character is more than one code unit long, we can set a single
8295*22dc650dSSadaf Ebrahimi       firstcu only if it is not to be matched caselessly. Multiple possible
8296*22dc650dSSadaf Ebrahimi       starting code units may be picked up later in the studying code. */
8297*22dc650dSSadaf Ebrahimi 
8298*22dc650dSSadaf Ebrahimi       if (mclength == 1 || req_caseopt == 0)
8299*22dc650dSSadaf Ebrahimi         {
8300*22dc650dSSadaf Ebrahimi         firstcu = mcbuffer[0];
8301*22dc650dSSadaf Ebrahimi         firstcuflags = req_caseopt;
8302*22dc650dSSadaf Ebrahimi         if (mclength != 1)
8303*22dc650dSSadaf Ebrahimi           {
8304*22dc650dSSadaf Ebrahimi           reqcu = code[-1];
8305*22dc650dSSadaf Ebrahimi           reqcuflags = cb->req_varyopt;
8306*22dc650dSSadaf Ebrahimi           }
8307*22dc650dSSadaf Ebrahimi         }
8308*22dc650dSSadaf Ebrahimi       else firstcuflags = reqcuflags = REQ_NONE;
8309*22dc650dSSadaf Ebrahimi       }
8310*22dc650dSSadaf Ebrahimi 
8311*22dc650dSSadaf Ebrahimi     /* firstcu was previously set; we can set reqcu only if the length is
8312*22dc650dSSadaf Ebrahimi     1 or the matching is caseful. */
8313*22dc650dSSadaf Ebrahimi 
8314*22dc650dSSadaf Ebrahimi     else
8315*22dc650dSSadaf Ebrahimi       {
8316*22dc650dSSadaf Ebrahimi       zerofirstcu = firstcu;
8317*22dc650dSSadaf Ebrahimi       zerofirstcuflags = firstcuflags;
8318*22dc650dSSadaf Ebrahimi       zeroreqcu = reqcu;
8319*22dc650dSSadaf Ebrahimi       zeroreqcuflags = reqcuflags;
8320*22dc650dSSadaf Ebrahimi       if (mclength == 1 || req_caseopt == 0)
8321*22dc650dSSadaf Ebrahimi         {
8322*22dc650dSSadaf Ebrahimi         reqcu = code[-1];
8323*22dc650dSSadaf Ebrahimi         reqcuflags = req_caseopt | cb->req_varyopt;
8324*22dc650dSSadaf Ebrahimi         }
8325*22dc650dSSadaf Ebrahimi       }
8326*22dc650dSSadaf Ebrahimi 
8327*22dc650dSSadaf Ebrahimi     /* If caselessness was temporarily instated, reset it. */
8328*22dc650dSSadaf Ebrahimi 
8329*22dc650dSSadaf Ebrahimi     if (reset_caseful)
8330*22dc650dSSadaf Ebrahimi       {
8331*22dc650dSSadaf Ebrahimi       options &= ~PCRE2_CASELESS;
8332*22dc650dSSadaf Ebrahimi       req_caseopt = 0;
8333*22dc650dSSadaf Ebrahimi       reset_caseful = FALSE;
8334*22dc650dSSadaf Ebrahimi       }
8335*22dc650dSSadaf Ebrahimi 
8336*22dc650dSSadaf Ebrahimi     break;    /* End literal character handling */
8337*22dc650dSSadaf Ebrahimi     }         /* End of big switch */
8338*22dc650dSSadaf Ebrahimi   }           /* End of big loop */
8339*22dc650dSSadaf Ebrahimi 
8340*22dc650dSSadaf Ebrahimi /* Control never reaches here. */
8341*22dc650dSSadaf Ebrahimi }
8342*22dc650dSSadaf Ebrahimi 
8343*22dc650dSSadaf Ebrahimi 
8344*22dc650dSSadaf Ebrahimi 
8345*22dc650dSSadaf Ebrahimi /*************************************************
8346*22dc650dSSadaf Ebrahimi *   Compile regex: a sequence of alternatives    *
8347*22dc650dSSadaf Ebrahimi *************************************************/
8348*22dc650dSSadaf Ebrahimi 
8349*22dc650dSSadaf Ebrahimi /* On entry, pptr is pointing past the bracket meta, but on return it points to
8350*22dc650dSSadaf Ebrahimi the closing bracket or META_END. The code variable is pointing at the code unit
8351*22dc650dSSadaf Ebrahimi into which the BRA operator has been stored. This function is used during the
8352*22dc650dSSadaf Ebrahimi pre-compile phase when we are trying to find out the amount of memory needed,
8353*22dc650dSSadaf Ebrahimi as well as during the real compile phase. The value of lengthptr distinguishes
8354*22dc650dSSadaf Ebrahimi the two phases.
8355*22dc650dSSadaf Ebrahimi 
8356*22dc650dSSadaf Ebrahimi Arguments:
8357*22dc650dSSadaf Ebrahimi   options           option bits, including any changes for this subpattern
8358*22dc650dSSadaf Ebrahimi   xoptions          extra option bits, ditto
8359*22dc650dSSadaf Ebrahimi   codeptr           -> the address of the current code pointer
8360*22dc650dSSadaf Ebrahimi   pptrptr           -> the address of the current parsed pattern pointer
8361*22dc650dSSadaf Ebrahimi   errorcodeptr      -> pointer to error code variable
8362*22dc650dSSadaf Ebrahimi   skipunits         skip this many code units at start (for brackets and OP_COND)
8363*22dc650dSSadaf Ebrahimi   firstcuptr        place to put the first required code unit
8364*22dc650dSSadaf Ebrahimi   firstcuflagsptr   place to put the first code unit flags
8365*22dc650dSSadaf Ebrahimi   reqcuptr          place to put the last required code unit
8366*22dc650dSSadaf Ebrahimi   reqcuflagsptr     place to put the last required code unit flags
8367*22dc650dSSadaf Ebrahimi   bcptr             pointer to the chain of currently open branches
8368*22dc650dSSadaf Ebrahimi   cb                points to the data block with tables pointers etc.
8369*22dc650dSSadaf Ebrahimi   lengthptr         NULL during the real compile phase
8370*22dc650dSSadaf Ebrahimi                     points to length accumulator during pre-compile phase
8371*22dc650dSSadaf Ebrahimi 
8372*22dc650dSSadaf Ebrahimi Returns:            0 There has been an error
8373*22dc650dSSadaf Ebrahimi                    +1 Success, this group must match at least one character
8374*22dc650dSSadaf Ebrahimi                    -1 Success, this group may match an empty string
8375*22dc650dSSadaf Ebrahimi */
8376*22dc650dSSadaf Ebrahimi 
8377*22dc650dSSadaf Ebrahimi static int
compile_regex(uint32_t options,uint32_t xoptions,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t skipunits,uint32_t * firstcuptr,uint32_t * firstcuflagsptr,uint32_t * reqcuptr,uint32_t * reqcuflagsptr,branch_chain * bcptr,open_capitem * open_caps,compile_block * cb,PCRE2_SIZE * lengthptr)8378*22dc650dSSadaf Ebrahimi compile_regex(uint32_t options, uint32_t xoptions, PCRE2_UCHAR **codeptr,
8379*22dc650dSSadaf Ebrahimi   uint32_t **pptrptr, int *errorcodeptr, uint32_t skipunits,
8380*22dc650dSSadaf Ebrahimi   uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
8381*22dc650dSSadaf Ebrahimi   uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
8382*22dc650dSSadaf Ebrahimi   compile_block *cb, PCRE2_SIZE *lengthptr)
8383*22dc650dSSadaf Ebrahimi {
8384*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *code = *codeptr;
8385*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *last_branch = code;
8386*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *start_bracket = code;
8387*22dc650dSSadaf Ebrahimi BOOL lookbehind;
8388*22dc650dSSadaf Ebrahimi open_capitem capitem;
8389*22dc650dSSadaf Ebrahimi int capnumber = 0;
8390*22dc650dSSadaf Ebrahimi int okreturn = 1;
8391*22dc650dSSadaf Ebrahimi uint32_t *pptr = *pptrptr;
8392*22dc650dSSadaf Ebrahimi uint32_t firstcu, reqcu;
8393*22dc650dSSadaf Ebrahimi uint32_t lookbehindlength;
8394*22dc650dSSadaf Ebrahimi uint32_t lookbehindminlength;
8395*22dc650dSSadaf Ebrahimi uint32_t firstcuflags, reqcuflags;
8396*22dc650dSSadaf Ebrahimi uint32_t branchfirstcu, branchreqcu;
8397*22dc650dSSadaf Ebrahimi uint32_t branchfirstcuflags, branchreqcuflags;
8398*22dc650dSSadaf Ebrahimi PCRE2_SIZE length;
8399*22dc650dSSadaf Ebrahimi branch_chain bc;
8400*22dc650dSSadaf Ebrahimi 
8401*22dc650dSSadaf Ebrahimi /* If set, call the external function that checks for stack availability. */
8402*22dc650dSSadaf Ebrahimi 
8403*22dc650dSSadaf Ebrahimi if (cb->cx->stack_guard != NULL &&
8404*22dc650dSSadaf Ebrahimi     cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
8405*22dc650dSSadaf Ebrahimi   {
8406*22dc650dSSadaf Ebrahimi   *errorcodeptr= ERR33;
8407*22dc650dSSadaf Ebrahimi   return 0;
8408*22dc650dSSadaf Ebrahimi   }
8409*22dc650dSSadaf Ebrahimi 
8410*22dc650dSSadaf Ebrahimi /* Miscellaneous initialization */
8411*22dc650dSSadaf Ebrahimi 
8412*22dc650dSSadaf Ebrahimi bc.outer = bcptr;
8413*22dc650dSSadaf Ebrahimi bc.current_branch = code;
8414*22dc650dSSadaf Ebrahimi 
8415*22dc650dSSadaf Ebrahimi firstcu = reqcu = 0;
8416*22dc650dSSadaf Ebrahimi firstcuflags = reqcuflags = REQ_UNSET;
8417*22dc650dSSadaf Ebrahimi 
8418*22dc650dSSadaf Ebrahimi /* Accumulate the length for use in the pre-compile phase. Start with the
8419*22dc650dSSadaf Ebrahimi length of the BRA and KET and any extra code units that are required at the
8420*22dc650dSSadaf Ebrahimi beginning. We accumulate in a local variable to save frequent testing of
8421*22dc650dSSadaf Ebrahimi lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
8422*22dc650dSSadaf Ebrahimi start and end of each alternative, because compiled items are discarded during
8423*22dc650dSSadaf Ebrahimi the pre-compile phase so that the workspace is not exceeded. */
8424*22dc650dSSadaf Ebrahimi 
8425*22dc650dSSadaf Ebrahimi length = 2 + 2*LINK_SIZE + skipunits;
8426*22dc650dSSadaf Ebrahimi 
8427*22dc650dSSadaf Ebrahimi /* Remember if this is a lookbehind assertion, and if it is, save its length
8428*22dc650dSSadaf Ebrahimi and skip over the pattern offset. */
8429*22dc650dSSadaf Ebrahimi 
8430*22dc650dSSadaf Ebrahimi lookbehind = *code == OP_ASSERTBACK ||
8431*22dc650dSSadaf Ebrahimi              *code == OP_ASSERTBACK_NOT ||
8432*22dc650dSSadaf Ebrahimi              *code == OP_ASSERTBACK_NA;
8433*22dc650dSSadaf Ebrahimi 
8434*22dc650dSSadaf Ebrahimi if (lookbehind)
8435*22dc650dSSadaf Ebrahimi   {
8436*22dc650dSSadaf Ebrahimi   lookbehindlength = META_DATA(pptr[-1]);
8437*22dc650dSSadaf Ebrahimi   lookbehindminlength = *pptr;
8438*22dc650dSSadaf Ebrahimi   pptr += SIZEOFFSET;
8439*22dc650dSSadaf Ebrahimi   }
8440*22dc650dSSadaf Ebrahimi else lookbehindlength = lookbehindminlength = 0;
8441*22dc650dSSadaf Ebrahimi 
8442*22dc650dSSadaf Ebrahimi /* If this is a capturing subpattern, add to the chain of open capturing items
8443*22dc650dSSadaf Ebrahimi so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
8444*22dc650dSSadaf Ebrahimi need be tested here; changing this opcode to one of its variants, e.g.
8445*22dc650dSSadaf Ebrahimi OP_SCBRAPOS, happens later, after the group has been compiled. */
8446*22dc650dSSadaf Ebrahimi 
8447*22dc650dSSadaf Ebrahimi if (*code == OP_CBRA)
8448*22dc650dSSadaf Ebrahimi   {
8449*22dc650dSSadaf Ebrahimi   capnumber = GET2(code, 1 + LINK_SIZE);
8450*22dc650dSSadaf Ebrahimi   capitem.number = capnumber;
8451*22dc650dSSadaf Ebrahimi   capitem.next = open_caps;
8452*22dc650dSSadaf Ebrahimi   capitem.assert_depth = cb->assert_depth;
8453*22dc650dSSadaf Ebrahimi   open_caps = &capitem;
8454*22dc650dSSadaf Ebrahimi   }
8455*22dc650dSSadaf Ebrahimi 
8456*22dc650dSSadaf Ebrahimi /* Offset is set zero to mark that this bracket is still open */
8457*22dc650dSSadaf Ebrahimi 
8458*22dc650dSSadaf Ebrahimi PUT(code, 1, 0);
8459*22dc650dSSadaf Ebrahimi code += 1 + LINK_SIZE + skipunits;
8460*22dc650dSSadaf Ebrahimi 
8461*22dc650dSSadaf Ebrahimi /* Loop for each alternative branch */
8462*22dc650dSSadaf Ebrahimi 
8463*22dc650dSSadaf Ebrahimi for (;;)
8464*22dc650dSSadaf Ebrahimi   {
8465*22dc650dSSadaf Ebrahimi   int branch_return;
8466*22dc650dSSadaf Ebrahimi 
8467*22dc650dSSadaf Ebrahimi   /* Insert OP_REVERSE or OP_VREVERSE if this is a lookbehind assertion. There
8468*22dc650dSSadaf Ebrahimi   is only a single mimimum length for the whole assertion. When the mimimum
8469*22dc650dSSadaf Ebrahimi   length is LOOKBEHIND_MAX it means that all branches are of fixed length,
8470*22dc650dSSadaf Ebrahimi   though not necessarily the same length. In this case, the original OP_REVERSE
8471*22dc650dSSadaf Ebrahimi   can be used. It can also be used if a branch in a variable length lookbehind
8472*22dc650dSSadaf Ebrahimi   has the same maximum and minimum. Otherwise, use OP_VREVERSE, which has both
8473*22dc650dSSadaf Ebrahimi   maximum and minimum values. */
8474*22dc650dSSadaf Ebrahimi 
8475*22dc650dSSadaf Ebrahimi   if (lookbehind && lookbehindlength > 0)
8476*22dc650dSSadaf Ebrahimi     {
8477*22dc650dSSadaf Ebrahimi     if (lookbehindminlength == LOOKBEHIND_MAX ||
8478*22dc650dSSadaf Ebrahimi         lookbehindminlength == lookbehindlength)
8479*22dc650dSSadaf Ebrahimi       {
8480*22dc650dSSadaf Ebrahimi       *code++ = OP_REVERSE;
8481*22dc650dSSadaf Ebrahimi       PUT2INC(code, 0, lookbehindlength);
8482*22dc650dSSadaf Ebrahimi       length += 1 + IMM2_SIZE;
8483*22dc650dSSadaf Ebrahimi       }
8484*22dc650dSSadaf Ebrahimi     else
8485*22dc650dSSadaf Ebrahimi       {
8486*22dc650dSSadaf Ebrahimi       *code++ = OP_VREVERSE;
8487*22dc650dSSadaf Ebrahimi       PUT2INC(code, 0, lookbehindminlength);
8488*22dc650dSSadaf Ebrahimi       PUT2INC(code, 0, lookbehindlength);
8489*22dc650dSSadaf Ebrahimi       length += 1 + 2*IMM2_SIZE;
8490*22dc650dSSadaf Ebrahimi       }
8491*22dc650dSSadaf Ebrahimi     }
8492*22dc650dSSadaf Ebrahimi 
8493*22dc650dSSadaf Ebrahimi   /* Now compile the branch; in the pre-compile phase its length gets added
8494*22dc650dSSadaf Ebrahimi   into the length. */
8495*22dc650dSSadaf Ebrahimi 
8496*22dc650dSSadaf Ebrahimi   if ((branch_return =
8497*22dc650dSSadaf Ebrahimi         compile_branch(&options, &xoptions, &code, &pptr, errorcodeptr,
8498*22dc650dSSadaf Ebrahimi           &branchfirstcu, &branchfirstcuflags, &branchreqcu, &branchreqcuflags,
8499*22dc650dSSadaf Ebrahimi           &bc, open_caps, cb, (lengthptr == NULL)? NULL : &length)) == 0)
8500*22dc650dSSadaf Ebrahimi     return 0;
8501*22dc650dSSadaf Ebrahimi 
8502*22dc650dSSadaf Ebrahimi   /* If a branch can match an empty string, so can the whole group. */
8503*22dc650dSSadaf Ebrahimi 
8504*22dc650dSSadaf Ebrahimi   if (branch_return < 0) okreturn = -1;
8505*22dc650dSSadaf Ebrahimi 
8506*22dc650dSSadaf Ebrahimi   /* In the real compile phase, there is some post-processing to be done. */
8507*22dc650dSSadaf Ebrahimi 
8508*22dc650dSSadaf Ebrahimi   if (lengthptr == NULL)
8509*22dc650dSSadaf Ebrahimi     {
8510*22dc650dSSadaf Ebrahimi     /* If this is the first branch, the firstcu and reqcu values for the
8511*22dc650dSSadaf Ebrahimi     branch become the values for the regex. */
8512*22dc650dSSadaf Ebrahimi 
8513*22dc650dSSadaf Ebrahimi     if (*last_branch != OP_ALT)
8514*22dc650dSSadaf Ebrahimi       {
8515*22dc650dSSadaf Ebrahimi       firstcu = branchfirstcu;
8516*22dc650dSSadaf Ebrahimi       firstcuflags = branchfirstcuflags;
8517*22dc650dSSadaf Ebrahimi       reqcu = branchreqcu;
8518*22dc650dSSadaf Ebrahimi       reqcuflags = branchreqcuflags;
8519*22dc650dSSadaf Ebrahimi       }
8520*22dc650dSSadaf Ebrahimi 
8521*22dc650dSSadaf Ebrahimi     /* If this is not the first branch, the first char and reqcu have to
8522*22dc650dSSadaf Ebrahimi     match the values from all the previous branches, except that if the
8523*22dc650dSSadaf Ebrahimi     previous value for reqcu didn't have REQ_VARY set, it can still match,
8524*22dc650dSSadaf Ebrahimi     and we set REQ_VARY for the group from this branch's value. */
8525*22dc650dSSadaf Ebrahimi 
8526*22dc650dSSadaf Ebrahimi     else
8527*22dc650dSSadaf Ebrahimi       {
8528*22dc650dSSadaf Ebrahimi       /* If we previously had a firstcu, but it doesn't match the new branch,
8529*22dc650dSSadaf Ebrahimi       we have to abandon the firstcu for the regex, but if there was
8530*22dc650dSSadaf Ebrahimi       previously no reqcu, it takes on the value of the old firstcu. */
8531*22dc650dSSadaf Ebrahimi 
8532*22dc650dSSadaf Ebrahimi       if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
8533*22dc650dSSadaf Ebrahimi         {
8534*22dc650dSSadaf Ebrahimi         if (firstcuflags < REQ_NONE)
8535*22dc650dSSadaf Ebrahimi           {
8536*22dc650dSSadaf Ebrahimi           if (reqcuflags >= REQ_NONE)
8537*22dc650dSSadaf Ebrahimi             {
8538*22dc650dSSadaf Ebrahimi             reqcu = firstcu;
8539*22dc650dSSadaf Ebrahimi             reqcuflags = firstcuflags;
8540*22dc650dSSadaf Ebrahimi             }
8541*22dc650dSSadaf Ebrahimi           }
8542*22dc650dSSadaf Ebrahimi         firstcuflags = REQ_NONE;
8543*22dc650dSSadaf Ebrahimi         }
8544*22dc650dSSadaf Ebrahimi 
8545*22dc650dSSadaf Ebrahimi       /* If we (now or from before) have no firstcu, a firstcu from the
8546*22dc650dSSadaf Ebrahimi       branch becomes a reqcu if there isn't a branch reqcu. */
8547*22dc650dSSadaf Ebrahimi 
8548*22dc650dSSadaf Ebrahimi       if (firstcuflags >= REQ_NONE && branchfirstcuflags < REQ_NONE &&
8549*22dc650dSSadaf Ebrahimi           branchreqcuflags >= REQ_NONE)
8550*22dc650dSSadaf Ebrahimi         {
8551*22dc650dSSadaf Ebrahimi         branchreqcu = branchfirstcu;
8552*22dc650dSSadaf Ebrahimi         branchreqcuflags = branchfirstcuflags;
8553*22dc650dSSadaf Ebrahimi         }
8554*22dc650dSSadaf Ebrahimi 
8555*22dc650dSSadaf Ebrahimi       /* Now ensure that the reqcus match */
8556*22dc650dSSadaf Ebrahimi 
8557*22dc650dSSadaf Ebrahimi       if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
8558*22dc650dSSadaf Ebrahimi           reqcu != branchreqcu)
8559*22dc650dSSadaf Ebrahimi         reqcuflags = REQ_NONE;
8560*22dc650dSSadaf Ebrahimi       else
8561*22dc650dSSadaf Ebrahimi         {
8562*22dc650dSSadaf Ebrahimi         reqcu = branchreqcu;
8563*22dc650dSSadaf Ebrahimi         reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */
8564*22dc650dSSadaf Ebrahimi         }
8565*22dc650dSSadaf Ebrahimi       }
8566*22dc650dSSadaf Ebrahimi     }
8567*22dc650dSSadaf Ebrahimi 
8568*22dc650dSSadaf Ebrahimi   /* Handle reaching the end of the expression, either ')' or end of pattern.
8569*22dc650dSSadaf Ebrahimi   In the real compile phase, go back through the alternative branches and
8570*22dc650dSSadaf Ebrahimi   reverse the chain of offsets, with the field in the BRA item now becoming an
8571*22dc650dSSadaf Ebrahimi   offset to the first alternative. If there are no alternatives, it points to
8572*22dc650dSSadaf Ebrahimi   the end of the group. The length in the terminating ket is always the length
8573*22dc650dSSadaf Ebrahimi   of the whole bracketed item. Return leaving the pointer at the terminating
8574*22dc650dSSadaf Ebrahimi   char. */
8575*22dc650dSSadaf Ebrahimi 
8576*22dc650dSSadaf Ebrahimi   if (META_CODE(*pptr) != META_ALT)
8577*22dc650dSSadaf Ebrahimi     {
8578*22dc650dSSadaf Ebrahimi     if (lengthptr == NULL)
8579*22dc650dSSadaf Ebrahimi       {
8580*22dc650dSSadaf Ebrahimi       PCRE2_SIZE branch_length = code - last_branch;
8581*22dc650dSSadaf Ebrahimi       do
8582*22dc650dSSadaf Ebrahimi         {
8583*22dc650dSSadaf Ebrahimi         PCRE2_SIZE prev_length = GET(last_branch, 1);
8584*22dc650dSSadaf Ebrahimi         PUT(last_branch, 1, branch_length);
8585*22dc650dSSadaf Ebrahimi         branch_length = prev_length;
8586*22dc650dSSadaf Ebrahimi         last_branch -= branch_length;
8587*22dc650dSSadaf Ebrahimi         }
8588*22dc650dSSadaf Ebrahimi       while (branch_length > 0);
8589*22dc650dSSadaf Ebrahimi       }
8590*22dc650dSSadaf Ebrahimi 
8591*22dc650dSSadaf Ebrahimi     /* Fill in the ket */
8592*22dc650dSSadaf Ebrahimi 
8593*22dc650dSSadaf Ebrahimi     *code = OP_KET;
8594*22dc650dSSadaf Ebrahimi     PUT(code, 1, (int)(code - start_bracket));
8595*22dc650dSSadaf Ebrahimi     code += 1 + LINK_SIZE;
8596*22dc650dSSadaf Ebrahimi 
8597*22dc650dSSadaf Ebrahimi     /* Set values to pass back */
8598*22dc650dSSadaf Ebrahimi 
8599*22dc650dSSadaf Ebrahimi     *codeptr = code;
8600*22dc650dSSadaf Ebrahimi     *pptrptr = pptr;
8601*22dc650dSSadaf Ebrahimi     *firstcuptr = firstcu;
8602*22dc650dSSadaf Ebrahimi     *firstcuflagsptr = firstcuflags;
8603*22dc650dSSadaf Ebrahimi     *reqcuptr = reqcu;
8604*22dc650dSSadaf Ebrahimi     *reqcuflagsptr = reqcuflags;
8605*22dc650dSSadaf Ebrahimi     if (lengthptr != NULL)
8606*22dc650dSSadaf Ebrahimi       {
8607*22dc650dSSadaf Ebrahimi       if (OFLOW_MAX - *lengthptr < length)
8608*22dc650dSSadaf Ebrahimi         {
8609*22dc650dSSadaf Ebrahimi         *errorcodeptr = ERR20;
8610*22dc650dSSadaf Ebrahimi         return 0;
8611*22dc650dSSadaf Ebrahimi         }
8612*22dc650dSSadaf Ebrahimi       *lengthptr += length;
8613*22dc650dSSadaf Ebrahimi       }
8614*22dc650dSSadaf Ebrahimi     return okreturn;
8615*22dc650dSSadaf Ebrahimi     }
8616*22dc650dSSadaf Ebrahimi 
8617*22dc650dSSadaf Ebrahimi   /* Another branch follows. In the pre-compile phase, we can move the code
8618*22dc650dSSadaf Ebrahimi   pointer back to where it was for the start of the first branch. (That is,
8619*22dc650dSSadaf Ebrahimi   pretend that each branch is the only one.)
8620*22dc650dSSadaf Ebrahimi 
8621*22dc650dSSadaf Ebrahimi   In the real compile phase, insert an ALT node. Its length field points back
8622*22dc650dSSadaf Ebrahimi   to the previous branch while the bracket remains open. At the end the chain
8623*22dc650dSSadaf Ebrahimi   is reversed. It's done like this so that the start of the bracket has a
8624*22dc650dSSadaf Ebrahimi   zero offset until it is closed, making it possible to detect recursion. */
8625*22dc650dSSadaf Ebrahimi 
8626*22dc650dSSadaf Ebrahimi   if (lengthptr != NULL)
8627*22dc650dSSadaf Ebrahimi     {
8628*22dc650dSSadaf Ebrahimi     code = *codeptr + 1 + LINK_SIZE + skipunits;
8629*22dc650dSSadaf Ebrahimi     length += 1 + LINK_SIZE;
8630*22dc650dSSadaf Ebrahimi     }
8631*22dc650dSSadaf Ebrahimi   else
8632*22dc650dSSadaf Ebrahimi     {
8633*22dc650dSSadaf Ebrahimi     *code = OP_ALT;
8634*22dc650dSSadaf Ebrahimi     PUT(code, 1, (int)(code - last_branch));
8635*22dc650dSSadaf Ebrahimi     bc.current_branch = last_branch = code;
8636*22dc650dSSadaf Ebrahimi     code += 1 + LINK_SIZE;
8637*22dc650dSSadaf Ebrahimi     }
8638*22dc650dSSadaf Ebrahimi 
8639*22dc650dSSadaf Ebrahimi   /* Set the maximum lookbehind length for the next branch (if not in a
8640*22dc650dSSadaf Ebrahimi   lookbehind the value will be zero) and then advance past the vertical bar. */
8641*22dc650dSSadaf Ebrahimi 
8642*22dc650dSSadaf Ebrahimi   lookbehindlength = META_DATA(*pptr);
8643*22dc650dSSadaf Ebrahimi   pptr++;
8644*22dc650dSSadaf Ebrahimi   }
8645*22dc650dSSadaf Ebrahimi /* Control never reaches here */
8646*22dc650dSSadaf Ebrahimi }
8647*22dc650dSSadaf Ebrahimi 
8648*22dc650dSSadaf Ebrahimi 
8649*22dc650dSSadaf Ebrahimi 
8650*22dc650dSSadaf Ebrahimi /*************************************************
8651*22dc650dSSadaf Ebrahimi *          Check for anchored pattern            *
8652*22dc650dSSadaf Ebrahimi *************************************************/
8653*22dc650dSSadaf Ebrahimi 
8654*22dc650dSSadaf Ebrahimi /* Try to find out if this is an anchored regular expression. Consider each
8655*22dc650dSSadaf Ebrahimi alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8656*22dc650dSSadaf Ebrahimi all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8657*22dc650dSSadaf Ebrahimi it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8658*22dc650dSSadaf Ebrahimi be found, because ^ generates OP_CIRCM in that mode.
8659*22dc650dSSadaf Ebrahimi 
8660*22dc650dSSadaf Ebrahimi We can also consider a regex to be anchored if OP_SOM starts all its branches.
8661*22dc650dSSadaf Ebrahimi This is the code for \G, which means "match at start of match position, taking
8662*22dc650dSSadaf Ebrahimi into account the match offset".
8663*22dc650dSSadaf Ebrahimi 
8664*22dc650dSSadaf Ebrahimi A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8665*22dc650dSSadaf Ebrahimi because that will try the rest of the pattern at all possible matching points,
8666*22dc650dSSadaf Ebrahimi so there is no point trying again.... er ....
8667*22dc650dSSadaf Ebrahimi 
8668*22dc650dSSadaf Ebrahimi .... except when the .* appears inside capturing parentheses, and there is a
8669*22dc650dSSadaf Ebrahimi subsequent back reference to those parentheses. We haven't enough information
8670*22dc650dSSadaf Ebrahimi to catch that case precisely.
8671*22dc650dSSadaf Ebrahimi 
8672*22dc650dSSadaf Ebrahimi At first, the best we could do was to detect when .* was in capturing brackets
8673*22dc650dSSadaf Ebrahimi and the highest back reference was greater than or equal to that level.
8674*22dc650dSSadaf Ebrahimi However, by keeping a bitmap of the first 31 back references, we can catch some
8675*22dc650dSSadaf Ebrahimi of the more common cases more precisely.
8676*22dc650dSSadaf Ebrahimi 
8677*22dc650dSSadaf Ebrahimi ... A second exception is when the .* appears inside an atomic group, because
8678*22dc650dSSadaf Ebrahimi this prevents the number of characters it matches from being adjusted.
8679*22dc650dSSadaf Ebrahimi 
8680*22dc650dSSadaf Ebrahimi Arguments:
8681*22dc650dSSadaf Ebrahimi   code           points to start of the compiled pattern
8682*22dc650dSSadaf Ebrahimi   bracket_map    a bitmap of which brackets we are inside while testing; this
8683*22dc650dSSadaf Ebrahimi                    handles up to substring 31; after that we just have to take
8684*22dc650dSSadaf Ebrahimi                    the less precise approach
8685*22dc650dSSadaf Ebrahimi   cb             points to the compile data block
8686*22dc650dSSadaf Ebrahimi   atomcount      atomic group level
8687*22dc650dSSadaf Ebrahimi   inassert       TRUE if in an assertion
8688*22dc650dSSadaf Ebrahimi 
8689*22dc650dSSadaf Ebrahimi Returns:     TRUE or FALSE
8690*22dc650dSSadaf Ebrahimi */
8691*22dc650dSSadaf Ebrahimi 
8692*22dc650dSSadaf Ebrahimi static BOOL
is_anchored(PCRE2_SPTR code,uint32_t bracket_map,compile_block * cb,int atomcount,BOOL inassert)8693*22dc650dSSadaf Ebrahimi is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb,
8694*22dc650dSSadaf Ebrahimi   int atomcount, BOOL inassert)
8695*22dc650dSSadaf Ebrahimi {
8696*22dc650dSSadaf Ebrahimi do {
8697*22dc650dSSadaf Ebrahimi    PCRE2_SPTR scode = first_significant_code(
8698*22dc650dSSadaf Ebrahimi      code + PRIV(OP_lengths)[*code], FALSE);
8699*22dc650dSSadaf Ebrahimi    int op = *scode;
8700*22dc650dSSadaf Ebrahimi 
8701*22dc650dSSadaf Ebrahimi    /* Non-capturing brackets */
8702*22dc650dSSadaf Ebrahimi 
8703*22dc650dSSadaf Ebrahimi    if (op == OP_BRA  || op == OP_BRAPOS ||
8704*22dc650dSSadaf Ebrahimi        op == OP_SBRA || op == OP_SBRAPOS)
8705*22dc650dSSadaf Ebrahimi      {
8706*22dc650dSSadaf Ebrahimi      if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8707*22dc650dSSadaf Ebrahimi        return FALSE;
8708*22dc650dSSadaf Ebrahimi      }
8709*22dc650dSSadaf Ebrahimi 
8710*22dc650dSSadaf Ebrahimi    /* Capturing brackets */
8711*22dc650dSSadaf Ebrahimi 
8712*22dc650dSSadaf Ebrahimi    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8713*22dc650dSSadaf Ebrahimi             op == OP_SCBRA || op == OP_SCBRAPOS)
8714*22dc650dSSadaf Ebrahimi      {
8715*22dc650dSSadaf Ebrahimi      int n = GET2(scode, 1+LINK_SIZE);
8716*22dc650dSSadaf Ebrahimi      uint32_t new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8717*22dc650dSSadaf Ebrahimi      if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
8718*22dc650dSSadaf Ebrahimi      }
8719*22dc650dSSadaf Ebrahimi 
8720*22dc650dSSadaf Ebrahimi    /* Positive forward assertion */
8721*22dc650dSSadaf Ebrahimi 
8722*22dc650dSSadaf Ebrahimi    else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8723*22dc650dSSadaf Ebrahimi      {
8724*22dc650dSSadaf Ebrahimi      if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8725*22dc650dSSadaf Ebrahimi      }
8726*22dc650dSSadaf Ebrahimi 
8727*22dc650dSSadaf Ebrahimi    /* Condition. If there is no second branch, it can't be anchored. */
8728*22dc650dSSadaf Ebrahimi 
8729*22dc650dSSadaf Ebrahimi    else if (op == OP_COND || op == OP_SCOND)
8730*22dc650dSSadaf Ebrahimi      {
8731*22dc650dSSadaf Ebrahimi      if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8732*22dc650dSSadaf Ebrahimi      if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8733*22dc650dSSadaf Ebrahimi        return FALSE;
8734*22dc650dSSadaf Ebrahimi      }
8735*22dc650dSSadaf Ebrahimi 
8736*22dc650dSSadaf Ebrahimi    /* Atomic groups */
8737*22dc650dSSadaf Ebrahimi 
8738*22dc650dSSadaf Ebrahimi    else if (op == OP_ONCE)
8739*22dc650dSSadaf Ebrahimi      {
8740*22dc650dSSadaf Ebrahimi      if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert))
8741*22dc650dSSadaf Ebrahimi        return FALSE;
8742*22dc650dSSadaf Ebrahimi      }
8743*22dc650dSSadaf Ebrahimi 
8744*22dc650dSSadaf Ebrahimi    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8745*22dc650dSSadaf Ebrahimi    it isn't in brackets that are or may be referenced or inside an atomic
8746*22dc650dSSadaf Ebrahimi    group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
8747*22dc650dSSadaf Ebrahimi    because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
8748*22dc650dSSadaf Ebrahimi    with the subject "aab", which matches "b", i.e. not at the start of a line.
8749*22dc650dSSadaf Ebrahimi    There is also an option that disables auto-anchoring. */
8750*22dc650dSSadaf Ebrahimi 
8751*22dc650dSSadaf Ebrahimi    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8752*22dc650dSSadaf Ebrahimi              op == OP_TYPEPOSSTAR))
8753*22dc650dSSadaf Ebrahimi      {
8754*22dc650dSSadaf Ebrahimi      if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
8755*22dc650dSSadaf Ebrahimi          atomcount > 0 || cb->had_pruneorskip || inassert ||
8756*22dc650dSSadaf Ebrahimi          (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8757*22dc650dSSadaf Ebrahimi        return FALSE;
8758*22dc650dSSadaf Ebrahimi      }
8759*22dc650dSSadaf Ebrahimi 
8760*22dc650dSSadaf Ebrahimi    /* Check for explicit anchoring */
8761*22dc650dSSadaf Ebrahimi 
8762*22dc650dSSadaf Ebrahimi    else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8763*22dc650dSSadaf Ebrahimi 
8764*22dc650dSSadaf Ebrahimi    code += GET(code, 1);
8765*22dc650dSSadaf Ebrahimi    }
8766*22dc650dSSadaf Ebrahimi while (*code == OP_ALT);   /* Loop for each alternative */
8767*22dc650dSSadaf Ebrahimi return TRUE;
8768*22dc650dSSadaf Ebrahimi }
8769*22dc650dSSadaf Ebrahimi 
8770*22dc650dSSadaf Ebrahimi 
8771*22dc650dSSadaf Ebrahimi 
8772*22dc650dSSadaf Ebrahimi /*************************************************
8773*22dc650dSSadaf Ebrahimi *         Check for starting with ^ or .*        *
8774*22dc650dSSadaf Ebrahimi *************************************************/
8775*22dc650dSSadaf Ebrahimi 
8776*22dc650dSSadaf Ebrahimi /* This is called to find out if every branch starts with ^ or .* so that
8777*22dc650dSSadaf Ebrahimi "first char" processing can be done to speed things up in multiline
8778*22dc650dSSadaf Ebrahimi matching and for non-DOTALL patterns that start with .* (which must start at
8779*22dc650dSSadaf Ebrahimi the beginning or after \n). As in the case of is_anchored() (see above), we
8780*22dc650dSSadaf Ebrahimi have to take account of back references to capturing brackets that contain .*
8781*22dc650dSSadaf Ebrahimi because in that case we can't make the assumption. Also, the appearance of .*
8782*22dc650dSSadaf Ebrahimi inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8783*22dc650dSSadaf Ebrahimi or *SKIP does not count, because once again the assumption no longer holds.
8784*22dc650dSSadaf Ebrahimi 
8785*22dc650dSSadaf Ebrahimi Arguments:
8786*22dc650dSSadaf Ebrahimi   code           points to start of the compiled pattern or a group
8787*22dc650dSSadaf Ebrahimi   bracket_map    a bitmap of which brackets we are inside while testing; this
8788*22dc650dSSadaf Ebrahimi                    handles up to substring 31; after that we just have to take
8789*22dc650dSSadaf Ebrahimi                    the less precise approach
8790*22dc650dSSadaf Ebrahimi   cb             points to the compile data
8791*22dc650dSSadaf Ebrahimi   atomcount      atomic group level
8792*22dc650dSSadaf Ebrahimi   inassert       TRUE if in an assertion
8793*22dc650dSSadaf Ebrahimi 
8794*22dc650dSSadaf Ebrahimi Returns:         TRUE or FALSE
8795*22dc650dSSadaf Ebrahimi */
8796*22dc650dSSadaf Ebrahimi 
8797*22dc650dSSadaf Ebrahimi static BOOL
is_startline(PCRE2_SPTR code,unsigned int bracket_map,compile_block * cb,int atomcount,BOOL inassert)8798*22dc650dSSadaf Ebrahimi is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8799*22dc650dSSadaf Ebrahimi   int atomcount, BOOL inassert)
8800*22dc650dSSadaf Ebrahimi {
8801*22dc650dSSadaf Ebrahimi do {
8802*22dc650dSSadaf Ebrahimi    PCRE2_SPTR scode = first_significant_code(
8803*22dc650dSSadaf Ebrahimi      code + PRIV(OP_lengths)[*code], FALSE);
8804*22dc650dSSadaf Ebrahimi    int op = *scode;
8805*22dc650dSSadaf Ebrahimi 
8806*22dc650dSSadaf Ebrahimi    /* If we are at the start of a conditional assertion group, *both* the
8807*22dc650dSSadaf Ebrahimi    conditional assertion *and* what follows the condition must satisfy the test
8808*22dc650dSSadaf Ebrahimi    for start of line. Other kinds of condition fail. Note that there may be an
8809*22dc650dSSadaf Ebrahimi    auto-callout at the start of a condition. */
8810*22dc650dSSadaf Ebrahimi 
8811*22dc650dSSadaf Ebrahimi    if (op == OP_COND)
8812*22dc650dSSadaf Ebrahimi      {
8813*22dc650dSSadaf Ebrahimi      scode += 1 + LINK_SIZE;
8814*22dc650dSSadaf Ebrahimi 
8815*22dc650dSSadaf Ebrahimi      if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8816*22dc650dSSadaf Ebrahimi        else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
8817*22dc650dSSadaf Ebrahimi 
8818*22dc650dSSadaf Ebrahimi      switch (*scode)
8819*22dc650dSSadaf Ebrahimi        {
8820*22dc650dSSadaf Ebrahimi        case OP_CREF:
8821*22dc650dSSadaf Ebrahimi        case OP_DNCREF:
8822*22dc650dSSadaf Ebrahimi        case OP_RREF:
8823*22dc650dSSadaf Ebrahimi        case OP_DNRREF:
8824*22dc650dSSadaf Ebrahimi        case OP_FAIL:
8825*22dc650dSSadaf Ebrahimi        case OP_FALSE:
8826*22dc650dSSadaf Ebrahimi        case OP_TRUE:
8827*22dc650dSSadaf Ebrahimi        return FALSE;
8828*22dc650dSSadaf Ebrahimi 
8829*22dc650dSSadaf Ebrahimi        default:     /* Assertion */
8830*22dc650dSSadaf Ebrahimi        if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8831*22dc650dSSadaf Ebrahimi        do scode += GET(scode, 1); while (*scode == OP_ALT);
8832*22dc650dSSadaf Ebrahimi        scode += 1 + LINK_SIZE;
8833*22dc650dSSadaf Ebrahimi        break;
8834*22dc650dSSadaf Ebrahimi        }
8835*22dc650dSSadaf Ebrahimi      scode = first_significant_code(scode, FALSE);
8836*22dc650dSSadaf Ebrahimi      op = *scode;
8837*22dc650dSSadaf Ebrahimi      }
8838*22dc650dSSadaf Ebrahimi 
8839*22dc650dSSadaf Ebrahimi    /* Non-capturing brackets */
8840*22dc650dSSadaf Ebrahimi 
8841*22dc650dSSadaf Ebrahimi    if (op == OP_BRA  || op == OP_BRAPOS ||
8842*22dc650dSSadaf Ebrahimi        op == OP_SBRA || op == OP_SBRAPOS)
8843*22dc650dSSadaf Ebrahimi      {
8844*22dc650dSSadaf Ebrahimi      if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
8845*22dc650dSSadaf Ebrahimi        return FALSE;
8846*22dc650dSSadaf Ebrahimi      }
8847*22dc650dSSadaf Ebrahimi 
8848*22dc650dSSadaf Ebrahimi    /* Capturing brackets */
8849*22dc650dSSadaf Ebrahimi 
8850*22dc650dSSadaf Ebrahimi    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8851*22dc650dSSadaf Ebrahimi             op == OP_SCBRA || op == OP_SCBRAPOS)
8852*22dc650dSSadaf Ebrahimi      {
8853*22dc650dSSadaf Ebrahimi      int n = GET2(scode, 1+LINK_SIZE);
8854*22dc650dSSadaf Ebrahimi      unsigned int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8855*22dc650dSSadaf Ebrahimi      if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
8856*22dc650dSSadaf Ebrahimi      }
8857*22dc650dSSadaf Ebrahimi 
8858*22dc650dSSadaf Ebrahimi    /* Positive forward assertions */
8859*22dc650dSSadaf Ebrahimi 
8860*22dc650dSSadaf Ebrahimi    else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8861*22dc650dSSadaf Ebrahimi      {
8862*22dc650dSSadaf Ebrahimi      if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
8863*22dc650dSSadaf Ebrahimi        return FALSE;
8864*22dc650dSSadaf Ebrahimi      }
8865*22dc650dSSadaf Ebrahimi 
8866*22dc650dSSadaf Ebrahimi    /* Atomic brackets */
8867*22dc650dSSadaf Ebrahimi 
8868*22dc650dSSadaf Ebrahimi    else if (op == OP_ONCE)
8869*22dc650dSSadaf Ebrahimi      {
8870*22dc650dSSadaf Ebrahimi      if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
8871*22dc650dSSadaf Ebrahimi        return FALSE;
8872*22dc650dSSadaf Ebrahimi      }
8873*22dc650dSSadaf Ebrahimi 
8874*22dc650dSSadaf Ebrahimi    /* .* means "start at start or after \n" if it isn't in atomic brackets or
8875*22dc650dSSadaf Ebrahimi    brackets that may be referenced or an assertion, and as long as the pattern
8876*22dc650dSSadaf Ebrahimi    does not contain *PRUNE or *SKIP, because these break the feature. Consider,
8877*22dc650dSSadaf Ebrahimi    for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
8878*22dc650dSSadaf Ebrahimi    i.e. not at the start of a line. There is also an option that disables this
8879*22dc650dSSadaf Ebrahimi    optimization. */
8880*22dc650dSSadaf Ebrahimi 
8881*22dc650dSSadaf Ebrahimi    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8882*22dc650dSSadaf Ebrahimi      {
8883*22dc650dSSadaf Ebrahimi      if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
8884*22dc650dSSadaf Ebrahimi          atomcount > 0 || cb->had_pruneorskip || inassert ||
8885*22dc650dSSadaf Ebrahimi          (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8886*22dc650dSSadaf Ebrahimi        return FALSE;
8887*22dc650dSSadaf Ebrahimi      }
8888*22dc650dSSadaf Ebrahimi 
8889*22dc650dSSadaf Ebrahimi    /* Check for explicit circumflex; anything else gives a FALSE result. Note
8890*22dc650dSSadaf Ebrahimi    in particular that this includes atomic brackets OP_ONCE because the number
8891*22dc650dSSadaf Ebrahimi    of characters matched by .* cannot be adjusted inside them. */
8892*22dc650dSSadaf Ebrahimi 
8893*22dc650dSSadaf Ebrahimi    else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8894*22dc650dSSadaf Ebrahimi 
8895*22dc650dSSadaf Ebrahimi    /* Move on to the next alternative */
8896*22dc650dSSadaf Ebrahimi 
8897*22dc650dSSadaf Ebrahimi    code += GET(code, 1);
8898*22dc650dSSadaf Ebrahimi    }
8899*22dc650dSSadaf Ebrahimi while (*code == OP_ALT);  /* Loop for each alternative */
8900*22dc650dSSadaf Ebrahimi return TRUE;
8901*22dc650dSSadaf Ebrahimi }
8902*22dc650dSSadaf Ebrahimi 
8903*22dc650dSSadaf Ebrahimi 
8904*22dc650dSSadaf Ebrahimi 
8905*22dc650dSSadaf Ebrahimi /*************************************************
8906*22dc650dSSadaf Ebrahimi *   Scan compiled regex for recursion reference  *
8907*22dc650dSSadaf Ebrahimi *************************************************/
8908*22dc650dSSadaf Ebrahimi 
8909*22dc650dSSadaf Ebrahimi /* This function scans through a compiled pattern until it finds an instance of
8910*22dc650dSSadaf Ebrahimi OP_RECURSE.
8911*22dc650dSSadaf Ebrahimi 
8912*22dc650dSSadaf Ebrahimi Arguments:
8913*22dc650dSSadaf Ebrahimi   code        points to start of expression
8914*22dc650dSSadaf Ebrahimi   utf         TRUE in UTF mode
8915*22dc650dSSadaf Ebrahimi 
8916*22dc650dSSadaf Ebrahimi Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
8917*22dc650dSSadaf Ebrahimi */
8918*22dc650dSSadaf Ebrahimi 
8919*22dc650dSSadaf Ebrahimi static PCRE2_SPTR
find_recurse(PCRE2_SPTR code,BOOL utf)8920*22dc650dSSadaf Ebrahimi find_recurse(PCRE2_SPTR code, BOOL utf)
8921*22dc650dSSadaf Ebrahimi {
8922*22dc650dSSadaf Ebrahimi for (;;)
8923*22dc650dSSadaf Ebrahimi   {
8924*22dc650dSSadaf Ebrahimi   PCRE2_UCHAR c = *code;
8925*22dc650dSSadaf Ebrahimi   if (c == OP_END) return NULL;
8926*22dc650dSSadaf Ebrahimi   if (c == OP_RECURSE) return code;
8927*22dc650dSSadaf Ebrahimi 
8928*22dc650dSSadaf Ebrahimi   /* XCLASS is used for classes that cannot be represented just by a bit map.
8929*22dc650dSSadaf Ebrahimi   This includes negated single high-valued characters. CALLOUT_STR is used for
8930*22dc650dSSadaf Ebrahimi   callouts with string arguments. In both cases the length in the table is
8931*22dc650dSSadaf Ebrahimi   zero; the actual length is stored in the compiled code. */
8932*22dc650dSSadaf Ebrahimi 
8933*22dc650dSSadaf Ebrahimi   if (c == OP_XCLASS) code += GET(code, 1);
8934*22dc650dSSadaf Ebrahimi     else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
8935*22dc650dSSadaf Ebrahimi 
8936*22dc650dSSadaf Ebrahimi   /* Otherwise, we can get the item's length from the table, except that for
8937*22dc650dSSadaf Ebrahimi   repeated character types, we have to test for \p and \P, which have an extra
8938*22dc650dSSadaf Ebrahimi   two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
8939*22dc650dSSadaf Ebrahimi   we must add in its length. */
8940*22dc650dSSadaf Ebrahimi 
8941*22dc650dSSadaf Ebrahimi   else
8942*22dc650dSSadaf Ebrahimi     {
8943*22dc650dSSadaf Ebrahimi     switch(c)
8944*22dc650dSSadaf Ebrahimi       {
8945*22dc650dSSadaf Ebrahimi       case OP_TYPESTAR:
8946*22dc650dSSadaf Ebrahimi       case OP_TYPEMINSTAR:
8947*22dc650dSSadaf Ebrahimi       case OP_TYPEPLUS:
8948*22dc650dSSadaf Ebrahimi       case OP_TYPEMINPLUS:
8949*22dc650dSSadaf Ebrahimi       case OP_TYPEQUERY:
8950*22dc650dSSadaf Ebrahimi       case OP_TYPEMINQUERY:
8951*22dc650dSSadaf Ebrahimi       case OP_TYPEPOSSTAR:
8952*22dc650dSSadaf Ebrahimi       case OP_TYPEPOSPLUS:
8953*22dc650dSSadaf Ebrahimi       case OP_TYPEPOSQUERY:
8954*22dc650dSSadaf Ebrahimi       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
8955*22dc650dSSadaf Ebrahimi       break;
8956*22dc650dSSadaf Ebrahimi 
8957*22dc650dSSadaf Ebrahimi       case OP_TYPEPOSUPTO:
8958*22dc650dSSadaf Ebrahimi       case OP_TYPEUPTO:
8959*22dc650dSSadaf Ebrahimi       case OP_TYPEMINUPTO:
8960*22dc650dSSadaf Ebrahimi       case OP_TYPEEXACT:
8961*22dc650dSSadaf Ebrahimi       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
8962*22dc650dSSadaf Ebrahimi         code += 2;
8963*22dc650dSSadaf Ebrahimi       break;
8964*22dc650dSSadaf Ebrahimi 
8965*22dc650dSSadaf Ebrahimi       case OP_MARK:
8966*22dc650dSSadaf Ebrahimi       case OP_COMMIT_ARG:
8967*22dc650dSSadaf Ebrahimi       case OP_PRUNE_ARG:
8968*22dc650dSSadaf Ebrahimi       case OP_SKIP_ARG:
8969*22dc650dSSadaf Ebrahimi       case OP_THEN_ARG:
8970*22dc650dSSadaf Ebrahimi       code += code[1];
8971*22dc650dSSadaf Ebrahimi       break;
8972*22dc650dSSadaf Ebrahimi       }
8973*22dc650dSSadaf Ebrahimi 
8974*22dc650dSSadaf Ebrahimi     /* Add in the fixed length from the table */
8975*22dc650dSSadaf Ebrahimi 
8976*22dc650dSSadaf Ebrahimi     code += PRIV(OP_lengths)[c];
8977*22dc650dSSadaf Ebrahimi 
8978*22dc650dSSadaf Ebrahimi     /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
8979*22dc650dSSadaf Ebrahimi     be followed by a multi-unit character. The length in the table is a
8980*22dc650dSSadaf Ebrahimi     minimum, so we have to arrange to skip the extra units. */
8981*22dc650dSSadaf Ebrahimi 
8982*22dc650dSSadaf Ebrahimi #ifdef MAYBE_UTF_MULTI
8983*22dc650dSSadaf Ebrahimi     if (utf) switch(c)
8984*22dc650dSSadaf Ebrahimi       {
8985*22dc650dSSadaf Ebrahimi       case OP_CHAR:
8986*22dc650dSSadaf Ebrahimi       case OP_CHARI:
8987*22dc650dSSadaf Ebrahimi       case OP_NOT:
8988*22dc650dSSadaf Ebrahimi       case OP_NOTI:
8989*22dc650dSSadaf Ebrahimi       case OP_EXACT:
8990*22dc650dSSadaf Ebrahimi       case OP_EXACTI:
8991*22dc650dSSadaf Ebrahimi       case OP_NOTEXACT:
8992*22dc650dSSadaf Ebrahimi       case OP_NOTEXACTI:
8993*22dc650dSSadaf Ebrahimi       case OP_UPTO:
8994*22dc650dSSadaf Ebrahimi       case OP_UPTOI:
8995*22dc650dSSadaf Ebrahimi       case OP_NOTUPTO:
8996*22dc650dSSadaf Ebrahimi       case OP_NOTUPTOI:
8997*22dc650dSSadaf Ebrahimi       case OP_MINUPTO:
8998*22dc650dSSadaf Ebrahimi       case OP_MINUPTOI:
8999*22dc650dSSadaf Ebrahimi       case OP_NOTMINUPTO:
9000*22dc650dSSadaf Ebrahimi       case OP_NOTMINUPTOI:
9001*22dc650dSSadaf Ebrahimi       case OP_POSUPTO:
9002*22dc650dSSadaf Ebrahimi       case OP_POSUPTOI:
9003*22dc650dSSadaf Ebrahimi       case OP_NOTPOSUPTO:
9004*22dc650dSSadaf Ebrahimi       case OP_NOTPOSUPTOI:
9005*22dc650dSSadaf Ebrahimi       case OP_STAR:
9006*22dc650dSSadaf Ebrahimi       case OP_STARI:
9007*22dc650dSSadaf Ebrahimi       case OP_NOTSTAR:
9008*22dc650dSSadaf Ebrahimi       case OP_NOTSTARI:
9009*22dc650dSSadaf Ebrahimi       case OP_MINSTAR:
9010*22dc650dSSadaf Ebrahimi       case OP_MINSTARI:
9011*22dc650dSSadaf Ebrahimi       case OP_NOTMINSTAR:
9012*22dc650dSSadaf Ebrahimi       case OP_NOTMINSTARI:
9013*22dc650dSSadaf Ebrahimi       case OP_POSSTAR:
9014*22dc650dSSadaf Ebrahimi       case OP_POSSTARI:
9015*22dc650dSSadaf Ebrahimi       case OP_NOTPOSSTAR:
9016*22dc650dSSadaf Ebrahimi       case OP_NOTPOSSTARI:
9017*22dc650dSSadaf Ebrahimi       case OP_PLUS:
9018*22dc650dSSadaf Ebrahimi       case OP_PLUSI:
9019*22dc650dSSadaf Ebrahimi       case OP_NOTPLUS:
9020*22dc650dSSadaf Ebrahimi       case OP_NOTPLUSI:
9021*22dc650dSSadaf Ebrahimi       case OP_MINPLUS:
9022*22dc650dSSadaf Ebrahimi       case OP_MINPLUSI:
9023*22dc650dSSadaf Ebrahimi       case OP_NOTMINPLUS:
9024*22dc650dSSadaf Ebrahimi       case OP_NOTMINPLUSI:
9025*22dc650dSSadaf Ebrahimi       case OP_POSPLUS:
9026*22dc650dSSadaf Ebrahimi       case OP_POSPLUSI:
9027*22dc650dSSadaf Ebrahimi       case OP_NOTPOSPLUS:
9028*22dc650dSSadaf Ebrahimi       case OP_NOTPOSPLUSI:
9029*22dc650dSSadaf Ebrahimi       case OP_QUERY:
9030*22dc650dSSadaf Ebrahimi       case OP_QUERYI:
9031*22dc650dSSadaf Ebrahimi       case OP_NOTQUERY:
9032*22dc650dSSadaf Ebrahimi       case OP_NOTQUERYI:
9033*22dc650dSSadaf Ebrahimi       case OP_MINQUERY:
9034*22dc650dSSadaf Ebrahimi       case OP_MINQUERYI:
9035*22dc650dSSadaf Ebrahimi       case OP_NOTMINQUERY:
9036*22dc650dSSadaf Ebrahimi       case OP_NOTMINQUERYI:
9037*22dc650dSSadaf Ebrahimi       case OP_POSQUERY:
9038*22dc650dSSadaf Ebrahimi       case OP_POSQUERYI:
9039*22dc650dSSadaf Ebrahimi       case OP_NOTPOSQUERY:
9040*22dc650dSSadaf Ebrahimi       case OP_NOTPOSQUERYI:
9041*22dc650dSSadaf Ebrahimi       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
9042*22dc650dSSadaf Ebrahimi       break;
9043*22dc650dSSadaf Ebrahimi       }
9044*22dc650dSSadaf Ebrahimi #else
9045*22dc650dSSadaf Ebrahimi     (void)(utf);  /* Keep compiler happy by referencing function argument */
9046*22dc650dSSadaf Ebrahimi #endif  /* MAYBE_UTF_MULTI */
9047*22dc650dSSadaf Ebrahimi     }
9048*22dc650dSSadaf Ebrahimi   }
9049*22dc650dSSadaf Ebrahimi }
9050*22dc650dSSadaf Ebrahimi 
9051*22dc650dSSadaf Ebrahimi 
9052*22dc650dSSadaf Ebrahimi 
9053*22dc650dSSadaf Ebrahimi /*************************************************
9054*22dc650dSSadaf Ebrahimi *    Check for asserted fixed first code unit    *
9055*22dc650dSSadaf Ebrahimi *************************************************/
9056*22dc650dSSadaf Ebrahimi 
9057*22dc650dSSadaf Ebrahimi /* During compilation, the "first code unit" settings from forward assertions
9058*22dc650dSSadaf Ebrahimi are discarded, because they can cause conflicts with actual literals that
9059*22dc650dSSadaf Ebrahimi follow. However, if we end up without a first code unit setting for an
9060*22dc650dSSadaf Ebrahimi unanchored pattern, it is worth scanning the regex to see if there is an
9061*22dc650dSSadaf Ebrahimi initial asserted first code unit. If all branches start with the same asserted
9062*22dc650dSSadaf Ebrahimi code unit, or with a non-conditional bracket all of whose alternatives start
9063*22dc650dSSadaf Ebrahimi with the same asserted code unit (recurse ad lib), then we return that code
9064*22dc650dSSadaf Ebrahimi unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
9065*22dc650dSSadaf Ebrahimi REQ_NONE in the flags.
9066*22dc650dSSadaf Ebrahimi 
9067*22dc650dSSadaf Ebrahimi Arguments:
9068*22dc650dSSadaf Ebrahimi   code       points to start of compiled pattern
9069*22dc650dSSadaf Ebrahimi   flags      points to the first code unit flags
9070*22dc650dSSadaf Ebrahimi   inassert   non-zero if in an assertion
9071*22dc650dSSadaf Ebrahimi 
9072*22dc650dSSadaf Ebrahimi Returns:     the fixed first code unit, or 0 with REQ_NONE in flags
9073*22dc650dSSadaf Ebrahimi */
9074*22dc650dSSadaf Ebrahimi 
9075*22dc650dSSadaf Ebrahimi static uint32_t
find_firstassertedcu(PCRE2_SPTR code,uint32_t * flags,uint32_t inassert)9076*22dc650dSSadaf Ebrahimi find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert)
9077*22dc650dSSadaf Ebrahimi {
9078*22dc650dSSadaf Ebrahimi uint32_t c = 0;
9079*22dc650dSSadaf Ebrahimi uint32_t cflags = REQ_NONE;
9080*22dc650dSSadaf Ebrahimi 
9081*22dc650dSSadaf Ebrahimi *flags = REQ_NONE;
9082*22dc650dSSadaf Ebrahimi do {
9083*22dc650dSSadaf Ebrahimi    uint32_t d;
9084*22dc650dSSadaf Ebrahimi    uint32_t dflags;
9085*22dc650dSSadaf Ebrahimi    int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
9086*22dc650dSSadaf Ebrahimi              *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
9087*22dc650dSSadaf Ebrahimi    PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
9088*22dc650dSSadaf Ebrahimi    PCRE2_UCHAR op = *scode;
9089*22dc650dSSadaf Ebrahimi 
9090*22dc650dSSadaf Ebrahimi    switch(op)
9091*22dc650dSSadaf Ebrahimi      {
9092*22dc650dSSadaf Ebrahimi      default:
9093*22dc650dSSadaf Ebrahimi      return 0;
9094*22dc650dSSadaf Ebrahimi 
9095*22dc650dSSadaf Ebrahimi      case OP_BRA:
9096*22dc650dSSadaf Ebrahimi      case OP_BRAPOS:
9097*22dc650dSSadaf Ebrahimi      case OP_CBRA:
9098*22dc650dSSadaf Ebrahimi      case OP_SCBRA:
9099*22dc650dSSadaf Ebrahimi      case OP_CBRAPOS:
9100*22dc650dSSadaf Ebrahimi      case OP_SCBRAPOS:
9101*22dc650dSSadaf Ebrahimi      case OP_ASSERT:
9102*22dc650dSSadaf Ebrahimi      case OP_ASSERT_NA:
9103*22dc650dSSadaf Ebrahimi      case OP_ONCE:
9104*22dc650dSSadaf Ebrahimi      case OP_SCRIPT_RUN:
9105*22dc650dSSadaf Ebrahimi      d = find_firstassertedcu(scode, &dflags, inassert +
9106*22dc650dSSadaf Ebrahimi        ((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
9107*22dc650dSSadaf Ebrahimi      if (dflags >= REQ_NONE) return 0;
9108*22dc650dSSadaf Ebrahimi      if (cflags >= REQ_NONE) { c = d; cflags = dflags; }
9109*22dc650dSSadaf Ebrahimi        else if (c != d || cflags != dflags) return 0;
9110*22dc650dSSadaf Ebrahimi      break;
9111*22dc650dSSadaf Ebrahimi 
9112*22dc650dSSadaf Ebrahimi      case OP_EXACT:
9113*22dc650dSSadaf Ebrahimi      scode += IMM2_SIZE;
9114*22dc650dSSadaf Ebrahimi      /* Fall through */
9115*22dc650dSSadaf Ebrahimi 
9116*22dc650dSSadaf Ebrahimi      case OP_CHAR:
9117*22dc650dSSadaf Ebrahimi      case OP_PLUS:
9118*22dc650dSSadaf Ebrahimi      case OP_MINPLUS:
9119*22dc650dSSadaf Ebrahimi      case OP_POSPLUS:
9120*22dc650dSSadaf Ebrahimi      if (inassert == 0) return 0;
9121*22dc650dSSadaf Ebrahimi      if (cflags >= REQ_NONE) { c = scode[1]; cflags = 0; }
9122*22dc650dSSadaf Ebrahimi        else if (c != scode[1]) return 0;
9123*22dc650dSSadaf Ebrahimi      break;
9124*22dc650dSSadaf Ebrahimi 
9125*22dc650dSSadaf Ebrahimi      case OP_EXACTI:
9126*22dc650dSSadaf Ebrahimi      scode += IMM2_SIZE;
9127*22dc650dSSadaf Ebrahimi      /* Fall through */
9128*22dc650dSSadaf Ebrahimi 
9129*22dc650dSSadaf Ebrahimi      case OP_CHARI:
9130*22dc650dSSadaf Ebrahimi      case OP_PLUSI:
9131*22dc650dSSadaf Ebrahimi      case OP_MINPLUSI:
9132*22dc650dSSadaf Ebrahimi      case OP_POSPLUSI:
9133*22dc650dSSadaf Ebrahimi      if (inassert == 0) return 0;
9134*22dc650dSSadaf Ebrahimi 
9135*22dc650dSSadaf Ebrahimi      /* If the character is more than one code unit long, we cannot set its
9136*22dc650dSSadaf Ebrahimi      first code unit when matching caselessly. Later scanning may pick up
9137*22dc650dSSadaf Ebrahimi      multiple code units. */
9138*22dc650dSSadaf Ebrahimi 
9139*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
9140*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
9141*22dc650dSSadaf Ebrahimi      if (scode[1] >= 0x80) return 0;
9142*22dc650dSSadaf Ebrahimi #elif PCRE2_CODE_UNIT_WIDTH == 16
9143*22dc650dSSadaf Ebrahimi      if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0;
9144*22dc650dSSadaf Ebrahimi #endif
9145*22dc650dSSadaf Ebrahimi #endif
9146*22dc650dSSadaf Ebrahimi 
9147*22dc650dSSadaf Ebrahimi      if (cflags >= REQ_NONE) { c = scode[1]; cflags = REQ_CASELESS; }
9148*22dc650dSSadaf Ebrahimi        else if (c != scode[1]) return 0;
9149*22dc650dSSadaf Ebrahimi      break;
9150*22dc650dSSadaf Ebrahimi      }
9151*22dc650dSSadaf Ebrahimi 
9152*22dc650dSSadaf Ebrahimi    code += GET(code, 1);
9153*22dc650dSSadaf Ebrahimi    }
9154*22dc650dSSadaf Ebrahimi while (*code == OP_ALT);
9155*22dc650dSSadaf Ebrahimi 
9156*22dc650dSSadaf Ebrahimi *flags = cflags;
9157*22dc650dSSadaf Ebrahimi return c;
9158*22dc650dSSadaf Ebrahimi }
9159*22dc650dSSadaf Ebrahimi 
9160*22dc650dSSadaf Ebrahimi 
9161*22dc650dSSadaf Ebrahimi 
9162*22dc650dSSadaf Ebrahimi /*************************************************
9163*22dc650dSSadaf Ebrahimi *     Add an entry to the name/number table      *
9164*22dc650dSSadaf Ebrahimi *************************************************/
9165*22dc650dSSadaf Ebrahimi 
9166*22dc650dSSadaf Ebrahimi /* This function is called between compiling passes to add an entry to the
9167*22dc650dSSadaf Ebrahimi name/number table, maintaining alphabetical order. Checking for permitted
9168*22dc650dSSadaf Ebrahimi and forbidden duplicates has already been done.
9169*22dc650dSSadaf Ebrahimi 
9170*22dc650dSSadaf Ebrahimi Arguments:
9171*22dc650dSSadaf Ebrahimi   cb           the compile data block
9172*22dc650dSSadaf Ebrahimi   name         the name to add
9173*22dc650dSSadaf Ebrahimi   length       the length of the name
9174*22dc650dSSadaf Ebrahimi   groupno      the group number
9175*22dc650dSSadaf Ebrahimi   tablecount   the count of names in the table so far
9176*22dc650dSSadaf Ebrahimi 
9177*22dc650dSSadaf Ebrahimi Returns:       nothing
9178*22dc650dSSadaf Ebrahimi */
9179*22dc650dSSadaf Ebrahimi 
9180*22dc650dSSadaf Ebrahimi static void
add_name_to_table(compile_block * cb,PCRE2_SPTR name,int length,unsigned int groupno,uint32_t tablecount)9181*22dc650dSSadaf Ebrahimi add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
9182*22dc650dSSadaf Ebrahimi   unsigned int groupno, uint32_t tablecount)
9183*22dc650dSSadaf Ebrahimi {
9184*22dc650dSSadaf Ebrahimi uint32_t i;
9185*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *slot = cb->name_table;
9186*22dc650dSSadaf Ebrahimi 
9187*22dc650dSSadaf Ebrahimi for (i = 0; i < tablecount; i++)
9188*22dc650dSSadaf Ebrahimi   {
9189*22dc650dSSadaf Ebrahimi   int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
9190*22dc650dSSadaf Ebrahimi   if (crc == 0 && slot[IMM2_SIZE+length] != 0)
9191*22dc650dSSadaf Ebrahimi     crc = -1; /* Current name is a substring */
9192*22dc650dSSadaf Ebrahimi 
9193*22dc650dSSadaf Ebrahimi   /* Make space in the table and break the loop for an earlier name. For a
9194*22dc650dSSadaf Ebrahimi   duplicate or later name, carry on. We do this for duplicates so that in the
9195*22dc650dSSadaf Ebrahimi   simple case (when ?(| is not used) they are in order of their numbers. In all
9196*22dc650dSSadaf Ebrahimi   cases they are in the order in which they appear in the pattern. */
9197*22dc650dSSadaf Ebrahimi 
9198*22dc650dSSadaf Ebrahimi   if (crc < 0)
9199*22dc650dSSadaf Ebrahimi     {
9200*22dc650dSSadaf Ebrahimi     (void)memmove(slot + cb->name_entry_size, slot,
9201*22dc650dSSadaf Ebrahimi       CU2BYTES((tablecount - i) * cb->name_entry_size));
9202*22dc650dSSadaf Ebrahimi     break;
9203*22dc650dSSadaf Ebrahimi     }
9204*22dc650dSSadaf Ebrahimi 
9205*22dc650dSSadaf Ebrahimi   /* Continue the loop for a later or duplicate name */
9206*22dc650dSSadaf Ebrahimi 
9207*22dc650dSSadaf Ebrahimi   slot += cb->name_entry_size;
9208*22dc650dSSadaf Ebrahimi   }
9209*22dc650dSSadaf Ebrahimi 
9210*22dc650dSSadaf Ebrahimi PUT2(slot, 0, groupno);
9211*22dc650dSSadaf Ebrahimi memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
9212*22dc650dSSadaf Ebrahimi 
9213*22dc650dSSadaf Ebrahimi /* Add a terminating zero and fill the rest of the slot with zeroes so that
9214*22dc650dSSadaf Ebrahimi the memory is all initialized. Otherwise valgrind moans about uninitialized
9215*22dc650dSSadaf Ebrahimi memory when saving serialized compiled patterns. */
9216*22dc650dSSadaf Ebrahimi 
9217*22dc650dSSadaf Ebrahimi memset(slot + IMM2_SIZE + length, 0,
9218*22dc650dSSadaf Ebrahimi   CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
9219*22dc650dSSadaf Ebrahimi }
9220*22dc650dSSadaf Ebrahimi 
9221*22dc650dSSadaf Ebrahimi 
9222*22dc650dSSadaf Ebrahimi 
9223*22dc650dSSadaf Ebrahimi /*************************************************
9224*22dc650dSSadaf Ebrahimi *             Skip in parsed pattern             *
9225*22dc650dSSadaf Ebrahimi *************************************************/
9226*22dc650dSSadaf Ebrahimi 
9227*22dc650dSSadaf Ebrahimi /* This function is called to skip parts of the parsed pattern when finding the
9228*22dc650dSSadaf Ebrahimi length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
9229*22dc650dSSadaf Ebrahimi the end of the branch, it is called to skip over an internal lookaround or
9230*22dc650dSSadaf Ebrahimi (DEFINE) group, and it is also called to skip to the end of a class, during
9231*22dc650dSSadaf Ebrahimi which it will never encounter nested groups (but there's no need to have
9232*22dc650dSSadaf Ebrahimi special code for that).
9233*22dc650dSSadaf Ebrahimi 
9234*22dc650dSSadaf Ebrahimi When called to find the end of a branch or group, pptr must point to the first
9235*22dc650dSSadaf Ebrahimi meta code inside the branch, not the branch-starting code. In other cases it
9236*22dc650dSSadaf Ebrahimi can point to the item that causes the function to be called.
9237*22dc650dSSadaf Ebrahimi 
9238*22dc650dSSadaf Ebrahimi Arguments:
9239*22dc650dSSadaf Ebrahimi   pptr       current pointer to skip from
9240*22dc650dSSadaf Ebrahimi   skiptype   PSKIP_CLASS when skipping to end of class
9241*22dc650dSSadaf Ebrahimi              PSKIP_ALT when META_ALT ends the skip
9242*22dc650dSSadaf Ebrahimi              PSKIP_KET when only META_KET ends the skip
9243*22dc650dSSadaf Ebrahimi 
9244*22dc650dSSadaf Ebrahimi Returns:     new value of pptr
9245*22dc650dSSadaf Ebrahimi              NULL if META_END is reached - should never occur
9246*22dc650dSSadaf Ebrahimi                or for an unknown meta value - likewise
9247*22dc650dSSadaf Ebrahimi */
9248*22dc650dSSadaf Ebrahimi 
9249*22dc650dSSadaf Ebrahimi static uint32_t *
parsed_skip(uint32_t * pptr,uint32_t skiptype)9250*22dc650dSSadaf Ebrahimi parsed_skip(uint32_t *pptr, uint32_t skiptype)
9251*22dc650dSSadaf Ebrahimi {
9252*22dc650dSSadaf Ebrahimi uint32_t nestlevel = 0;
9253*22dc650dSSadaf Ebrahimi 
9254*22dc650dSSadaf Ebrahimi for (;; pptr++)
9255*22dc650dSSadaf Ebrahimi   {
9256*22dc650dSSadaf Ebrahimi   uint32_t meta = META_CODE(*pptr);
9257*22dc650dSSadaf Ebrahimi 
9258*22dc650dSSadaf Ebrahimi   switch(meta)
9259*22dc650dSSadaf Ebrahimi     {
9260*22dc650dSSadaf Ebrahimi     default:  /* Just skip over most items */
9261*22dc650dSSadaf Ebrahimi     if (meta < META_END) continue;  /* Literal */
9262*22dc650dSSadaf Ebrahimi     break;
9263*22dc650dSSadaf Ebrahimi 
9264*22dc650dSSadaf Ebrahimi     /* This should never occur. */
9265*22dc650dSSadaf Ebrahimi 
9266*22dc650dSSadaf Ebrahimi     case META_END:
9267*22dc650dSSadaf Ebrahimi     return NULL;
9268*22dc650dSSadaf Ebrahimi 
9269*22dc650dSSadaf Ebrahimi     /* The data for these items is variable in length. */
9270*22dc650dSSadaf Ebrahimi 
9271*22dc650dSSadaf Ebrahimi     case META_BACKREF:  /* Offset is present only if group >= 10 */
9272*22dc650dSSadaf Ebrahimi     if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
9273*22dc650dSSadaf Ebrahimi     break;
9274*22dc650dSSadaf Ebrahimi 
9275*22dc650dSSadaf Ebrahimi     case META_ESCAPE:   /* A few escapes are followed by data items. */
9276*22dc650dSSadaf Ebrahimi     switch (META_DATA(*pptr))
9277*22dc650dSSadaf Ebrahimi       {
9278*22dc650dSSadaf Ebrahimi       case ESC_P:
9279*22dc650dSSadaf Ebrahimi       case ESC_p:
9280*22dc650dSSadaf Ebrahimi       pptr += 1;
9281*22dc650dSSadaf Ebrahimi       break;
9282*22dc650dSSadaf Ebrahimi 
9283*22dc650dSSadaf Ebrahimi       case ESC_g:
9284*22dc650dSSadaf Ebrahimi       case ESC_k:
9285*22dc650dSSadaf Ebrahimi       pptr += 1 + SIZEOFFSET;
9286*22dc650dSSadaf Ebrahimi       break;
9287*22dc650dSSadaf Ebrahimi       }
9288*22dc650dSSadaf Ebrahimi     break;
9289*22dc650dSSadaf Ebrahimi 
9290*22dc650dSSadaf Ebrahimi     case META_MARK:     /* Add the length of the name. */
9291*22dc650dSSadaf Ebrahimi     case META_COMMIT_ARG:
9292*22dc650dSSadaf Ebrahimi     case META_PRUNE_ARG:
9293*22dc650dSSadaf Ebrahimi     case META_SKIP_ARG:
9294*22dc650dSSadaf Ebrahimi     case META_THEN_ARG:
9295*22dc650dSSadaf Ebrahimi     pptr += pptr[1];
9296*22dc650dSSadaf Ebrahimi     break;
9297*22dc650dSSadaf Ebrahimi 
9298*22dc650dSSadaf Ebrahimi     /* These are the "active" items in this loop. */
9299*22dc650dSSadaf Ebrahimi 
9300*22dc650dSSadaf Ebrahimi     case META_CLASS_END:
9301*22dc650dSSadaf Ebrahimi     if (skiptype == PSKIP_CLASS) return pptr;
9302*22dc650dSSadaf Ebrahimi     break;
9303*22dc650dSSadaf Ebrahimi 
9304*22dc650dSSadaf Ebrahimi     case META_ATOMIC:
9305*22dc650dSSadaf Ebrahimi     case META_CAPTURE:
9306*22dc650dSSadaf Ebrahimi     case META_COND_ASSERT:
9307*22dc650dSSadaf Ebrahimi     case META_COND_DEFINE:
9308*22dc650dSSadaf Ebrahimi     case META_COND_NAME:
9309*22dc650dSSadaf Ebrahimi     case META_COND_NUMBER:
9310*22dc650dSSadaf Ebrahimi     case META_COND_RNAME:
9311*22dc650dSSadaf Ebrahimi     case META_COND_RNUMBER:
9312*22dc650dSSadaf Ebrahimi     case META_COND_VERSION:
9313*22dc650dSSadaf Ebrahimi     case META_LOOKAHEAD:
9314*22dc650dSSadaf Ebrahimi     case META_LOOKAHEADNOT:
9315*22dc650dSSadaf Ebrahimi     case META_LOOKAHEAD_NA:
9316*22dc650dSSadaf Ebrahimi     case META_LOOKBEHIND:
9317*22dc650dSSadaf Ebrahimi     case META_LOOKBEHINDNOT:
9318*22dc650dSSadaf Ebrahimi     case META_LOOKBEHIND_NA:
9319*22dc650dSSadaf Ebrahimi     case META_NOCAPTURE:
9320*22dc650dSSadaf Ebrahimi     case META_SCRIPT_RUN:
9321*22dc650dSSadaf Ebrahimi     nestlevel++;
9322*22dc650dSSadaf Ebrahimi     break;
9323*22dc650dSSadaf Ebrahimi 
9324*22dc650dSSadaf Ebrahimi     case META_ALT:
9325*22dc650dSSadaf Ebrahimi     if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
9326*22dc650dSSadaf Ebrahimi     break;
9327*22dc650dSSadaf Ebrahimi 
9328*22dc650dSSadaf Ebrahimi     case META_KET:
9329*22dc650dSSadaf Ebrahimi     if (nestlevel == 0) return pptr;
9330*22dc650dSSadaf Ebrahimi     nestlevel--;
9331*22dc650dSSadaf Ebrahimi     break;
9332*22dc650dSSadaf Ebrahimi     }
9333*22dc650dSSadaf Ebrahimi 
9334*22dc650dSSadaf Ebrahimi   /* The extra data item length for each meta is in a table. */
9335*22dc650dSSadaf Ebrahimi 
9336*22dc650dSSadaf Ebrahimi   meta = (meta >> 16) & 0x7fff;
9337*22dc650dSSadaf Ebrahimi   if (meta >= sizeof(meta_extra_lengths)) return NULL;
9338*22dc650dSSadaf Ebrahimi   pptr += meta_extra_lengths[meta];
9339*22dc650dSSadaf Ebrahimi   }
9340*22dc650dSSadaf Ebrahimi /* Control never reaches here */
9341*22dc650dSSadaf Ebrahimi return pptr;
9342*22dc650dSSadaf Ebrahimi }
9343*22dc650dSSadaf Ebrahimi 
9344*22dc650dSSadaf Ebrahimi 
9345*22dc650dSSadaf Ebrahimi 
9346*22dc650dSSadaf Ebrahimi /*************************************************
9347*22dc650dSSadaf Ebrahimi *       Find length of a parsed group            *
9348*22dc650dSSadaf Ebrahimi *************************************************/
9349*22dc650dSSadaf Ebrahimi 
9350*22dc650dSSadaf Ebrahimi /* This is called for nested groups within a branch of a lookbehind whose
9351*22dc650dSSadaf Ebrahimi length is being computed. On entry, the pointer must be at the first element
9352*22dc650dSSadaf Ebrahimi after the group initializing code. On exit it points to OP_KET. Caching is used
9353*22dc650dSSadaf Ebrahimi to improve processing speed when the same capturing group occurs many times.
9354*22dc650dSSadaf Ebrahimi 
9355*22dc650dSSadaf Ebrahimi Arguments:
9356*22dc650dSSadaf Ebrahimi   pptrptr     pointer to pointer in the parsed pattern
9357*22dc650dSSadaf Ebrahimi   minptr      where to return the minimum length
9358*22dc650dSSadaf Ebrahimi   isinline    FALSE if a reference or recursion; TRUE for inline group
9359*22dc650dSSadaf Ebrahimi   errcodeptr  pointer to the errorcode
9360*22dc650dSSadaf Ebrahimi   lcptr       pointer to the loop counter
9361*22dc650dSSadaf Ebrahimi   group       number of captured group or -1 for a non-capturing group
9362*22dc650dSSadaf Ebrahimi   recurses    chain of recurse_check to catch mutual recursion
9363*22dc650dSSadaf Ebrahimi   cb          pointer to the compile data
9364*22dc650dSSadaf Ebrahimi 
9365*22dc650dSSadaf Ebrahimi Returns:      the maximum group length or a negative number
9366*22dc650dSSadaf Ebrahimi */
9367*22dc650dSSadaf Ebrahimi 
9368*22dc650dSSadaf Ebrahimi static int
get_grouplength(uint32_t ** pptrptr,int * minptr,BOOL isinline,int * errcodeptr,int * lcptr,int group,parsed_recurse_check * recurses,compile_block * cb)9369*22dc650dSSadaf Ebrahimi get_grouplength(uint32_t **pptrptr, int *minptr, BOOL isinline, int *errcodeptr,
9370*22dc650dSSadaf Ebrahimi   int *lcptr, int group, parsed_recurse_check *recurses, compile_block *cb)
9371*22dc650dSSadaf Ebrahimi {
9372*22dc650dSSadaf Ebrahimi uint32_t *gi = cb->groupinfo + 2 * group;
9373*22dc650dSSadaf Ebrahimi int branchlength, branchminlength;
9374*22dc650dSSadaf Ebrahimi int grouplength = -1;
9375*22dc650dSSadaf Ebrahimi int groupminlength = INT_MAX;
9376*22dc650dSSadaf Ebrahimi 
9377*22dc650dSSadaf Ebrahimi /* The cache can be used only if there is no possibility of there being two
9378*22dc650dSSadaf Ebrahimi groups with the same number. We do not need to set the end pointer for a group
9379*22dc650dSSadaf Ebrahimi that is being processed as a back reference or recursion, but we must do so for
9380*22dc650dSSadaf Ebrahimi an inline group. */
9381*22dc650dSSadaf Ebrahimi 
9382*22dc650dSSadaf Ebrahimi if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
9383*22dc650dSSadaf Ebrahimi   {
9384*22dc650dSSadaf Ebrahimi   uint32_t groupinfo = gi[0];
9385*22dc650dSSadaf Ebrahimi   if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
9386*22dc650dSSadaf Ebrahimi   if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
9387*22dc650dSSadaf Ebrahimi     {
9388*22dc650dSSadaf Ebrahimi     if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
9389*22dc650dSSadaf Ebrahimi     *minptr = gi[1];
9390*22dc650dSSadaf Ebrahimi     return groupinfo & GI_FIXED_LENGTH_MASK;
9391*22dc650dSSadaf Ebrahimi     }
9392*22dc650dSSadaf Ebrahimi   }
9393*22dc650dSSadaf Ebrahimi 
9394*22dc650dSSadaf Ebrahimi /* Scan the group. In this case we find the end pointer of necessity. */
9395*22dc650dSSadaf Ebrahimi 
9396*22dc650dSSadaf Ebrahimi for(;;)
9397*22dc650dSSadaf Ebrahimi   {
9398*22dc650dSSadaf Ebrahimi   branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
9399*22dc650dSSadaf Ebrahimi     recurses, cb);
9400*22dc650dSSadaf Ebrahimi   if (branchlength < 0) goto ISNOTFIXED;
9401*22dc650dSSadaf Ebrahimi   if (branchlength > grouplength) grouplength = branchlength;
9402*22dc650dSSadaf Ebrahimi   if (branchminlength < groupminlength) groupminlength = branchminlength;
9403*22dc650dSSadaf Ebrahimi   if (**pptrptr == META_KET) break;
9404*22dc650dSSadaf Ebrahimi   *pptrptr += 1;   /* Skip META_ALT */
9405*22dc650dSSadaf Ebrahimi   }
9406*22dc650dSSadaf Ebrahimi 
9407*22dc650dSSadaf Ebrahimi if (group > 0)
9408*22dc650dSSadaf Ebrahimi   {
9409*22dc650dSSadaf Ebrahimi   gi[0] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
9410*22dc650dSSadaf Ebrahimi   gi[1] = groupminlength;
9411*22dc650dSSadaf Ebrahimi   }
9412*22dc650dSSadaf Ebrahimi 
9413*22dc650dSSadaf Ebrahimi *minptr = groupminlength;
9414*22dc650dSSadaf Ebrahimi return grouplength;
9415*22dc650dSSadaf Ebrahimi 
9416*22dc650dSSadaf Ebrahimi ISNOTFIXED:
9417*22dc650dSSadaf Ebrahimi if (group > 0) gi[0] |= GI_NOT_FIXED_LENGTH;
9418*22dc650dSSadaf Ebrahimi return -1;
9419*22dc650dSSadaf Ebrahimi }
9420*22dc650dSSadaf Ebrahimi 
9421*22dc650dSSadaf Ebrahimi 
9422*22dc650dSSadaf Ebrahimi 
9423*22dc650dSSadaf Ebrahimi /*************************************************
9424*22dc650dSSadaf Ebrahimi *        Find length of a parsed branch          *
9425*22dc650dSSadaf Ebrahimi *************************************************/
9426*22dc650dSSadaf Ebrahimi 
9427*22dc650dSSadaf Ebrahimi /* Return fixed maximum and minimum lengths for a branch in a lookbehind,
9428*22dc650dSSadaf Ebrahimi giving an error if the length is not limited. On entry, *pptrptr points to the
9429*22dc650dSSadaf Ebrahimi first element inside the branch. On exit it is set to point to the ALT or KET.
9430*22dc650dSSadaf Ebrahimi 
9431*22dc650dSSadaf Ebrahimi Arguments:
9432*22dc650dSSadaf Ebrahimi   pptrptr     pointer to pointer in the parsed pattern
9433*22dc650dSSadaf Ebrahimi   minptr      where to return the minimum length
9434*22dc650dSSadaf Ebrahimi   errcodeptr  pointer to error code
9435*22dc650dSSadaf Ebrahimi   lcptr       pointer to loop counter
9436*22dc650dSSadaf Ebrahimi   recurses    chain of recurse_check to catch mutual recursion
9437*22dc650dSSadaf Ebrahimi   cb          pointer to compile block
9438*22dc650dSSadaf Ebrahimi 
9439*22dc650dSSadaf Ebrahimi Returns:      the maximum length, or a negative value on error
9440*22dc650dSSadaf Ebrahimi */
9441*22dc650dSSadaf Ebrahimi 
9442*22dc650dSSadaf Ebrahimi static int
get_branchlength(uint32_t ** pptrptr,int * minptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)9443*22dc650dSSadaf Ebrahimi get_branchlength(uint32_t **pptrptr, int *minptr, int *errcodeptr, int *lcptr,
9444*22dc650dSSadaf Ebrahimi   parsed_recurse_check *recurses, compile_block *cb)
9445*22dc650dSSadaf Ebrahimi {
9446*22dc650dSSadaf Ebrahimi int branchlength = 0;
9447*22dc650dSSadaf Ebrahimi int branchminlength = 0;
9448*22dc650dSSadaf Ebrahimi int grouplength, groupminlength;
9449*22dc650dSSadaf Ebrahimi uint32_t lastitemlength = 0;
9450*22dc650dSSadaf Ebrahimi uint32_t lastitemminlength = 0;
9451*22dc650dSSadaf Ebrahimi uint32_t *pptr = *pptrptr;
9452*22dc650dSSadaf Ebrahimi PCRE2_SIZE offset;
9453*22dc650dSSadaf Ebrahimi parsed_recurse_check this_recurse;
9454*22dc650dSSadaf Ebrahimi 
9455*22dc650dSSadaf Ebrahimi /* A large and/or complex regex can take too long to process. This can happen
9456*22dc650dSSadaf Ebrahimi more often when (?| groups are present in the pattern because their length
9457*22dc650dSSadaf Ebrahimi cannot be cached. */
9458*22dc650dSSadaf Ebrahimi 
9459*22dc650dSSadaf Ebrahimi if ((*lcptr)++ > 2000)
9460*22dc650dSSadaf Ebrahimi   {
9461*22dc650dSSadaf Ebrahimi   *errcodeptr = ERR35;  /* Lookbehind is too complicated */
9462*22dc650dSSadaf Ebrahimi   return -1;
9463*22dc650dSSadaf Ebrahimi   }
9464*22dc650dSSadaf Ebrahimi 
9465*22dc650dSSadaf Ebrahimi /* Scan the branch, accumulating the length. */
9466*22dc650dSSadaf Ebrahimi 
9467*22dc650dSSadaf Ebrahimi for (;; pptr++)
9468*22dc650dSSadaf Ebrahimi   {
9469*22dc650dSSadaf Ebrahimi   parsed_recurse_check *r;
9470*22dc650dSSadaf Ebrahimi   uint32_t *gptr, *gptrend;
9471*22dc650dSSadaf Ebrahimi   uint32_t escape;
9472*22dc650dSSadaf Ebrahimi   uint32_t group = 0;
9473*22dc650dSSadaf Ebrahimi   uint32_t itemlength = 0;
9474*22dc650dSSadaf Ebrahimi   uint32_t itemminlength = 0;
9475*22dc650dSSadaf Ebrahimi   uint32_t min, max;
9476*22dc650dSSadaf Ebrahimi 
9477*22dc650dSSadaf Ebrahimi   if (*pptr < META_END)
9478*22dc650dSSadaf Ebrahimi     {
9479*22dc650dSSadaf Ebrahimi     itemlength = itemminlength = 1;
9480*22dc650dSSadaf Ebrahimi     }
9481*22dc650dSSadaf Ebrahimi 
9482*22dc650dSSadaf Ebrahimi   else switch (META_CODE(*pptr))
9483*22dc650dSSadaf Ebrahimi     {
9484*22dc650dSSadaf Ebrahimi     case META_KET:
9485*22dc650dSSadaf Ebrahimi     case META_ALT:
9486*22dc650dSSadaf Ebrahimi     goto EXIT;
9487*22dc650dSSadaf Ebrahimi 
9488*22dc650dSSadaf Ebrahimi     /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
9489*22dc650dSSadaf Ebrahimi     actual termination. */
9490*22dc650dSSadaf Ebrahimi 
9491*22dc650dSSadaf Ebrahimi     case META_ACCEPT:
9492*22dc650dSSadaf Ebrahimi     case META_FAIL:
9493*22dc650dSSadaf Ebrahimi     pptr = parsed_skip(pptr, PSKIP_ALT);
9494*22dc650dSSadaf Ebrahimi     if (pptr == NULL) goto PARSED_SKIP_FAILED;
9495*22dc650dSSadaf Ebrahimi     goto EXIT;
9496*22dc650dSSadaf Ebrahimi 
9497*22dc650dSSadaf Ebrahimi     case META_MARK:
9498*22dc650dSSadaf Ebrahimi     case META_COMMIT_ARG:
9499*22dc650dSSadaf Ebrahimi     case META_PRUNE_ARG:
9500*22dc650dSSadaf Ebrahimi     case META_SKIP_ARG:
9501*22dc650dSSadaf Ebrahimi     case META_THEN_ARG:
9502*22dc650dSSadaf Ebrahimi     pptr += pptr[1] + 1;
9503*22dc650dSSadaf Ebrahimi     break;
9504*22dc650dSSadaf Ebrahimi 
9505*22dc650dSSadaf Ebrahimi     case META_CIRCUMFLEX:
9506*22dc650dSSadaf Ebrahimi     case META_COMMIT:
9507*22dc650dSSadaf Ebrahimi     case META_DOLLAR:
9508*22dc650dSSadaf Ebrahimi     case META_PRUNE:
9509*22dc650dSSadaf Ebrahimi     case META_SKIP:
9510*22dc650dSSadaf Ebrahimi     case META_THEN:
9511*22dc650dSSadaf Ebrahimi     break;
9512*22dc650dSSadaf Ebrahimi 
9513*22dc650dSSadaf Ebrahimi     case META_OPTIONS:
9514*22dc650dSSadaf Ebrahimi     pptr += 2;
9515*22dc650dSSadaf Ebrahimi     break;
9516*22dc650dSSadaf Ebrahimi 
9517*22dc650dSSadaf Ebrahimi     case META_BIGVALUE:
9518*22dc650dSSadaf Ebrahimi     itemlength = itemminlength = 1;
9519*22dc650dSSadaf Ebrahimi     pptr += 1;
9520*22dc650dSSadaf Ebrahimi     break;
9521*22dc650dSSadaf Ebrahimi 
9522*22dc650dSSadaf Ebrahimi     case META_CLASS:
9523*22dc650dSSadaf Ebrahimi     case META_CLASS_NOT:
9524*22dc650dSSadaf Ebrahimi     itemlength = itemminlength = 1;
9525*22dc650dSSadaf Ebrahimi     pptr = parsed_skip(pptr, PSKIP_CLASS);
9526*22dc650dSSadaf Ebrahimi     if (pptr == NULL) goto PARSED_SKIP_FAILED;
9527*22dc650dSSadaf Ebrahimi     break;
9528*22dc650dSSadaf Ebrahimi 
9529*22dc650dSSadaf Ebrahimi     case META_CLASS_EMPTY_NOT:
9530*22dc650dSSadaf Ebrahimi     case META_DOT:
9531*22dc650dSSadaf Ebrahimi     itemlength = itemminlength = 1;
9532*22dc650dSSadaf Ebrahimi     break;
9533*22dc650dSSadaf Ebrahimi 
9534*22dc650dSSadaf Ebrahimi     case META_CALLOUT_NUMBER:
9535*22dc650dSSadaf Ebrahimi     pptr += 3;
9536*22dc650dSSadaf Ebrahimi     break;
9537*22dc650dSSadaf Ebrahimi 
9538*22dc650dSSadaf Ebrahimi     case META_CALLOUT_STRING:
9539*22dc650dSSadaf Ebrahimi     pptr += 3 + SIZEOFFSET;
9540*22dc650dSSadaf Ebrahimi     break;
9541*22dc650dSSadaf Ebrahimi 
9542*22dc650dSSadaf Ebrahimi     /* Only some escapes consume a character. Of those, \R can match one or two
9543*22dc650dSSadaf Ebrahimi     characters, but \X is never allowed because it matches an unknown number of
9544*22dc650dSSadaf Ebrahimi     characters. \C is allowed only in 32-bit and non-UTF 8/16-bit modes. */
9545*22dc650dSSadaf Ebrahimi 
9546*22dc650dSSadaf Ebrahimi     case META_ESCAPE:
9547*22dc650dSSadaf Ebrahimi     escape = META_DATA(*pptr);
9548*22dc650dSSadaf Ebrahimi     if (escape == ESC_X) return -1;
9549*22dc650dSSadaf Ebrahimi     if (escape == ESC_R)
9550*22dc650dSSadaf Ebrahimi       {
9551*22dc650dSSadaf Ebrahimi       itemminlength = 1;
9552*22dc650dSSadaf Ebrahimi       itemlength = 2;
9553*22dc650dSSadaf Ebrahimi       }
9554*22dc650dSSadaf Ebrahimi     else if (escape > ESC_b && escape < ESC_Z)
9555*22dc650dSSadaf Ebrahimi       {
9556*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH != 32
9557*22dc650dSSadaf Ebrahimi       if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
9558*22dc650dSSadaf Ebrahimi         {
9559*22dc650dSSadaf Ebrahimi         *errcodeptr = ERR36;
9560*22dc650dSSadaf Ebrahimi         return -1;
9561*22dc650dSSadaf Ebrahimi         }
9562*22dc650dSSadaf Ebrahimi #endif
9563*22dc650dSSadaf Ebrahimi       itemlength = itemminlength = 1;
9564*22dc650dSSadaf Ebrahimi       if (escape == ESC_p || escape == ESC_P) pptr++;  /* Skip prop data */
9565*22dc650dSSadaf Ebrahimi       }
9566*22dc650dSSadaf Ebrahimi     break;
9567*22dc650dSSadaf Ebrahimi 
9568*22dc650dSSadaf Ebrahimi     /* Lookaheads do not contribute to the length of this branch, but they may
9569*22dc650dSSadaf Ebrahimi     contain lookbehinds within them whose lengths need to be set. */
9570*22dc650dSSadaf Ebrahimi 
9571*22dc650dSSadaf Ebrahimi     case META_LOOKAHEAD:
9572*22dc650dSSadaf Ebrahimi     case META_LOOKAHEADNOT:
9573*22dc650dSSadaf Ebrahimi     case META_LOOKAHEAD_NA:
9574*22dc650dSSadaf Ebrahimi     *errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr);
9575*22dc650dSSadaf Ebrahimi     if (*errcodeptr != 0) return -1;
9576*22dc650dSSadaf Ebrahimi 
9577*22dc650dSSadaf Ebrahimi     /* Ignore any qualifiers that follow a lookahead assertion. */
9578*22dc650dSSadaf Ebrahimi 
9579*22dc650dSSadaf Ebrahimi     switch (pptr[1])
9580*22dc650dSSadaf Ebrahimi       {
9581*22dc650dSSadaf Ebrahimi       case META_ASTERISK:
9582*22dc650dSSadaf Ebrahimi       case META_ASTERISK_PLUS:
9583*22dc650dSSadaf Ebrahimi       case META_ASTERISK_QUERY:
9584*22dc650dSSadaf Ebrahimi       case META_PLUS:
9585*22dc650dSSadaf Ebrahimi       case META_PLUS_PLUS:
9586*22dc650dSSadaf Ebrahimi       case META_PLUS_QUERY:
9587*22dc650dSSadaf Ebrahimi       case META_QUERY:
9588*22dc650dSSadaf Ebrahimi       case META_QUERY_PLUS:
9589*22dc650dSSadaf Ebrahimi       case META_QUERY_QUERY:
9590*22dc650dSSadaf Ebrahimi       pptr++;
9591*22dc650dSSadaf Ebrahimi       break;
9592*22dc650dSSadaf Ebrahimi 
9593*22dc650dSSadaf Ebrahimi       case META_MINMAX:
9594*22dc650dSSadaf Ebrahimi       case META_MINMAX_PLUS:
9595*22dc650dSSadaf Ebrahimi       case META_MINMAX_QUERY:
9596*22dc650dSSadaf Ebrahimi       pptr += 3;
9597*22dc650dSSadaf Ebrahimi       break;
9598*22dc650dSSadaf Ebrahimi 
9599*22dc650dSSadaf Ebrahimi       default:
9600*22dc650dSSadaf Ebrahimi       break;
9601*22dc650dSSadaf Ebrahimi       }
9602*22dc650dSSadaf Ebrahimi     break;
9603*22dc650dSSadaf Ebrahimi 
9604*22dc650dSSadaf Ebrahimi     /* A nested lookbehind does not contribute any length to this lookbehind,
9605*22dc650dSSadaf Ebrahimi     but must itself be checked and have its lengths set. */
9606*22dc650dSSadaf Ebrahimi 
9607*22dc650dSSadaf Ebrahimi     case META_LOOKBEHIND:
9608*22dc650dSSadaf Ebrahimi     case META_LOOKBEHINDNOT:
9609*22dc650dSSadaf Ebrahimi     case META_LOOKBEHIND_NA:
9610*22dc650dSSadaf Ebrahimi     if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
9611*22dc650dSSadaf Ebrahimi       return -1;
9612*22dc650dSSadaf Ebrahimi     break;
9613*22dc650dSSadaf Ebrahimi 
9614*22dc650dSSadaf Ebrahimi     /* Back references and recursions are handled by very similar code. At this
9615*22dc650dSSadaf Ebrahimi     stage, the names generated in the parsing pass are available, but the main
9616*22dc650dSSadaf Ebrahimi     name table has not yet been created. So for the named varieties, scan the
9617*22dc650dSSadaf Ebrahimi     list of names in order to get the number of the first one in the pattern,
9618*22dc650dSSadaf Ebrahimi     and whether or not this name is duplicated. */
9619*22dc650dSSadaf Ebrahimi 
9620*22dc650dSSadaf Ebrahimi     case META_BACKREF_BYNAME:
9621*22dc650dSSadaf Ebrahimi     if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
9622*22dc650dSSadaf Ebrahimi       goto ISNOTFIXED;
9623*22dc650dSSadaf Ebrahimi     /* Fall through */
9624*22dc650dSSadaf Ebrahimi 
9625*22dc650dSSadaf Ebrahimi     case META_RECURSE_BYNAME:
9626*22dc650dSSadaf Ebrahimi       {
9627*22dc650dSSadaf Ebrahimi       int i;
9628*22dc650dSSadaf Ebrahimi       PCRE2_SPTR name;
9629*22dc650dSSadaf Ebrahimi       BOOL is_dupname = FALSE;
9630*22dc650dSSadaf Ebrahimi       named_group *ng = cb->named_groups;
9631*22dc650dSSadaf Ebrahimi       uint32_t meta_code = META_CODE(*pptr);
9632*22dc650dSSadaf Ebrahimi       uint32_t length = *(++pptr);
9633*22dc650dSSadaf Ebrahimi 
9634*22dc650dSSadaf Ebrahimi       GETPLUSOFFSET(offset, pptr);
9635*22dc650dSSadaf Ebrahimi       name = cb->start_pattern + offset;
9636*22dc650dSSadaf Ebrahimi       for (i = 0; i < cb->names_found; i++, ng++)
9637*22dc650dSSadaf Ebrahimi         {
9638*22dc650dSSadaf Ebrahimi         if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0)
9639*22dc650dSSadaf Ebrahimi           {
9640*22dc650dSSadaf Ebrahimi           group = ng->number;
9641*22dc650dSSadaf Ebrahimi           is_dupname = ng->isdup;
9642*22dc650dSSadaf Ebrahimi           break;
9643*22dc650dSSadaf Ebrahimi           }
9644*22dc650dSSadaf Ebrahimi         }
9645*22dc650dSSadaf Ebrahimi 
9646*22dc650dSSadaf Ebrahimi       if (group == 0)
9647*22dc650dSSadaf Ebrahimi         {
9648*22dc650dSSadaf Ebrahimi         *errcodeptr = ERR15;  /* Non-existent subpattern */
9649*22dc650dSSadaf Ebrahimi         cb->erroroffset = offset;
9650*22dc650dSSadaf Ebrahimi         return -1;
9651*22dc650dSSadaf Ebrahimi         }
9652*22dc650dSSadaf Ebrahimi 
9653*22dc650dSSadaf Ebrahimi       /* A numerical back reference can be fixed length if duplicate capturing
9654*22dc650dSSadaf Ebrahimi       groups are not being used. A non-duplicate named back reference can also
9655*22dc650dSSadaf Ebrahimi       be handled. */
9656*22dc650dSSadaf Ebrahimi 
9657*22dc650dSSadaf Ebrahimi       if (meta_code == META_RECURSE_BYNAME ||
9658*22dc650dSSadaf Ebrahimi           (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
9659*22dc650dSSadaf Ebrahimi         goto RECURSE_OR_BACKREF_LENGTH;  /* Handle as a numbered version. */
9660*22dc650dSSadaf Ebrahimi       }
9661*22dc650dSSadaf Ebrahimi     goto ISNOTFIXED;                     /* Duplicate name or number */
9662*22dc650dSSadaf Ebrahimi 
9663*22dc650dSSadaf Ebrahimi     /* The offset values for back references < 10 are in a separate vector
9664*22dc650dSSadaf Ebrahimi     because otherwise they would use more than two parsed pattern elements on
9665*22dc650dSSadaf Ebrahimi     64-bit systems. */
9666*22dc650dSSadaf Ebrahimi 
9667*22dc650dSSadaf Ebrahimi     case META_BACKREF:
9668*22dc650dSSadaf Ebrahimi     if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
9669*22dc650dSSadaf Ebrahimi         (cb->external_flags & PCRE2_DUPCAPUSED) != 0)
9670*22dc650dSSadaf Ebrahimi       goto ISNOTFIXED;
9671*22dc650dSSadaf Ebrahimi     group = META_DATA(*pptr);
9672*22dc650dSSadaf Ebrahimi     if (group < 10)
9673*22dc650dSSadaf Ebrahimi       {
9674*22dc650dSSadaf Ebrahimi       offset = cb->small_ref_offset[group];
9675*22dc650dSSadaf Ebrahimi       goto RECURSE_OR_BACKREF_LENGTH;
9676*22dc650dSSadaf Ebrahimi       }
9677*22dc650dSSadaf Ebrahimi 
9678*22dc650dSSadaf Ebrahimi     /* Fall through */
9679*22dc650dSSadaf Ebrahimi     /* For groups >= 10 - picking up group twice does no harm. */
9680*22dc650dSSadaf Ebrahimi 
9681*22dc650dSSadaf Ebrahimi     /* A true recursion implies not fixed length, but a subroutine call may
9682*22dc650dSSadaf Ebrahimi     be OK. Back reference "recursions" are also failed. */
9683*22dc650dSSadaf Ebrahimi 
9684*22dc650dSSadaf Ebrahimi     case META_RECURSE:
9685*22dc650dSSadaf Ebrahimi     group = META_DATA(*pptr);
9686*22dc650dSSadaf Ebrahimi     GETPLUSOFFSET(offset, pptr);
9687*22dc650dSSadaf Ebrahimi 
9688*22dc650dSSadaf Ebrahimi     RECURSE_OR_BACKREF_LENGTH:
9689*22dc650dSSadaf Ebrahimi     if (group > cb->bracount)
9690*22dc650dSSadaf Ebrahimi       {
9691*22dc650dSSadaf Ebrahimi       cb->erroroffset = offset;
9692*22dc650dSSadaf Ebrahimi       *errcodeptr = ERR15;  /* Non-existent subpattern */
9693*22dc650dSSadaf Ebrahimi       return -1;
9694*22dc650dSSadaf Ebrahimi       }
9695*22dc650dSSadaf Ebrahimi     if (group == 0) goto ISNOTFIXED;  /* Local recursion */
9696*22dc650dSSadaf Ebrahimi     for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
9697*22dc650dSSadaf Ebrahimi       {
9698*22dc650dSSadaf Ebrahimi       if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
9699*22dc650dSSadaf Ebrahimi         else if (*gptr == (META_CAPTURE | group)) break;
9700*22dc650dSSadaf Ebrahimi       }
9701*22dc650dSSadaf Ebrahimi 
9702*22dc650dSSadaf Ebrahimi     /* We must start the search for the end of the group at the first meta code
9703*22dc650dSSadaf Ebrahimi     inside the group. Otherwise it will be treated as an enclosed group. */
9704*22dc650dSSadaf Ebrahimi 
9705*22dc650dSSadaf Ebrahimi     gptrend = parsed_skip(gptr + 1, PSKIP_KET);
9706*22dc650dSSadaf Ebrahimi     if (gptrend == NULL) goto PARSED_SKIP_FAILED;
9707*22dc650dSSadaf Ebrahimi     if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED;  /* Local recursion */
9708*22dc650dSSadaf Ebrahimi     for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
9709*22dc650dSSadaf Ebrahimi     if (r != NULL) goto ISNOTFIXED;   /* Mutual recursion */
9710*22dc650dSSadaf Ebrahimi     this_recurse.prev = recurses;
9711*22dc650dSSadaf Ebrahimi     this_recurse.groupptr = gptr;
9712*22dc650dSSadaf Ebrahimi 
9713*22dc650dSSadaf Ebrahimi     /* We do not need to know the position of the end of the group, that is,
9714*22dc650dSSadaf Ebrahimi     gptr is not used after the call to get_grouplength(). Setting the second
9715*22dc650dSSadaf Ebrahimi     argument FALSE stops it scanning for the end when the length can be found
9716*22dc650dSSadaf Ebrahimi     in the cache. */
9717*22dc650dSSadaf Ebrahimi 
9718*22dc650dSSadaf Ebrahimi     gptr++;
9719*22dc650dSSadaf Ebrahimi     grouplength = get_grouplength(&gptr, &groupminlength, FALSE, errcodeptr,
9720*22dc650dSSadaf Ebrahimi       lcptr, group, &this_recurse, cb);
9721*22dc650dSSadaf Ebrahimi     if (grouplength < 0)
9722*22dc650dSSadaf Ebrahimi       {
9723*22dc650dSSadaf Ebrahimi       if (*errcodeptr == 0) goto ISNOTFIXED;
9724*22dc650dSSadaf Ebrahimi       return -1;  /* Error already set */
9725*22dc650dSSadaf Ebrahimi       }
9726*22dc650dSSadaf Ebrahimi     itemlength = grouplength;
9727*22dc650dSSadaf Ebrahimi     itemminlength = groupminlength;
9728*22dc650dSSadaf Ebrahimi     break;
9729*22dc650dSSadaf Ebrahimi 
9730*22dc650dSSadaf Ebrahimi     /* A (DEFINE) group is never obeyed inline and so it does not contribute to
9731*22dc650dSSadaf Ebrahimi     the length of this branch. Skip from the following item to the next
9732*22dc650dSSadaf Ebrahimi     unpaired ket. */
9733*22dc650dSSadaf Ebrahimi 
9734*22dc650dSSadaf Ebrahimi     case META_COND_DEFINE:
9735*22dc650dSSadaf Ebrahimi     pptr = parsed_skip(pptr + 1, PSKIP_KET);
9736*22dc650dSSadaf Ebrahimi     break;
9737*22dc650dSSadaf Ebrahimi 
9738*22dc650dSSadaf Ebrahimi     /* Check other nested groups - advance past the initial data for each type
9739*22dc650dSSadaf Ebrahimi     and then seek a fixed length with get_grouplength(). */
9740*22dc650dSSadaf Ebrahimi 
9741*22dc650dSSadaf Ebrahimi     case META_COND_NAME:
9742*22dc650dSSadaf Ebrahimi     case META_COND_NUMBER:
9743*22dc650dSSadaf Ebrahimi     case META_COND_RNAME:
9744*22dc650dSSadaf Ebrahimi     case META_COND_RNUMBER:
9745*22dc650dSSadaf Ebrahimi     pptr += 2 + SIZEOFFSET;
9746*22dc650dSSadaf Ebrahimi     goto CHECK_GROUP;
9747*22dc650dSSadaf Ebrahimi 
9748*22dc650dSSadaf Ebrahimi     case META_COND_ASSERT:
9749*22dc650dSSadaf Ebrahimi     pptr += 1;
9750*22dc650dSSadaf Ebrahimi     goto CHECK_GROUP;
9751*22dc650dSSadaf Ebrahimi 
9752*22dc650dSSadaf Ebrahimi     case META_COND_VERSION:
9753*22dc650dSSadaf Ebrahimi     pptr += 4;
9754*22dc650dSSadaf Ebrahimi     goto CHECK_GROUP;
9755*22dc650dSSadaf Ebrahimi 
9756*22dc650dSSadaf Ebrahimi     case META_CAPTURE:
9757*22dc650dSSadaf Ebrahimi     group = META_DATA(*pptr);
9758*22dc650dSSadaf Ebrahimi     /* Fall through */
9759*22dc650dSSadaf Ebrahimi 
9760*22dc650dSSadaf Ebrahimi     case META_ATOMIC:
9761*22dc650dSSadaf Ebrahimi     case META_NOCAPTURE:
9762*22dc650dSSadaf Ebrahimi     case META_SCRIPT_RUN:
9763*22dc650dSSadaf Ebrahimi     pptr++;
9764*22dc650dSSadaf Ebrahimi     CHECK_GROUP:
9765*22dc650dSSadaf Ebrahimi     grouplength = get_grouplength(&pptr, &groupminlength, TRUE, errcodeptr,
9766*22dc650dSSadaf Ebrahimi       lcptr, group, recurses, cb);
9767*22dc650dSSadaf Ebrahimi     if (grouplength < 0) return -1;
9768*22dc650dSSadaf Ebrahimi     itemlength = grouplength;
9769*22dc650dSSadaf Ebrahimi     itemminlength = groupminlength;
9770*22dc650dSSadaf Ebrahimi     break;
9771*22dc650dSSadaf Ebrahimi 
9772*22dc650dSSadaf Ebrahimi     case META_QUERY:
9773*22dc650dSSadaf Ebrahimi     case META_QUERY_PLUS:
9774*22dc650dSSadaf Ebrahimi     case META_QUERY_QUERY:
9775*22dc650dSSadaf Ebrahimi     min = 0;
9776*22dc650dSSadaf Ebrahimi     max = 1;
9777*22dc650dSSadaf Ebrahimi     goto REPETITION;
9778*22dc650dSSadaf Ebrahimi 
9779*22dc650dSSadaf Ebrahimi     /* Exact repetition is OK; variable repetition is not. A repetition of zero
9780*22dc650dSSadaf Ebrahimi     must subtract the length that has already been added. */
9781*22dc650dSSadaf Ebrahimi 
9782*22dc650dSSadaf Ebrahimi     case META_MINMAX:
9783*22dc650dSSadaf Ebrahimi     case META_MINMAX_PLUS:
9784*22dc650dSSadaf Ebrahimi     case META_MINMAX_QUERY:
9785*22dc650dSSadaf Ebrahimi     min = pptr[1];
9786*22dc650dSSadaf Ebrahimi     max = pptr[2];
9787*22dc650dSSadaf Ebrahimi     pptr += 2;
9788*22dc650dSSadaf Ebrahimi 
9789*22dc650dSSadaf Ebrahimi     REPETITION:
9790*22dc650dSSadaf Ebrahimi     if (max != REPEAT_UNLIMITED)
9791*22dc650dSSadaf Ebrahimi       {
9792*22dc650dSSadaf Ebrahimi       if (lastitemlength != 0 &&  /* Should not occur, but just in case */
9793*22dc650dSSadaf Ebrahimi           max != 0 &&
9794*22dc650dSSadaf Ebrahimi           (INT_MAX - branchlength)/lastitemlength < max - 1)
9795*22dc650dSSadaf Ebrahimi         {
9796*22dc650dSSadaf Ebrahimi         *errcodeptr = ERR87;  /* Integer overflow; lookbehind too big */
9797*22dc650dSSadaf Ebrahimi         return -1;
9798*22dc650dSSadaf Ebrahimi         }
9799*22dc650dSSadaf Ebrahimi       if (min == 0) branchminlength -= lastitemminlength;
9800*22dc650dSSadaf Ebrahimi         else itemminlength = (min - 1) * lastitemminlength;
9801*22dc650dSSadaf Ebrahimi       if (max == 0) branchlength -= lastitemlength;
9802*22dc650dSSadaf Ebrahimi         else itemlength = (max - 1) * lastitemlength;
9803*22dc650dSSadaf Ebrahimi       break;
9804*22dc650dSSadaf Ebrahimi       }
9805*22dc650dSSadaf Ebrahimi     /* Fall through */
9806*22dc650dSSadaf Ebrahimi 
9807*22dc650dSSadaf Ebrahimi     /* Any other item means this branch does not have a fixed length. */
9808*22dc650dSSadaf Ebrahimi 
9809*22dc650dSSadaf Ebrahimi     default:
9810*22dc650dSSadaf Ebrahimi     ISNOTFIXED:
9811*22dc650dSSadaf Ebrahimi     *errcodeptr = ERR25;   /* Not fixed length */
9812*22dc650dSSadaf Ebrahimi     return -1;
9813*22dc650dSSadaf Ebrahimi     }
9814*22dc650dSSadaf Ebrahimi 
9815*22dc650dSSadaf Ebrahimi   /* Add the item length to the branchlength, checking for integer overflow and
9816*22dc650dSSadaf Ebrahimi   for the branch length exceeding the overall limit. Later, if there is at
9817*22dc650dSSadaf Ebrahimi   least one variable-length branch in the group, there is a test for the
9818*22dc650dSSadaf Ebrahimi   (smaller) variable-length branch length limit. */
9819*22dc650dSSadaf Ebrahimi 
9820*22dc650dSSadaf Ebrahimi   if (INT_MAX - branchlength < (int)itemlength ||
9821*22dc650dSSadaf Ebrahimi       (branchlength += itemlength) > LOOKBEHIND_MAX)
9822*22dc650dSSadaf Ebrahimi     {
9823*22dc650dSSadaf Ebrahimi     *errcodeptr = ERR87;
9824*22dc650dSSadaf Ebrahimi     return -1;
9825*22dc650dSSadaf Ebrahimi     }
9826*22dc650dSSadaf Ebrahimi 
9827*22dc650dSSadaf Ebrahimi   branchminlength += itemminlength;
9828*22dc650dSSadaf Ebrahimi 
9829*22dc650dSSadaf Ebrahimi   /* Save this item length for use if the next item is a quantifier. */
9830*22dc650dSSadaf Ebrahimi 
9831*22dc650dSSadaf Ebrahimi   lastitemlength = itemlength;
9832*22dc650dSSadaf Ebrahimi   lastitemminlength = itemminlength;
9833*22dc650dSSadaf Ebrahimi   }
9834*22dc650dSSadaf Ebrahimi 
9835*22dc650dSSadaf Ebrahimi EXIT:
9836*22dc650dSSadaf Ebrahimi *pptrptr = pptr;
9837*22dc650dSSadaf Ebrahimi *minptr = branchminlength;
9838*22dc650dSSadaf Ebrahimi return branchlength;
9839*22dc650dSSadaf Ebrahimi 
9840*22dc650dSSadaf Ebrahimi PARSED_SKIP_FAILED:
9841*22dc650dSSadaf Ebrahimi *errcodeptr = ERR90;
9842*22dc650dSSadaf Ebrahimi return -1;
9843*22dc650dSSadaf Ebrahimi }
9844*22dc650dSSadaf Ebrahimi 
9845*22dc650dSSadaf Ebrahimi 
9846*22dc650dSSadaf Ebrahimi 
9847*22dc650dSSadaf Ebrahimi /*************************************************
9848*22dc650dSSadaf Ebrahimi *        Set lengths in a lookbehind             *
9849*22dc650dSSadaf Ebrahimi *************************************************/
9850*22dc650dSSadaf Ebrahimi 
9851*22dc650dSSadaf Ebrahimi /* This function is called for each lookbehind, to set the lengths in its
9852*22dc650dSSadaf Ebrahimi branches. An error occurs if any branch does not have a limited maximum length
9853*22dc650dSSadaf Ebrahimi that is less than the limit (65535). On exit, the pointer must be left on the
9854*22dc650dSSadaf Ebrahimi final ket.
9855*22dc650dSSadaf Ebrahimi 
9856*22dc650dSSadaf Ebrahimi The function also maintains the max_lookbehind value. Any lookbehind branch
9857*22dc650dSSadaf Ebrahimi that contains a nested lookbehind may actually look further back than the
9858*22dc650dSSadaf Ebrahimi length of the branch. The additional amount is passed back from
9859*22dc650dSSadaf Ebrahimi get_branchlength() as an "extra" value.
9860*22dc650dSSadaf Ebrahimi 
9861*22dc650dSSadaf Ebrahimi Arguments:
9862*22dc650dSSadaf Ebrahimi   pptrptr     pointer to pointer in the parsed pattern
9863*22dc650dSSadaf Ebrahimi   errcodeptr  pointer to error code
9864*22dc650dSSadaf Ebrahimi   lcptr       pointer to loop counter
9865*22dc650dSSadaf Ebrahimi   recurses    chain of recurse_check to catch mutual recursion
9866*22dc650dSSadaf Ebrahimi   cb          pointer to compile block
9867*22dc650dSSadaf Ebrahimi 
9868*22dc650dSSadaf Ebrahimi Returns:      TRUE if all is well
9869*22dc650dSSadaf Ebrahimi               FALSE otherwise, with error code and offset set
9870*22dc650dSSadaf Ebrahimi */
9871*22dc650dSSadaf Ebrahimi 
9872*22dc650dSSadaf Ebrahimi static BOOL
set_lookbehind_lengths(uint32_t ** pptrptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)9873*22dc650dSSadaf Ebrahimi set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9874*22dc650dSSadaf Ebrahimi   parsed_recurse_check *recurses, compile_block *cb)
9875*22dc650dSSadaf Ebrahimi {
9876*22dc650dSSadaf Ebrahimi PCRE2_SIZE offset;
9877*22dc650dSSadaf Ebrahimi uint32_t *bptr = *pptrptr;
9878*22dc650dSSadaf Ebrahimi uint32_t *gbptr = bptr;
9879*22dc650dSSadaf Ebrahimi int maxlength = 0;
9880*22dc650dSSadaf Ebrahimi int minlength = INT_MAX;
9881*22dc650dSSadaf Ebrahimi BOOL variable = FALSE;
9882*22dc650dSSadaf Ebrahimi 
9883*22dc650dSSadaf Ebrahimi READPLUSOFFSET(offset, bptr);  /* Offset for error messages */
9884*22dc650dSSadaf Ebrahimi *pptrptr += SIZEOFFSET;
9885*22dc650dSSadaf Ebrahimi 
9886*22dc650dSSadaf Ebrahimi /* Each branch can have a different maximum length, but we can keep only a
9887*22dc650dSSadaf Ebrahimi single minimum for the whole group, because there's nowhere to save individual
9888*22dc650dSSadaf Ebrahimi values in the META_ALT item. */
9889*22dc650dSSadaf Ebrahimi 
9890*22dc650dSSadaf Ebrahimi do
9891*22dc650dSSadaf Ebrahimi   {
9892*22dc650dSSadaf Ebrahimi   int branchlength, branchminlength;
9893*22dc650dSSadaf Ebrahimi 
9894*22dc650dSSadaf Ebrahimi   *pptrptr += 1;
9895*22dc650dSSadaf Ebrahimi   branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
9896*22dc650dSSadaf Ebrahimi     recurses, cb);
9897*22dc650dSSadaf Ebrahimi 
9898*22dc650dSSadaf Ebrahimi   if (branchlength < 0)
9899*22dc650dSSadaf Ebrahimi     {
9900*22dc650dSSadaf Ebrahimi     /* The errorcode and offset may already be set from a nested lookbehind. */
9901*22dc650dSSadaf Ebrahimi     if (*errcodeptr == 0) *errcodeptr = ERR25;
9902*22dc650dSSadaf Ebrahimi     if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
9903*22dc650dSSadaf Ebrahimi     return FALSE;
9904*22dc650dSSadaf Ebrahimi     }
9905*22dc650dSSadaf Ebrahimi 
9906*22dc650dSSadaf Ebrahimi   if (branchlength != branchminlength) variable = TRUE;
9907*22dc650dSSadaf Ebrahimi   if (branchminlength < minlength) minlength = branchminlength;
9908*22dc650dSSadaf Ebrahimi   if (branchlength > maxlength) maxlength = branchlength;
9909*22dc650dSSadaf Ebrahimi   if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
9910*22dc650dSSadaf Ebrahimi   *bptr |= branchlength;  /* branchlength never more than 65535 */
9911*22dc650dSSadaf Ebrahimi   bptr = *pptrptr;
9912*22dc650dSSadaf Ebrahimi   }
9913*22dc650dSSadaf Ebrahimi while (META_CODE(*bptr) == META_ALT);
9914*22dc650dSSadaf Ebrahimi 
9915*22dc650dSSadaf Ebrahimi /* If any branch is of variable length, the whole lookbehind is of variable
9916*22dc650dSSadaf Ebrahimi length. If the maximum length of any branch exceeds the maximum for variable
9917*22dc650dSSadaf Ebrahimi lookbehinds, give an error. Otherwise, the minimum length is set in the word
9918*22dc650dSSadaf Ebrahimi that follows the original group META value. For a fixed-length lookbehind, this
9919*22dc650dSSadaf Ebrahimi is set to LOOKBEHIND_MAX, to indicate that each branch is of a fixed (but
9920*22dc650dSSadaf Ebrahimi possibly different) length. */
9921*22dc650dSSadaf Ebrahimi 
9922*22dc650dSSadaf Ebrahimi if (variable)
9923*22dc650dSSadaf Ebrahimi   {
9924*22dc650dSSadaf Ebrahimi   gbptr[1] = minlength;
9925*22dc650dSSadaf Ebrahimi   if ((uint32_t)maxlength > cb->max_varlookbehind)
9926*22dc650dSSadaf Ebrahimi     {
9927*22dc650dSSadaf Ebrahimi     *errcodeptr = ERR100;
9928*22dc650dSSadaf Ebrahimi     cb->erroroffset = offset;
9929*22dc650dSSadaf Ebrahimi     return FALSE;
9930*22dc650dSSadaf Ebrahimi     }
9931*22dc650dSSadaf Ebrahimi   }
9932*22dc650dSSadaf Ebrahimi else gbptr[1] = LOOKBEHIND_MAX;
9933*22dc650dSSadaf Ebrahimi 
9934*22dc650dSSadaf Ebrahimi 
9935*22dc650dSSadaf Ebrahimi gbptr[1] = variable? minlength : LOOKBEHIND_MAX;
9936*22dc650dSSadaf Ebrahimi return TRUE;
9937*22dc650dSSadaf Ebrahimi }
9938*22dc650dSSadaf Ebrahimi 
9939*22dc650dSSadaf Ebrahimi 
9940*22dc650dSSadaf Ebrahimi 
9941*22dc650dSSadaf Ebrahimi /*************************************************
9942*22dc650dSSadaf Ebrahimi *         Check parsed pattern lookbehinds       *
9943*22dc650dSSadaf Ebrahimi *************************************************/
9944*22dc650dSSadaf Ebrahimi 
9945*22dc650dSSadaf Ebrahimi /* This function is called at the end of parsing a pattern if any lookbehinds
9946*22dc650dSSadaf Ebrahimi were encountered. It scans the parsed pattern for them, calling
9947*22dc650dSSadaf Ebrahimi set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
9948*22dc650dSSadaf Ebrahimi the error offset is marked unset. The enables the functions above not to
9949*22dc650dSSadaf Ebrahimi override settings from deeper nestings.
9950*22dc650dSSadaf Ebrahimi 
9951*22dc650dSSadaf Ebrahimi This function is called recursively from get_branchlength() for lookaheads in
9952*22dc650dSSadaf Ebrahimi order to process any lookbehinds that they may contain. It stops when it hits a
9953*22dc650dSSadaf Ebrahimi non-nested closing parenthesis in this case, returning a pointer to it.
9954*22dc650dSSadaf Ebrahimi 
9955*22dc650dSSadaf Ebrahimi Arguments
9956*22dc650dSSadaf Ebrahimi   pptr      points to where to start (start of pattern or start of lookahead)
9957*22dc650dSSadaf Ebrahimi   retptr    if not NULL, return the ket pointer here
9958*22dc650dSSadaf Ebrahimi   recurses  chain of recurse_check to catch mutual recursion
9959*22dc650dSSadaf Ebrahimi   cb        points to the compile block
9960*22dc650dSSadaf Ebrahimi   lcptr     points to loop counter
9961*22dc650dSSadaf Ebrahimi 
9962*22dc650dSSadaf Ebrahimi Returns:    0 on success, or an errorcode (cb->erroroffset will be set)
9963*22dc650dSSadaf Ebrahimi */
9964*22dc650dSSadaf Ebrahimi 
9965*22dc650dSSadaf Ebrahimi static int
check_lookbehinds(uint32_t * pptr,uint32_t ** retptr,parsed_recurse_check * recurses,compile_block * cb,int * lcptr)9966*22dc650dSSadaf Ebrahimi check_lookbehinds(uint32_t *pptr, uint32_t **retptr,
9967*22dc650dSSadaf Ebrahimi   parsed_recurse_check *recurses, compile_block *cb, int *lcptr)
9968*22dc650dSSadaf Ebrahimi {
9969*22dc650dSSadaf Ebrahimi int errorcode = 0;
9970*22dc650dSSadaf Ebrahimi int nestlevel = 0;
9971*22dc650dSSadaf Ebrahimi 
9972*22dc650dSSadaf Ebrahimi cb->erroroffset = PCRE2_UNSET;
9973*22dc650dSSadaf Ebrahimi 
9974*22dc650dSSadaf Ebrahimi for (; *pptr != META_END; pptr++)
9975*22dc650dSSadaf Ebrahimi   {
9976*22dc650dSSadaf Ebrahimi   if (*pptr < META_END) continue;  /* Literal */
9977*22dc650dSSadaf Ebrahimi 
9978*22dc650dSSadaf Ebrahimi   switch (META_CODE(*pptr))
9979*22dc650dSSadaf Ebrahimi     {
9980*22dc650dSSadaf Ebrahimi     default:
9981*22dc650dSSadaf Ebrahimi     return ERR70;  /* Unrecognized meta code */
9982*22dc650dSSadaf Ebrahimi 
9983*22dc650dSSadaf Ebrahimi     case META_ESCAPE:
9984*22dc650dSSadaf Ebrahimi     if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9985*22dc650dSSadaf Ebrahimi       pptr += 1;
9986*22dc650dSSadaf Ebrahimi     break;
9987*22dc650dSSadaf Ebrahimi 
9988*22dc650dSSadaf Ebrahimi     case META_KET:
9989*22dc650dSSadaf Ebrahimi     if (--nestlevel < 0)
9990*22dc650dSSadaf Ebrahimi       {
9991*22dc650dSSadaf Ebrahimi       if (retptr != NULL) *retptr = pptr;
9992*22dc650dSSadaf Ebrahimi       return 0;
9993*22dc650dSSadaf Ebrahimi       }
9994*22dc650dSSadaf Ebrahimi     break;
9995*22dc650dSSadaf Ebrahimi 
9996*22dc650dSSadaf Ebrahimi     case META_ATOMIC:
9997*22dc650dSSadaf Ebrahimi     case META_CAPTURE:
9998*22dc650dSSadaf Ebrahimi     case META_COND_ASSERT:
9999*22dc650dSSadaf Ebrahimi     case META_LOOKAHEAD:
10000*22dc650dSSadaf Ebrahimi     case META_LOOKAHEADNOT:
10001*22dc650dSSadaf Ebrahimi     case META_LOOKAHEAD_NA:
10002*22dc650dSSadaf Ebrahimi     case META_NOCAPTURE:
10003*22dc650dSSadaf Ebrahimi     case META_SCRIPT_RUN:
10004*22dc650dSSadaf Ebrahimi     nestlevel++;
10005*22dc650dSSadaf Ebrahimi     break;
10006*22dc650dSSadaf Ebrahimi 
10007*22dc650dSSadaf Ebrahimi     case META_ACCEPT:
10008*22dc650dSSadaf Ebrahimi     case META_ALT:
10009*22dc650dSSadaf Ebrahimi     case META_ASTERISK:
10010*22dc650dSSadaf Ebrahimi     case META_ASTERISK_PLUS:
10011*22dc650dSSadaf Ebrahimi     case META_ASTERISK_QUERY:
10012*22dc650dSSadaf Ebrahimi     case META_BACKREF:
10013*22dc650dSSadaf Ebrahimi     case META_CIRCUMFLEX:
10014*22dc650dSSadaf Ebrahimi     case META_CLASS:
10015*22dc650dSSadaf Ebrahimi     case META_CLASS_EMPTY:
10016*22dc650dSSadaf Ebrahimi     case META_CLASS_EMPTY_NOT:
10017*22dc650dSSadaf Ebrahimi     case META_CLASS_END:
10018*22dc650dSSadaf Ebrahimi     case META_CLASS_NOT:
10019*22dc650dSSadaf Ebrahimi     case META_COMMIT:
10020*22dc650dSSadaf Ebrahimi     case META_DOLLAR:
10021*22dc650dSSadaf Ebrahimi     case META_DOT:
10022*22dc650dSSadaf Ebrahimi     case META_FAIL:
10023*22dc650dSSadaf Ebrahimi     case META_PLUS:
10024*22dc650dSSadaf Ebrahimi     case META_PLUS_PLUS:
10025*22dc650dSSadaf Ebrahimi     case META_PLUS_QUERY:
10026*22dc650dSSadaf Ebrahimi     case META_PRUNE:
10027*22dc650dSSadaf Ebrahimi     case META_QUERY:
10028*22dc650dSSadaf Ebrahimi     case META_QUERY_PLUS:
10029*22dc650dSSadaf Ebrahimi     case META_QUERY_QUERY:
10030*22dc650dSSadaf Ebrahimi     case META_RANGE_ESCAPED:
10031*22dc650dSSadaf Ebrahimi     case META_RANGE_LITERAL:
10032*22dc650dSSadaf Ebrahimi     case META_SKIP:
10033*22dc650dSSadaf Ebrahimi     case META_THEN:
10034*22dc650dSSadaf Ebrahimi     break;
10035*22dc650dSSadaf Ebrahimi 
10036*22dc650dSSadaf Ebrahimi     case META_RECURSE:
10037*22dc650dSSadaf Ebrahimi     pptr += SIZEOFFSET;
10038*22dc650dSSadaf Ebrahimi     break;
10039*22dc650dSSadaf Ebrahimi 
10040*22dc650dSSadaf Ebrahimi     case META_BACKREF_BYNAME:
10041*22dc650dSSadaf Ebrahimi     case META_RECURSE_BYNAME:
10042*22dc650dSSadaf Ebrahimi     pptr += 1 + SIZEOFFSET;
10043*22dc650dSSadaf Ebrahimi     break;
10044*22dc650dSSadaf Ebrahimi 
10045*22dc650dSSadaf Ebrahimi     case META_COND_DEFINE:
10046*22dc650dSSadaf Ebrahimi     pptr += SIZEOFFSET;
10047*22dc650dSSadaf Ebrahimi     nestlevel++;
10048*22dc650dSSadaf Ebrahimi     break;
10049*22dc650dSSadaf Ebrahimi 
10050*22dc650dSSadaf Ebrahimi     case META_COND_NAME:
10051*22dc650dSSadaf Ebrahimi     case META_COND_NUMBER:
10052*22dc650dSSadaf Ebrahimi     case META_COND_RNAME:
10053*22dc650dSSadaf Ebrahimi     case META_COND_RNUMBER:
10054*22dc650dSSadaf Ebrahimi     pptr += 1 + SIZEOFFSET;
10055*22dc650dSSadaf Ebrahimi     nestlevel++;
10056*22dc650dSSadaf Ebrahimi     break;
10057*22dc650dSSadaf Ebrahimi 
10058*22dc650dSSadaf Ebrahimi     case META_COND_VERSION:
10059*22dc650dSSadaf Ebrahimi     pptr += 3;
10060*22dc650dSSadaf Ebrahimi     nestlevel++;
10061*22dc650dSSadaf Ebrahimi     break;
10062*22dc650dSSadaf Ebrahimi 
10063*22dc650dSSadaf Ebrahimi     case META_CALLOUT_STRING:
10064*22dc650dSSadaf Ebrahimi     pptr += 3 + SIZEOFFSET;
10065*22dc650dSSadaf Ebrahimi     break;
10066*22dc650dSSadaf Ebrahimi 
10067*22dc650dSSadaf Ebrahimi     case META_BIGVALUE:
10068*22dc650dSSadaf Ebrahimi     case META_POSIX:
10069*22dc650dSSadaf Ebrahimi     case META_POSIX_NEG:
10070*22dc650dSSadaf Ebrahimi     pptr += 1;
10071*22dc650dSSadaf Ebrahimi     break;
10072*22dc650dSSadaf Ebrahimi 
10073*22dc650dSSadaf Ebrahimi     case META_MINMAX:
10074*22dc650dSSadaf Ebrahimi     case META_MINMAX_QUERY:
10075*22dc650dSSadaf Ebrahimi     case META_MINMAX_PLUS:
10076*22dc650dSSadaf Ebrahimi     case META_OPTIONS:
10077*22dc650dSSadaf Ebrahimi     pptr += 2;
10078*22dc650dSSadaf Ebrahimi     break;
10079*22dc650dSSadaf Ebrahimi 
10080*22dc650dSSadaf Ebrahimi     case META_CALLOUT_NUMBER:
10081*22dc650dSSadaf Ebrahimi     pptr += 3;
10082*22dc650dSSadaf Ebrahimi     break;
10083*22dc650dSSadaf Ebrahimi 
10084*22dc650dSSadaf Ebrahimi     case META_MARK:
10085*22dc650dSSadaf Ebrahimi     case META_COMMIT_ARG:
10086*22dc650dSSadaf Ebrahimi     case META_PRUNE_ARG:
10087*22dc650dSSadaf Ebrahimi     case META_SKIP_ARG:
10088*22dc650dSSadaf Ebrahimi     case META_THEN_ARG:
10089*22dc650dSSadaf Ebrahimi     pptr += 1 + pptr[1];
10090*22dc650dSSadaf Ebrahimi     break;
10091*22dc650dSSadaf Ebrahimi 
10092*22dc650dSSadaf Ebrahimi     case META_LOOKBEHIND:
10093*22dc650dSSadaf Ebrahimi     case META_LOOKBEHINDNOT:
10094*22dc650dSSadaf Ebrahimi     case META_LOOKBEHIND_NA:
10095*22dc650dSSadaf Ebrahimi     if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb))
10096*22dc650dSSadaf Ebrahimi       return errorcode;
10097*22dc650dSSadaf Ebrahimi     break;
10098*22dc650dSSadaf Ebrahimi     }
10099*22dc650dSSadaf Ebrahimi   }
10100*22dc650dSSadaf Ebrahimi 
10101*22dc650dSSadaf Ebrahimi return 0;
10102*22dc650dSSadaf Ebrahimi }
10103*22dc650dSSadaf Ebrahimi 
10104*22dc650dSSadaf Ebrahimi 
10105*22dc650dSSadaf Ebrahimi 
10106*22dc650dSSadaf Ebrahimi /*************************************************
10107*22dc650dSSadaf Ebrahimi *     External function to compile a pattern     *
10108*22dc650dSSadaf Ebrahimi *************************************************/
10109*22dc650dSSadaf Ebrahimi 
10110*22dc650dSSadaf Ebrahimi /* This function reads a regular expression in the form of a string and returns
10111*22dc650dSSadaf Ebrahimi a pointer to a block of store holding a compiled version of the expression.
10112*22dc650dSSadaf Ebrahimi 
10113*22dc650dSSadaf Ebrahimi Arguments:
10114*22dc650dSSadaf Ebrahimi   pattern       the regular expression
10115*22dc650dSSadaf Ebrahimi   patlen        the length of the pattern, or PCRE2_ZERO_TERMINATED
10116*22dc650dSSadaf Ebrahimi   options       option bits
10117*22dc650dSSadaf Ebrahimi   errorptr      pointer to errorcode
10118*22dc650dSSadaf Ebrahimi   erroroffset   pointer to error offset
10119*22dc650dSSadaf Ebrahimi   ccontext      points to a compile context or is NULL
10120*22dc650dSSadaf Ebrahimi 
10121*22dc650dSSadaf Ebrahimi Returns:        pointer to compiled data block, or NULL on error,
10122*22dc650dSSadaf Ebrahimi                 with errorcode and erroroffset set
10123*22dc650dSSadaf Ebrahimi */
10124*22dc650dSSadaf Ebrahimi 
10125*22dc650dSSadaf Ebrahimi PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_compile(PCRE2_SPTR pattern,PCRE2_SIZE patlen,uint32_t options,int * errorptr,PCRE2_SIZE * erroroffset,pcre2_compile_context * ccontext)10126*22dc650dSSadaf Ebrahimi pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
10127*22dc650dSSadaf Ebrahimi    int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
10128*22dc650dSSadaf Ebrahimi {
10129*22dc650dSSadaf Ebrahimi BOOL utf;                             /* Set TRUE for UTF mode */
10130*22dc650dSSadaf Ebrahimi BOOL ucp;                             /* Set TRUE for UCP mode */
10131*22dc650dSSadaf Ebrahimi BOOL has_lookbehind = FALSE;          /* Set TRUE if a lookbehind is found */
10132*22dc650dSSadaf Ebrahimi BOOL zero_terminated;                 /* Set TRUE for zero-terminated pattern */
10133*22dc650dSSadaf Ebrahimi pcre2_real_code *re = NULL;           /* What we will return */
10134*22dc650dSSadaf Ebrahimi compile_block cb;                     /* "Static" compile-time data */
10135*22dc650dSSadaf Ebrahimi const uint8_t *tables;                /* Char tables base pointer */
10136*22dc650dSSadaf Ebrahimi 
10137*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *code;                    /* Current pointer in compiled code */
10138*22dc650dSSadaf Ebrahimi PCRE2_SPTR codestart;                 /* Start of compiled code */
10139*22dc650dSSadaf Ebrahimi PCRE2_SPTR ptr;                       /* Current pointer in pattern */
10140*22dc650dSSadaf Ebrahimi uint32_t *pptr;                       /* Current pointer in parsed pattern */
10141*22dc650dSSadaf Ebrahimi 
10142*22dc650dSSadaf Ebrahimi PCRE2_SIZE length = 1;                /* Allow for final END opcode */
10143*22dc650dSSadaf Ebrahimi PCRE2_SIZE usedlength;                /* Actual length used */
10144*22dc650dSSadaf Ebrahimi PCRE2_SIZE re_blocksize;              /* Size of memory block */
10145*22dc650dSSadaf Ebrahimi PCRE2_SIZE big32count = 0;            /* 32-bit literals >= 0x80000000 */
10146*22dc650dSSadaf Ebrahimi PCRE2_SIZE parsed_size_needed;        /* Needed for parsed pattern */
10147*22dc650dSSadaf Ebrahimi 
10148*22dc650dSSadaf Ebrahimi uint32_t firstcuflags, reqcuflags;    /* Type of first/req code unit */
10149*22dc650dSSadaf Ebrahimi uint32_t firstcu, reqcu;              /* Value of first/req code unit */
10150*22dc650dSSadaf Ebrahimi uint32_t setflags = 0;                /* NL and BSR set flags */
10151*22dc650dSSadaf Ebrahimi 
10152*22dc650dSSadaf Ebrahimi uint32_t skipatstart;                 /* When checking (*UTF) etc */
10153*22dc650dSSadaf Ebrahimi uint32_t limit_heap  = UINT32_MAX;
10154*22dc650dSSadaf Ebrahimi uint32_t limit_match = UINT32_MAX;    /* Unset match limits */
10155*22dc650dSSadaf Ebrahimi uint32_t limit_depth = UINT32_MAX;
10156*22dc650dSSadaf Ebrahimi 
10157*22dc650dSSadaf Ebrahimi int newline = 0;                      /* Unset; can be set by the pattern */
10158*22dc650dSSadaf Ebrahimi int bsr = 0;                          /* Unset; can be set by the pattern */
10159*22dc650dSSadaf Ebrahimi int errorcode = 0;                    /* Initialize to avoid compiler warn */
10160*22dc650dSSadaf Ebrahimi int regexrc;                          /* Return from compile */
10161*22dc650dSSadaf Ebrahimi 
10162*22dc650dSSadaf Ebrahimi uint32_t i;                           /* Local loop counter */
10163*22dc650dSSadaf Ebrahimi 
10164*22dc650dSSadaf Ebrahimi /* Comments at the head of this file explain about these variables. */
10165*22dc650dSSadaf Ebrahimi 
10166*22dc650dSSadaf Ebrahimi uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
10167*22dc650dSSadaf Ebrahimi uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
10168*22dc650dSSadaf Ebrahimi named_group named_groups[NAMED_GROUP_LIST_SIZE];
10169*22dc650dSSadaf Ebrahimi 
10170*22dc650dSSadaf Ebrahimi /* The workspace is used in different ways in the different compiling phases.
10171*22dc650dSSadaf Ebrahimi It needs to be 16-bit aligned for the preliminary parsing scan. */
10172*22dc650dSSadaf Ebrahimi 
10173*22dc650dSSadaf Ebrahimi uint32_t c16workspace[C16_WORK_SIZE];
10174*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
10175*22dc650dSSadaf Ebrahimi 
10176*22dc650dSSadaf Ebrahimi 
10177*22dc650dSSadaf Ebrahimi /* -------------- Check arguments and set up the pattern ----------------- */
10178*22dc650dSSadaf Ebrahimi 
10179*22dc650dSSadaf Ebrahimi /* There must be error code and offset pointers. */
10180*22dc650dSSadaf Ebrahimi 
10181*22dc650dSSadaf Ebrahimi if (errorptr == NULL || erroroffset == NULL) return NULL;
10182*22dc650dSSadaf Ebrahimi *errorptr = ERR0;
10183*22dc650dSSadaf Ebrahimi *erroroffset = 0;
10184*22dc650dSSadaf Ebrahimi 
10185*22dc650dSSadaf Ebrahimi /* There must be a pattern, but NULL is allowed with zero length. */
10186*22dc650dSSadaf Ebrahimi 
10187*22dc650dSSadaf Ebrahimi if (pattern == NULL)
10188*22dc650dSSadaf Ebrahimi   {
10189*22dc650dSSadaf Ebrahimi   if (patlen == 0) pattern = (PCRE2_SPTR)""; else
10190*22dc650dSSadaf Ebrahimi     {
10191*22dc650dSSadaf Ebrahimi     *errorptr = ERR16;
10192*22dc650dSSadaf Ebrahimi     return NULL;
10193*22dc650dSSadaf Ebrahimi     }
10194*22dc650dSSadaf Ebrahimi   }
10195*22dc650dSSadaf Ebrahimi 
10196*22dc650dSSadaf Ebrahimi /* A NULL compile context means "use a default context" */
10197*22dc650dSSadaf Ebrahimi 
10198*22dc650dSSadaf Ebrahimi if (ccontext == NULL)
10199*22dc650dSSadaf Ebrahimi   ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
10200*22dc650dSSadaf Ebrahimi 
10201*22dc650dSSadaf Ebrahimi /* PCRE2_MATCH_INVALID_UTF implies UTF */
10202*22dc650dSSadaf Ebrahimi 
10203*22dc650dSSadaf Ebrahimi if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF;
10204*22dc650dSSadaf Ebrahimi 
10205*22dc650dSSadaf Ebrahimi /* Check that all undefined public option bits are zero. */
10206*22dc650dSSadaf Ebrahimi 
10207*22dc650dSSadaf Ebrahimi if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
10208*22dc650dSSadaf Ebrahimi     (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
10209*22dc650dSSadaf Ebrahimi   {
10210*22dc650dSSadaf Ebrahimi   *errorptr = ERR17;
10211*22dc650dSSadaf Ebrahimi   return NULL;
10212*22dc650dSSadaf Ebrahimi   }
10213*22dc650dSSadaf Ebrahimi 
10214*22dc650dSSadaf Ebrahimi if ((options & PCRE2_LITERAL) != 0 &&
10215*22dc650dSSadaf Ebrahimi     ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
10216*22dc650dSSadaf Ebrahimi      (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
10217*22dc650dSSadaf Ebrahimi   {
10218*22dc650dSSadaf Ebrahimi   *errorptr = ERR92;
10219*22dc650dSSadaf Ebrahimi   return NULL;
10220*22dc650dSSadaf Ebrahimi   }
10221*22dc650dSSadaf Ebrahimi 
10222*22dc650dSSadaf Ebrahimi /* A zero-terminated pattern is indicated by the special length value
10223*22dc650dSSadaf Ebrahimi PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
10224*22dc650dSSadaf Ebrahimi 
10225*22dc650dSSadaf Ebrahimi if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
10226*22dc650dSSadaf Ebrahimi   patlen = PRIV(strlen)(pattern);
10227*22dc650dSSadaf Ebrahimi 
10228*22dc650dSSadaf Ebrahimi if (patlen > ccontext->max_pattern_length)
10229*22dc650dSSadaf Ebrahimi   {
10230*22dc650dSSadaf Ebrahimi   *errorptr = ERR88;
10231*22dc650dSSadaf Ebrahimi   return NULL;
10232*22dc650dSSadaf Ebrahimi   }
10233*22dc650dSSadaf Ebrahimi 
10234*22dc650dSSadaf Ebrahimi /* From here on, all returns from this function should end up going via the
10235*22dc650dSSadaf Ebrahimi EXIT label. */
10236*22dc650dSSadaf Ebrahimi 
10237*22dc650dSSadaf Ebrahimi 
10238*22dc650dSSadaf Ebrahimi /* ------------ Initialize the "static" compile data -------------- */
10239*22dc650dSSadaf Ebrahimi 
10240*22dc650dSSadaf Ebrahimi tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
10241*22dc650dSSadaf Ebrahimi 
10242*22dc650dSSadaf Ebrahimi cb.lcc = tables + lcc_offset;          /* Individual */
10243*22dc650dSSadaf Ebrahimi cb.fcc = tables + fcc_offset;          /*   character */
10244*22dc650dSSadaf Ebrahimi cb.cbits = tables + cbits_offset;      /*      tables */
10245*22dc650dSSadaf Ebrahimi cb.ctypes = tables + ctypes_offset;
10246*22dc650dSSadaf Ebrahimi 
10247*22dc650dSSadaf Ebrahimi cb.assert_depth = 0;
10248*22dc650dSSadaf Ebrahimi cb.bracount = 0;
10249*22dc650dSSadaf Ebrahimi cb.cx = ccontext;
10250*22dc650dSSadaf Ebrahimi cb.dupnames = FALSE;
10251*22dc650dSSadaf Ebrahimi cb.end_pattern = pattern + patlen;
10252*22dc650dSSadaf Ebrahimi cb.erroroffset = 0;
10253*22dc650dSSadaf Ebrahimi cb.external_flags = 0;
10254*22dc650dSSadaf Ebrahimi cb.external_options = options;
10255*22dc650dSSadaf Ebrahimi cb.groupinfo = stack_groupinfo;
10256*22dc650dSSadaf Ebrahimi cb.had_recurse = FALSE;
10257*22dc650dSSadaf Ebrahimi cb.lastcapture = 0;
10258*22dc650dSSadaf Ebrahimi cb.max_lookbehind = 0;                               /* Max encountered */
10259*22dc650dSSadaf Ebrahimi cb.max_varlookbehind = ccontext->max_varlookbehind;  /* Limit */
10260*22dc650dSSadaf Ebrahimi cb.name_entry_size = 0;
10261*22dc650dSSadaf Ebrahimi cb.name_table = NULL;
10262*22dc650dSSadaf Ebrahimi cb.named_groups = named_groups;
10263*22dc650dSSadaf Ebrahimi cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
10264*22dc650dSSadaf Ebrahimi cb.names_found = 0;
10265*22dc650dSSadaf Ebrahimi cb.parens_depth = 0;
10266*22dc650dSSadaf Ebrahimi cb.parsed_pattern = stack_parsed_pattern;
10267*22dc650dSSadaf Ebrahimi cb.req_varyopt = 0;
10268*22dc650dSSadaf Ebrahimi cb.start_code = cworkspace;
10269*22dc650dSSadaf Ebrahimi cb.start_pattern = pattern;
10270*22dc650dSSadaf Ebrahimi cb.start_workspace = cworkspace;
10271*22dc650dSSadaf Ebrahimi cb.workspace_size = COMPILE_WORK_SIZE;
10272*22dc650dSSadaf Ebrahimi 
10273*22dc650dSSadaf Ebrahimi /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
10274*22dc650dSSadaf Ebrahimi references to help in deciding whether (.*) can be treated as anchored or not.
10275*22dc650dSSadaf Ebrahimi */
10276*22dc650dSSadaf Ebrahimi 
10277*22dc650dSSadaf Ebrahimi cb.top_backref = 0;
10278*22dc650dSSadaf Ebrahimi cb.backref_map = 0;
10279*22dc650dSSadaf Ebrahimi 
10280*22dc650dSSadaf Ebrahimi /* Escape sequences \1 to \9 are always back references, but as they are only
10281*22dc650dSSadaf Ebrahimi two characters long, only two elements can be used in the parsed_pattern
10282*22dc650dSSadaf Ebrahimi vector. The first contains the reference, and we'd like to use the second to
10283*22dc650dSSadaf Ebrahimi record the offset in the pattern, so that forward references to non-existent
10284*22dc650dSSadaf Ebrahimi groups can be diagnosed later with an offset. However, on 64-bit systems,
10285*22dc650dSSadaf Ebrahimi PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
10286*22dc650dSSadaf Ebrahimi occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
10287*22dc650dSSadaf Ebrahimi references have enough space for the offset to be put into the parsed pattern.
10288*22dc650dSSadaf Ebrahimi */
10289*22dc650dSSadaf Ebrahimi 
10290*22dc650dSSadaf Ebrahimi for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
10291*22dc650dSSadaf Ebrahimi 
10292*22dc650dSSadaf Ebrahimi 
10293*22dc650dSSadaf Ebrahimi /* --------------- Start looking at the pattern --------------- */
10294*22dc650dSSadaf Ebrahimi 
10295*22dc650dSSadaf Ebrahimi /* Unless PCRE2_LITERAL is set, check for global one-time option settings at
10296*22dc650dSSadaf Ebrahimi the start of the pattern, and remember the offset to the actual regex. With
10297*22dc650dSSadaf Ebrahimi valgrind support, make the terminator of a zero-terminated pattern
10298*22dc650dSSadaf Ebrahimi inaccessible. This catches bugs that would otherwise only show up for
10299*22dc650dSSadaf Ebrahimi non-zero-terminated patterns. */
10300*22dc650dSSadaf Ebrahimi 
10301*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_VALGRIND
10302*22dc650dSSadaf Ebrahimi if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
10303*22dc650dSSadaf Ebrahimi #endif
10304*22dc650dSSadaf Ebrahimi 
10305*22dc650dSSadaf Ebrahimi ptr = pattern;
10306*22dc650dSSadaf Ebrahimi skipatstart = 0;
10307*22dc650dSSadaf Ebrahimi 
10308*22dc650dSSadaf Ebrahimi if ((options & PCRE2_LITERAL) == 0)
10309*22dc650dSSadaf Ebrahimi   {
10310*22dc650dSSadaf Ebrahimi   while (patlen - skipatstart >= 2 &&
10311*22dc650dSSadaf Ebrahimi          ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
10312*22dc650dSSadaf Ebrahimi          ptr[skipatstart+1] == CHAR_ASTERISK)
10313*22dc650dSSadaf Ebrahimi     {
10314*22dc650dSSadaf Ebrahimi     for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
10315*22dc650dSSadaf Ebrahimi       {
10316*22dc650dSSadaf Ebrahimi       uint32_t c, pp;
10317*22dc650dSSadaf Ebrahimi       const pso *p = pso_list + i;
10318*22dc650dSSadaf Ebrahimi 
10319*22dc650dSSadaf Ebrahimi       if (patlen - skipatstart - 2 >= p->length &&
10320*22dc650dSSadaf Ebrahimi           PRIV(strncmp_c8)(ptr + skipatstart + 2, (char *)(p->name),
10321*22dc650dSSadaf Ebrahimi             p->length) == 0)
10322*22dc650dSSadaf Ebrahimi         {
10323*22dc650dSSadaf Ebrahimi         skipatstart += p->length + 2;
10324*22dc650dSSadaf Ebrahimi         switch(p->type)
10325*22dc650dSSadaf Ebrahimi           {
10326*22dc650dSSadaf Ebrahimi           case PSO_OPT:
10327*22dc650dSSadaf Ebrahimi           cb.external_options |= p->value;
10328*22dc650dSSadaf Ebrahimi           break;
10329*22dc650dSSadaf Ebrahimi 
10330*22dc650dSSadaf Ebrahimi           case PSO_FLG:
10331*22dc650dSSadaf Ebrahimi           setflags |= p->value;
10332*22dc650dSSadaf Ebrahimi           break;
10333*22dc650dSSadaf Ebrahimi 
10334*22dc650dSSadaf Ebrahimi           case PSO_NL:
10335*22dc650dSSadaf Ebrahimi           newline = p->value;
10336*22dc650dSSadaf Ebrahimi           setflags |= PCRE2_NL_SET;
10337*22dc650dSSadaf Ebrahimi           break;
10338*22dc650dSSadaf Ebrahimi 
10339*22dc650dSSadaf Ebrahimi           case PSO_BSR:
10340*22dc650dSSadaf Ebrahimi           bsr = p->value;
10341*22dc650dSSadaf Ebrahimi           setflags |= PCRE2_BSR_SET;
10342*22dc650dSSadaf Ebrahimi           break;
10343*22dc650dSSadaf Ebrahimi 
10344*22dc650dSSadaf Ebrahimi           case PSO_LIMM:
10345*22dc650dSSadaf Ebrahimi           case PSO_LIMD:
10346*22dc650dSSadaf Ebrahimi           case PSO_LIMH:
10347*22dc650dSSadaf Ebrahimi           c = 0;
10348*22dc650dSSadaf Ebrahimi           pp = skipatstart;
10349*22dc650dSSadaf Ebrahimi           if (!IS_DIGIT(ptr[pp]))
10350*22dc650dSSadaf Ebrahimi             {
10351*22dc650dSSadaf Ebrahimi             errorcode = ERR60;
10352*22dc650dSSadaf Ebrahimi             ptr += pp;
10353*22dc650dSSadaf Ebrahimi             goto HAD_EARLY_ERROR;
10354*22dc650dSSadaf Ebrahimi             }
10355*22dc650dSSadaf Ebrahimi           while (IS_DIGIT(ptr[pp]))
10356*22dc650dSSadaf Ebrahimi             {
10357*22dc650dSSadaf Ebrahimi             if (c > UINT32_MAX / 10 - 1) break;   /* Integer overflow */
10358*22dc650dSSadaf Ebrahimi             c = c*10 + (ptr[pp++] - CHAR_0);
10359*22dc650dSSadaf Ebrahimi             }
10360*22dc650dSSadaf Ebrahimi           if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
10361*22dc650dSSadaf Ebrahimi             {
10362*22dc650dSSadaf Ebrahimi             errorcode = ERR60;
10363*22dc650dSSadaf Ebrahimi             ptr += pp;
10364*22dc650dSSadaf Ebrahimi             goto HAD_EARLY_ERROR;
10365*22dc650dSSadaf Ebrahimi             }
10366*22dc650dSSadaf Ebrahimi           if (p->type == PSO_LIMH) limit_heap = c;
10367*22dc650dSSadaf Ebrahimi             else if (p->type == PSO_LIMM) limit_match = c;
10368*22dc650dSSadaf Ebrahimi             else limit_depth = c;
10369*22dc650dSSadaf Ebrahimi           skipatstart += pp - skipatstart;
10370*22dc650dSSadaf Ebrahimi           break;
10371*22dc650dSSadaf Ebrahimi           }
10372*22dc650dSSadaf Ebrahimi         break;   /* Out of the table scan loop */
10373*22dc650dSSadaf Ebrahimi         }
10374*22dc650dSSadaf Ebrahimi       }
10375*22dc650dSSadaf Ebrahimi     if (i >= sizeof(pso_list)/sizeof(pso)) break;   /* Out of pso loop */
10376*22dc650dSSadaf Ebrahimi     }
10377*22dc650dSSadaf Ebrahimi   }
10378*22dc650dSSadaf Ebrahimi 
10379*22dc650dSSadaf Ebrahimi /* End of pattern-start options; advance to start of real regex. */
10380*22dc650dSSadaf Ebrahimi 
10381*22dc650dSSadaf Ebrahimi ptr += skipatstart;
10382*22dc650dSSadaf Ebrahimi 
10383*22dc650dSSadaf Ebrahimi /* Can't support UTF or UCP if PCRE2 was built without Unicode support. */
10384*22dc650dSSadaf Ebrahimi 
10385*22dc650dSSadaf Ebrahimi #ifndef SUPPORT_UNICODE
10386*22dc650dSSadaf Ebrahimi if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
10387*22dc650dSSadaf Ebrahimi   {
10388*22dc650dSSadaf Ebrahimi   errorcode = ERR32;
10389*22dc650dSSadaf Ebrahimi   goto HAD_EARLY_ERROR;
10390*22dc650dSSadaf Ebrahimi   }
10391*22dc650dSSadaf Ebrahimi #endif
10392*22dc650dSSadaf Ebrahimi 
10393*22dc650dSSadaf Ebrahimi /* Check UTF. We have the original options in 'options', with that value as
10394*22dc650dSSadaf Ebrahimi modified by (*UTF) etc in cb->external_options. The extra option
10395*22dc650dSSadaf Ebrahimi PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
10396*22dc650dSSadaf Ebrahimi surrogate code points cannot be represented in UTF-16. */
10397*22dc650dSSadaf Ebrahimi 
10398*22dc650dSSadaf Ebrahimi utf = (cb.external_options & PCRE2_UTF) != 0;
10399*22dc650dSSadaf Ebrahimi if (utf)
10400*22dc650dSSadaf Ebrahimi   {
10401*22dc650dSSadaf Ebrahimi   if ((options & PCRE2_NEVER_UTF) != 0)
10402*22dc650dSSadaf Ebrahimi     {
10403*22dc650dSSadaf Ebrahimi     errorcode = ERR74;
10404*22dc650dSSadaf Ebrahimi     goto HAD_EARLY_ERROR;
10405*22dc650dSSadaf Ebrahimi     }
10406*22dc650dSSadaf Ebrahimi   if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
10407*22dc650dSSadaf Ebrahimi        (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
10408*22dc650dSSadaf Ebrahimi     goto HAD_ERROR;  /* Offset was set by valid_utf() */
10409*22dc650dSSadaf Ebrahimi 
10410*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 16
10411*22dc650dSSadaf Ebrahimi   if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
10412*22dc650dSSadaf Ebrahimi     {
10413*22dc650dSSadaf Ebrahimi     errorcode = ERR91;
10414*22dc650dSSadaf Ebrahimi     goto HAD_EARLY_ERROR;
10415*22dc650dSSadaf Ebrahimi     }
10416*22dc650dSSadaf Ebrahimi #endif
10417*22dc650dSSadaf Ebrahimi   }
10418*22dc650dSSadaf Ebrahimi 
10419*22dc650dSSadaf Ebrahimi /* Check UCP lockout. */
10420*22dc650dSSadaf Ebrahimi 
10421*22dc650dSSadaf Ebrahimi ucp = (cb.external_options & PCRE2_UCP) != 0;
10422*22dc650dSSadaf Ebrahimi if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
10423*22dc650dSSadaf Ebrahimi   {
10424*22dc650dSSadaf Ebrahimi   errorcode = ERR75;
10425*22dc650dSSadaf Ebrahimi   goto HAD_EARLY_ERROR;
10426*22dc650dSSadaf Ebrahimi   }
10427*22dc650dSSadaf Ebrahimi 
10428*22dc650dSSadaf Ebrahimi /* Process the BSR setting. */
10429*22dc650dSSadaf Ebrahimi 
10430*22dc650dSSadaf Ebrahimi if (bsr == 0) bsr = ccontext->bsr_convention;
10431*22dc650dSSadaf Ebrahimi 
10432*22dc650dSSadaf Ebrahimi /* Process the newline setting. */
10433*22dc650dSSadaf Ebrahimi 
10434*22dc650dSSadaf Ebrahimi if (newline == 0) newline = ccontext->newline_convention;
10435*22dc650dSSadaf Ebrahimi cb.nltype = NLTYPE_FIXED;
10436*22dc650dSSadaf Ebrahimi switch(newline)
10437*22dc650dSSadaf Ebrahimi   {
10438*22dc650dSSadaf Ebrahimi   case PCRE2_NEWLINE_CR:
10439*22dc650dSSadaf Ebrahimi   cb.nllen = 1;
10440*22dc650dSSadaf Ebrahimi   cb.nl[0] = CHAR_CR;
10441*22dc650dSSadaf Ebrahimi   break;
10442*22dc650dSSadaf Ebrahimi 
10443*22dc650dSSadaf Ebrahimi   case PCRE2_NEWLINE_LF:
10444*22dc650dSSadaf Ebrahimi   cb.nllen = 1;
10445*22dc650dSSadaf Ebrahimi   cb.nl[0] = CHAR_NL;
10446*22dc650dSSadaf Ebrahimi   break;
10447*22dc650dSSadaf Ebrahimi 
10448*22dc650dSSadaf Ebrahimi   case PCRE2_NEWLINE_NUL:
10449*22dc650dSSadaf Ebrahimi   cb.nllen = 1;
10450*22dc650dSSadaf Ebrahimi   cb.nl[0] = CHAR_NUL;
10451*22dc650dSSadaf Ebrahimi   break;
10452*22dc650dSSadaf Ebrahimi 
10453*22dc650dSSadaf Ebrahimi   case PCRE2_NEWLINE_CRLF:
10454*22dc650dSSadaf Ebrahimi   cb.nllen = 2;
10455*22dc650dSSadaf Ebrahimi   cb.nl[0] = CHAR_CR;
10456*22dc650dSSadaf Ebrahimi   cb.nl[1] = CHAR_NL;
10457*22dc650dSSadaf Ebrahimi   break;
10458*22dc650dSSadaf Ebrahimi 
10459*22dc650dSSadaf Ebrahimi   case PCRE2_NEWLINE_ANY:
10460*22dc650dSSadaf Ebrahimi   cb.nltype = NLTYPE_ANY;
10461*22dc650dSSadaf Ebrahimi   break;
10462*22dc650dSSadaf Ebrahimi 
10463*22dc650dSSadaf Ebrahimi   case PCRE2_NEWLINE_ANYCRLF:
10464*22dc650dSSadaf Ebrahimi   cb.nltype = NLTYPE_ANYCRLF;
10465*22dc650dSSadaf Ebrahimi   break;
10466*22dc650dSSadaf Ebrahimi 
10467*22dc650dSSadaf Ebrahimi   default:
10468*22dc650dSSadaf Ebrahimi   errorcode = ERR56;
10469*22dc650dSSadaf Ebrahimi   goto HAD_EARLY_ERROR;
10470*22dc650dSSadaf Ebrahimi   }
10471*22dc650dSSadaf Ebrahimi 
10472*22dc650dSSadaf Ebrahimi /* Pre-scan the pattern to do two things: (1) Discover the named groups and
10473*22dc650dSSadaf Ebrahimi their numerical equivalents, so that this information is always available for
10474*22dc650dSSadaf Ebrahimi the remaining processing. (2) At the same time, parse the pattern and put a
10475*22dc650dSSadaf Ebrahimi processed version into the parsed_pattern vector. This has escapes interpreted
10476*22dc650dSSadaf Ebrahimi and comments removed (amongst other things).
10477*22dc650dSSadaf Ebrahimi 
10478*22dc650dSSadaf Ebrahimi In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned
10479*22dc650dSSadaf Ebrahimi 32-bit ints in the parsed pattern is bounded by the length of the pattern plus
10480*22dc650dSSadaf Ebrahimi one (for the terminator) plus four if PCRE2_EXTRA_WORD or PCRE2_EXTRA_LINE is
10481*22dc650dSSadaf Ebrahimi set. The exceptional case is when running in 32-bit, non-UTF mode, when literal
10482*22dc650dSSadaf Ebrahimi characters greater than META_END (0x80000000) have to be coded as two units. In
10483*22dc650dSSadaf Ebrahimi this case, therefore, we scan the pattern to check for such values. */
10484*22dc650dSSadaf Ebrahimi 
10485*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 32
10486*22dc650dSSadaf Ebrahimi if (!utf)
10487*22dc650dSSadaf Ebrahimi   {
10488*22dc650dSSadaf Ebrahimi   PCRE2_SPTR p;
10489*22dc650dSSadaf Ebrahimi   for (p = ptr; p < cb.end_pattern; p++) if (*p >= META_END) big32count++;
10490*22dc650dSSadaf Ebrahimi   }
10491*22dc650dSSadaf Ebrahimi #endif
10492*22dc650dSSadaf Ebrahimi 
10493*22dc650dSSadaf Ebrahimi /* Ensure that the parsed pattern buffer is big enough. When PCRE2_AUTO_CALLOUT
10494*22dc650dSSadaf Ebrahimi is set we have to assume a numerical callout (4 elements) for each character
10495*22dc650dSSadaf Ebrahimi plus one at the end. This is overkill, but memory is plentiful these days. For
10496*22dc650dSSadaf Ebrahimi many smaller patterns the vector on the stack (which was set up above) can be
10497*22dc650dSSadaf Ebrahimi used. */
10498*22dc650dSSadaf Ebrahimi 
10499*22dc650dSSadaf Ebrahimi parsed_size_needed = patlen - skipatstart + big32count;
10500*22dc650dSSadaf Ebrahimi 
10501*22dc650dSSadaf Ebrahimi if ((ccontext->extra_options &
10502*22dc650dSSadaf Ebrahimi      (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
10503*22dc650dSSadaf Ebrahimi   parsed_size_needed += 4;
10504*22dc650dSSadaf Ebrahimi 
10505*22dc650dSSadaf Ebrahimi if ((options & PCRE2_AUTO_CALLOUT) != 0)
10506*22dc650dSSadaf Ebrahimi   parsed_size_needed = (parsed_size_needed + 1) * 5;
10507*22dc650dSSadaf Ebrahimi 
10508*22dc650dSSadaf Ebrahimi if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE)
10509*22dc650dSSadaf Ebrahimi   {
10510*22dc650dSSadaf Ebrahimi   uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
10511*22dc650dSSadaf Ebrahimi     (parsed_size_needed + 1) * sizeof(uint32_t), ccontext->memctl.memory_data);
10512*22dc650dSSadaf Ebrahimi   if (heap_parsed_pattern == NULL)
10513*22dc650dSSadaf Ebrahimi     {
10514*22dc650dSSadaf Ebrahimi     *errorptr = ERR21;
10515*22dc650dSSadaf Ebrahimi     goto EXIT;
10516*22dc650dSSadaf Ebrahimi     }
10517*22dc650dSSadaf Ebrahimi   cb.parsed_pattern = heap_parsed_pattern;
10518*22dc650dSSadaf Ebrahimi   }
10519*22dc650dSSadaf Ebrahimi cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed + 1;
10520*22dc650dSSadaf Ebrahimi 
10521*22dc650dSSadaf Ebrahimi /* Do the parsing scan. */
10522*22dc650dSSadaf Ebrahimi 
10523*22dc650dSSadaf Ebrahimi errorcode = parse_regex(ptr, cb.external_options, &has_lookbehind, &cb);
10524*22dc650dSSadaf Ebrahimi if (errorcode != 0) goto HAD_CB_ERROR;
10525*22dc650dSSadaf Ebrahimi 
10526*22dc650dSSadaf Ebrahimi /* If there are any lookbehinds, scan the parsed pattern to figure out their
10527*22dc650dSSadaf Ebrahimi lengths. Workspace is needed to remember whether numbered groups are or are not
10528*22dc650dSSadaf Ebrahimi of limited length, and if limited, what the minimum and maximum lengths are.
10529*22dc650dSSadaf Ebrahimi This caching saves re-computing the length of any group that is referenced more
10530*22dc650dSSadaf Ebrahimi than once, which is particularly relevant when recursion is involved.
10531*22dc650dSSadaf Ebrahimi Unnumbered groups do not have this exposure because they cannot be referenced.
10532*22dc650dSSadaf Ebrahimi If there are sufficiently few groups, the default index vector on the stack, as
10533*22dc650dSSadaf Ebrahimi set up above, can be used. Otherwise we have to get/free some heap memory. The
10534*22dc650dSSadaf Ebrahimi vector must be initialized to zero. */
10535*22dc650dSSadaf Ebrahimi 
10536*22dc650dSSadaf Ebrahimi if (has_lookbehind)
10537*22dc650dSSadaf Ebrahimi   {
10538*22dc650dSSadaf Ebrahimi   int loopcount = 0;
10539*22dc650dSSadaf Ebrahimi   if (cb.bracount >= GROUPINFO_DEFAULT_SIZE/2)
10540*22dc650dSSadaf Ebrahimi     {
10541*22dc650dSSadaf Ebrahimi     cb.groupinfo = ccontext->memctl.malloc(
10542*22dc650dSSadaf Ebrahimi       (2 * (cb.bracount + 1))*sizeof(uint32_t), ccontext->memctl.memory_data);
10543*22dc650dSSadaf Ebrahimi     if (cb.groupinfo == NULL)
10544*22dc650dSSadaf Ebrahimi       {
10545*22dc650dSSadaf Ebrahimi       errorcode = ERR21;
10546*22dc650dSSadaf Ebrahimi       cb.erroroffset = 0;
10547*22dc650dSSadaf Ebrahimi       goto HAD_CB_ERROR;
10548*22dc650dSSadaf Ebrahimi       }
10549*22dc650dSSadaf Ebrahimi     }
10550*22dc650dSSadaf Ebrahimi   memset(cb.groupinfo, 0, (2 * cb.bracount + 1) * sizeof(uint32_t));
10551*22dc650dSSadaf Ebrahimi   errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount);
10552*22dc650dSSadaf Ebrahimi   if (errorcode != 0) goto HAD_CB_ERROR;
10553*22dc650dSSadaf Ebrahimi   }
10554*22dc650dSSadaf Ebrahimi 
10555*22dc650dSSadaf Ebrahimi /* For debugging, there is a function that shows the parsed pattern vector. */
10556*22dc650dSSadaf Ebrahimi 
10557*22dc650dSSadaf Ebrahimi #ifdef DEBUG_SHOW_PARSED
10558*22dc650dSSadaf Ebrahimi fprintf(stderr, "+++ Pre-scan complete:\n");
10559*22dc650dSSadaf Ebrahimi show_parsed(&cb);
10560*22dc650dSSadaf Ebrahimi #endif
10561*22dc650dSSadaf Ebrahimi 
10562*22dc650dSSadaf Ebrahimi /* For debugging capturing information this code can be enabled. */
10563*22dc650dSSadaf Ebrahimi 
10564*22dc650dSSadaf Ebrahimi #ifdef DEBUG_SHOW_CAPTURES
10565*22dc650dSSadaf Ebrahimi   {
10566*22dc650dSSadaf Ebrahimi   named_group *ng = cb.named_groups;
10567*22dc650dSSadaf Ebrahimi   fprintf(stderr, "+++Captures: %d\n", cb.bracount);
10568*22dc650dSSadaf Ebrahimi   for (i = 0; i < cb.names_found; i++, ng++)
10569*22dc650dSSadaf Ebrahimi     {
10570*22dc650dSSadaf Ebrahimi     fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
10571*22dc650dSSadaf Ebrahimi     }
10572*22dc650dSSadaf Ebrahimi   }
10573*22dc650dSSadaf Ebrahimi #endif
10574*22dc650dSSadaf Ebrahimi 
10575*22dc650dSSadaf Ebrahimi /* Pretend to compile the pattern while actually just accumulating the amount
10576*22dc650dSSadaf Ebrahimi of memory required in the 'length' variable. This behaviour is triggered by
10577*22dc650dSSadaf Ebrahimi passing a non-NULL final argument to compile_regex(). We pass a block of
10578*22dc650dSSadaf Ebrahimi workspace (cworkspace) for it to compile parts of the pattern into; the
10579*22dc650dSSadaf Ebrahimi compiled code is discarded when it is no longer needed, so hopefully this
10580*22dc650dSSadaf Ebrahimi workspace will never overflow, though there is a test for its doing so.
10581*22dc650dSSadaf Ebrahimi 
10582*22dc650dSSadaf Ebrahimi On error, errorcode will be set non-zero, so we don't need to look at the
10583*22dc650dSSadaf Ebrahimi result of the function. The initial options have been put into the cb block,
10584*22dc650dSSadaf Ebrahimi but we still have to pass a separate options variable (the first argument)
10585*22dc650dSSadaf Ebrahimi because the options may change as the pattern is processed. */
10586*22dc650dSSadaf Ebrahimi 
10587*22dc650dSSadaf Ebrahimi cb.erroroffset = patlen;   /* For any subsequent errors that do not set it */
10588*22dc650dSSadaf Ebrahimi pptr = cb.parsed_pattern;
10589*22dc650dSSadaf Ebrahimi code = cworkspace;
10590*22dc650dSSadaf Ebrahimi *code = OP_BRA;
10591*22dc650dSSadaf Ebrahimi 
10592*22dc650dSSadaf Ebrahimi (void)compile_regex(cb.external_options, ccontext->extra_options, &code, &pptr,
10593*22dc650dSSadaf Ebrahimi    &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, NULL,
10594*22dc650dSSadaf Ebrahimi    &cb, &length);
10595*22dc650dSSadaf Ebrahimi 
10596*22dc650dSSadaf Ebrahimi if (errorcode != 0) goto HAD_CB_ERROR;  /* Offset is in cb.erroroffset */
10597*22dc650dSSadaf Ebrahimi 
10598*22dc650dSSadaf Ebrahimi /* This should be caught in compile_regex(), but just in case... */
10599*22dc650dSSadaf Ebrahimi 
10600*22dc650dSSadaf Ebrahimi if (length > MAX_PATTERN_SIZE)
10601*22dc650dSSadaf Ebrahimi   {
10602*22dc650dSSadaf Ebrahimi   errorcode = ERR20;
10603*22dc650dSSadaf Ebrahimi   goto HAD_CB_ERROR;
10604*22dc650dSSadaf Ebrahimi   }
10605*22dc650dSSadaf Ebrahimi 
10606*22dc650dSSadaf Ebrahimi /* Compute the size of, then, if not too large, get and initialize the data
10607*22dc650dSSadaf Ebrahimi block for storing the compiled pattern and names table. Integer overflow should
10608*22dc650dSSadaf Ebrahimi no longer be possible because nowadays we limit the maximum value of
10609*22dc650dSSadaf Ebrahimi cb.names_found and cb.name_entry_size. */
10610*22dc650dSSadaf Ebrahimi 
10611*22dc650dSSadaf Ebrahimi re_blocksize = sizeof(pcre2_real_code) +
10612*22dc650dSSadaf Ebrahimi   CU2BYTES(length +
10613*22dc650dSSadaf Ebrahimi   (PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
10614*22dc650dSSadaf Ebrahimi 
10615*22dc650dSSadaf Ebrahimi if (re_blocksize > ccontext->max_pattern_compiled_length)
10616*22dc650dSSadaf Ebrahimi   {
10617*22dc650dSSadaf Ebrahimi   errorcode = ERR101;
10618*22dc650dSSadaf Ebrahimi   goto HAD_CB_ERROR;
10619*22dc650dSSadaf Ebrahimi   }
10620*22dc650dSSadaf Ebrahimi 
10621*22dc650dSSadaf Ebrahimi re = (pcre2_real_code *)
10622*22dc650dSSadaf Ebrahimi   ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
10623*22dc650dSSadaf Ebrahimi if (re == NULL)
10624*22dc650dSSadaf Ebrahimi   {
10625*22dc650dSSadaf Ebrahimi   errorcode = ERR21;
10626*22dc650dSSadaf Ebrahimi   goto HAD_CB_ERROR;
10627*22dc650dSSadaf Ebrahimi   }
10628*22dc650dSSadaf Ebrahimi 
10629*22dc650dSSadaf Ebrahimi /* The compiler may put padding at the end of the pcre2_real_code structure in
10630*22dc650dSSadaf Ebrahimi order to round it up to a multiple of 4 or 8 bytes. This means that when a
10631*22dc650dSSadaf Ebrahimi compiled pattern is copied (for example, when serialized) undefined bytes are
10632*22dc650dSSadaf Ebrahimi read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
10633*22dc650dSSadaf Ebrahimi write to the last 8 bytes of the structure before setting the fields. */
10634*22dc650dSSadaf Ebrahimi 
10635*22dc650dSSadaf Ebrahimi memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
10636*22dc650dSSadaf Ebrahimi re->memctl = ccontext->memctl;
10637*22dc650dSSadaf Ebrahimi re->tables = tables;
10638*22dc650dSSadaf Ebrahimi re->executable_jit = NULL;
10639*22dc650dSSadaf Ebrahimi memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
10640*22dc650dSSadaf Ebrahimi re->blocksize = re_blocksize;
10641*22dc650dSSadaf Ebrahimi re->magic_number = MAGIC_NUMBER;
10642*22dc650dSSadaf Ebrahimi re->compile_options = options;
10643*22dc650dSSadaf Ebrahimi re->overall_options = cb.external_options;
10644*22dc650dSSadaf Ebrahimi re->extra_options = ccontext->extra_options;
10645*22dc650dSSadaf Ebrahimi re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
10646*22dc650dSSadaf Ebrahimi re->limit_heap = limit_heap;
10647*22dc650dSSadaf Ebrahimi re->limit_match = limit_match;
10648*22dc650dSSadaf Ebrahimi re->limit_depth = limit_depth;
10649*22dc650dSSadaf Ebrahimi re->first_codeunit = 0;
10650*22dc650dSSadaf Ebrahimi re->last_codeunit = 0;
10651*22dc650dSSadaf Ebrahimi re->bsr_convention = bsr;
10652*22dc650dSSadaf Ebrahimi re->newline_convention = newline;
10653*22dc650dSSadaf Ebrahimi re->max_lookbehind = 0;
10654*22dc650dSSadaf Ebrahimi re->minlength = 0;
10655*22dc650dSSadaf Ebrahimi re->top_bracket = 0;
10656*22dc650dSSadaf Ebrahimi re->top_backref = 0;
10657*22dc650dSSadaf Ebrahimi re->name_entry_size = cb.name_entry_size;
10658*22dc650dSSadaf Ebrahimi re->name_count = cb.names_found;
10659*22dc650dSSadaf Ebrahimi 
10660*22dc650dSSadaf Ebrahimi /* The basic block is immediately followed by the name table, and the compiled
10661*22dc650dSSadaf Ebrahimi code follows after that. */
10662*22dc650dSSadaf Ebrahimi 
10663*22dc650dSSadaf Ebrahimi codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) +
10664*22dc650dSSadaf Ebrahimi   re->name_entry_size * re->name_count;
10665*22dc650dSSadaf Ebrahimi 
10666*22dc650dSSadaf Ebrahimi /* Update the compile data block for the actual compile. The starting points of
10667*22dc650dSSadaf Ebrahimi the name/number translation table and of the code are passed around in the
10668*22dc650dSSadaf Ebrahimi compile data block. The start/end pattern and initial options are already set
10669*22dc650dSSadaf Ebrahimi from the pre-compile phase, as is the name_entry_size field. */
10670*22dc650dSSadaf Ebrahimi 
10671*22dc650dSSadaf Ebrahimi cb.parens_depth = 0;
10672*22dc650dSSadaf Ebrahimi cb.assert_depth = 0;
10673*22dc650dSSadaf Ebrahimi cb.lastcapture = 0;
10674*22dc650dSSadaf Ebrahimi cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
10675*22dc650dSSadaf Ebrahimi cb.start_code = codestart;
10676*22dc650dSSadaf Ebrahimi cb.req_varyopt = 0;
10677*22dc650dSSadaf Ebrahimi cb.had_accept = FALSE;
10678*22dc650dSSadaf Ebrahimi cb.had_pruneorskip = FALSE;
10679*22dc650dSSadaf Ebrahimi 
10680*22dc650dSSadaf Ebrahimi /* If any named groups were found, create the name/number table from the list
10681*22dc650dSSadaf Ebrahimi created in the pre-pass. */
10682*22dc650dSSadaf Ebrahimi 
10683*22dc650dSSadaf Ebrahimi if (cb.names_found > 0)
10684*22dc650dSSadaf Ebrahimi   {
10685*22dc650dSSadaf Ebrahimi   named_group *ng = cb.named_groups;
10686*22dc650dSSadaf Ebrahimi   for (i = 0; i < cb.names_found; i++, ng++)
10687*22dc650dSSadaf Ebrahimi     add_name_to_table(&cb, ng->name, ng->length, ng->number, i);
10688*22dc650dSSadaf Ebrahimi   }
10689*22dc650dSSadaf Ebrahimi 
10690*22dc650dSSadaf Ebrahimi /* Set up a starting, non-extracting bracket, then compile the expression. On
10691*22dc650dSSadaf Ebrahimi error, errorcode will be set non-zero, so we don't need to look at the result
10692*22dc650dSSadaf Ebrahimi of the function here. */
10693*22dc650dSSadaf Ebrahimi 
10694*22dc650dSSadaf Ebrahimi pptr = cb.parsed_pattern;
10695*22dc650dSSadaf Ebrahimi code = (PCRE2_UCHAR *)codestart;
10696*22dc650dSSadaf Ebrahimi *code = OP_BRA;
10697*22dc650dSSadaf Ebrahimi regexrc = compile_regex(re->overall_options, ccontext->extra_options, &code,
10698*22dc650dSSadaf Ebrahimi   &pptr, &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL,
10699*22dc650dSSadaf Ebrahimi   NULL, &cb, NULL);
10700*22dc650dSSadaf Ebrahimi if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
10701*22dc650dSSadaf Ebrahimi re->top_bracket = cb.bracount;
10702*22dc650dSSadaf Ebrahimi re->top_backref = cb.top_backref;
10703*22dc650dSSadaf Ebrahimi re->max_lookbehind = cb.max_lookbehind;
10704*22dc650dSSadaf Ebrahimi 
10705*22dc650dSSadaf Ebrahimi if (cb.had_accept)
10706*22dc650dSSadaf Ebrahimi   {
10707*22dc650dSSadaf Ebrahimi   reqcu = 0;                     /* Must disable after (*ACCEPT) */
10708*22dc650dSSadaf Ebrahimi   reqcuflags = REQ_NONE;
10709*22dc650dSSadaf Ebrahimi   re->flags |= PCRE2_HASACCEPT;  /* Disables minimum length */
10710*22dc650dSSadaf Ebrahimi   }
10711*22dc650dSSadaf Ebrahimi 
10712*22dc650dSSadaf Ebrahimi /* Fill in the final opcode and check for disastrous overflow. If no overflow,
10713*22dc650dSSadaf Ebrahimi but the estimated length exceeds the really used length, adjust the value of
10714*22dc650dSSadaf Ebrahimi re->blocksize, and if valgrind support is configured, mark the extra allocated
10715*22dc650dSSadaf Ebrahimi memory as unaddressable, so that any out-of-bound reads can be detected. */
10716*22dc650dSSadaf Ebrahimi 
10717*22dc650dSSadaf Ebrahimi *code++ = OP_END;
10718*22dc650dSSadaf Ebrahimi usedlength = code - codestart;
10719*22dc650dSSadaf Ebrahimi if (usedlength > length) errorcode = ERR23; else
10720*22dc650dSSadaf Ebrahimi   {
10721*22dc650dSSadaf Ebrahimi   re->blocksize -= CU2BYTES(length - usedlength);
10722*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_VALGRIND
10723*22dc650dSSadaf Ebrahimi   VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
10724*22dc650dSSadaf Ebrahimi #endif
10725*22dc650dSSadaf Ebrahimi   }
10726*22dc650dSSadaf Ebrahimi 
10727*22dc650dSSadaf Ebrahimi /* Scan the pattern for recursion/subroutine calls and convert the group
10728*22dc650dSSadaf Ebrahimi numbers into offsets. Maintain a small cache so that repeated groups containing
10729*22dc650dSSadaf Ebrahimi recursions are efficiently handled. */
10730*22dc650dSSadaf Ebrahimi 
10731*22dc650dSSadaf Ebrahimi #define RSCAN_CACHE_SIZE 8
10732*22dc650dSSadaf Ebrahimi 
10733*22dc650dSSadaf Ebrahimi if (errorcode == 0 && cb.had_recurse)
10734*22dc650dSSadaf Ebrahimi   {
10735*22dc650dSSadaf Ebrahimi   PCRE2_UCHAR *rcode;
10736*22dc650dSSadaf Ebrahimi   PCRE2_SPTR rgroup;
10737*22dc650dSSadaf Ebrahimi   unsigned int ccount = 0;
10738*22dc650dSSadaf Ebrahimi   int start = RSCAN_CACHE_SIZE;
10739*22dc650dSSadaf Ebrahimi   recurse_cache rc[RSCAN_CACHE_SIZE];
10740*22dc650dSSadaf Ebrahimi 
10741*22dc650dSSadaf Ebrahimi   for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf);
10742*22dc650dSSadaf Ebrahimi        rcode != NULL;
10743*22dc650dSSadaf Ebrahimi        rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf))
10744*22dc650dSSadaf Ebrahimi     {
10745*22dc650dSSadaf Ebrahimi     int p, groupnumber;
10746*22dc650dSSadaf Ebrahimi 
10747*22dc650dSSadaf Ebrahimi     groupnumber = (int)GET(rcode, 1);
10748*22dc650dSSadaf Ebrahimi     if (groupnumber == 0) rgroup = codestart; else
10749*22dc650dSSadaf Ebrahimi       {
10750*22dc650dSSadaf Ebrahimi       PCRE2_SPTR search_from = codestart;
10751*22dc650dSSadaf Ebrahimi       rgroup = NULL;
10752*22dc650dSSadaf Ebrahimi       for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
10753*22dc650dSSadaf Ebrahimi         {
10754*22dc650dSSadaf Ebrahimi         if (groupnumber == rc[p].groupnumber)
10755*22dc650dSSadaf Ebrahimi           {
10756*22dc650dSSadaf Ebrahimi           rgroup = rc[p].group;
10757*22dc650dSSadaf Ebrahimi           break;
10758*22dc650dSSadaf Ebrahimi           }
10759*22dc650dSSadaf Ebrahimi 
10760*22dc650dSSadaf Ebrahimi         /* Group n+1 must always start to the right of group n, so we can save
10761*22dc650dSSadaf Ebrahimi         search time below when the new group number is greater than any of the
10762*22dc650dSSadaf Ebrahimi         previously found groups. */
10763*22dc650dSSadaf Ebrahimi 
10764*22dc650dSSadaf Ebrahimi         if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
10765*22dc650dSSadaf Ebrahimi         }
10766*22dc650dSSadaf Ebrahimi 
10767*22dc650dSSadaf Ebrahimi       if (rgroup == NULL)
10768*22dc650dSSadaf Ebrahimi         {
10769*22dc650dSSadaf Ebrahimi         rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
10770*22dc650dSSadaf Ebrahimi         if (rgroup == NULL)
10771*22dc650dSSadaf Ebrahimi           {
10772*22dc650dSSadaf Ebrahimi           errorcode = ERR53;
10773*22dc650dSSadaf Ebrahimi           break;
10774*22dc650dSSadaf Ebrahimi           }
10775*22dc650dSSadaf Ebrahimi         if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
10776*22dc650dSSadaf Ebrahimi         rc[start].groupnumber = groupnumber;
10777*22dc650dSSadaf Ebrahimi         rc[start].group = rgroup;
10778*22dc650dSSadaf Ebrahimi         if (ccount < RSCAN_CACHE_SIZE) ccount++;
10779*22dc650dSSadaf Ebrahimi         }
10780*22dc650dSSadaf Ebrahimi       }
10781*22dc650dSSadaf Ebrahimi 
10782*22dc650dSSadaf Ebrahimi     PUT(rcode, 1, rgroup - codestart);
10783*22dc650dSSadaf Ebrahimi     }
10784*22dc650dSSadaf Ebrahimi   }
10785*22dc650dSSadaf Ebrahimi 
10786*22dc650dSSadaf Ebrahimi /* In rare debugging situations we sometimes need to look at the compiled code
10787*22dc650dSSadaf Ebrahimi at this stage. */
10788*22dc650dSSadaf Ebrahimi 
10789*22dc650dSSadaf Ebrahimi #ifdef DEBUG_CALL_PRINTINT
10790*22dc650dSSadaf Ebrahimi pcre2_printint(re, stderr, TRUE);
10791*22dc650dSSadaf Ebrahimi fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
10792*22dc650dSSadaf Ebrahimi #endif
10793*22dc650dSSadaf Ebrahimi 
10794*22dc650dSSadaf Ebrahimi /* Unless disabled, check whether any single character iterators can be
10795*22dc650dSSadaf Ebrahimi auto-possessified. The function overwrites the appropriate opcode values, so
10796*22dc650dSSadaf Ebrahimi the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
10797*22dc650dSSadaf Ebrahimi used in this code because at least one compiler gives a warning about loss of
10798*22dc650dSSadaf Ebrahimi "const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
10799*22dc650dSSadaf Ebrahimi function call. */
10800*22dc650dSSadaf Ebrahimi 
10801*22dc650dSSadaf Ebrahimi if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
10802*22dc650dSSadaf Ebrahimi   {
10803*22dc650dSSadaf Ebrahimi   PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
10804*22dc650dSSadaf Ebrahimi   if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;
10805*22dc650dSSadaf Ebrahimi   }
10806*22dc650dSSadaf Ebrahimi 
10807*22dc650dSSadaf Ebrahimi /* Failed to compile, or error while post-processing. */
10808*22dc650dSSadaf Ebrahimi 
10809*22dc650dSSadaf Ebrahimi if (errorcode != 0) goto HAD_CB_ERROR;
10810*22dc650dSSadaf Ebrahimi 
10811*22dc650dSSadaf Ebrahimi /* Successful compile. If the anchored option was not passed, set it if
10812*22dc650dSSadaf Ebrahimi we can determine that the pattern is anchored by virtue of ^ characters or \A
10813*22dc650dSSadaf Ebrahimi or anything else, such as starting with non-atomic .* when DOTALL is set and
10814*22dc650dSSadaf Ebrahimi there are no occurrences of *PRUNE or *SKIP (though there is an option to
10815*22dc650dSSadaf Ebrahimi disable this case). */
10816*22dc650dSSadaf Ebrahimi 
10817*22dc650dSSadaf Ebrahimi if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10818*22dc650dSSadaf Ebrahimi      is_anchored(codestart, 0, &cb, 0, FALSE))
10819*22dc650dSSadaf Ebrahimi   re->overall_options |= PCRE2_ANCHORED;
10820*22dc650dSSadaf Ebrahimi 
10821*22dc650dSSadaf Ebrahimi /* Set up the first code unit or startline flag, the required code unit, and
10822*22dc650dSSadaf Ebrahimi then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE
10823*22dc650dSSadaf Ebrahimi is set, as the data it would create will not be used. Note that a first code
10824*22dc650dSSadaf Ebrahimi unit (but not the startline flag) is useful for anchored patterns because it
10825*22dc650dSSadaf Ebrahimi can still give a quick "no match" and also avoid searching for a last code
10826*22dc650dSSadaf Ebrahimi unit. */
10827*22dc650dSSadaf Ebrahimi 
10828*22dc650dSSadaf Ebrahimi if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
10829*22dc650dSSadaf Ebrahimi   {
10830*22dc650dSSadaf Ebrahimi   int minminlength = 0;  /* For minimal minlength from first/required CU */
10831*22dc650dSSadaf Ebrahimi 
10832*22dc650dSSadaf Ebrahimi   /* If we do not have a first code unit, see if there is one that is asserted
10833*22dc650dSSadaf Ebrahimi   (these are not saved during the compile because they can cause conflicts with
10834*22dc650dSSadaf Ebrahimi   actual literals that follow). */
10835*22dc650dSSadaf Ebrahimi 
10836*22dc650dSSadaf Ebrahimi   if (firstcuflags >= REQ_NONE)
10837*22dc650dSSadaf Ebrahimi     firstcu = find_firstassertedcu(codestart, &firstcuflags, 0);
10838*22dc650dSSadaf Ebrahimi 
10839*22dc650dSSadaf Ebrahimi   /* Save the data for a first code unit. The existence of one means the
10840*22dc650dSSadaf Ebrahimi   minimum length must be at least 1. */
10841*22dc650dSSadaf Ebrahimi 
10842*22dc650dSSadaf Ebrahimi   if (firstcuflags < REQ_NONE)
10843*22dc650dSSadaf Ebrahimi     {
10844*22dc650dSSadaf Ebrahimi     re->first_codeunit = firstcu;
10845*22dc650dSSadaf Ebrahimi     re->flags |= PCRE2_FIRSTSET;
10846*22dc650dSSadaf Ebrahimi     minminlength++;
10847*22dc650dSSadaf Ebrahimi 
10848*22dc650dSSadaf Ebrahimi     /* Handle caseless first code units. */
10849*22dc650dSSadaf Ebrahimi 
10850*22dc650dSSadaf Ebrahimi     if ((firstcuflags & REQ_CASELESS) != 0)
10851*22dc650dSSadaf Ebrahimi       {
10852*22dc650dSSadaf Ebrahimi       if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
10853*22dc650dSSadaf Ebrahimi         {
10854*22dc650dSSadaf Ebrahimi         if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
10855*22dc650dSSadaf Ebrahimi         }
10856*22dc650dSSadaf Ebrahimi 
10857*22dc650dSSadaf Ebrahimi       /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
10858*22dc650dSSadaf Ebrahimi       In 8-bit UTF mode, codepoints in the range 128-255 are introductory code
10859*22dc650dSSadaf Ebrahimi       points and cannot have another case, but if UCP is set they may do. */
10860*22dc650dSSadaf Ebrahimi 
10861*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
10862*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
10863*22dc650dSSadaf Ebrahimi       else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
10864*22dc650dSSadaf Ebrahimi         re->flags |= PCRE2_FIRSTCASELESS;
10865*22dc650dSSadaf Ebrahimi #else
10866*22dc650dSSadaf Ebrahimi       else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
10867*22dc650dSSadaf Ebrahimi                UCD_OTHERCASE(firstcu) != firstcu)
10868*22dc650dSSadaf Ebrahimi         re->flags |= PCRE2_FIRSTCASELESS;
10869*22dc650dSSadaf Ebrahimi #endif
10870*22dc650dSSadaf Ebrahimi #endif  /* SUPPORT_UNICODE */
10871*22dc650dSSadaf Ebrahimi       }
10872*22dc650dSSadaf Ebrahimi     }
10873*22dc650dSSadaf Ebrahimi 
10874*22dc650dSSadaf Ebrahimi   /* When there is no first code unit, for non-anchored patterns, see if we can
10875*22dc650dSSadaf Ebrahimi   set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
10876*22dc650dSSadaf Ebrahimi   branches start with ^ and also when all branches start with non-atomic .* for
10877*22dc650dSSadaf Ebrahimi   non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
10878*22dc650dSSadaf Ebrahimi   that disables this case.) */
10879*22dc650dSSadaf Ebrahimi 
10880*22dc650dSSadaf Ebrahimi   else if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10881*22dc650dSSadaf Ebrahimi            is_startline(codestart, 0, &cb, 0, FALSE))
10882*22dc650dSSadaf Ebrahimi     re->flags |= PCRE2_STARTLINE;
10883*22dc650dSSadaf Ebrahimi 
10884*22dc650dSSadaf Ebrahimi   /* Handle the "required code unit", if one is set. In the UTF case we can
10885*22dc650dSSadaf Ebrahimi   increment the minimum minimum length only if we are sure this really is a
10886*22dc650dSSadaf Ebrahimi   different character and not a non-starting code unit of the first character,
10887*22dc650dSSadaf Ebrahimi   because the minimum length count is in characters, not code units. */
10888*22dc650dSSadaf Ebrahimi 
10889*22dc650dSSadaf Ebrahimi   if (reqcuflags < REQ_NONE)
10890*22dc650dSSadaf Ebrahimi     {
10891*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 16
10892*22dc650dSSadaf Ebrahimi     if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
10893*22dc650dSSadaf Ebrahimi         firstcuflags >= REQ_NONE ||                 /* First not set */
10894*22dc650dSSadaf Ebrahimi         (firstcu & 0xf800) != 0xd800 ||             /* First not surrogate */
10895*22dc650dSSadaf Ebrahimi         (reqcu & 0xfc00) != 0xdc00)                 /* Req not low surrogate */
10896*22dc650dSSadaf Ebrahimi #elif PCRE2_CODE_UNIT_WIDTH == 8
10897*22dc650dSSadaf Ebrahimi     if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
10898*22dc650dSSadaf Ebrahimi         firstcuflags >= REQ_NONE ||                 /* First not set */
10899*22dc650dSSadaf Ebrahimi         (firstcu & 0x80) == 0 ||                    /* First is ASCII */
10900*22dc650dSSadaf Ebrahimi         (reqcu & 0x80) == 0)                        /* Req is ASCII */
10901*22dc650dSSadaf Ebrahimi #endif
10902*22dc650dSSadaf Ebrahimi       {
10903*22dc650dSSadaf Ebrahimi       minminlength++;
10904*22dc650dSSadaf Ebrahimi       }
10905*22dc650dSSadaf Ebrahimi 
10906*22dc650dSSadaf Ebrahimi     /* In the case of an anchored pattern, set up the value only if it follows
10907*22dc650dSSadaf Ebrahimi     a variable length item in the pattern. */
10908*22dc650dSSadaf Ebrahimi 
10909*22dc650dSSadaf Ebrahimi     if ((re->overall_options & PCRE2_ANCHORED) == 0 ||
10910*22dc650dSSadaf Ebrahimi         (reqcuflags & REQ_VARY) != 0)
10911*22dc650dSSadaf Ebrahimi       {
10912*22dc650dSSadaf Ebrahimi       re->last_codeunit = reqcu;
10913*22dc650dSSadaf Ebrahimi       re->flags |= PCRE2_LASTSET;
10914*22dc650dSSadaf Ebrahimi 
10915*22dc650dSSadaf Ebrahimi       /* Handle caseless required code units as for first code units (above). */
10916*22dc650dSSadaf Ebrahimi 
10917*22dc650dSSadaf Ebrahimi       if ((reqcuflags & REQ_CASELESS) != 0)
10918*22dc650dSSadaf Ebrahimi         {
10919*22dc650dSSadaf Ebrahimi         if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
10920*22dc650dSSadaf Ebrahimi           {
10921*22dc650dSSadaf Ebrahimi           if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
10922*22dc650dSSadaf Ebrahimi           }
10923*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
10924*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
10925*22dc650dSSadaf Ebrahimi       else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
10926*22dc650dSSadaf Ebrahimi         re->flags |= PCRE2_LASTCASELESS;
10927*22dc650dSSadaf Ebrahimi #else
10928*22dc650dSSadaf Ebrahimi       else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
10929*22dc650dSSadaf Ebrahimi                UCD_OTHERCASE(reqcu) != reqcu)
10930*22dc650dSSadaf Ebrahimi         re->flags |= PCRE2_LASTCASELESS;
10931*22dc650dSSadaf Ebrahimi #endif
10932*22dc650dSSadaf Ebrahimi #endif  /* SUPPORT_UNICODE */
10933*22dc650dSSadaf Ebrahimi         }
10934*22dc650dSSadaf Ebrahimi       }
10935*22dc650dSSadaf Ebrahimi     }
10936*22dc650dSSadaf Ebrahimi 
10937*22dc650dSSadaf Ebrahimi   /* Study the compiled pattern to set up information such as a bitmap of
10938*22dc650dSSadaf Ebrahimi   starting code units and a minimum matching length. */
10939*22dc650dSSadaf Ebrahimi 
10940*22dc650dSSadaf Ebrahimi   if (PRIV(study)(re) != 0)
10941*22dc650dSSadaf Ebrahimi     {
10942*22dc650dSSadaf Ebrahimi     errorcode = ERR31;
10943*22dc650dSSadaf Ebrahimi     goto HAD_CB_ERROR;
10944*22dc650dSSadaf Ebrahimi     }
10945*22dc650dSSadaf Ebrahimi 
10946*22dc650dSSadaf Ebrahimi   /* If study() set a bitmap of starting code units, it implies a minimum
10947*22dc650dSSadaf Ebrahimi   length of at least one. */
10948*22dc650dSSadaf Ebrahimi 
10949*22dc650dSSadaf Ebrahimi   if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0)
10950*22dc650dSSadaf Ebrahimi     minminlength = 1;
10951*22dc650dSSadaf Ebrahimi 
10952*22dc650dSSadaf Ebrahimi   /* If the minimum length set (or not set) by study() is less than the minimum
10953*22dc650dSSadaf Ebrahimi   implied by required code units, override it. */
10954*22dc650dSSadaf Ebrahimi 
10955*22dc650dSSadaf Ebrahimi   if (re->minlength < minminlength) re->minlength = minminlength;
10956*22dc650dSSadaf Ebrahimi   }   /* End of start-of-match optimizations. */
10957*22dc650dSSadaf Ebrahimi 
10958*22dc650dSSadaf Ebrahimi /* Control ends up here in all cases. When running under valgrind, make a
10959*22dc650dSSadaf Ebrahimi pattern's terminating zero defined again. If memory was obtained for the parsed
10960*22dc650dSSadaf Ebrahimi version of the pattern, free it before returning. Also free the list of named
10961*22dc650dSSadaf Ebrahimi groups if a larger one had to be obtained, and likewise the group information
10962*22dc650dSSadaf Ebrahimi vector. */
10963*22dc650dSSadaf Ebrahimi 
10964*22dc650dSSadaf Ebrahimi EXIT:
10965*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_VALGRIND
10966*22dc650dSSadaf Ebrahimi if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
10967*22dc650dSSadaf Ebrahimi #endif
10968*22dc650dSSadaf Ebrahimi if (cb.parsed_pattern != stack_parsed_pattern)
10969*22dc650dSSadaf Ebrahimi   ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
10970*22dc650dSSadaf Ebrahimi if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
10971*22dc650dSSadaf Ebrahimi   ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
10972*22dc650dSSadaf Ebrahimi if (cb.groupinfo != stack_groupinfo)
10973*22dc650dSSadaf Ebrahimi   ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
10974*22dc650dSSadaf Ebrahimi return re;    /* Will be NULL after an error */
10975*22dc650dSSadaf Ebrahimi 
10976*22dc650dSSadaf Ebrahimi /* Errors discovered in parse_regex() set the offset value in the compile
10977*22dc650dSSadaf Ebrahimi block. Errors discovered before it is called must compute it from the ptr
10978*22dc650dSSadaf Ebrahimi value. After parse_regex() is called, the offset in the compile block is set to
10979*22dc650dSSadaf Ebrahimi the end of the pattern, but certain errors in compile_regex() may reset it if
10980*22dc650dSSadaf Ebrahimi an offset is available in the parsed pattern. */
10981*22dc650dSSadaf Ebrahimi 
10982*22dc650dSSadaf Ebrahimi HAD_CB_ERROR:
10983*22dc650dSSadaf Ebrahimi ptr = pattern + cb.erroroffset;
10984*22dc650dSSadaf Ebrahimi 
10985*22dc650dSSadaf Ebrahimi HAD_EARLY_ERROR:
10986*22dc650dSSadaf Ebrahimi *erroroffset = ptr - pattern;
10987*22dc650dSSadaf Ebrahimi 
10988*22dc650dSSadaf Ebrahimi HAD_ERROR:
10989*22dc650dSSadaf Ebrahimi *errorptr = errorcode;
10990*22dc650dSSadaf Ebrahimi pcre2_code_free(re);
10991*22dc650dSSadaf Ebrahimi re = NULL;
10992*22dc650dSSadaf Ebrahimi goto EXIT;
10993*22dc650dSSadaf Ebrahimi }
10994*22dc650dSSadaf Ebrahimi 
10995*22dc650dSSadaf Ebrahimi /* These #undefs are here to enable unity builds with CMake. */
10996*22dc650dSSadaf Ebrahimi 
10997*22dc650dSSadaf Ebrahimi #undef NLBLOCK /* Block containing newline information */
10998*22dc650dSSadaf Ebrahimi #undef PSSTART /* Field containing processed string start */
10999*22dc650dSSadaf Ebrahimi #undef PSEND   /* Field containing processed string end */
11000*22dc650dSSadaf Ebrahimi 
11001*22dc650dSSadaf Ebrahimi /* End of pcre2_compile.c */
11002