1*22dc650dSSadaf Ebrahimi /*************************************************
2*22dc650dSSadaf Ebrahimi * Perl-Compatible Regular Expressions *
3*22dc650dSSadaf Ebrahimi *************************************************/
4*22dc650dSSadaf Ebrahimi
5*22dc650dSSadaf Ebrahimi /* PCRE is a library of functions to support regular expressions whose syntax
6*22dc650dSSadaf Ebrahimi and semantics are as close as possible to those of the Perl 5 language.
7*22dc650dSSadaf Ebrahimi
8*22dc650dSSadaf Ebrahimi Written by Philip Hazel
9*22dc650dSSadaf Ebrahimi Original API code Copyright (c) 1997-2012 University of Cambridge
10*22dc650dSSadaf Ebrahimi New API code Copyright (c) 2016-2020 University of Cambridge
11*22dc650dSSadaf Ebrahimi
12*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
13*22dc650dSSadaf Ebrahimi Redistribution and use in source and binary forms, with or without
14*22dc650dSSadaf Ebrahimi modification, are permitted provided that the following conditions are met:
15*22dc650dSSadaf Ebrahimi
16*22dc650dSSadaf Ebrahimi * Redistributions of source code must retain the above copyright notice,
17*22dc650dSSadaf Ebrahimi this list of conditions and the following disclaimer.
18*22dc650dSSadaf Ebrahimi
19*22dc650dSSadaf Ebrahimi * Redistributions in binary form must reproduce the above copyright
20*22dc650dSSadaf Ebrahimi notice, this list of conditions and the following disclaimer in the
21*22dc650dSSadaf Ebrahimi documentation and/or other materials provided with the distribution.
22*22dc650dSSadaf Ebrahimi
23*22dc650dSSadaf Ebrahimi * Neither the name of the University of Cambridge nor the names of its
24*22dc650dSSadaf Ebrahimi contributors may be used to endorse or promote products derived from
25*22dc650dSSadaf Ebrahimi this software without specific prior written permission.
26*22dc650dSSadaf Ebrahimi
27*22dc650dSSadaf Ebrahimi THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28*22dc650dSSadaf Ebrahimi AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29*22dc650dSSadaf Ebrahimi IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30*22dc650dSSadaf Ebrahimi ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31*22dc650dSSadaf Ebrahimi LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32*22dc650dSSadaf Ebrahimi CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33*22dc650dSSadaf Ebrahimi SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34*22dc650dSSadaf Ebrahimi INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35*22dc650dSSadaf Ebrahimi CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36*22dc650dSSadaf Ebrahimi ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37*22dc650dSSadaf Ebrahimi POSSIBILITY OF SUCH DAMAGE.
38*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
39*22dc650dSSadaf Ebrahimi */
40*22dc650dSSadaf Ebrahimi
41*22dc650dSSadaf Ebrahimi
42*22dc650dSSadaf Ebrahimi /* This module contains an internal function for validating UTF character
43*22dc650dSSadaf Ebrahimi strings. This file is also #included by the pcre2test program, which uses
44*22dc650dSSadaf Ebrahimi macros to change names from _pcre2_xxx to xxxx, thereby avoiding name clashes
45*22dc650dSSadaf Ebrahimi with the library. In this case, PCRE2_PCRE2TEST is defined. */
46*22dc650dSSadaf Ebrahimi
47*22dc650dSSadaf Ebrahimi #ifndef PCRE2_PCRE2TEST /* We're compiling the library */
48*22dc650dSSadaf Ebrahimi #ifdef HAVE_CONFIG_H
49*22dc650dSSadaf Ebrahimi #include "config.h"
50*22dc650dSSadaf Ebrahimi #endif
51*22dc650dSSadaf Ebrahimi #include "pcre2_internal.h"
52*22dc650dSSadaf Ebrahimi #endif /* PCRE2_PCRE2TEST */
53*22dc650dSSadaf Ebrahimi
54*22dc650dSSadaf Ebrahimi
55*22dc650dSSadaf Ebrahimi #ifndef SUPPORT_UNICODE
56*22dc650dSSadaf Ebrahimi /*************************************************
57*22dc650dSSadaf Ebrahimi * Dummy function when Unicode is not supported *
58*22dc650dSSadaf Ebrahimi *************************************************/
59*22dc650dSSadaf Ebrahimi
60*22dc650dSSadaf Ebrahimi /* This function should never be called when Unicode is not supported. */
61*22dc650dSSadaf Ebrahimi
62*22dc650dSSadaf Ebrahimi int
PRIV(valid_utf)63*22dc650dSSadaf Ebrahimi PRIV(valid_utf)(PCRE2_SPTR string, PCRE2_SIZE length, PCRE2_SIZE *erroroffset)
64*22dc650dSSadaf Ebrahimi {
65*22dc650dSSadaf Ebrahimi (void)string;
66*22dc650dSSadaf Ebrahimi (void)length;
67*22dc650dSSadaf Ebrahimi (void)erroroffset;
68*22dc650dSSadaf Ebrahimi return 0;
69*22dc650dSSadaf Ebrahimi }
70*22dc650dSSadaf Ebrahimi #else /* UTF is supported */
71*22dc650dSSadaf Ebrahimi
72*22dc650dSSadaf Ebrahimi
73*22dc650dSSadaf Ebrahimi
74*22dc650dSSadaf Ebrahimi /*************************************************
75*22dc650dSSadaf Ebrahimi * Validate a UTF string *
76*22dc650dSSadaf Ebrahimi *************************************************/
77*22dc650dSSadaf Ebrahimi
78*22dc650dSSadaf Ebrahimi /* This function is called (optionally) at the start of compile or match, to
79*22dc650dSSadaf Ebrahimi check that a supposed UTF string is actually valid. The early check means
80*22dc650dSSadaf Ebrahimi that subsequent code can assume it is dealing with a valid string. The check
81*22dc650dSSadaf Ebrahimi can be turned off for maximum performance, but the consequences of supplying an
82*22dc650dSSadaf Ebrahimi invalid string are then undefined.
83*22dc650dSSadaf Ebrahimi
84*22dc650dSSadaf Ebrahimi Arguments:
85*22dc650dSSadaf Ebrahimi string points to the string
86*22dc650dSSadaf Ebrahimi length length of string
87*22dc650dSSadaf Ebrahimi errp pointer to an error position offset variable
88*22dc650dSSadaf Ebrahimi
89*22dc650dSSadaf Ebrahimi Returns: == 0 if the string is a valid UTF string
90*22dc650dSSadaf Ebrahimi != 0 otherwise, setting the offset of the bad character
91*22dc650dSSadaf Ebrahimi */
92*22dc650dSSadaf Ebrahimi
93*22dc650dSSadaf Ebrahimi int
PRIV(valid_utf)94*22dc650dSSadaf Ebrahimi PRIV(valid_utf)(PCRE2_SPTR string, PCRE2_SIZE length, PCRE2_SIZE *erroroffset)
95*22dc650dSSadaf Ebrahimi {
96*22dc650dSSadaf Ebrahimi PCRE2_SPTR p;
97*22dc650dSSadaf Ebrahimi uint32_t c;
98*22dc650dSSadaf Ebrahimi
99*22dc650dSSadaf Ebrahimi /* ----------------- Check a UTF-8 string ----------------- */
100*22dc650dSSadaf Ebrahimi
101*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
102*22dc650dSSadaf Ebrahimi
103*22dc650dSSadaf Ebrahimi /* Originally, this function checked according to RFC 2279, allowing for values
104*22dc650dSSadaf Ebrahimi in the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were
105*22dc650dSSadaf Ebrahimi in the canonical format. Once somebody had pointed out RFC 3629 to me (it
106*22dc650dSSadaf Ebrahimi obsoletes 2279), additional restrictions were applied. The values are now
107*22dc650dSSadaf Ebrahimi limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the
108*22dc650dSSadaf Ebrahimi subrange 0xd000 to 0xdfff is excluded. However, the format of 5-byte and 6-byte
109*22dc650dSSadaf Ebrahimi characters is still checked. Error returns are as follows:
110*22dc650dSSadaf Ebrahimi
111*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR1 Missing 1 byte at the end of the string
112*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR2 Missing 2 bytes at the end of the string
113*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR3 Missing 3 bytes at the end of the string
114*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR4 Missing 4 bytes at the end of the string
115*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR5 Missing 5 bytes at the end of the string
116*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR6 2nd-byte's two top bits are not 0x80
117*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR7 3rd-byte's two top bits are not 0x80
118*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR8 4th-byte's two top bits are not 0x80
119*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR9 5th-byte's two top bits are not 0x80
120*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR10 6th-byte's two top bits are not 0x80
121*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR11 5-byte character is not permitted by RFC 3629
122*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR12 6-byte character is not permitted by RFC 3629
123*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR13 4-byte character with value > 0x10ffff is not permitted
124*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR14 3-byte character with value 0xd800-0xdfff is not permitted
125*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR15 Overlong 2-byte sequence
126*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR16 Overlong 3-byte sequence
127*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR17 Overlong 4-byte sequence
128*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR18 Overlong 5-byte sequence (won't ever occur)
129*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR19 Overlong 6-byte sequence (won't ever occur)
130*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR20 Isolated 0x80 byte (not within UTF-8 character)
131*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR21 Byte with the illegal value 0xfe or 0xff
132*22dc650dSSadaf Ebrahimi */
133*22dc650dSSadaf Ebrahimi
134*22dc650dSSadaf Ebrahimi for (p = string; length > 0; p++)
135*22dc650dSSadaf Ebrahimi {
136*22dc650dSSadaf Ebrahimi uint32_t ab, d;
137*22dc650dSSadaf Ebrahimi
138*22dc650dSSadaf Ebrahimi c = *p;
139*22dc650dSSadaf Ebrahimi length--;
140*22dc650dSSadaf Ebrahimi
141*22dc650dSSadaf Ebrahimi if (c < 128) continue; /* ASCII character */
142*22dc650dSSadaf Ebrahimi
143*22dc650dSSadaf Ebrahimi if (c < 0xc0) /* Isolated 10xx xxxx byte */
144*22dc650dSSadaf Ebrahimi {
145*22dc650dSSadaf Ebrahimi *erroroffset = (PCRE2_SIZE)(p - string);
146*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_UTF8_ERR20;
147*22dc650dSSadaf Ebrahimi }
148*22dc650dSSadaf Ebrahimi
149*22dc650dSSadaf Ebrahimi if (c >= 0xfe) /* Invalid 0xfe or 0xff bytes */
150*22dc650dSSadaf Ebrahimi {
151*22dc650dSSadaf Ebrahimi *erroroffset = (PCRE2_SIZE)(p - string);
152*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_UTF8_ERR21;
153*22dc650dSSadaf Ebrahimi }
154*22dc650dSSadaf Ebrahimi
155*22dc650dSSadaf Ebrahimi ab = PRIV(utf8_table4)[c & 0x3f]; /* Number of additional bytes (1-5) */
156*22dc650dSSadaf Ebrahimi if (length < ab) /* Missing bytes */
157*22dc650dSSadaf Ebrahimi {
158*22dc650dSSadaf Ebrahimi *erroroffset = (PCRE2_SIZE)(p - string);
159*22dc650dSSadaf Ebrahimi switch(ab - length)
160*22dc650dSSadaf Ebrahimi {
161*22dc650dSSadaf Ebrahimi case 1: return PCRE2_ERROR_UTF8_ERR1;
162*22dc650dSSadaf Ebrahimi case 2: return PCRE2_ERROR_UTF8_ERR2;
163*22dc650dSSadaf Ebrahimi case 3: return PCRE2_ERROR_UTF8_ERR3;
164*22dc650dSSadaf Ebrahimi case 4: return PCRE2_ERROR_UTF8_ERR4;
165*22dc650dSSadaf Ebrahimi case 5: return PCRE2_ERROR_UTF8_ERR5;
166*22dc650dSSadaf Ebrahimi }
167*22dc650dSSadaf Ebrahimi }
168*22dc650dSSadaf Ebrahimi length -= ab; /* Length remaining */
169*22dc650dSSadaf Ebrahimi
170*22dc650dSSadaf Ebrahimi /* Check top bits in the second byte */
171*22dc650dSSadaf Ebrahimi
172*22dc650dSSadaf Ebrahimi if (((d = *(++p)) & 0xc0) != 0x80)
173*22dc650dSSadaf Ebrahimi {
174*22dc650dSSadaf Ebrahimi *erroroffset = (PCRE2_SIZE)(p - string) - 1;
175*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_UTF8_ERR6;
176*22dc650dSSadaf Ebrahimi }
177*22dc650dSSadaf Ebrahimi
178*22dc650dSSadaf Ebrahimi /* For each length, check that the remaining bytes start with the 0x80 bit
179*22dc650dSSadaf Ebrahimi set and not the 0x40 bit. Then check for an overlong sequence, and for the
180*22dc650dSSadaf Ebrahimi excluded range 0xd800 to 0xdfff. */
181*22dc650dSSadaf Ebrahimi
182*22dc650dSSadaf Ebrahimi switch (ab)
183*22dc650dSSadaf Ebrahimi {
184*22dc650dSSadaf Ebrahimi /* 2-byte character. No further bytes to check for 0x80. Check first byte
185*22dc650dSSadaf Ebrahimi for for xx00 000x (overlong sequence). */
186*22dc650dSSadaf Ebrahimi
187*22dc650dSSadaf Ebrahimi case 1: if ((c & 0x3e) == 0)
188*22dc650dSSadaf Ebrahimi {
189*22dc650dSSadaf Ebrahimi *erroroffset = (PCRE2_SIZE)(p - string) - 1;
190*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_UTF8_ERR15;
191*22dc650dSSadaf Ebrahimi }
192*22dc650dSSadaf Ebrahimi break;
193*22dc650dSSadaf Ebrahimi
194*22dc650dSSadaf Ebrahimi /* 3-byte character. Check third byte for 0x80. Then check first 2 bytes
195*22dc650dSSadaf Ebrahimi for 1110 0000, xx0x xxxx (overlong sequence) or
196*22dc650dSSadaf Ebrahimi 1110 1101, 1010 xxxx (0xd800 - 0xdfff) */
197*22dc650dSSadaf Ebrahimi
198*22dc650dSSadaf Ebrahimi case 2:
199*22dc650dSSadaf Ebrahimi if ((*(++p) & 0xc0) != 0x80) /* Third byte */
200*22dc650dSSadaf Ebrahimi {
201*22dc650dSSadaf Ebrahimi *erroroffset = (PCRE2_SIZE)(p - string) - 2;
202*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_UTF8_ERR7;
203*22dc650dSSadaf Ebrahimi }
204*22dc650dSSadaf Ebrahimi if (c == 0xe0 && (d & 0x20) == 0)
205*22dc650dSSadaf Ebrahimi {
206*22dc650dSSadaf Ebrahimi *erroroffset = (PCRE2_SIZE)(p - string) - 2;
207*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_UTF8_ERR16;
208*22dc650dSSadaf Ebrahimi }
209*22dc650dSSadaf Ebrahimi if (c == 0xed && d >= 0xa0)
210*22dc650dSSadaf Ebrahimi {
211*22dc650dSSadaf Ebrahimi *erroroffset = (PCRE2_SIZE)(p - string) - 2;
212*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_UTF8_ERR14;
213*22dc650dSSadaf Ebrahimi }
214*22dc650dSSadaf Ebrahimi break;
215*22dc650dSSadaf Ebrahimi
216*22dc650dSSadaf Ebrahimi /* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2
217*22dc650dSSadaf Ebrahimi bytes for for 1111 0000, xx00 xxxx (overlong sequence), then check for a
218*22dc650dSSadaf Ebrahimi character greater than 0x0010ffff (f4 8f bf bf) */
219*22dc650dSSadaf Ebrahimi
220*22dc650dSSadaf Ebrahimi case 3:
221*22dc650dSSadaf Ebrahimi if ((*(++p) & 0xc0) != 0x80) /* Third byte */
222*22dc650dSSadaf Ebrahimi {
223*22dc650dSSadaf Ebrahimi *erroroffset = (PCRE2_SIZE)(p - string) - 2;
224*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_UTF8_ERR7;
225*22dc650dSSadaf Ebrahimi }
226*22dc650dSSadaf Ebrahimi if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
227*22dc650dSSadaf Ebrahimi {
228*22dc650dSSadaf Ebrahimi *erroroffset = (PCRE2_SIZE)(p - string) - 3;
229*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_UTF8_ERR8;
230*22dc650dSSadaf Ebrahimi }
231*22dc650dSSadaf Ebrahimi if (c == 0xf0 && (d & 0x30) == 0)
232*22dc650dSSadaf Ebrahimi {
233*22dc650dSSadaf Ebrahimi *erroroffset = (PCRE2_SIZE)(p - string) - 3;
234*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_UTF8_ERR17;
235*22dc650dSSadaf Ebrahimi }
236*22dc650dSSadaf Ebrahimi if (c > 0xf4 || (c == 0xf4 && d > 0x8f))
237*22dc650dSSadaf Ebrahimi {
238*22dc650dSSadaf Ebrahimi *erroroffset = (PCRE2_SIZE)(p - string) - 3;
239*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_UTF8_ERR13;
240*22dc650dSSadaf Ebrahimi }
241*22dc650dSSadaf Ebrahimi break;
242*22dc650dSSadaf Ebrahimi
243*22dc650dSSadaf Ebrahimi /* 5-byte and 6-byte characters are not allowed by RFC 3629, and will be
244*22dc650dSSadaf Ebrahimi rejected by the length test below. However, we do the appropriate tests
245*22dc650dSSadaf Ebrahimi here so that overlong sequences get diagnosed, and also in case there is
246*22dc650dSSadaf Ebrahimi ever an option for handling these larger code points. */
247*22dc650dSSadaf Ebrahimi
248*22dc650dSSadaf Ebrahimi /* 5-byte character. Check 3rd, 4th, and 5th bytes for 0x80. Then check for
249*22dc650dSSadaf Ebrahimi 1111 1000, xx00 0xxx */
250*22dc650dSSadaf Ebrahimi
251*22dc650dSSadaf Ebrahimi case 4:
252*22dc650dSSadaf Ebrahimi if ((*(++p) & 0xc0) != 0x80) /* Third byte */
253*22dc650dSSadaf Ebrahimi {
254*22dc650dSSadaf Ebrahimi *erroroffset = (PCRE2_SIZE)(p - string) - 2;
255*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_UTF8_ERR7;
256*22dc650dSSadaf Ebrahimi }
257*22dc650dSSadaf Ebrahimi if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
258*22dc650dSSadaf Ebrahimi {
259*22dc650dSSadaf Ebrahimi *erroroffset = (PCRE2_SIZE)(p - string) - 3;
260*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_UTF8_ERR8;
261*22dc650dSSadaf Ebrahimi }
262*22dc650dSSadaf Ebrahimi if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */
263*22dc650dSSadaf Ebrahimi {
264*22dc650dSSadaf Ebrahimi *erroroffset = (PCRE2_SIZE)(p - string) - 4;
265*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_UTF8_ERR9;
266*22dc650dSSadaf Ebrahimi }
267*22dc650dSSadaf Ebrahimi if (c == 0xf8 && (d & 0x38) == 0)
268*22dc650dSSadaf Ebrahimi {
269*22dc650dSSadaf Ebrahimi *erroroffset = (PCRE2_SIZE)(p - string) - 4;
270*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_UTF8_ERR18;
271*22dc650dSSadaf Ebrahimi }
272*22dc650dSSadaf Ebrahimi break;
273*22dc650dSSadaf Ebrahimi
274*22dc650dSSadaf Ebrahimi /* 6-byte character. Check 3rd-6th bytes for 0x80. Then check for
275*22dc650dSSadaf Ebrahimi 1111 1100, xx00 00xx. */
276*22dc650dSSadaf Ebrahimi
277*22dc650dSSadaf Ebrahimi case 5:
278*22dc650dSSadaf Ebrahimi if ((*(++p) & 0xc0) != 0x80) /* Third byte */
279*22dc650dSSadaf Ebrahimi {
280*22dc650dSSadaf Ebrahimi *erroroffset = (PCRE2_SIZE)(p - string) - 2;
281*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_UTF8_ERR7;
282*22dc650dSSadaf Ebrahimi }
283*22dc650dSSadaf Ebrahimi if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
284*22dc650dSSadaf Ebrahimi {
285*22dc650dSSadaf Ebrahimi *erroroffset = (PCRE2_SIZE)(p - string) - 3;
286*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_UTF8_ERR8;
287*22dc650dSSadaf Ebrahimi }
288*22dc650dSSadaf Ebrahimi if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */
289*22dc650dSSadaf Ebrahimi {
290*22dc650dSSadaf Ebrahimi *erroroffset = (PCRE2_SIZE)(p - string) - 4;
291*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_UTF8_ERR9;
292*22dc650dSSadaf Ebrahimi }
293*22dc650dSSadaf Ebrahimi if ((*(++p) & 0xc0) != 0x80) /* Sixth byte */
294*22dc650dSSadaf Ebrahimi {
295*22dc650dSSadaf Ebrahimi *erroroffset = (PCRE2_SIZE)(p - string) - 5;
296*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_UTF8_ERR10;
297*22dc650dSSadaf Ebrahimi }
298*22dc650dSSadaf Ebrahimi if (c == 0xfc && (d & 0x3c) == 0)
299*22dc650dSSadaf Ebrahimi {
300*22dc650dSSadaf Ebrahimi *erroroffset = (PCRE2_SIZE)(p - string) - 5;
301*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_UTF8_ERR19;
302*22dc650dSSadaf Ebrahimi }
303*22dc650dSSadaf Ebrahimi break;
304*22dc650dSSadaf Ebrahimi }
305*22dc650dSSadaf Ebrahimi
306*22dc650dSSadaf Ebrahimi /* Character is valid under RFC 2279, but 4-byte and 5-byte characters are
307*22dc650dSSadaf Ebrahimi excluded by RFC 3629. The pointer p is currently at the last byte of the
308*22dc650dSSadaf Ebrahimi character. */
309*22dc650dSSadaf Ebrahimi
310*22dc650dSSadaf Ebrahimi if (ab > 3)
311*22dc650dSSadaf Ebrahimi {
312*22dc650dSSadaf Ebrahimi *erroroffset = (PCRE2_SIZE)(p - string) - ab;
313*22dc650dSSadaf Ebrahimi return (ab == 4)? PCRE2_ERROR_UTF8_ERR11 : PCRE2_ERROR_UTF8_ERR12;
314*22dc650dSSadaf Ebrahimi }
315*22dc650dSSadaf Ebrahimi }
316*22dc650dSSadaf Ebrahimi return 0;
317*22dc650dSSadaf Ebrahimi
318*22dc650dSSadaf Ebrahimi
319*22dc650dSSadaf Ebrahimi /* ----------------- Check a UTF-16 string ----------------- */
320*22dc650dSSadaf Ebrahimi
321*22dc650dSSadaf Ebrahimi #elif PCRE2_CODE_UNIT_WIDTH == 16
322*22dc650dSSadaf Ebrahimi
323*22dc650dSSadaf Ebrahimi /* There's not so much work, nor so many errors, for UTF-16.
324*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF16_ERR1 Missing low surrogate at the end of the string
325*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF16_ERR2 Invalid low surrogate
326*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF16_ERR3 Isolated low surrogate
327*22dc650dSSadaf Ebrahimi */
328*22dc650dSSadaf Ebrahimi
329*22dc650dSSadaf Ebrahimi for (p = string; length > 0; p++)
330*22dc650dSSadaf Ebrahimi {
331*22dc650dSSadaf Ebrahimi c = *p;
332*22dc650dSSadaf Ebrahimi length--;
333*22dc650dSSadaf Ebrahimi
334*22dc650dSSadaf Ebrahimi if ((c & 0xf800) != 0xd800)
335*22dc650dSSadaf Ebrahimi {
336*22dc650dSSadaf Ebrahimi /* Normal UTF-16 code point. Neither high nor low surrogate. */
337*22dc650dSSadaf Ebrahimi }
338*22dc650dSSadaf Ebrahimi else if ((c & 0x0400) == 0)
339*22dc650dSSadaf Ebrahimi {
340*22dc650dSSadaf Ebrahimi /* High surrogate. Must be a followed by a low surrogate. */
341*22dc650dSSadaf Ebrahimi if (length == 0)
342*22dc650dSSadaf Ebrahimi {
343*22dc650dSSadaf Ebrahimi *erroroffset = (PCRE2_SIZE)(p - string);
344*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_UTF16_ERR1;
345*22dc650dSSadaf Ebrahimi }
346*22dc650dSSadaf Ebrahimi p++;
347*22dc650dSSadaf Ebrahimi length--;
348*22dc650dSSadaf Ebrahimi if ((*p & 0xfc00) != 0xdc00)
349*22dc650dSSadaf Ebrahimi {
350*22dc650dSSadaf Ebrahimi *erroroffset = (PCRE2_SIZE)(p - string) - 1;
351*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_UTF16_ERR2;
352*22dc650dSSadaf Ebrahimi }
353*22dc650dSSadaf Ebrahimi }
354*22dc650dSSadaf Ebrahimi else
355*22dc650dSSadaf Ebrahimi {
356*22dc650dSSadaf Ebrahimi /* Isolated low surrogate. Always an error. */
357*22dc650dSSadaf Ebrahimi *erroroffset = (PCRE2_SIZE)(p - string);
358*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_UTF16_ERR3;
359*22dc650dSSadaf Ebrahimi }
360*22dc650dSSadaf Ebrahimi }
361*22dc650dSSadaf Ebrahimi return 0;
362*22dc650dSSadaf Ebrahimi
363*22dc650dSSadaf Ebrahimi
364*22dc650dSSadaf Ebrahimi
365*22dc650dSSadaf Ebrahimi /* ----------------- Check a UTF-32 string ----------------- */
366*22dc650dSSadaf Ebrahimi
367*22dc650dSSadaf Ebrahimi #else
368*22dc650dSSadaf Ebrahimi
369*22dc650dSSadaf Ebrahimi /* There is very little to do for a UTF-32 string.
370*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF32_ERR1 Surrogate character
371*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF32_ERR2 Character > 0x10ffff
372*22dc650dSSadaf Ebrahimi */
373*22dc650dSSadaf Ebrahimi
374*22dc650dSSadaf Ebrahimi for (p = string; length > 0; length--, p++)
375*22dc650dSSadaf Ebrahimi {
376*22dc650dSSadaf Ebrahimi c = *p;
377*22dc650dSSadaf Ebrahimi if ((c & 0xfffff800u) != 0xd800u)
378*22dc650dSSadaf Ebrahimi {
379*22dc650dSSadaf Ebrahimi /* Normal UTF-32 code point. Neither high nor low surrogate. */
380*22dc650dSSadaf Ebrahimi if (c > 0x10ffffu)
381*22dc650dSSadaf Ebrahimi {
382*22dc650dSSadaf Ebrahimi *erroroffset = (PCRE2_SIZE)(p - string);
383*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_UTF32_ERR2;
384*22dc650dSSadaf Ebrahimi }
385*22dc650dSSadaf Ebrahimi }
386*22dc650dSSadaf Ebrahimi else
387*22dc650dSSadaf Ebrahimi {
388*22dc650dSSadaf Ebrahimi /* A surrogate */
389*22dc650dSSadaf Ebrahimi *erroroffset = (PCRE2_SIZE)(p - string);
390*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_UTF32_ERR1;
391*22dc650dSSadaf Ebrahimi }
392*22dc650dSSadaf Ebrahimi }
393*22dc650dSSadaf Ebrahimi return 0;
394*22dc650dSSadaf Ebrahimi #endif /* CODE_UNIT_WIDTH */
395*22dc650dSSadaf Ebrahimi }
396*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_UNICODE */
397*22dc650dSSadaf Ebrahimi
398*22dc650dSSadaf Ebrahimi /* End of pcre2_valid_utf.c */
399