xref: /aosp_15_r20/external/pcre/src/pcre2_valid_utf.c (revision 22dc650d8ae982c6770746019a6f94af92b0f024)
1*22dc650dSSadaf Ebrahimi /*************************************************
2*22dc650dSSadaf Ebrahimi *      Perl-Compatible Regular Expressions       *
3*22dc650dSSadaf Ebrahimi *************************************************/
4*22dc650dSSadaf Ebrahimi 
5*22dc650dSSadaf Ebrahimi /* PCRE is a library of functions to support regular expressions whose syntax
6*22dc650dSSadaf Ebrahimi and semantics are as close as possible to those of the Perl 5 language.
7*22dc650dSSadaf Ebrahimi 
8*22dc650dSSadaf Ebrahimi                        Written by Philip Hazel
9*22dc650dSSadaf Ebrahimi      Original API code Copyright (c) 1997-2012 University of Cambridge
10*22dc650dSSadaf Ebrahimi           New API code Copyright (c) 2016-2020 University of Cambridge
11*22dc650dSSadaf Ebrahimi 
12*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
13*22dc650dSSadaf Ebrahimi Redistribution and use in source and binary forms, with or without
14*22dc650dSSadaf Ebrahimi modification, are permitted provided that the following conditions are met:
15*22dc650dSSadaf Ebrahimi 
16*22dc650dSSadaf Ebrahimi     * Redistributions of source code must retain the above copyright notice,
17*22dc650dSSadaf Ebrahimi       this list of conditions and the following disclaimer.
18*22dc650dSSadaf Ebrahimi 
19*22dc650dSSadaf Ebrahimi     * Redistributions in binary form must reproduce the above copyright
20*22dc650dSSadaf Ebrahimi       notice, this list of conditions and the following disclaimer in the
21*22dc650dSSadaf Ebrahimi       documentation and/or other materials provided with the distribution.
22*22dc650dSSadaf Ebrahimi 
23*22dc650dSSadaf Ebrahimi     * Neither the name of the University of Cambridge nor the names of its
24*22dc650dSSadaf Ebrahimi       contributors may be used to endorse or promote products derived from
25*22dc650dSSadaf Ebrahimi       this software without specific prior written permission.
26*22dc650dSSadaf Ebrahimi 
27*22dc650dSSadaf Ebrahimi THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28*22dc650dSSadaf Ebrahimi AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29*22dc650dSSadaf Ebrahimi IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30*22dc650dSSadaf Ebrahimi ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31*22dc650dSSadaf Ebrahimi LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32*22dc650dSSadaf Ebrahimi CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33*22dc650dSSadaf Ebrahimi SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34*22dc650dSSadaf Ebrahimi INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35*22dc650dSSadaf Ebrahimi CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36*22dc650dSSadaf Ebrahimi ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37*22dc650dSSadaf Ebrahimi POSSIBILITY OF SUCH DAMAGE.
38*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
39*22dc650dSSadaf Ebrahimi */
40*22dc650dSSadaf Ebrahimi 
41*22dc650dSSadaf Ebrahimi 
42*22dc650dSSadaf Ebrahimi /* This module contains an internal function for validating UTF character
43*22dc650dSSadaf Ebrahimi strings. This file is also #included by the pcre2test program, which uses
44*22dc650dSSadaf Ebrahimi macros to change names from _pcre2_xxx to xxxx, thereby avoiding name clashes
45*22dc650dSSadaf Ebrahimi with the library. In this case, PCRE2_PCRE2TEST is defined. */
46*22dc650dSSadaf Ebrahimi 
47*22dc650dSSadaf Ebrahimi #ifndef PCRE2_PCRE2TEST           /* We're compiling the library */
48*22dc650dSSadaf Ebrahimi #ifdef HAVE_CONFIG_H
49*22dc650dSSadaf Ebrahimi #include "config.h"
50*22dc650dSSadaf Ebrahimi #endif
51*22dc650dSSadaf Ebrahimi #include "pcre2_internal.h"
52*22dc650dSSadaf Ebrahimi #endif /* PCRE2_PCRE2TEST */
53*22dc650dSSadaf Ebrahimi 
54*22dc650dSSadaf Ebrahimi 
55*22dc650dSSadaf Ebrahimi #ifndef SUPPORT_UNICODE
56*22dc650dSSadaf Ebrahimi /*************************************************
57*22dc650dSSadaf Ebrahimi *  Dummy function when Unicode is not supported  *
58*22dc650dSSadaf Ebrahimi *************************************************/
59*22dc650dSSadaf Ebrahimi 
60*22dc650dSSadaf Ebrahimi /* This function should never be called when Unicode is not supported. */
61*22dc650dSSadaf Ebrahimi 
62*22dc650dSSadaf Ebrahimi int
PRIV(valid_utf)63*22dc650dSSadaf Ebrahimi PRIV(valid_utf)(PCRE2_SPTR string, PCRE2_SIZE length, PCRE2_SIZE *erroroffset)
64*22dc650dSSadaf Ebrahimi {
65*22dc650dSSadaf Ebrahimi (void)string;
66*22dc650dSSadaf Ebrahimi (void)length;
67*22dc650dSSadaf Ebrahimi (void)erroroffset;
68*22dc650dSSadaf Ebrahimi return 0;
69*22dc650dSSadaf Ebrahimi }
70*22dc650dSSadaf Ebrahimi #else  /* UTF is supported */
71*22dc650dSSadaf Ebrahimi 
72*22dc650dSSadaf Ebrahimi 
73*22dc650dSSadaf Ebrahimi 
74*22dc650dSSadaf Ebrahimi /*************************************************
75*22dc650dSSadaf Ebrahimi *           Validate a UTF string                *
76*22dc650dSSadaf Ebrahimi *************************************************/
77*22dc650dSSadaf Ebrahimi 
78*22dc650dSSadaf Ebrahimi /* This function is called (optionally) at the start of compile or match, to
79*22dc650dSSadaf Ebrahimi check that a supposed UTF string is actually valid. The early check means
80*22dc650dSSadaf Ebrahimi that subsequent code can assume it is dealing with a valid string. The check
81*22dc650dSSadaf Ebrahimi can be turned off for maximum performance, but the consequences of supplying an
82*22dc650dSSadaf Ebrahimi invalid string are then undefined.
83*22dc650dSSadaf Ebrahimi 
84*22dc650dSSadaf Ebrahimi Arguments:
85*22dc650dSSadaf Ebrahimi   string       points to the string
86*22dc650dSSadaf Ebrahimi   length       length of string
87*22dc650dSSadaf Ebrahimi   errp         pointer to an error position offset variable
88*22dc650dSSadaf Ebrahimi 
89*22dc650dSSadaf Ebrahimi Returns:       == 0    if the string is a valid UTF string
90*22dc650dSSadaf Ebrahimi                != 0    otherwise, setting the offset of the bad character
91*22dc650dSSadaf Ebrahimi */
92*22dc650dSSadaf Ebrahimi 
93*22dc650dSSadaf Ebrahimi int
PRIV(valid_utf)94*22dc650dSSadaf Ebrahimi PRIV(valid_utf)(PCRE2_SPTR string, PCRE2_SIZE length, PCRE2_SIZE *erroroffset)
95*22dc650dSSadaf Ebrahimi {
96*22dc650dSSadaf Ebrahimi PCRE2_SPTR p;
97*22dc650dSSadaf Ebrahimi uint32_t c;
98*22dc650dSSadaf Ebrahimi 
99*22dc650dSSadaf Ebrahimi /* ----------------- Check a UTF-8 string ----------------- */
100*22dc650dSSadaf Ebrahimi 
101*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
102*22dc650dSSadaf Ebrahimi 
103*22dc650dSSadaf Ebrahimi /* Originally, this function checked according to RFC 2279, allowing for values
104*22dc650dSSadaf Ebrahimi in the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were
105*22dc650dSSadaf Ebrahimi in the canonical format. Once somebody had pointed out RFC 3629 to me (it
106*22dc650dSSadaf Ebrahimi obsoletes 2279), additional restrictions were applied. The values are now
107*22dc650dSSadaf Ebrahimi limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the
108*22dc650dSSadaf Ebrahimi subrange 0xd000 to 0xdfff is excluded. However, the format of 5-byte and 6-byte
109*22dc650dSSadaf Ebrahimi characters is still checked. Error returns are as follows:
110*22dc650dSSadaf Ebrahimi 
111*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR1   Missing 1 byte at the end of the string
112*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR2   Missing 2 bytes at the end of the string
113*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR3   Missing 3 bytes at the end of the string
114*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR4   Missing 4 bytes at the end of the string
115*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR5   Missing 5 bytes at the end of the string
116*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR6   2nd-byte's two top bits are not 0x80
117*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR7   3rd-byte's two top bits are not 0x80
118*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR8   4th-byte's two top bits are not 0x80
119*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR9   5th-byte's two top bits are not 0x80
120*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR10  6th-byte's two top bits are not 0x80
121*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR11  5-byte character is not permitted by RFC 3629
122*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR12  6-byte character is not permitted by RFC 3629
123*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR13  4-byte character with value > 0x10ffff is not permitted
124*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR14  3-byte character with value 0xd800-0xdfff is not permitted
125*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR15  Overlong 2-byte sequence
126*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR16  Overlong 3-byte sequence
127*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR17  Overlong 4-byte sequence
128*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR18  Overlong 5-byte sequence (won't ever occur)
129*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR19  Overlong 6-byte sequence (won't ever occur)
130*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR20  Isolated 0x80 byte (not within UTF-8 character)
131*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF8_ERR21  Byte with the illegal value 0xfe or 0xff
132*22dc650dSSadaf Ebrahimi */
133*22dc650dSSadaf Ebrahimi 
134*22dc650dSSadaf Ebrahimi for (p = string; length > 0; p++)
135*22dc650dSSadaf Ebrahimi   {
136*22dc650dSSadaf Ebrahimi   uint32_t ab, d;
137*22dc650dSSadaf Ebrahimi 
138*22dc650dSSadaf Ebrahimi   c = *p;
139*22dc650dSSadaf Ebrahimi   length--;
140*22dc650dSSadaf Ebrahimi 
141*22dc650dSSadaf Ebrahimi   if (c < 128) continue;                /* ASCII character */
142*22dc650dSSadaf Ebrahimi 
143*22dc650dSSadaf Ebrahimi   if (c < 0xc0)                         /* Isolated 10xx xxxx byte */
144*22dc650dSSadaf Ebrahimi     {
145*22dc650dSSadaf Ebrahimi     *erroroffset = (PCRE2_SIZE)(p - string);
146*22dc650dSSadaf Ebrahimi     return PCRE2_ERROR_UTF8_ERR20;
147*22dc650dSSadaf Ebrahimi     }
148*22dc650dSSadaf Ebrahimi 
149*22dc650dSSadaf Ebrahimi   if (c >= 0xfe)                        /* Invalid 0xfe or 0xff bytes */
150*22dc650dSSadaf Ebrahimi     {
151*22dc650dSSadaf Ebrahimi     *erroroffset = (PCRE2_SIZE)(p - string);
152*22dc650dSSadaf Ebrahimi     return PCRE2_ERROR_UTF8_ERR21;
153*22dc650dSSadaf Ebrahimi     }
154*22dc650dSSadaf Ebrahimi 
155*22dc650dSSadaf Ebrahimi   ab = PRIV(utf8_table4)[c & 0x3f];     /* Number of additional bytes (1-5) */
156*22dc650dSSadaf Ebrahimi   if (length < ab)                      /* Missing bytes */
157*22dc650dSSadaf Ebrahimi     {
158*22dc650dSSadaf Ebrahimi     *erroroffset = (PCRE2_SIZE)(p - string);
159*22dc650dSSadaf Ebrahimi     switch(ab - length)
160*22dc650dSSadaf Ebrahimi       {
161*22dc650dSSadaf Ebrahimi       case 1: return PCRE2_ERROR_UTF8_ERR1;
162*22dc650dSSadaf Ebrahimi       case 2: return PCRE2_ERROR_UTF8_ERR2;
163*22dc650dSSadaf Ebrahimi       case 3: return PCRE2_ERROR_UTF8_ERR3;
164*22dc650dSSadaf Ebrahimi       case 4: return PCRE2_ERROR_UTF8_ERR4;
165*22dc650dSSadaf Ebrahimi       case 5: return PCRE2_ERROR_UTF8_ERR5;
166*22dc650dSSadaf Ebrahimi       }
167*22dc650dSSadaf Ebrahimi     }
168*22dc650dSSadaf Ebrahimi   length -= ab;                         /* Length remaining */
169*22dc650dSSadaf Ebrahimi 
170*22dc650dSSadaf Ebrahimi   /* Check top bits in the second byte */
171*22dc650dSSadaf Ebrahimi 
172*22dc650dSSadaf Ebrahimi   if (((d = *(++p)) & 0xc0) != 0x80)
173*22dc650dSSadaf Ebrahimi     {
174*22dc650dSSadaf Ebrahimi     *erroroffset = (PCRE2_SIZE)(p - string) - 1;
175*22dc650dSSadaf Ebrahimi     return PCRE2_ERROR_UTF8_ERR6;
176*22dc650dSSadaf Ebrahimi     }
177*22dc650dSSadaf Ebrahimi 
178*22dc650dSSadaf Ebrahimi   /* For each length, check that the remaining bytes start with the 0x80 bit
179*22dc650dSSadaf Ebrahimi   set and not the 0x40 bit. Then check for an overlong sequence, and for the
180*22dc650dSSadaf Ebrahimi   excluded range 0xd800 to 0xdfff. */
181*22dc650dSSadaf Ebrahimi 
182*22dc650dSSadaf Ebrahimi   switch (ab)
183*22dc650dSSadaf Ebrahimi     {
184*22dc650dSSadaf Ebrahimi     /* 2-byte character. No further bytes to check for 0x80. Check first byte
185*22dc650dSSadaf Ebrahimi     for for xx00 000x (overlong sequence). */
186*22dc650dSSadaf Ebrahimi 
187*22dc650dSSadaf Ebrahimi     case 1: if ((c & 0x3e) == 0)
188*22dc650dSSadaf Ebrahimi       {
189*22dc650dSSadaf Ebrahimi       *erroroffset = (PCRE2_SIZE)(p - string) - 1;
190*22dc650dSSadaf Ebrahimi       return PCRE2_ERROR_UTF8_ERR15;
191*22dc650dSSadaf Ebrahimi       }
192*22dc650dSSadaf Ebrahimi     break;
193*22dc650dSSadaf Ebrahimi 
194*22dc650dSSadaf Ebrahimi     /* 3-byte character. Check third byte for 0x80. Then check first 2 bytes
195*22dc650dSSadaf Ebrahimi       for 1110 0000, xx0x xxxx (overlong sequence) or
196*22dc650dSSadaf Ebrahimi           1110 1101, 1010 xxxx (0xd800 - 0xdfff) */
197*22dc650dSSadaf Ebrahimi 
198*22dc650dSSadaf Ebrahimi     case 2:
199*22dc650dSSadaf Ebrahimi     if ((*(++p) & 0xc0) != 0x80)     /* Third byte */
200*22dc650dSSadaf Ebrahimi       {
201*22dc650dSSadaf Ebrahimi       *erroroffset = (PCRE2_SIZE)(p - string) - 2;
202*22dc650dSSadaf Ebrahimi       return PCRE2_ERROR_UTF8_ERR7;
203*22dc650dSSadaf Ebrahimi       }
204*22dc650dSSadaf Ebrahimi     if (c == 0xe0 && (d & 0x20) == 0)
205*22dc650dSSadaf Ebrahimi       {
206*22dc650dSSadaf Ebrahimi       *erroroffset = (PCRE2_SIZE)(p - string) - 2;
207*22dc650dSSadaf Ebrahimi       return PCRE2_ERROR_UTF8_ERR16;
208*22dc650dSSadaf Ebrahimi       }
209*22dc650dSSadaf Ebrahimi     if (c == 0xed && d >= 0xa0)
210*22dc650dSSadaf Ebrahimi       {
211*22dc650dSSadaf Ebrahimi       *erroroffset = (PCRE2_SIZE)(p - string) - 2;
212*22dc650dSSadaf Ebrahimi       return PCRE2_ERROR_UTF8_ERR14;
213*22dc650dSSadaf Ebrahimi       }
214*22dc650dSSadaf Ebrahimi     break;
215*22dc650dSSadaf Ebrahimi 
216*22dc650dSSadaf Ebrahimi     /* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2
217*22dc650dSSadaf Ebrahimi        bytes for for 1111 0000, xx00 xxxx (overlong sequence), then check for a
218*22dc650dSSadaf Ebrahimi        character greater than 0x0010ffff (f4 8f bf bf) */
219*22dc650dSSadaf Ebrahimi 
220*22dc650dSSadaf Ebrahimi     case 3:
221*22dc650dSSadaf Ebrahimi     if ((*(++p) & 0xc0) != 0x80)     /* Third byte */
222*22dc650dSSadaf Ebrahimi       {
223*22dc650dSSadaf Ebrahimi       *erroroffset = (PCRE2_SIZE)(p - string) - 2;
224*22dc650dSSadaf Ebrahimi       return PCRE2_ERROR_UTF8_ERR7;
225*22dc650dSSadaf Ebrahimi       }
226*22dc650dSSadaf Ebrahimi     if ((*(++p) & 0xc0) != 0x80)     /* Fourth byte */
227*22dc650dSSadaf Ebrahimi       {
228*22dc650dSSadaf Ebrahimi       *erroroffset = (PCRE2_SIZE)(p - string) - 3;
229*22dc650dSSadaf Ebrahimi       return PCRE2_ERROR_UTF8_ERR8;
230*22dc650dSSadaf Ebrahimi       }
231*22dc650dSSadaf Ebrahimi     if (c == 0xf0 && (d & 0x30) == 0)
232*22dc650dSSadaf Ebrahimi       {
233*22dc650dSSadaf Ebrahimi       *erroroffset = (PCRE2_SIZE)(p - string) - 3;
234*22dc650dSSadaf Ebrahimi       return PCRE2_ERROR_UTF8_ERR17;
235*22dc650dSSadaf Ebrahimi       }
236*22dc650dSSadaf Ebrahimi     if (c > 0xf4 || (c == 0xf4 && d > 0x8f))
237*22dc650dSSadaf Ebrahimi       {
238*22dc650dSSadaf Ebrahimi       *erroroffset = (PCRE2_SIZE)(p - string) - 3;
239*22dc650dSSadaf Ebrahimi       return PCRE2_ERROR_UTF8_ERR13;
240*22dc650dSSadaf Ebrahimi       }
241*22dc650dSSadaf Ebrahimi     break;
242*22dc650dSSadaf Ebrahimi 
243*22dc650dSSadaf Ebrahimi     /* 5-byte and 6-byte characters are not allowed by RFC 3629, and will be
244*22dc650dSSadaf Ebrahimi     rejected by the length test below. However, we do the appropriate tests
245*22dc650dSSadaf Ebrahimi     here so that overlong sequences get diagnosed, and also in case there is
246*22dc650dSSadaf Ebrahimi     ever an option for handling these larger code points. */
247*22dc650dSSadaf Ebrahimi 
248*22dc650dSSadaf Ebrahimi     /* 5-byte character. Check 3rd, 4th, and 5th bytes for 0x80. Then check for
249*22dc650dSSadaf Ebrahimi     1111 1000, xx00 0xxx */
250*22dc650dSSadaf Ebrahimi 
251*22dc650dSSadaf Ebrahimi     case 4:
252*22dc650dSSadaf Ebrahimi     if ((*(++p) & 0xc0) != 0x80)     /* Third byte */
253*22dc650dSSadaf Ebrahimi       {
254*22dc650dSSadaf Ebrahimi       *erroroffset = (PCRE2_SIZE)(p - string) - 2;
255*22dc650dSSadaf Ebrahimi       return PCRE2_ERROR_UTF8_ERR7;
256*22dc650dSSadaf Ebrahimi       }
257*22dc650dSSadaf Ebrahimi     if ((*(++p) & 0xc0) != 0x80)     /* Fourth byte */
258*22dc650dSSadaf Ebrahimi       {
259*22dc650dSSadaf Ebrahimi       *erroroffset = (PCRE2_SIZE)(p - string) - 3;
260*22dc650dSSadaf Ebrahimi       return PCRE2_ERROR_UTF8_ERR8;
261*22dc650dSSadaf Ebrahimi       }
262*22dc650dSSadaf Ebrahimi     if ((*(++p) & 0xc0) != 0x80)     /* Fifth byte */
263*22dc650dSSadaf Ebrahimi       {
264*22dc650dSSadaf Ebrahimi       *erroroffset = (PCRE2_SIZE)(p - string) - 4;
265*22dc650dSSadaf Ebrahimi       return PCRE2_ERROR_UTF8_ERR9;
266*22dc650dSSadaf Ebrahimi       }
267*22dc650dSSadaf Ebrahimi     if (c == 0xf8 && (d & 0x38) == 0)
268*22dc650dSSadaf Ebrahimi       {
269*22dc650dSSadaf Ebrahimi       *erroroffset = (PCRE2_SIZE)(p - string) - 4;
270*22dc650dSSadaf Ebrahimi       return PCRE2_ERROR_UTF8_ERR18;
271*22dc650dSSadaf Ebrahimi       }
272*22dc650dSSadaf Ebrahimi     break;
273*22dc650dSSadaf Ebrahimi 
274*22dc650dSSadaf Ebrahimi     /* 6-byte character. Check 3rd-6th bytes for 0x80. Then check for
275*22dc650dSSadaf Ebrahimi     1111 1100, xx00 00xx. */
276*22dc650dSSadaf Ebrahimi 
277*22dc650dSSadaf Ebrahimi     case 5:
278*22dc650dSSadaf Ebrahimi     if ((*(++p) & 0xc0) != 0x80)     /* Third byte */
279*22dc650dSSadaf Ebrahimi       {
280*22dc650dSSadaf Ebrahimi       *erroroffset = (PCRE2_SIZE)(p - string) - 2;
281*22dc650dSSadaf Ebrahimi       return PCRE2_ERROR_UTF8_ERR7;
282*22dc650dSSadaf Ebrahimi       }
283*22dc650dSSadaf Ebrahimi     if ((*(++p) & 0xc0) != 0x80)     /* Fourth byte */
284*22dc650dSSadaf Ebrahimi       {
285*22dc650dSSadaf Ebrahimi       *erroroffset = (PCRE2_SIZE)(p - string) - 3;
286*22dc650dSSadaf Ebrahimi       return PCRE2_ERROR_UTF8_ERR8;
287*22dc650dSSadaf Ebrahimi       }
288*22dc650dSSadaf Ebrahimi     if ((*(++p) & 0xc0) != 0x80)     /* Fifth byte */
289*22dc650dSSadaf Ebrahimi       {
290*22dc650dSSadaf Ebrahimi       *erroroffset = (PCRE2_SIZE)(p - string) - 4;
291*22dc650dSSadaf Ebrahimi       return PCRE2_ERROR_UTF8_ERR9;
292*22dc650dSSadaf Ebrahimi       }
293*22dc650dSSadaf Ebrahimi     if ((*(++p) & 0xc0) != 0x80)     /* Sixth byte */
294*22dc650dSSadaf Ebrahimi       {
295*22dc650dSSadaf Ebrahimi       *erroroffset = (PCRE2_SIZE)(p - string) - 5;
296*22dc650dSSadaf Ebrahimi       return PCRE2_ERROR_UTF8_ERR10;
297*22dc650dSSadaf Ebrahimi       }
298*22dc650dSSadaf Ebrahimi     if (c == 0xfc && (d & 0x3c) == 0)
299*22dc650dSSadaf Ebrahimi       {
300*22dc650dSSadaf Ebrahimi       *erroroffset = (PCRE2_SIZE)(p - string) - 5;
301*22dc650dSSadaf Ebrahimi       return PCRE2_ERROR_UTF8_ERR19;
302*22dc650dSSadaf Ebrahimi       }
303*22dc650dSSadaf Ebrahimi     break;
304*22dc650dSSadaf Ebrahimi     }
305*22dc650dSSadaf Ebrahimi 
306*22dc650dSSadaf Ebrahimi   /* Character is valid under RFC 2279, but 4-byte and 5-byte characters are
307*22dc650dSSadaf Ebrahimi   excluded by RFC 3629. The pointer p is currently at the last byte of the
308*22dc650dSSadaf Ebrahimi   character. */
309*22dc650dSSadaf Ebrahimi 
310*22dc650dSSadaf Ebrahimi   if (ab > 3)
311*22dc650dSSadaf Ebrahimi     {
312*22dc650dSSadaf Ebrahimi     *erroroffset = (PCRE2_SIZE)(p - string) - ab;
313*22dc650dSSadaf Ebrahimi     return (ab == 4)? PCRE2_ERROR_UTF8_ERR11 : PCRE2_ERROR_UTF8_ERR12;
314*22dc650dSSadaf Ebrahimi     }
315*22dc650dSSadaf Ebrahimi   }
316*22dc650dSSadaf Ebrahimi return 0;
317*22dc650dSSadaf Ebrahimi 
318*22dc650dSSadaf Ebrahimi 
319*22dc650dSSadaf Ebrahimi /* ----------------- Check a UTF-16 string ----------------- */
320*22dc650dSSadaf Ebrahimi 
321*22dc650dSSadaf Ebrahimi #elif PCRE2_CODE_UNIT_WIDTH == 16
322*22dc650dSSadaf Ebrahimi 
323*22dc650dSSadaf Ebrahimi /* There's not so much work, nor so many errors, for UTF-16.
324*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF16_ERR1  Missing low surrogate at the end of the string
325*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF16_ERR2  Invalid low surrogate
326*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF16_ERR3  Isolated low surrogate
327*22dc650dSSadaf Ebrahimi */
328*22dc650dSSadaf Ebrahimi 
329*22dc650dSSadaf Ebrahimi for (p = string; length > 0; p++)
330*22dc650dSSadaf Ebrahimi   {
331*22dc650dSSadaf Ebrahimi   c = *p;
332*22dc650dSSadaf Ebrahimi   length--;
333*22dc650dSSadaf Ebrahimi 
334*22dc650dSSadaf Ebrahimi   if ((c & 0xf800) != 0xd800)
335*22dc650dSSadaf Ebrahimi     {
336*22dc650dSSadaf Ebrahimi     /* Normal UTF-16 code point. Neither high nor low surrogate. */
337*22dc650dSSadaf Ebrahimi     }
338*22dc650dSSadaf Ebrahimi   else if ((c & 0x0400) == 0)
339*22dc650dSSadaf Ebrahimi     {
340*22dc650dSSadaf Ebrahimi     /* High surrogate. Must be a followed by a low surrogate. */
341*22dc650dSSadaf Ebrahimi     if (length == 0)
342*22dc650dSSadaf Ebrahimi       {
343*22dc650dSSadaf Ebrahimi       *erroroffset = (PCRE2_SIZE)(p - string);
344*22dc650dSSadaf Ebrahimi       return PCRE2_ERROR_UTF16_ERR1;
345*22dc650dSSadaf Ebrahimi       }
346*22dc650dSSadaf Ebrahimi     p++;
347*22dc650dSSadaf Ebrahimi     length--;
348*22dc650dSSadaf Ebrahimi     if ((*p & 0xfc00) != 0xdc00)
349*22dc650dSSadaf Ebrahimi       {
350*22dc650dSSadaf Ebrahimi       *erroroffset = (PCRE2_SIZE)(p - string) - 1;
351*22dc650dSSadaf Ebrahimi       return PCRE2_ERROR_UTF16_ERR2;
352*22dc650dSSadaf Ebrahimi       }
353*22dc650dSSadaf Ebrahimi     }
354*22dc650dSSadaf Ebrahimi   else
355*22dc650dSSadaf Ebrahimi     {
356*22dc650dSSadaf Ebrahimi     /* Isolated low surrogate. Always an error. */
357*22dc650dSSadaf Ebrahimi     *erroroffset = (PCRE2_SIZE)(p - string);
358*22dc650dSSadaf Ebrahimi     return PCRE2_ERROR_UTF16_ERR3;
359*22dc650dSSadaf Ebrahimi     }
360*22dc650dSSadaf Ebrahimi   }
361*22dc650dSSadaf Ebrahimi return 0;
362*22dc650dSSadaf Ebrahimi 
363*22dc650dSSadaf Ebrahimi 
364*22dc650dSSadaf Ebrahimi 
365*22dc650dSSadaf Ebrahimi /* ----------------- Check a UTF-32 string ----------------- */
366*22dc650dSSadaf Ebrahimi 
367*22dc650dSSadaf Ebrahimi #else
368*22dc650dSSadaf Ebrahimi 
369*22dc650dSSadaf Ebrahimi /* There is very little to do for a UTF-32 string.
370*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF32_ERR1  Surrogate character
371*22dc650dSSadaf Ebrahimi PCRE2_ERROR_UTF32_ERR2  Character > 0x10ffff
372*22dc650dSSadaf Ebrahimi */
373*22dc650dSSadaf Ebrahimi 
374*22dc650dSSadaf Ebrahimi for (p = string; length > 0; length--, p++)
375*22dc650dSSadaf Ebrahimi   {
376*22dc650dSSadaf Ebrahimi   c = *p;
377*22dc650dSSadaf Ebrahimi   if ((c & 0xfffff800u) != 0xd800u)
378*22dc650dSSadaf Ebrahimi     {
379*22dc650dSSadaf Ebrahimi     /* Normal UTF-32 code point. Neither high nor low surrogate. */
380*22dc650dSSadaf Ebrahimi     if (c > 0x10ffffu)
381*22dc650dSSadaf Ebrahimi       {
382*22dc650dSSadaf Ebrahimi       *erroroffset = (PCRE2_SIZE)(p - string);
383*22dc650dSSadaf Ebrahimi       return PCRE2_ERROR_UTF32_ERR2;
384*22dc650dSSadaf Ebrahimi       }
385*22dc650dSSadaf Ebrahimi     }
386*22dc650dSSadaf Ebrahimi   else
387*22dc650dSSadaf Ebrahimi     {
388*22dc650dSSadaf Ebrahimi     /* A surrogate */
389*22dc650dSSadaf Ebrahimi     *erroroffset = (PCRE2_SIZE)(p - string);
390*22dc650dSSadaf Ebrahimi     return PCRE2_ERROR_UTF32_ERR1;
391*22dc650dSSadaf Ebrahimi     }
392*22dc650dSSadaf Ebrahimi   }
393*22dc650dSSadaf Ebrahimi return 0;
394*22dc650dSSadaf Ebrahimi #endif  /* CODE_UNIT_WIDTH */
395*22dc650dSSadaf Ebrahimi }
396*22dc650dSSadaf Ebrahimi #endif  /* SUPPORT_UNICODE */
397*22dc650dSSadaf Ebrahimi 
398*22dc650dSSadaf Ebrahimi /* End of pcre2_valid_utf.c */
399