xref: /aosp_15_r20/external/pcre/src/pcre2_extuni.c (revision 22dc650d8ae982c6770746019a6f94af92b0f024)
1*22dc650dSSadaf Ebrahimi /*************************************************
2*22dc650dSSadaf Ebrahimi *      Perl-Compatible Regular Expressions       *
3*22dc650dSSadaf Ebrahimi *************************************************/
4*22dc650dSSadaf Ebrahimi 
5*22dc650dSSadaf Ebrahimi /* PCRE is a library of functions to support regular expressions whose syntax
6*22dc650dSSadaf Ebrahimi and semantics are as close as possible to those of the Perl 5 language.
7*22dc650dSSadaf Ebrahimi 
8*22dc650dSSadaf Ebrahimi                        Written by Philip Hazel
9*22dc650dSSadaf Ebrahimi      Original API code Copyright (c) 1997-2012 University of Cambridge
10*22dc650dSSadaf Ebrahimi           New API code Copyright (c) 2016-2024 University of Cambridge
11*22dc650dSSadaf Ebrahimi 
12*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
13*22dc650dSSadaf Ebrahimi Redistribution and use in source and binary forms, with or without
14*22dc650dSSadaf Ebrahimi modification, are permitted provided that the following conditions are met:
15*22dc650dSSadaf Ebrahimi 
16*22dc650dSSadaf Ebrahimi     * Redistributions of source code must retain the above copyright notice,
17*22dc650dSSadaf Ebrahimi       this list of conditions and the following disclaimer.
18*22dc650dSSadaf Ebrahimi 
19*22dc650dSSadaf Ebrahimi     * Redistributions in binary form must reproduce the above copyright
20*22dc650dSSadaf Ebrahimi       notice, this list of conditions and the following disclaimer in the
21*22dc650dSSadaf Ebrahimi       documentation and/or other materials provided with the distribution.
22*22dc650dSSadaf Ebrahimi 
23*22dc650dSSadaf Ebrahimi     * Neither the name of the University of Cambridge nor the names of its
24*22dc650dSSadaf Ebrahimi       contributors may be used to endorse or promote products derived from
25*22dc650dSSadaf Ebrahimi       this software without specific prior written permission.
26*22dc650dSSadaf Ebrahimi 
27*22dc650dSSadaf Ebrahimi THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28*22dc650dSSadaf Ebrahimi AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29*22dc650dSSadaf Ebrahimi IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30*22dc650dSSadaf Ebrahimi ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31*22dc650dSSadaf Ebrahimi LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32*22dc650dSSadaf Ebrahimi CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33*22dc650dSSadaf Ebrahimi SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34*22dc650dSSadaf Ebrahimi INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35*22dc650dSSadaf Ebrahimi CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36*22dc650dSSadaf Ebrahimi ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37*22dc650dSSadaf Ebrahimi POSSIBILITY OF SUCH DAMAGE.
38*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
39*22dc650dSSadaf Ebrahimi */
40*22dc650dSSadaf Ebrahimi 
41*22dc650dSSadaf Ebrahimi /* This module contains an internal function that is used to match a Unicode
42*22dc650dSSadaf Ebrahimi extended grapheme sequence. It is used by both pcre2_match() and
43*22dc650dSSadaf Ebrahimi pcre2_def_match(). However, it is called only when Unicode support is being
44*22dc650dSSadaf Ebrahimi compiled. Nevertheless, we provide a dummy function when there is no Unicode
45*22dc650dSSadaf Ebrahimi support, because some compilers do not like functionless source files. */
46*22dc650dSSadaf Ebrahimi 
47*22dc650dSSadaf Ebrahimi 
48*22dc650dSSadaf Ebrahimi #ifdef HAVE_CONFIG_H
49*22dc650dSSadaf Ebrahimi #include "config.h"
50*22dc650dSSadaf Ebrahimi #endif
51*22dc650dSSadaf Ebrahimi 
52*22dc650dSSadaf Ebrahimi 
53*22dc650dSSadaf Ebrahimi #include "pcre2_internal.h"
54*22dc650dSSadaf Ebrahimi 
55*22dc650dSSadaf Ebrahimi 
56*22dc650dSSadaf Ebrahimi /* Dummy function */
57*22dc650dSSadaf Ebrahimi 
58*22dc650dSSadaf Ebrahimi #ifndef SUPPORT_UNICODE
59*22dc650dSSadaf Ebrahimi PCRE2_SPTR
PRIV(extuni)60*22dc650dSSadaf Ebrahimi PRIV(extuni)(uint32_t c, PCRE2_SPTR eptr, PCRE2_SPTR start_subject,
61*22dc650dSSadaf Ebrahimi   PCRE2_SPTR end_subject, BOOL utf, int *xcount)
62*22dc650dSSadaf Ebrahimi {
63*22dc650dSSadaf Ebrahimi (void)c;
64*22dc650dSSadaf Ebrahimi (void)eptr;
65*22dc650dSSadaf Ebrahimi (void)start_subject;
66*22dc650dSSadaf Ebrahimi (void)end_subject;
67*22dc650dSSadaf Ebrahimi (void)utf;
68*22dc650dSSadaf Ebrahimi (void)xcount;
69*22dc650dSSadaf Ebrahimi return NULL;
70*22dc650dSSadaf Ebrahimi }
71*22dc650dSSadaf Ebrahimi #else
72*22dc650dSSadaf Ebrahimi 
73*22dc650dSSadaf Ebrahimi 
74*22dc650dSSadaf Ebrahimi /*************************************************
75*22dc650dSSadaf Ebrahimi *      Match an extended grapheme sequence       *
76*22dc650dSSadaf Ebrahimi *************************************************/
77*22dc650dSSadaf Ebrahimi 
78*22dc650dSSadaf Ebrahimi /* NOTE: The logic contained in this function is replicated in three special-
79*22dc650dSSadaf Ebrahimi purpose functions in the pcre2_jit_compile.c module. If the logic below is
80*22dc650dSSadaf Ebrahimi changed, they must be kept in step so that the interpreter and the JIT have the
81*22dc650dSSadaf Ebrahimi same behaviour.
82*22dc650dSSadaf Ebrahimi 
83*22dc650dSSadaf Ebrahimi Arguments:
84*22dc650dSSadaf Ebrahimi   c              the first character
85*22dc650dSSadaf Ebrahimi   eptr           pointer to next character
86*22dc650dSSadaf Ebrahimi   start_subject  pointer to start of subject
87*22dc650dSSadaf Ebrahimi   end_subject    pointer to end of subject
88*22dc650dSSadaf Ebrahimi   utf            TRUE if in UTF mode
89*22dc650dSSadaf Ebrahimi   xcount         pointer to count of additional characters,
90*22dc650dSSadaf Ebrahimi                    or NULL if count not needed
91*22dc650dSSadaf Ebrahimi 
92*22dc650dSSadaf Ebrahimi Returns:         pointer after the end of the sequence
93*22dc650dSSadaf Ebrahimi */
94*22dc650dSSadaf Ebrahimi 
95*22dc650dSSadaf Ebrahimi PCRE2_SPTR
PRIV(extuni)96*22dc650dSSadaf Ebrahimi PRIV(extuni)(uint32_t c, PCRE2_SPTR eptr, PCRE2_SPTR start_subject,
97*22dc650dSSadaf Ebrahimi   PCRE2_SPTR end_subject, BOOL utf, int *xcount)
98*22dc650dSSadaf Ebrahimi {
99*22dc650dSSadaf Ebrahimi BOOL was_ep_ZWJ = FALSE;
100*22dc650dSSadaf Ebrahimi int lgb = UCD_GRAPHBREAK(c);
101*22dc650dSSadaf Ebrahimi 
102*22dc650dSSadaf Ebrahimi while (eptr < end_subject)
103*22dc650dSSadaf Ebrahimi   {
104*22dc650dSSadaf Ebrahimi   int rgb;
105*22dc650dSSadaf Ebrahimi   int len = 1;
106*22dc650dSSadaf Ebrahimi   if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
107*22dc650dSSadaf Ebrahimi   rgb = UCD_GRAPHBREAK(c);
108*22dc650dSSadaf Ebrahimi   if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
109*22dc650dSSadaf Ebrahimi 
110*22dc650dSSadaf Ebrahimi   /* ZWJ followed by Extended Pictographic is allowed only if the ZWJ was
111*22dc650dSSadaf Ebrahimi   preceded by Extended Pictographic. */
112*22dc650dSSadaf Ebrahimi 
113*22dc650dSSadaf Ebrahimi   if (lgb == ucp_gbZWJ && rgb == ucp_gbExtended_Pictographic && !was_ep_ZWJ)
114*22dc650dSSadaf Ebrahimi     break;
115*22dc650dSSadaf Ebrahimi 
116*22dc650dSSadaf Ebrahimi   /* Not breaking between Regional Indicators is allowed only if there
117*22dc650dSSadaf Ebrahimi   are an even number of preceding RIs. */
118*22dc650dSSadaf Ebrahimi 
119*22dc650dSSadaf Ebrahimi   if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator)
120*22dc650dSSadaf Ebrahimi     {
121*22dc650dSSadaf Ebrahimi     int ricount = 0;
122*22dc650dSSadaf Ebrahimi     PCRE2_SPTR bptr = eptr - 1;
123*22dc650dSSadaf Ebrahimi     if (utf) BACKCHAR(bptr);
124*22dc650dSSadaf Ebrahimi 
125*22dc650dSSadaf Ebrahimi     /* bptr is pointing to the left-hand character */
126*22dc650dSSadaf Ebrahimi 
127*22dc650dSSadaf Ebrahimi     while (bptr > start_subject)
128*22dc650dSSadaf Ebrahimi       {
129*22dc650dSSadaf Ebrahimi       bptr--;
130*22dc650dSSadaf Ebrahimi       if (utf)
131*22dc650dSSadaf Ebrahimi         {
132*22dc650dSSadaf Ebrahimi         BACKCHAR(bptr);
133*22dc650dSSadaf Ebrahimi         GETCHAR(c, bptr);
134*22dc650dSSadaf Ebrahimi         }
135*22dc650dSSadaf Ebrahimi       else
136*22dc650dSSadaf Ebrahimi       c = *bptr;
137*22dc650dSSadaf Ebrahimi       if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator) break;
138*22dc650dSSadaf Ebrahimi       ricount++;
139*22dc650dSSadaf Ebrahimi       }
140*22dc650dSSadaf Ebrahimi     if ((ricount & 1) != 0) break;  /* Grapheme break required */
141*22dc650dSSadaf Ebrahimi     }
142*22dc650dSSadaf Ebrahimi 
143*22dc650dSSadaf Ebrahimi   /* Set a flag when ZWJ follows Extended Pictographic (with optional Extend in
144*22dc650dSSadaf Ebrahimi   between; see next statement). */
145*22dc650dSSadaf Ebrahimi 
146*22dc650dSSadaf Ebrahimi   was_ep_ZWJ = (lgb == ucp_gbExtended_Pictographic && rgb == ucp_gbZWJ);
147*22dc650dSSadaf Ebrahimi 
148*22dc650dSSadaf Ebrahimi   /* If Extend follows Extended_Pictographic, do not update lgb; this allows
149*22dc650dSSadaf Ebrahimi   any number of them before a following ZWJ. */
150*22dc650dSSadaf Ebrahimi 
151*22dc650dSSadaf Ebrahimi   if (rgb != ucp_gbExtend || lgb != ucp_gbExtended_Pictographic) lgb = rgb;
152*22dc650dSSadaf Ebrahimi 
153*22dc650dSSadaf Ebrahimi   eptr += len;
154*22dc650dSSadaf Ebrahimi   if (xcount != NULL) *xcount += 1;
155*22dc650dSSadaf Ebrahimi   }
156*22dc650dSSadaf Ebrahimi 
157*22dc650dSSadaf Ebrahimi return eptr;
158*22dc650dSSadaf Ebrahimi }
159*22dc650dSSadaf Ebrahimi 
160*22dc650dSSadaf Ebrahimi #endif  /* SUPPORT_UNICODE */
161*22dc650dSSadaf Ebrahimi 
162*22dc650dSSadaf Ebrahimi /* End of pcre2_extuni.c */
163