1*22dc650dSSadaf Ebrahimi /*************************************************
2*22dc650dSSadaf Ebrahimi * Perl-Compatible Regular Expressions *
3*22dc650dSSadaf Ebrahimi *************************************************/
4*22dc650dSSadaf Ebrahimi
5*22dc650dSSadaf Ebrahimi /* PCRE is a library of functions to support regular expressions whose syntax
6*22dc650dSSadaf Ebrahimi and semantics are as close as possible to those of the Perl 5 language.
7*22dc650dSSadaf Ebrahimi
8*22dc650dSSadaf Ebrahimi Written by Philip Hazel
9*22dc650dSSadaf Ebrahimi Original API code Copyright (c) 1997-2012 University of Cambridge
10*22dc650dSSadaf Ebrahimi New API code Copyright (c) 2016-2024 University of Cambridge
11*22dc650dSSadaf Ebrahimi
12*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
13*22dc650dSSadaf Ebrahimi Redistribution and use in source and binary forms, with or without
14*22dc650dSSadaf Ebrahimi modification, are permitted provided that the following conditions are met:
15*22dc650dSSadaf Ebrahimi
16*22dc650dSSadaf Ebrahimi * Redistributions of source code must retain the above copyright notice,
17*22dc650dSSadaf Ebrahimi this list of conditions and the following disclaimer.
18*22dc650dSSadaf Ebrahimi
19*22dc650dSSadaf Ebrahimi * Redistributions in binary form must reproduce the above copyright
20*22dc650dSSadaf Ebrahimi notice, this list of conditions and the following disclaimer in the
21*22dc650dSSadaf Ebrahimi documentation and/or other materials provided with the distribution.
22*22dc650dSSadaf Ebrahimi
23*22dc650dSSadaf Ebrahimi * Neither the name of the University of Cambridge nor the names of its
24*22dc650dSSadaf Ebrahimi contributors may be used to endorse or promote products derived from
25*22dc650dSSadaf Ebrahimi this software without specific prior written permission.
26*22dc650dSSadaf Ebrahimi
27*22dc650dSSadaf Ebrahimi THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28*22dc650dSSadaf Ebrahimi AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29*22dc650dSSadaf Ebrahimi IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30*22dc650dSSadaf Ebrahimi ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31*22dc650dSSadaf Ebrahimi LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32*22dc650dSSadaf Ebrahimi CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33*22dc650dSSadaf Ebrahimi SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34*22dc650dSSadaf Ebrahimi INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35*22dc650dSSadaf Ebrahimi CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36*22dc650dSSadaf Ebrahimi ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37*22dc650dSSadaf Ebrahimi POSSIBILITY OF SUCH DAMAGE.
38*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
39*22dc650dSSadaf Ebrahimi */
40*22dc650dSSadaf Ebrahimi
41*22dc650dSSadaf Ebrahimi /* This module contains an internal function that is used to match a Unicode
42*22dc650dSSadaf Ebrahimi extended grapheme sequence. It is used by both pcre2_match() and
43*22dc650dSSadaf Ebrahimi pcre2_def_match(). However, it is called only when Unicode support is being
44*22dc650dSSadaf Ebrahimi compiled. Nevertheless, we provide a dummy function when there is no Unicode
45*22dc650dSSadaf Ebrahimi support, because some compilers do not like functionless source files. */
46*22dc650dSSadaf Ebrahimi
47*22dc650dSSadaf Ebrahimi
48*22dc650dSSadaf Ebrahimi #ifdef HAVE_CONFIG_H
49*22dc650dSSadaf Ebrahimi #include "config.h"
50*22dc650dSSadaf Ebrahimi #endif
51*22dc650dSSadaf Ebrahimi
52*22dc650dSSadaf Ebrahimi
53*22dc650dSSadaf Ebrahimi #include "pcre2_internal.h"
54*22dc650dSSadaf Ebrahimi
55*22dc650dSSadaf Ebrahimi
56*22dc650dSSadaf Ebrahimi /* Dummy function */
57*22dc650dSSadaf Ebrahimi
58*22dc650dSSadaf Ebrahimi #ifndef SUPPORT_UNICODE
59*22dc650dSSadaf Ebrahimi PCRE2_SPTR
PRIV(extuni)60*22dc650dSSadaf Ebrahimi PRIV(extuni)(uint32_t c, PCRE2_SPTR eptr, PCRE2_SPTR start_subject,
61*22dc650dSSadaf Ebrahimi PCRE2_SPTR end_subject, BOOL utf, int *xcount)
62*22dc650dSSadaf Ebrahimi {
63*22dc650dSSadaf Ebrahimi (void)c;
64*22dc650dSSadaf Ebrahimi (void)eptr;
65*22dc650dSSadaf Ebrahimi (void)start_subject;
66*22dc650dSSadaf Ebrahimi (void)end_subject;
67*22dc650dSSadaf Ebrahimi (void)utf;
68*22dc650dSSadaf Ebrahimi (void)xcount;
69*22dc650dSSadaf Ebrahimi return NULL;
70*22dc650dSSadaf Ebrahimi }
71*22dc650dSSadaf Ebrahimi #else
72*22dc650dSSadaf Ebrahimi
73*22dc650dSSadaf Ebrahimi
74*22dc650dSSadaf Ebrahimi /*************************************************
75*22dc650dSSadaf Ebrahimi * Match an extended grapheme sequence *
76*22dc650dSSadaf Ebrahimi *************************************************/
77*22dc650dSSadaf Ebrahimi
78*22dc650dSSadaf Ebrahimi /* NOTE: The logic contained in this function is replicated in three special-
79*22dc650dSSadaf Ebrahimi purpose functions in the pcre2_jit_compile.c module. If the logic below is
80*22dc650dSSadaf Ebrahimi changed, they must be kept in step so that the interpreter and the JIT have the
81*22dc650dSSadaf Ebrahimi same behaviour.
82*22dc650dSSadaf Ebrahimi
83*22dc650dSSadaf Ebrahimi Arguments:
84*22dc650dSSadaf Ebrahimi c the first character
85*22dc650dSSadaf Ebrahimi eptr pointer to next character
86*22dc650dSSadaf Ebrahimi start_subject pointer to start of subject
87*22dc650dSSadaf Ebrahimi end_subject pointer to end of subject
88*22dc650dSSadaf Ebrahimi utf TRUE if in UTF mode
89*22dc650dSSadaf Ebrahimi xcount pointer to count of additional characters,
90*22dc650dSSadaf Ebrahimi or NULL if count not needed
91*22dc650dSSadaf Ebrahimi
92*22dc650dSSadaf Ebrahimi Returns: pointer after the end of the sequence
93*22dc650dSSadaf Ebrahimi */
94*22dc650dSSadaf Ebrahimi
95*22dc650dSSadaf Ebrahimi PCRE2_SPTR
PRIV(extuni)96*22dc650dSSadaf Ebrahimi PRIV(extuni)(uint32_t c, PCRE2_SPTR eptr, PCRE2_SPTR start_subject,
97*22dc650dSSadaf Ebrahimi PCRE2_SPTR end_subject, BOOL utf, int *xcount)
98*22dc650dSSadaf Ebrahimi {
99*22dc650dSSadaf Ebrahimi BOOL was_ep_ZWJ = FALSE;
100*22dc650dSSadaf Ebrahimi int lgb = UCD_GRAPHBREAK(c);
101*22dc650dSSadaf Ebrahimi
102*22dc650dSSadaf Ebrahimi while (eptr < end_subject)
103*22dc650dSSadaf Ebrahimi {
104*22dc650dSSadaf Ebrahimi int rgb;
105*22dc650dSSadaf Ebrahimi int len = 1;
106*22dc650dSSadaf Ebrahimi if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
107*22dc650dSSadaf Ebrahimi rgb = UCD_GRAPHBREAK(c);
108*22dc650dSSadaf Ebrahimi if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
109*22dc650dSSadaf Ebrahimi
110*22dc650dSSadaf Ebrahimi /* ZWJ followed by Extended Pictographic is allowed only if the ZWJ was
111*22dc650dSSadaf Ebrahimi preceded by Extended Pictographic. */
112*22dc650dSSadaf Ebrahimi
113*22dc650dSSadaf Ebrahimi if (lgb == ucp_gbZWJ && rgb == ucp_gbExtended_Pictographic && !was_ep_ZWJ)
114*22dc650dSSadaf Ebrahimi break;
115*22dc650dSSadaf Ebrahimi
116*22dc650dSSadaf Ebrahimi /* Not breaking between Regional Indicators is allowed only if there
117*22dc650dSSadaf Ebrahimi are an even number of preceding RIs. */
118*22dc650dSSadaf Ebrahimi
119*22dc650dSSadaf Ebrahimi if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator)
120*22dc650dSSadaf Ebrahimi {
121*22dc650dSSadaf Ebrahimi int ricount = 0;
122*22dc650dSSadaf Ebrahimi PCRE2_SPTR bptr = eptr - 1;
123*22dc650dSSadaf Ebrahimi if (utf) BACKCHAR(bptr);
124*22dc650dSSadaf Ebrahimi
125*22dc650dSSadaf Ebrahimi /* bptr is pointing to the left-hand character */
126*22dc650dSSadaf Ebrahimi
127*22dc650dSSadaf Ebrahimi while (bptr > start_subject)
128*22dc650dSSadaf Ebrahimi {
129*22dc650dSSadaf Ebrahimi bptr--;
130*22dc650dSSadaf Ebrahimi if (utf)
131*22dc650dSSadaf Ebrahimi {
132*22dc650dSSadaf Ebrahimi BACKCHAR(bptr);
133*22dc650dSSadaf Ebrahimi GETCHAR(c, bptr);
134*22dc650dSSadaf Ebrahimi }
135*22dc650dSSadaf Ebrahimi else
136*22dc650dSSadaf Ebrahimi c = *bptr;
137*22dc650dSSadaf Ebrahimi if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator) break;
138*22dc650dSSadaf Ebrahimi ricount++;
139*22dc650dSSadaf Ebrahimi }
140*22dc650dSSadaf Ebrahimi if ((ricount & 1) != 0) break; /* Grapheme break required */
141*22dc650dSSadaf Ebrahimi }
142*22dc650dSSadaf Ebrahimi
143*22dc650dSSadaf Ebrahimi /* Set a flag when ZWJ follows Extended Pictographic (with optional Extend in
144*22dc650dSSadaf Ebrahimi between; see next statement). */
145*22dc650dSSadaf Ebrahimi
146*22dc650dSSadaf Ebrahimi was_ep_ZWJ = (lgb == ucp_gbExtended_Pictographic && rgb == ucp_gbZWJ);
147*22dc650dSSadaf Ebrahimi
148*22dc650dSSadaf Ebrahimi /* If Extend follows Extended_Pictographic, do not update lgb; this allows
149*22dc650dSSadaf Ebrahimi any number of them before a following ZWJ. */
150*22dc650dSSadaf Ebrahimi
151*22dc650dSSadaf Ebrahimi if (rgb != ucp_gbExtend || lgb != ucp_gbExtended_Pictographic) lgb = rgb;
152*22dc650dSSadaf Ebrahimi
153*22dc650dSSadaf Ebrahimi eptr += len;
154*22dc650dSSadaf Ebrahimi if (xcount != NULL) *xcount += 1;
155*22dc650dSSadaf Ebrahimi }
156*22dc650dSSadaf Ebrahimi
157*22dc650dSSadaf Ebrahimi return eptr;
158*22dc650dSSadaf Ebrahimi }
159*22dc650dSSadaf Ebrahimi
160*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_UNICODE */
161*22dc650dSSadaf Ebrahimi
162*22dc650dSSadaf Ebrahimi /* End of pcre2_extuni.c */
163