xref: /aosp_15_r20/external/pcre/src/pcre2_script_run.c (revision 22dc650d8ae982c6770746019a6f94af92b0f024)
1*22dc650dSSadaf Ebrahimi /*************************************************
2*22dc650dSSadaf Ebrahimi *      Perl-Compatible Regular Expressions       *
3*22dc650dSSadaf Ebrahimi *************************************************/
4*22dc650dSSadaf Ebrahimi 
5*22dc650dSSadaf Ebrahimi /* PCRE is a library of functions to support regular expressions whose syntax
6*22dc650dSSadaf Ebrahimi and semantics are as close as possible to those of the Perl 5 language.
7*22dc650dSSadaf Ebrahimi 
8*22dc650dSSadaf Ebrahimi                        Written by Philip Hazel
9*22dc650dSSadaf Ebrahimi      Original API code Copyright (c) 1997-2012 University of Cambridge
10*22dc650dSSadaf Ebrahimi           New API code Copyright (c) 2016-2021 University of Cambridge
11*22dc650dSSadaf Ebrahimi 
12*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
13*22dc650dSSadaf Ebrahimi Redistribution and use in source and binary forms, with or without
14*22dc650dSSadaf Ebrahimi modification, are permitted provided that the following conditions are met:
15*22dc650dSSadaf Ebrahimi 
16*22dc650dSSadaf Ebrahimi     * Redistributions of source code must retain the above copyright notice,
17*22dc650dSSadaf Ebrahimi       this list of conditions and the following disclaimer.
18*22dc650dSSadaf Ebrahimi 
19*22dc650dSSadaf Ebrahimi     * Redistributions in binary form must reproduce the above copyright
20*22dc650dSSadaf Ebrahimi       notice, this list of conditions and the following disclaimer in the
21*22dc650dSSadaf Ebrahimi       documentation and/or other materials provided with the distribution.
22*22dc650dSSadaf Ebrahimi 
23*22dc650dSSadaf Ebrahimi     * Neither the name of the University of Cambridge nor the names of its
24*22dc650dSSadaf Ebrahimi       contributors may be used to endorse or promote products derived from
25*22dc650dSSadaf Ebrahimi       this software without specific prior written permission.
26*22dc650dSSadaf Ebrahimi 
27*22dc650dSSadaf Ebrahimi THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28*22dc650dSSadaf Ebrahimi AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29*22dc650dSSadaf Ebrahimi IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30*22dc650dSSadaf Ebrahimi ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31*22dc650dSSadaf Ebrahimi LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32*22dc650dSSadaf Ebrahimi CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33*22dc650dSSadaf Ebrahimi SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34*22dc650dSSadaf Ebrahimi INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35*22dc650dSSadaf Ebrahimi CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36*22dc650dSSadaf Ebrahimi ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37*22dc650dSSadaf Ebrahimi POSSIBILITY OF SUCH DAMAGE.
38*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
39*22dc650dSSadaf Ebrahimi */
40*22dc650dSSadaf Ebrahimi 
41*22dc650dSSadaf Ebrahimi /* This module contains the function for checking a script run. */
42*22dc650dSSadaf Ebrahimi 
43*22dc650dSSadaf Ebrahimi #ifdef HAVE_CONFIG_H
44*22dc650dSSadaf Ebrahimi #include "config.h"
45*22dc650dSSadaf Ebrahimi #endif
46*22dc650dSSadaf Ebrahimi 
47*22dc650dSSadaf Ebrahimi #include "pcre2_internal.h"
48*22dc650dSSadaf Ebrahimi 
49*22dc650dSSadaf Ebrahimi 
50*22dc650dSSadaf Ebrahimi /*************************************************
51*22dc650dSSadaf Ebrahimi *                Check script run                *
52*22dc650dSSadaf Ebrahimi *************************************************/
53*22dc650dSSadaf Ebrahimi 
54*22dc650dSSadaf Ebrahimi /* A script run is conceptually a sequence of characters all in the same
55*22dc650dSSadaf Ebrahimi Unicode script. However, it isn't quite that simple. There are special rules
56*22dc650dSSadaf Ebrahimi for scripts that are commonly used together, and also special rules for digits.
57*22dc650dSSadaf Ebrahimi This function implements the appropriate checks, which is possible only when
58*22dc650dSSadaf Ebrahimi PCRE2 is compiled with Unicode support. The function returns TRUE if there is
59*22dc650dSSadaf Ebrahimi no Unicode support; however, it should never be called in that circumstance
60*22dc650dSSadaf Ebrahimi because an error is given by pcre2_compile() if a script run is called for in a
61*22dc650dSSadaf Ebrahimi version of PCRE2 compiled without Unicode support.
62*22dc650dSSadaf Ebrahimi 
63*22dc650dSSadaf Ebrahimi Arguments:
64*22dc650dSSadaf Ebrahimi   pgr       point to the first character
65*22dc650dSSadaf Ebrahimi   endptr    point after the last character
66*22dc650dSSadaf Ebrahimi   utf       TRUE if in UTF mode
67*22dc650dSSadaf Ebrahimi 
68*22dc650dSSadaf Ebrahimi Returns:    TRUE if this is a valid script run
69*22dc650dSSadaf Ebrahimi */
70*22dc650dSSadaf Ebrahimi 
71*22dc650dSSadaf Ebrahimi /* These are states in the checking process. */
72*22dc650dSSadaf Ebrahimi 
73*22dc650dSSadaf Ebrahimi enum { SCRIPT_UNSET,          /* Requirement as yet unknown */
74*22dc650dSSadaf Ebrahimi        SCRIPT_MAP,            /* Bitmap contains acceptable scripts */
75*22dc650dSSadaf Ebrahimi        SCRIPT_HANPENDING,     /* Have had only Han characters */
76*22dc650dSSadaf Ebrahimi        SCRIPT_HANHIRAKATA,    /* Expect Han or Hirikata */
77*22dc650dSSadaf Ebrahimi        SCRIPT_HANBOPOMOFO,    /* Expect Han or Bopomofo */
78*22dc650dSSadaf Ebrahimi        SCRIPT_HANHANGUL       /* Expect Han or Hangul */
79*22dc650dSSadaf Ebrahimi        };
80*22dc650dSSadaf Ebrahimi 
81*22dc650dSSadaf Ebrahimi #define UCD_MAPSIZE (ucp_Unknown/32 + 1)
82*22dc650dSSadaf Ebrahimi #define FULL_MAPSIZE (ucp_Script_Count/32 + 1)
83*22dc650dSSadaf Ebrahimi 
84*22dc650dSSadaf Ebrahimi BOOL
PRIV(script_run)85*22dc650dSSadaf Ebrahimi PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
86*22dc650dSSadaf Ebrahimi {
87*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
88*22dc650dSSadaf Ebrahimi uint32_t require_state = SCRIPT_UNSET;
89*22dc650dSSadaf Ebrahimi uint32_t require_map[FULL_MAPSIZE];
90*22dc650dSSadaf Ebrahimi uint32_t map[FULL_MAPSIZE];
91*22dc650dSSadaf Ebrahimi uint32_t require_digitset = 0;
92*22dc650dSSadaf Ebrahimi uint32_t c;
93*22dc650dSSadaf Ebrahimi 
94*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 32
95*22dc650dSSadaf Ebrahimi (void)utf;    /* Avoid compiler warning */
96*22dc650dSSadaf Ebrahimi #endif
97*22dc650dSSadaf Ebrahimi 
98*22dc650dSSadaf Ebrahimi /* Any string containing fewer than 2 characters is a valid script run. */
99*22dc650dSSadaf Ebrahimi 
100*22dc650dSSadaf Ebrahimi if (ptr >= endptr) return TRUE;
101*22dc650dSSadaf Ebrahimi GETCHARINCTEST(c, ptr);
102*22dc650dSSadaf Ebrahimi if (ptr >= endptr) return TRUE;
103*22dc650dSSadaf Ebrahimi 
104*22dc650dSSadaf Ebrahimi /* Initialize the require map. This is a full-size bitmap that has a bit for
105*22dc650dSSadaf Ebrahimi every script, as opposed to the maps in ucd_script_sets, which only have bits
106*22dc650dSSadaf Ebrahimi for scripts less than ucp_Unknown - those that appear in script extension
107*22dc650dSSadaf Ebrahimi lists. */
108*22dc650dSSadaf Ebrahimi 
109*22dc650dSSadaf Ebrahimi for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] = 0;
110*22dc650dSSadaf Ebrahimi 
111*22dc650dSSadaf Ebrahimi /* Scan strings of two or more characters, checking the Unicode characteristics
112*22dc650dSSadaf Ebrahimi of each code point. There is special code for scripts that can be combined with
113*22dc650dSSadaf Ebrahimi characters from the Han Chinese script. This may be used in conjunction with
114*22dc650dSSadaf Ebrahimi four other scripts in these combinations:
115*22dc650dSSadaf Ebrahimi 
116*22dc650dSSadaf Ebrahimi . Han with Hiragana and Katakana is allowed (for Japanese).
117*22dc650dSSadaf Ebrahimi . Han with Bopomofo is allowed (for Taiwanese Mandarin).
118*22dc650dSSadaf Ebrahimi . Han with Hangul is allowed (for Korean).
119*22dc650dSSadaf Ebrahimi 
120*22dc650dSSadaf Ebrahimi If the first significant character's script is one of the four, the required
121*22dc650dSSadaf Ebrahimi script type is immediately known. However, if the first significant
122*22dc650dSSadaf Ebrahimi character's script is Han, we have to keep checking for a non-Han character.
123*22dc650dSSadaf Ebrahimi Hence the SCRIPT_HANPENDING state. */
124*22dc650dSSadaf Ebrahimi 
125*22dc650dSSadaf Ebrahimi for (;;)
126*22dc650dSSadaf Ebrahimi   {
127*22dc650dSSadaf Ebrahimi   const ucd_record *ucd = GET_UCD(c);
128*22dc650dSSadaf Ebrahimi   uint32_t script = ucd->script;
129*22dc650dSSadaf Ebrahimi 
130*22dc650dSSadaf Ebrahimi   /* If the script is Unknown, the string is not a valid script run. Such
131*22dc650dSSadaf Ebrahimi   characters can only form script runs of length one (see test above). */
132*22dc650dSSadaf Ebrahimi 
133*22dc650dSSadaf Ebrahimi   if (script == ucp_Unknown) return FALSE;
134*22dc650dSSadaf Ebrahimi 
135*22dc650dSSadaf Ebrahimi   /* A character without any script extensions whose script is Inherited or
136*22dc650dSSadaf Ebrahimi   Common is always accepted with any script. If there are extensions, the
137*22dc650dSSadaf Ebrahimi   following processing happens for all scripts. */
138*22dc650dSSadaf Ebrahimi 
139*22dc650dSSadaf Ebrahimi   if (UCD_SCRIPTX_PROP(ucd) != 0 || (script != ucp_Inherited && script != ucp_Common))
140*22dc650dSSadaf Ebrahimi     {
141*22dc650dSSadaf Ebrahimi     BOOL OK;
142*22dc650dSSadaf Ebrahimi 
143*22dc650dSSadaf Ebrahimi     /* Set up a full-sized map for this character that can include bits for all
144*22dc650dSSadaf Ebrahimi     scripts. Copy the scriptx map for this character (which covers those
145*22dc650dSSadaf Ebrahimi     scripts that appear in script extension lists), set the remaining values to
146*22dc650dSSadaf Ebrahimi     zero, and then, except for Common or Inherited, add this script's bit to
147*22dc650dSSadaf Ebrahimi     the map. */
148*22dc650dSSadaf Ebrahimi 
149*22dc650dSSadaf Ebrahimi     memcpy(map, PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(ucd), UCD_MAPSIZE * sizeof(uint32_t));
150*22dc650dSSadaf Ebrahimi     memset(map + UCD_MAPSIZE, 0, (FULL_MAPSIZE - UCD_MAPSIZE) * sizeof(uint32_t));
151*22dc650dSSadaf Ebrahimi     if (script != ucp_Common && script != ucp_Inherited) MAPSET(map, script);
152*22dc650dSSadaf Ebrahimi 
153*22dc650dSSadaf Ebrahimi     /* Handle the different checking states */
154*22dc650dSSadaf Ebrahimi 
155*22dc650dSSadaf Ebrahimi     switch(require_state)
156*22dc650dSSadaf Ebrahimi       {
157*22dc650dSSadaf Ebrahimi       /* First significant character - it might follow Common or Inherited
158*22dc650dSSadaf Ebrahimi       characters that do not have any script extensions. */
159*22dc650dSSadaf Ebrahimi 
160*22dc650dSSadaf Ebrahimi       case SCRIPT_UNSET:
161*22dc650dSSadaf Ebrahimi       switch(script)
162*22dc650dSSadaf Ebrahimi         {
163*22dc650dSSadaf Ebrahimi         case ucp_Han:
164*22dc650dSSadaf Ebrahimi         require_state = SCRIPT_HANPENDING;
165*22dc650dSSadaf Ebrahimi         break;
166*22dc650dSSadaf Ebrahimi 
167*22dc650dSSadaf Ebrahimi         case ucp_Hiragana:
168*22dc650dSSadaf Ebrahimi         case ucp_Katakana:
169*22dc650dSSadaf Ebrahimi         require_state = SCRIPT_HANHIRAKATA;
170*22dc650dSSadaf Ebrahimi         break;
171*22dc650dSSadaf Ebrahimi 
172*22dc650dSSadaf Ebrahimi         case ucp_Bopomofo:
173*22dc650dSSadaf Ebrahimi         require_state = SCRIPT_HANBOPOMOFO;
174*22dc650dSSadaf Ebrahimi         break;
175*22dc650dSSadaf Ebrahimi 
176*22dc650dSSadaf Ebrahimi         case ucp_Hangul:
177*22dc650dSSadaf Ebrahimi         require_state = SCRIPT_HANHANGUL;
178*22dc650dSSadaf Ebrahimi         break;
179*22dc650dSSadaf Ebrahimi 
180*22dc650dSSadaf Ebrahimi         default:
181*22dc650dSSadaf Ebrahimi         memcpy(require_map, map, FULL_MAPSIZE * sizeof(uint32_t));
182*22dc650dSSadaf Ebrahimi         require_state = SCRIPT_MAP;
183*22dc650dSSadaf Ebrahimi         break;
184*22dc650dSSadaf Ebrahimi         }
185*22dc650dSSadaf Ebrahimi       break;
186*22dc650dSSadaf Ebrahimi 
187*22dc650dSSadaf Ebrahimi       /* The first significant character was Han. An inspection of the Unicode
188*22dc650dSSadaf Ebrahimi       11.0.0 files shows that there are the following types of Script Extension
189*22dc650dSSadaf Ebrahimi       list that involve the Han, Bopomofo, Hiragana, Katakana, and Hangul
190*22dc650dSSadaf Ebrahimi       scripts:
191*22dc650dSSadaf Ebrahimi 
192*22dc650dSSadaf Ebrahimi       . Bopomofo + Han
193*22dc650dSSadaf Ebrahimi       . Han + Hiragana + Katakana
194*22dc650dSSadaf Ebrahimi       . Hiragana + Katakana
195*22dc650dSSadaf Ebrahimi       . Bopopmofo + Hangul + Han + Hiragana + Katakana
196*22dc650dSSadaf Ebrahimi 
197*22dc650dSSadaf Ebrahimi       The following code tries to make sense of this. */
198*22dc650dSSadaf Ebrahimi 
199*22dc650dSSadaf Ebrahimi #define FOUND_BOPOMOFO 1
200*22dc650dSSadaf Ebrahimi #define FOUND_HIRAGANA 2
201*22dc650dSSadaf Ebrahimi #define FOUND_KATAKANA 4
202*22dc650dSSadaf Ebrahimi #define FOUND_HANGUL   8
203*22dc650dSSadaf Ebrahimi 
204*22dc650dSSadaf Ebrahimi       case SCRIPT_HANPENDING:
205*22dc650dSSadaf Ebrahimi       if (script != ucp_Han)   /* Another Han does nothing */
206*22dc650dSSadaf Ebrahimi         {
207*22dc650dSSadaf Ebrahimi         uint32_t chspecial = 0;
208*22dc650dSSadaf Ebrahimi 
209*22dc650dSSadaf Ebrahimi         if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO;
210*22dc650dSSadaf Ebrahimi         if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA;
211*22dc650dSSadaf Ebrahimi         if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA;
212*22dc650dSSadaf Ebrahimi         if (MAPBIT(map, ucp_Hangul) != 0)   chspecial |= FOUND_HANGUL;
213*22dc650dSSadaf Ebrahimi 
214*22dc650dSSadaf Ebrahimi         if (chspecial == 0) return FALSE;   /* Not allowed with Han */
215*22dc650dSSadaf Ebrahimi 
216*22dc650dSSadaf Ebrahimi         if (chspecial == FOUND_BOPOMOFO)
217*22dc650dSSadaf Ebrahimi           require_state = SCRIPT_HANBOPOMOFO;
218*22dc650dSSadaf Ebrahimi         else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
219*22dc650dSSadaf Ebrahimi           require_state = SCRIPT_HANHIRAKATA;
220*22dc650dSSadaf Ebrahimi 
221*22dc650dSSadaf Ebrahimi         /* Otherwise this character must be allowed with all of them, so remain
222*22dc650dSSadaf Ebrahimi         in the pending state. */
223*22dc650dSSadaf Ebrahimi         }
224*22dc650dSSadaf Ebrahimi       break;
225*22dc650dSSadaf Ebrahimi 
226*22dc650dSSadaf Ebrahimi       /* Previously encountered one of the "with Han" scripts. Check that
227*22dc650dSSadaf Ebrahimi       this character is appropriate. */
228*22dc650dSSadaf Ebrahimi 
229*22dc650dSSadaf Ebrahimi       case SCRIPT_HANHIRAKATA:
230*22dc650dSSadaf Ebrahimi       if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hiragana) +
231*22dc650dSSadaf Ebrahimi           MAPBIT(map, ucp_Katakana) == 0) return FALSE;
232*22dc650dSSadaf Ebrahimi       break;
233*22dc650dSSadaf Ebrahimi 
234*22dc650dSSadaf Ebrahimi       case SCRIPT_HANBOPOMOFO:
235*22dc650dSSadaf Ebrahimi       if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Bopomofo) == 0) return FALSE;
236*22dc650dSSadaf Ebrahimi       break;
237*22dc650dSSadaf Ebrahimi 
238*22dc650dSSadaf Ebrahimi       case SCRIPT_HANHANGUL:
239*22dc650dSSadaf Ebrahimi       if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hangul) == 0) return FALSE;
240*22dc650dSSadaf Ebrahimi       break;
241*22dc650dSSadaf Ebrahimi 
242*22dc650dSSadaf Ebrahimi       /* Previously encountered one or more characters that are allowed with a
243*22dc650dSSadaf Ebrahimi       list of scripts. */
244*22dc650dSSadaf Ebrahimi 
245*22dc650dSSadaf Ebrahimi       case SCRIPT_MAP:
246*22dc650dSSadaf Ebrahimi       OK = FALSE;
247*22dc650dSSadaf Ebrahimi 
248*22dc650dSSadaf Ebrahimi       for (int i = 0; i < FULL_MAPSIZE; i++)
249*22dc650dSSadaf Ebrahimi         {
250*22dc650dSSadaf Ebrahimi         if ((require_map[i] & map[i]) != 0)
251*22dc650dSSadaf Ebrahimi           {
252*22dc650dSSadaf Ebrahimi           OK = TRUE;
253*22dc650dSSadaf Ebrahimi           break;
254*22dc650dSSadaf Ebrahimi           }
255*22dc650dSSadaf Ebrahimi         }
256*22dc650dSSadaf Ebrahimi 
257*22dc650dSSadaf Ebrahimi       if (!OK) return FALSE;
258*22dc650dSSadaf Ebrahimi 
259*22dc650dSSadaf Ebrahimi       /* The rest of the string must be in this script, but we have to
260*22dc650dSSadaf Ebrahimi       allow for the Han complications. */
261*22dc650dSSadaf Ebrahimi 
262*22dc650dSSadaf Ebrahimi       switch(script)
263*22dc650dSSadaf Ebrahimi         {
264*22dc650dSSadaf Ebrahimi         case ucp_Han:
265*22dc650dSSadaf Ebrahimi         require_state = SCRIPT_HANPENDING;
266*22dc650dSSadaf Ebrahimi         break;
267*22dc650dSSadaf Ebrahimi 
268*22dc650dSSadaf Ebrahimi         case ucp_Hiragana:
269*22dc650dSSadaf Ebrahimi         case ucp_Katakana:
270*22dc650dSSadaf Ebrahimi         require_state = SCRIPT_HANHIRAKATA;
271*22dc650dSSadaf Ebrahimi         break;
272*22dc650dSSadaf Ebrahimi 
273*22dc650dSSadaf Ebrahimi         case ucp_Bopomofo:
274*22dc650dSSadaf Ebrahimi         require_state = SCRIPT_HANBOPOMOFO;
275*22dc650dSSadaf Ebrahimi         break;
276*22dc650dSSadaf Ebrahimi 
277*22dc650dSSadaf Ebrahimi         case ucp_Hangul:
278*22dc650dSSadaf Ebrahimi         require_state = SCRIPT_HANHANGUL;
279*22dc650dSSadaf Ebrahimi         break;
280*22dc650dSSadaf Ebrahimi 
281*22dc650dSSadaf Ebrahimi         /* Compute the intersection of the required list of scripts and the
282*22dc650dSSadaf Ebrahimi         allowed scripts for this character. */
283*22dc650dSSadaf Ebrahimi 
284*22dc650dSSadaf Ebrahimi         default:
285*22dc650dSSadaf Ebrahimi         for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] &= map[i];
286*22dc650dSSadaf Ebrahimi         break;
287*22dc650dSSadaf Ebrahimi         }
288*22dc650dSSadaf Ebrahimi 
289*22dc650dSSadaf Ebrahimi       break;
290*22dc650dSSadaf Ebrahimi       }
291*22dc650dSSadaf Ebrahimi     }   /* End checking character's script and extensions. */
292*22dc650dSSadaf Ebrahimi 
293*22dc650dSSadaf Ebrahimi   /* The character is in an acceptable script. We must now ensure that all
294*22dc650dSSadaf Ebrahimi   decimal digits in the string come from the same set. Some scripts (e.g.
295*22dc650dSSadaf Ebrahimi   Common, Arabic) have more than one set of decimal digits. This code does
296*22dc650dSSadaf Ebrahimi   not allow mixing sets, even within the same script. The vector called
297*22dc650dSSadaf Ebrahimi   PRIV(ucd_digit_sets)[] contains, in its first element, the number of
298*22dc650dSSadaf Ebrahimi   following elements, and then, in ascending order, the code points of the
299*22dc650dSSadaf Ebrahimi   '9' characters in every set of 10 digits. Each set is identified by the
300*22dc650dSSadaf Ebrahimi   offset in the vector of its '9' character. An initial check of the first
301*22dc650dSSadaf Ebrahimi   value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
302*22dc650dSSadaf Ebrahimi 
303*22dc650dSSadaf Ebrahimi   if (ucd->chartype == ucp_Nd)
304*22dc650dSSadaf Ebrahimi     {
305*22dc650dSSadaf Ebrahimi     uint32_t digitset;
306*22dc650dSSadaf Ebrahimi 
307*22dc650dSSadaf Ebrahimi     if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
308*22dc650dSSadaf Ebrahimi       {
309*22dc650dSSadaf Ebrahimi       int mid;
310*22dc650dSSadaf Ebrahimi       int bot = 1;
311*22dc650dSSadaf Ebrahimi       int top = PRIV(ucd_digit_sets)[0];
312*22dc650dSSadaf Ebrahimi       for (;;)
313*22dc650dSSadaf Ebrahimi         {
314*22dc650dSSadaf Ebrahimi         if (top <= bot + 1)    /* <= rather than == is paranoia */
315*22dc650dSSadaf Ebrahimi           {
316*22dc650dSSadaf Ebrahimi           digitset = top;
317*22dc650dSSadaf Ebrahimi           break;
318*22dc650dSSadaf Ebrahimi           }
319*22dc650dSSadaf Ebrahimi         mid = (top + bot) / 2;
320*22dc650dSSadaf Ebrahimi         if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
321*22dc650dSSadaf Ebrahimi         }
322*22dc650dSSadaf Ebrahimi       }
323*22dc650dSSadaf Ebrahimi 
324*22dc650dSSadaf Ebrahimi     /* A required value of 0 means "unset". */
325*22dc650dSSadaf Ebrahimi 
326*22dc650dSSadaf Ebrahimi     if (require_digitset == 0) require_digitset = digitset;
327*22dc650dSSadaf Ebrahimi       else if (digitset != require_digitset) return FALSE;
328*22dc650dSSadaf Ebrahimi     }   /* End digit handling */
329*22dc650dSSadaf Ebrahimi 
330*22dc650dSSadaf Ebrahimi   /* If we haven't yet got to the end, pick up the next character. */
331*22dc650dSSadaf Ebrahimi 
332*22dc650dSSadaf Ebrahimi   if (ptr >= endptr) return TRUE;
333*22dc650dSSadaf Ebrahimi   GETCHARINCTEST(c, ptr);
334*22dc650dSSadaf Ebrahimi   }  /* End checking loop */
335*22dc650dSSadaf Ebrahimi 
336*22dc650dSSadaf Ebrahimi #else   /* NOT SUPPORT_UNICODE */
337*22dc650dSSadaf Ebrahimi (void)ptr;
338*22dc650dSSadaf Ebrahimi (void)endptr;
339*22dc650dSSadaf Ebrahimi (void)utf;
340*22dc650dSSadaf Ebrahimi return TRUE;
341*22dc650dSSadaf Ebrahimi #endif  /* SUPPORT_UNICODE */
342*22dc650dSSadaf Ebrahimi }
343*22dc650dSSadaf Ebrahimi 
344*22dc650dSSadaf Ebrahimi /* End of pcre2_script_run.c */
345