1*22dc650dSSadaf Ebrahimi /*************************************************
2*22dc650dSSadaf Ebrahimi * Perl-Compatible Regular Expressions *
3*22dc650dSSadaf Ebrahimi *************************************************/
4*22dc650dSSadaf Ebrahimi
5*22dc650dSSadaf Ebrahimi /* PCRE is a library of functions to support regular expressions whose syntax
6*22dc650dSSadaf Ebrahimi and semantics are as close as possible to those of the Perl 5 language.
7*22dc650dSSadaf Ebrahimi
8*22dc650dSSadaf Ebrahimi Written by Philip Hazel
9*22dc650dSSadaf Ebrahimi Original API code Copyright (c) 1997-2012 University of Cambridge
10*22dc650dSSadaf Ebrahimi New API code Copyright (c) 2016-2021 University of Cambridge
11*22dc650dSSadaf Ebrahimi
12*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
13*22dc650dSSadaf Ebrahimi Redistribution and use in source and binary forms, with or without
14*22dc650dSSadaf Ebrahimi modification, are permitted provided that the following conditions are met:
15*22dc650dSSadaf Ebrahimi
16*22dc650dSSadaf Ebrahimi * Redistributions of source code must retain the above copyright notice,
17*22dc650dSSadaf Ebrahimi this list of conditions and the following disclaimer.
18*22dc650dSSadaf Ebrahimi
19*22dc650dSSadaf Ebrahimi * Redistributions in binary form must reproduce the above copyright
20*22dc650dSSadaf Ebrahimi notice, this list of conditions and the following disclaimer in the
21*22dc650dSSadaf Ebrahimi documentation and/or other materials provided with the distribution.
22*22dc650dSSadaf Ebrahimi
23*22dc650dSSadaf Ebrahimi * Neither the name of the University of Cambridge nor the names of its
24*22dc650dSSadaf Ebrahimi contributors may be used to endorse or promote products derived from
25*22dc650dSSadaf Ebrahimi this software without specific prior written permission.
26*22dc650dSSadaf Ebrahimi
27*22dc650dSSadaf Ebrahimi THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28*22dc650dSSadaf Ebrahimi AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29*22dc650dSSadaf Ebrahimi IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30*22dc650dSSadaf Ebrahimi ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31*22dc650dSSadaf Ebrahimi LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32*22dc650dSSadaf Ebrahimi CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33*22dc650dSSadaf Ebrahimi SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34*22dc650dSSadaf Ebrahimi INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35*22dc650dSSadaf Ebrahimi CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36*22dc650dSSadaf Ebrahimi ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37*22dc650dSSadaf Ebrahimi POSSIBILITY OF SUCH DAMAGE.
38*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
39*22dc650dSSadaf Ebrahimi */
40*22dc650dSSadaf Ebrahimi
41*22dc650dSSadaf Ebrahimi /* This module contains the function for checking a script run. */
42*22dc650dSSadaf Ebrahimi
43*22dc650dSSadaf Ebrahimi #ifdef HAVE_CONFIG_H
44*22dc650dSSadaf Ebrahimi #include "config.h"
45*22dc650dSSadaf Ebrahimi #endif
46*22dc650dSSadaf Ebrahimi
47*22dc650dSSadaf Ebrahimi #include "pcre2_internal.h"
48*22dc650dSSadaf Ebrahimi
49*22dc650dSSadaf Ebrahimi
50*22dc650dSSadaf Ebrahimi /*************************************************
51*22dc650dSSadaf Ebrahimi * Check script run *
52*22dc650dSSadaf Ebrahimi *************************************************/
53*22dc650dSSadaf Ebrahimi
54*22dc650dSSadaf Ebrahimi /* A script run is conceptually a sequence of characters all in the same
55*22dc650dSSadaf Ebrahimi Unicode script. However, it isn't quite that simple. There are special rules
56*22dc650dSSadaf Ebrahimi for scripts that are commonly used together, and also special rules for digits.
57*22dc650dSSadaf Ebrahimi This function implements the appropriate checks, which is possible only when
58*22dc650dSSadaf Ebrahimi PCRE2 is compiled with Unicode support. The function returns TRUE if there is
59*22dc650dSSadaf Ebrahimi no Unicode support; however, it should never be called in that circumstance
60*22dc650dSSadaf Ebrahimi because an error is given by pcre2_compile() if a script run is called for in a
61*22dc650dSSadaf Ebrahimi version of PCRE2 compiled without Unicode support.
62*22dc650dSSadaf Ebrahimi
63*22dc650dSSadaf Ebrahimi Arguments:
64*22dc650dSSadaf Ebrahimi pgr point to the first character
65*22dc650dSSadaf Ebrahimi endptr point after the last character
66*22dc650dSSadaf Ebrahimi utf TRUE if in UTF mode
67*22dc650dSSadaf Ebrahimi
68*22dc650dSSadaf Ebrahimi Returns: TRUE if this is a valid script run
69*22dc650dSSadaf Ebrahimi */
70*22dc650dSSadaf Ebrahimi
71*22dc650dSSadaf Ebrahimi /* These are states in the checking process. */
72*22dc650dSSadaf Ebrahimi
73*22dc650dSSadaf Ebrahimi enum { SCRIPT_UNSET, /* Requirement as yet unknown */
74*22dc650dSSadaf Ebrahimi SCRIPT_MAP, /* Bitmap contains acceptable scripts */
75*22dc650dSSadaf Ebrahimi SCRIPT_HANPENDING, /* Have had only Han characters */
76*22dc650dSSadaf Ebrahimi SCRIPT_HANHIRAKATA, /* Expect Han or Hirikata */
77*22dc650dSSadaf Ebrahimi SCRIPT_HANBOPOMOFO, /* Expect Han or Bopomofo */
78*22dc650dSSadaf Ebrahimi SCRIPT_HANHANGUL /* Expect Han or Hangul */
79*22dc650dSSadaf Ebrahimi };
80*22dc650dSSadaf Ebrahimi
81*22dc650dSSadaf Ebrahimi #define UCD_MAPSIZE (ucp_Unknown/32 + 1)
82*22dc650dSSadaf Ebrahimi #define FULL_MAPSIZE (ucp_Script_Count/32 + 1)
83*22dc650dSSadaf Ebrahimi
84*22dc650dSSadaf Ebrahimi BOOL
PRIV(script_run)85*22dc650dSSadaf Ebrahimi PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
86*22dc650dSSadaf Ebrahimi {
87*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
88*22dc650dSSadaf Ebrahimi uint32_t require_state = SCRIPT_UNSET;
89*22dc650dSSadaf Ebrahimi uint32_t require_map[FULL_MAPSIZE];
90*22dc650dSSadaf Ebrahimi uint32_t map[FULL_MAPSIZE];
91*22dc650dSSadaf Ebrahimi uint32_t require_digitset = 0;
92*22dc650dSSadaf Ebrahimi uint32_t c;
93*22dc650dSSadaf Ebrahimi
94*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 32
95*22dc650dSSadaf Ebrahimi (void)utf; /* Avoid compiler warning */
96*22dc650dSSadaf Ebrahimi #endif
97*22dc650dSSadaf Ebrahimi
98*22dc650dSSadaf Ebrahimi /* Any string containing fewer than 2 characters is a valid script run. */
99*22dc650dSSadaf Ebrahimi
100*22dc650dSSadaf Ebrahimi if (ptr >= endptr) return TRUE;
101*22dc650dSSadaf Ebrahimi GETCHARINCTEST(c, ptr);
102*22dc650dSSadaf Ebrahimi if (ptr >= endptr) return TRUE;
103*22dc650dSSadaf Ebrahimi
104*22dc650dSSadaf Ebrahimi /* Initialize the require map. This is a full-size bitmap that has a bit for
105*22dc650dSSadaf Ebrahimi every script, as opposed to the maps in ucd_script_sets, which only have bits
106*22dc650dSSadaf Ebrahimi for scripts less than ucp_Unknown - those that appear in script extension
107*22dc650dSSadaf Ebrahimi lists. */
108*22dc650dSSadaf Ebrahimi
109*22dc650dSSadaf Ebrahimi for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] = 0;
110*22dc650dSSadaf Ebrahimi
111*22dc650dSSadaf Ebrahimi /* Scan strings of two or more characters, checking the Unicode characteristics
112*22dc650dSSadaf Ebrahimi of each code point. There is special code for scripts that can be combined with
113*22dc650dSSadaf Ebrahimi characters from the Han Chinese script. This may be used in conjunction with
114*22dc650dSSadaf Ebrahimi four other scripts in these combinations:
115*22dc650dSSadaf Ebrahimi
116*22dc650dSSadaf Ebrahimi . Han with Hiragana and Katakana is allowed (for Japanese).
117*22dc650dSSadaf Ebrahimi . Han with Bopomofo is allowed (for Taiwanese Mandarin).
118*22dc650dSSadaf Ebrahimi . Han with Hangul is allowed (for Korean).
119*22dc650dSSadaf Ebrahimi
120*22dc650dSSadaf Ebrahimi If the first significant character's script is one of the four, the required
121*22dc650dSSadaf Ebrahimi script type is immediately known. However, if the first significant
122*22dc650dSSadaf Ebrahimi character's script is Han, we have to keep checking for a non-Han character.
123*22dc650dSSadaf Ebrahimi Hence the SCRIPT_HANPENDING state. */
124*22dc650dSSadaf Ebrahimi
125*22dc650dSSadaf Ebrahimi for (;;)
126*22dc650dSSadaf Ebrahimi {
127*22dc650dSSadaf Ebrahimi const ucd_record *ucd = GET_UCD(c);
128*22dc650dSSadaf Ebrahimi uint32_t script = ucd->script;
129*22dc650dSSadaf Ebrahimi
130*22dc650dSSadaf Ebrahimi /* If the script is Unknown, the string is not a valid script run. Such
131*22dc650dSSadaf Ebrahimi characters can only form script runs of length one (see test above). */
132*22dc650dSSadaf Ebrahimi
133*22dc650dSSadaf Ebrahimi if (script == ucp_Unknown) return FALSE;
134*22dc650dSSadaf Ebrahimi
135*22dc650dSSadaf Ebrahimi /* A character without any script extensions whose script is Inherited or
136*22dc650dSSadaf Ebrahimi Common is always accepted with any script. If there are extensions, the
137*22dc650dSSadaf Ebrahimi following processing happens for all scripts. */
138*22dc650dSSadaf Ebrahimi
139*22dc650dSSadaf Ebrahimi if (UCD_SCRIPTX_PROP(ucd) != 0 || (script != ucp_Inherited && script != ucp_Common))
140*22dc650dSSadaf Ebrahimi {
141*22dc650dSSadaf Ebrahimi BOOL OK;
142*22dc650dSSadaf Ebrahimi
143*22dc650dSSadaf Ebrahimi /* Set up a full-sized map for this character that can include bits for all
144*22dc650dSSadaf Ebrahimi scripts. Copy the scriptx map for this character (which covers those
145*22dc650dSSadaf Ebrahimi scripts that appear in script extension lists), set the remaining values to
146*22dc650dSSadaf Ebrahimi zero, and then, except for Common or Inherited, add this script's bit to
147*22dc650dSSadaf Ebrahimi the map. */
148*22dc650dSSadaf Ebrahimi
149*22dc650dSSadaf Ebrahimi memcpy(map, PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(ucd), UCD_MAPSIZE * sizeof(uint32_t));
150*22dc650dSSadaf Ebrahimi memset(map + UCD_MAPSIZE, 0, (FULL_MAPSIZE - UCD_MAPSIZE) * sizeof(uint32_t));
151*22dc650dSSadaf Ebrahimi if (script != ucp_Common && script != ucp_Inherited) MAPSET(map, script);
152*22dc650dSSadaf Ebrahimi
153*22dc650dSSadaf Ebrahimi /* Handle the different checking states */
154*22dc650dSSadaf Ebrahimi
155*22dc650dSSadaf Ebrahimi switch(require_state)
156*22dc650dSSadaf Ebrahimi {
157*22dc650dSSadaf Ebrahimi /* First significant character - it might follow Common or Inherited
158*22dc650dSSadaf Ebrahimi characters that do not have any script extensions. */
159*22dc650dSSadaf Ebrahimi
160*22dc650dSSadaf Ebrahimi case SCRIPT_UNSET:
161*22dc650dSSadaf Ebrahimi switch(script)
162*22dc650dSSadaf Ebrahimi {
163*22dc650dSSadaf Ebrahimi case ucp_Han:
164*22dc650dSSadaf Ebrahimi require_state = SCRIPT_HANPENDING;
165*22dc650dSSadaf Ebrahimi break;
166*22dc650dSSadaf Ebrahimi
167*22dc650dSSadaf Ebrahimi case ucp_Hiragana:
168*22dc650dSSadaf Ebrahimi case ucp_Katakana:
169*22dc650dSSadaf Ebrahimi require_state = SCRIPT_HANHIRAKATA;
170*22dc650dSSadaf Ebrahimi break;
171*22dc650dSSadaf Ebrahimi
172*22dc650dSSadaf Ebrahimi case ucp_Bopomofo:
173*22dc650dSSadaf Ebrahimi require_state = SCRIPT_HANBOPOMOFO;
174*22dc650dSSadaf Ebrahimi break;
175*22dc650dSSadaf Ebrahimi
176*22dc650dSSadaf Ebrahimi case ucp_Hangul:
177*22dc650dSSadaf Ebrahimi require_state = SCRIPT_HANHANGUL;
178*22dc650dSSadaf Ebrahimi break;
179*22dc650dSSadaf Ebrahimi
180*22dc650dSSadaf Ebrahimi default:
181*22dc650dSSadaf Ebrahimi memcpy(require_map, map, FULL_MAPSIZE * sizeof(uint32_t));
182*22dc650dSSadaf Ebrahimi require_state = SCRIPT_MAP;
183*22dc650dSSadaf Ebrahimi break;
184*22dc650dSSadaf Ebrahimi }
185*22dc650dSSadaf Ebrahimi break;
186*22dc650dSSadaf Ebrahimi
187*22dc650dSSadaf Ebrahimi /* The first significant character was Han. An inspection of the Unicode
188*22dc650dSSadaf Ebrahimi 11.0.0 files shows that there are the following types of Script Extension
189*22dc650dSSadaf Ebrahimi list that involve the Han, Bopomofo, Hiragana, Katakana, and Hangul
190*22dc650dSSadaf Ebrahimi scripts:
191*22dc650dSSadaf Ebrahimi
192*22dc650dSSadaf Ebrahimi . Bopomofo + Han
193*22dc650dSSadaf Ebrahimi . Han + Hiragana + Katakana
194*22dc650dSSadaf Ebrahimi . Hiragana + Katakana
195*22dc650dSSadaf Ebrahimi . Bopopmofo + Hangul + Han + Hiragana + Katakana
196*22dc650dSSadaf Ebrahimi
197*22dc650dSSadaf Ebrahimi The following code tries to make sense of this. */
198*22dc650dSSadaf Ebrahimi
199*22dc650dSSadaf Ebrahimi #define FOUND_BOPOMOFO 1
200*22dc650dSSadaf Ebrahimi #define FOUND_HIRAGANA 2
201*22dc650dSSadaf Ebrahimi #define FOUND_KATAKANA 4
202*22dc650dSSadaf Ebrahimi #define FOUND_HANGUL 8
203*22dc650dSSadaf Ebrahimi
204*22dc650dSSadaf Ebrahimi case SCRIPT_HANPENDING:
205*22dc650dSSadaf Ebrahimi if (script != ucp_Han) /* Another Han does nothing */
206*22dc650dSSadaf Ebrahimi {
207*22dc650dSSadaf Ebrahimi uint32_t chspecial = 0;
208*22dc650dSSadaf Ebrahimi
209*22dc650dSSadaf Ebrahimi if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO;
210*22dc650dSSadaf Ebrahimi if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA;
211*22dc650dSSadaf Ebrahimi if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA;
212*22dc650dSSadaf Ebrahimi if (MAPBIT(map, ucp_Hangul) != 0) chspecial |= FOUND_HANGUL;
213*22dc650dSSadaf Ebrahimi
214*22dc650dSSadaf Ebrahimi if (chspecial == 0) return FALSE; /* Not allowed with Han */
215*22dc650dSSadaf Ebrahimi
216*22dc650dSSadaf Ebrahimi if (chspecial == FOUND_BOPOMOFO)
217*22dc650dSSadaf Ebrahimi require_state = SCRIPT_HANBOPOMOFO;
218*22dc650dSSadaf Ebrahimi else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
219*22dc650dSSadaf Ebrahimi require_state = SCRIPT_HANHIRAKATA;
220*22dc650dSSadaf Ebrahimi
221*22dc650dSSadaf Ebrahimi /* Otherwise this character must be allowed with all of them, so remain
222*22dc650dSSadaf Ebrahimi in the pending state. */
223*22dc650dSSadaf Ebrahimi }
224*22dc650dSSadaf Ebrahimi break;
225*22dc650dSSadaf Ebrahimi
226*22dc650dSSadaf Ebrahimi /* Previously encountered one of the "with Han" scripts. Check that
227*22dc650dSSadaf Ebrahimi this character is appropriate. */
228*22dc650dSSadaf Ebrahimi
229*22dc650dSSadaf Ebrahimi case SCRIPT_HANHIRAKATA:
230*22dc650dSSadaf Ebrahimi if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hiragana) +
231*22dc650dSSadaf Ebrahimi MAPBIT(map, ucp_Katakana) == 0) return FALSE;
232*22dc650dSSadaf Ebrahimi break;
233*22dc650dSSadaf Ebrahimi
234*22dc650dSSadaf Ebrahimi case SCRIPT_HANBOPOMOFO:
235*22dc650dSSadaf Ebrahimi if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Bopomofo) == 0) return FALSE;
236*22dc650dSSadaf Ebrahimi break;
237*22dc650dSSadaf Ebrahimi
238*22dc650dSSadaf Ebrahimi case SCRIPT_HANHANGUL:
239*22dc650dSSadaf Ebrahimi if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hangul) == 0) return FALSE;
240*22dc650dSSadaf Ebrahimi break;
241*22dc650dSSadaf Ebrahimi
242*22dc650dSSadaf Ebrahimi /* Previously encountered one or more characters that are allowed with a
243*22dc650dSSadaf Ebrahimi list of scripts. */
244*22dc650dSSadaf Ebrahimi
245*22dc650dSSadaf Ebrahimi case SCRIPT_MAP:
246*22dc650dSSadaf Ebrahimi OK = FALSE;
247*22dc650dSSadaf Ebrahimi
248*22dc650dSSadaf Ebrahimi for (int i = 0; i < FULL_MAPSIZE; i++)
249*22dc650dSSadaf Ebrahimi {
250*22dc650dSSadaf Ebrahimi if ((require_map[i] & map[i]) != 0)
251*22dc650dSSadaf Ebrahimi {
252*22dc650dSSadaf Ebrahimi OK = TRUE;
253*22dc650dSSadaf Ebrahimi break;
254*22dc650dSSadaf Ebrahimi }
255*22dc650dSSadaf Ebrahimi }
256*22dc650dSSadaf Ebrahimi
257*22dc650dSSadaf Ebrahimi if (!OK) return FALSE;
258*22dc650dSSadaf Ebrahimi
259*22dc650dSSadaf Ebrahimi /* The rest of the string must be in this script, but we have to
260*22dc650dSSadaf Ebrahimi allow for the Han complications. */
261*22dc650dSSadaf Ebrahimi
262*22dc650dSSadaf Ebrahimi switch(script)
263*22dc650dSSadaf Ebrahimi {
264*22dc650dSSadaf Ebrahimi case ucp_Han:
265*22dc650dSSadaf Ebrahimi require_state = SCRIPT_HANPENDING;
266*22dc650dSSadaf Ebrahimi break;
267*22dc650dSSadaf Ebrahimi
268*22dc650dSSadaf Ebrahimi case ucp_Hiragana:
269*22dc650dSSadaf Ebrahimi case ucp_Katakana:
270*22dc650dSSadaf Ebrahimi require_state = SCRIPT_HANHIRAKATA;
271*22dc650dSSadaf Ebrahimi break;
272*22dc650dSSadaf Ebrahimi
273*22dc650dSSadaf Ebrahimi case ucp_Bopomofo:
274*22dc650dSSadaf Ebrahimi require_state = SCRIPT_HANBOPOMOFO;
275*22dc650dSSadaf Ebrahimi break;
276*22dc650dSSadaf Ebrahimi
277*22dc650dSSadaf Ebrahimi case ucp_Hangul:
278*22dc650dSSadaf Ebrahimi require_state = SCRIPT_HANHANGUL;
279*22dc650dSSadaf Ebrahimi break;
280*22dc650dSSadaf Ebrahimi
281*22dc650dSSadaf Ebrahimi /* Compute the intersection of the required list of scripts and the
282*22dc650dSSadaf Ebrahimi allowed scripts for this character. */
283*22dc650dSSadaf Ebrahimi
284*22dc650dSSadaf Ebrahimi default:
285*22dc650dSSadaf Ebrahimi for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] &= map[i];
286*22dc650dSSadaf Ebrahimi break;
287*22dc650dSSadaf Ebrahimi }
288*22dc650dSSadaf Ebrahimi
289*22dc650dSSadaf Ebrahimi break;
290*22dc650dSSadaf Ebrahimi }
291*22dc650dSSadaf Ebrahimi } /* End checking character's script and extensions. */
292*22dc650dSSadaf Ebrahimi
293*22dc650dSSadaf Ebrahimi /* The character is in an acceptable script. We must now ensure that all
294*22dc650dSSadaf Ebrahimi decimal digits in the string come from the same set. Some scripts (e.g.
295*22dc650dSSadaf Ebrahimi Common, Arabic) have more than one set of decimal digits. This code does
296*22dc650dSSadaf Ebrahimi not allow mixing sets, even within the same script. The vector called
297*22dc650dSSadaf Ebrahimi PRIV(ucd_digit_sets)[] contains, in its first element, the number of
298*22dc650dSSadaf Ebrahimi following elements, and then, in ascending order, the code points of the
299*22dc650dSSadaf Ebrahimi '9' characters in every set of 10 digits. Each set is identified by the
300*22dc650dSSadaf Ebrahimi offset in the vector of its '9' character. An initial check of the first
301*22dc650dSSadaf Ebrahimi value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
302*22dc650dSSadaf Ebrahimi
303*22dc650dSSadaf Ebrahimi if (ucd->chartype == ucp_Nd)
304*22dc650dSSadaf Ebrahimi {
305*22dc650dSSadaf Ebrahimi uint32_t digitset;
306*22dc650dSSadaf Ebrahimi
307*22dc650dSSadaf Ebrahimi if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
308*22dc650dSSadaf Ebrahimi {
309*22dc650dSSadaf Ebrahimi int mid;
310*22dc650dSSadaf Ebrahimi int bot = 1;
311*22dc650dSSadaf Ebrahimi int top = PRIV(ucd_digit_sets)[0];
312*22dc650dSSadaf Ebrahimi for (;;)
313*22dc650dSSadaf Ebrahimi {
314*22dc650dSSadaf Ebrahimi if (top <= bot + 1) /* <= rather than == is paranoia */
315*22dc650dSSadaf Ebrahimi {
316*22dc650dSSadaf Ebrahimi digitset = top;
317*22dc650dSSadaf Ebrahimi break;
318*22dc650dSSadaf Ebrahimi }
319*22dc650dSSadaf Ebrahimi mid = (top + bot) / 2;
320*22dc650dSSadaf Ebrahimi if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
321*22dc650dSSadaf Ebrahimi }
322*22dc650dSSadaf Ebrahimi }
323*22dc650dSSadaf Ebrahimi
324*22dc650dSSadaf Ebrahimi /* A required value of 0 means "unset". */
325*22dc650dSSadaf Ebrahimi
326*22dc650dSSadaf Ebrahimi if (require_digitset == 0) require_digitset = digitset;
327*22dc650dSSadaf Ebrahimi else if (digitset != require_digitset) return FALSE;
328*22dc650dSSadaf Ebrahimi } /* End digit handling */
329*22dc650dSSadaf Ebrahimi
330*22dc650dSSadaf Ebrahimi /* If we haven't yet got to the end, pick up the next character. */
331*22dc650dSSadaf Ebrahimi
332*22dc650dSSadaf Ebrahimi if (ptr >= endptr) return TRUE;
333*22dc650dSSadaf Ebrahimi GETCHARINCTEST(c, ptr);
334*22dc650dSSadaf Ebrahimi } /* End checking loop */
335*22dc650dSSadaf Ebrahimi
336*22dc650dSSadaf Ebrahimi #else /* NOT SUPPORT_UNICODE */
337*22dc650dSSadaf Ebrahimi (void)ptr;
338*22dc650dSSadaf Ebrahimi (void)endptr;
339*22dc650dSSadaf Ebrahimi (void)utf;
340*22dc650dSSadaf Ebrahimi return TRUE;
341*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_UNICODE */
342*22dc650dSSadaf Ebrahimi }
343*22dc650dSSadaf Ebrahimi
344*22dc650dSSadaf Ebrahimi /* End of pcre2_script_run.c */
345