xref: /aosp_15_r20/external/icu/icu4c/source/extra/scrptrun/scrptrun.cpp (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others.
2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html
3*0e209d39SAndroid Build Coastguard Worker /*
4*0e209d39SAndroid Build Coastguard Worker  *******************************************************************************
5*0e209d39SAndroid Build Coastguard Worker  *
6*0e209d39SAndroid Build Coastguard Worker  *   Copyright (C) 1999-2016, International Business Machines
7*0e209d39SAndroid Build Coastguard Worker  *   Corporation and others.  All Rights Reserved.
8*0e209d39SAndroid Build Coastguard Worker  *
9*0e209d39SAndroid Build Coastguard Worker  *******************************************************************************
10*0e209d39SAndroid Build Coastguard Worker  *   file name:  scrptrun.cpp
11*0e209d39SAndroid Build Coastguard Worker  *
12*0e209d39SAndroid Build Coastguard Worker  *   created on: 10/17/2001
13*0e209d39SAndroid Build Coastguard Worker  *   created by: Eric R. Mader
14*0e209d39SAndroid Build Coastguard Worker  */
15*0e209d39SAndroid Build Coastguard Worker 
16*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h"
17*0e209d39SAndroid Build Coastguard Worker #include "unicode/uscript.h"
18*0e209d39SAndroid Build Coastguard Worker 
19*0e209d39SAndroid Build Coastguard Worker #include "cmemory.h"
20*0e209d39SAndroid Build Coastguard Worker #include "scrptrun.h"
21*0e209d39SAndroid Build Coastguard Worker 
22*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN
23*0e209d39SAndroid Build Coastguard Worker 
24*0e209d39SAndroid Build Coastguard Worker const char ScriptRun::fgClassID=0;
25*0e209d39SAndroid Build Coastguard Worker 
26*0e209d39SAndroid Build Coastguard Worker UChar32 ScriptRun::pairedChars[] = {
27*0e209d39SAndroid Build Coastguard Worker     0x0028, 0x0029, // ascii paired punctuation
28*0e209d39SAndroid Build Coastguard Worker     0x003c, 0x003e,
29*0e209d39SAndroid Build Coastguard Worker     0x005b, 0x005d,
30*0e209d39SAndroid Build Coastguard Worker     0x007b, 0x007d,
31*0e209d39SAndroid Build Coastguard Worker     0x00ab, 0x00bb, // guillemets
32*0e209d39SAndroid Build Coastguard Worker     0x2018, 0x2019, // general punctuation
33*0e209d39SAndroid Build Coastguard Worker     0x201c, 0x201d,
34*0e209d39SAndroid Build Coastguard Worker     0x2039, 0x203a,
35*0e209d39SAndroid Build Coastguard Worker     0x3008, 0x3009, // chinese paired punctuation
36*0e209d39SAndroid Build Coastguard Worker     0x300a, 0x300b,
37*0e209d39SAndroid Build Coastguard Worker     0x300c, 0x300d,
38*0e209d39SAndroid Build Coastguard Worker     0x300e, 0x300f,
39*0e209d39SAndroid Build Coastguard Worker     0x3010, 0x3011,
40*0e209d39SAndroid Build Coastguard Worker     0x3014, 0x3015,
41*0e209d39SAndroid Build Coastguard Worker     0x3016, 0x3017,
42*0e209d39SAndroid Build Coastguard Worker     0x3018, 0x3019,
43*0e209d39SAndroid Build Coastguard Worker     0x301a, 0x301b
44*0e209d39SAndroid Build Coastguard Worker };
45*0e209d39SAndroid Build Coastguard Worker 
46*0e209d39SAndroid Build Coastguard Worker const int32_t ScriptRun::pairedCharCount = UPRV_LENGTHOF(pairedChars);
47*0e209d39SAndroid Build Coastguard Worker const int32_t ScriptRun::pairedCharPower = 1 << highBit(pairedCharCount);
48*0e209d39SAndroid Build Coastguard Worker const int32_t ScriptRun::pairedCharExtra = pairedCharCount - pairedCharPower;
49*0e209d39SAndroid Build Coastguard Worker 
highBit(int32_t value)50*0e209d39SAndroid Build Coastguard Worker int8_t ScriptRun::highBit(int32_t value)
51*0e209d39SAndroid Build Coastguard Worker {
52*0e209d39SAndroid Build Coastguard Worker     if (value <= 0) {
53*0e209d39SAndroid Build Coastguard Worker         return -32;
54*0e209d39SAndroid Build Coastguard Worker     }
55*0e209d39SAndroid Build Coastguard Worker 
56*0e209d39SAndroid Build Coastguard Worker     int8_t bit = 0;
57*0e209d39SAndroid Build Coastguard Worker 
58*0e209d39SAndroid Build Coastguard Worker     if (value >= 1 << 16) {
59*0e209d39SAndroid Build Coastguard Worker         value >>= 16;
60*0e209d39SAndroid Build Coastguard Worker         bit += 16;
61*0e209d39SAndroid Build Coastguard Worker     }
62*0e209d39SAndroid Build Coastguard Worker 
63*0e209d39SAndroid Build Coastguard Worker     if (value >= 1 << 8) {
64*0e209d39SAndroid Build Coastguard Worker         value >>= 8;
65*0e209d39SAndroid Build Coastguard Worker         bit += 8;
66*0e209d39SAndroid Build Coastguard Worker     }
67*0e209d39SAndroid Build Coastguard Worker 
68*0e209d39SAndroid Build Coastguard Worker     if (value >= 1 << 4) {
69*0e209d39SAndroid Build Coastguard Worker         value >>= 4;
70*0e209d39SAndroid Build Coastguard Worker         bit += 4;
71*0e209d39SAndroid Build Coastguard Worker     }
72*0e209d39SAndroid Build Coastguard Worker 
73*0e209d39SAndroid Build Coastguard Worker     if (value >= 1 << 2) {
74*0e209d39SAndroid Build Coastguard Worker         value >>= 2;
75*0e209d39SAndroid Build Coastguard Worker         bit += 2;
76*0e209d39SAndroid Build Coastguard Worker     }
77*0e209d39SAndroid Build Coastguard Worker 
78*0e209d39SAndroid Build Coastguard Worker     if (value >= 1 << 1) {
79*0e209d39SAndroid Build Coastguard Worker         value >>= 1;
80*0e209d39SAndroid Build Coastguard Worker         bit += 1;
81*0e209d39SAndroid Build Coastguard Worker     }
82*0e209d39SAndroid Build Coastguard Worker 
83*0e209d39SAndroid Build Coastguard Worker     return bit;
84*0e209d39SAndroid Build Coastguard Worker }
85*0e209d39SAndroid Build Coastguard Worker 
getPairIndex(UChar32 ch)86*0e209d39SAndroid Build Coastguard Worker int32_t ScriptRun::getPairIndex(UChar32 ch)
87*0e209d39SAndroid Build Coastguard Worker {
88*0e209d39SAndroid Build Coastguard Worker     int32_t probe = pairedCharPower;
89*0e209d39SAndroid Build Coastguard Worker     int32_t index = 0;
90*0e209d39SAndroid Build Coastguard Worker 
91*0e209d39SAndroid Build Coastguard Worker     if (ch >= pairedChars[pairedCharExtra]) {
92*0e209d39SAndroid Build Coastguard Worker         index = pairedCharExtra;
93*0e209d39SAndroid Build Coastguard Worker     }
94*0e209d39SAndroid Build Coastguard Worker 
95*0e209d39SAndroid Build Coastguard Worker     while (probe > (1 << 0)) {
96*0e209d39SAndroid Build Coastguard Worker         probe >>= 1;
97*0e209d39SAndroid Build Coastguard Worker 
98*0e209d39SAndroid Build Coastguard Worker         if (ch >= pairedChars[index + probe]) {
99*0e209d39SAndroid Build Coastguard Worker             index += probe;
100*0e209d39SAndroid Build Coastguard Worker         }
101*0e209d39SAndroid Build Coastguard Worker     }
102*0e209d39SAndroid Build Coastguard Worker 
103*0e209d39SAndroid Build Coastguard Worker     if (pairedChars[index] != ch) {
104*0e209d39SAndroid Build Coastguard Worker         index = -1;
105*0e209d39SAndroid Build Coastguard Worker     }
106*0e209d39SAndroid Build Coastguard Worker 
107*0e209d39SAndroid Build Coastguard Worker     return index;
108*0e209d39SAndroid Build Coastguard Worker }
109*0e209d39SAndroid Build Coastguard Worker 
sameScript(int32_t scriptOne,int32_t scriptTwo)110*0e209d39SAndroid Build Coastguard Worker UBool ScriptRun::sameScript(int32_t scriptOne, int32_t scriptTwo)
111*0e209d39SAndroid Build Coastguard Worker {
112*0e209d39SAndroid Build Coastguard Worker     return scriptOne <= USCRIPT_INHERITED || scriptTwo <= USCRIPT_INHERITED || scriptOne == scriptTwo;
113*0e209d39SAndroid Build Coastguard Worker }
114*0e209d39SAndroid Build Coastguard Worker 
next()115*0e209d39SAndroid Build Coastguard Worker UBool ScriptRun::next()
116*0e209d39SAndroid Build Coastguard Worker {
117*0e209d39SAndroid Build Coastguard Worker     int32_t startSP  = parenSP;  // used to find the first new open character
118*0e209d39SAndroid Build Coastguard Worker     UErrorCode error = U_ZERO_ERROR;
119*0e209d39SAndroid Build Coastguard Worker 
120*0e209d39SAndroid Build Coastguard Worker     // if we've fallen off the end of the text, we're done
121*0e209d39SAndroid Build Coastguard Worker     if (scriptEnd >= charLimit) {
122*0e209d39SAndroid Build Coastguard Worker         return false;
123*0e209d39SAndroid Build Coastguard Worker     }
124*0e209d39SAndroid Build Coastguard Worker 
125*0e209d39SAndroid Build Coastguard Worker     scriptCode = USCRIPT_COMMON;
126*0e209d39SAndroid Build Coastguard Worker 
127*0e209d39SAndroid Build Coastguard Worker     for (scriptStart = scriptEnd; scriptEnd < charLimit; scriptEnd += 1) {
128*0e209d39SAndroid Build Coastguard Worker         char16_t   high = charArray[scriptEnd];
129*0e209d39SAndroid Build Coastguard Worker         UChar32 ch   = high;
130*0e209d39SAndroid Build Coastguard Worker 
131*0e209d39SAndroid Build Coastguard Worker         // if the character is a high surrogate and it's not the last one
132*0e209d39SAndroid Build Coastguard Worker         // in the text, see if it's followed by a low surrogate
133*0e209d39SAndroid Build Coastguard Worker         if (high >= 0xD800 && high <= 0xDBFF && scriptEnd < charLimit - 1)
134*0e209d39SAndroid Build Coastguard Worker         {
135*0e209d39SAndroid Build Coastguard Worker             char16_t low = charArray[scriptEnd + 1];
136*0e209d39SAndroid Build Coastguard Worker 
137*0e209d39SAndroid Build Coastguard Worker             // if it is followed by a low surrogate,
138*0e209d39SAndroid Build Coastguard Worker             // consume it and form the full character
139*0e209d39SAndroid Build Coastguard Worker             if (low >= 0xDC00 && low <= 0xDFFF) {
140*0e209d39SAndroid Build Coastguard Worker                 ch = (high - 0xD800) * 0x0400 + low - 0xDC00 + 0x10000;
141*0e209d39SAndroid Build Coastguard Worker                 scriptEnd += 1;
142*0e209d39SAndroid Build Coastguard Worker             }
143*0e209d39SAndroid Build Coastguard Worker         }
144*0e209d39SAndroid Build Coastguard Worker 
145*0e209d39SAndroid Build Coastguard Worker         UScriptCode sc = uscript_getScript(ch, &error);
146*0e209d39SAndroid Build Coastguard Worker         int32_t pairIndex = getPairIndex(ch);
147*0e209d39SAndroid Build Coastguard Worker 
148*0e209d39SAndroid Build Coastguard Worker         // Paired character handling:
149*0e209d39SAndroid Build Coastguard Worker         //
150*0e209d39SAndroid Build Coastguard Worker         // if it's an open character, push it onto the stack.
151*0e209d39SAndroid Build Coastguard Worker         // if it's a close character, find the matching open on the
152*0e209d39SAndroid Build Coastguard Worker         // stack, and use that script code. Any non-matching open
153*0e209d39SAndroid Build Coastguard Worker         // characters above it on the stack will be poped.
154*0e209d39SAndroid Build Coastguard Worker         if (pairIndex >= 0) {
155*0e209d39SAndroid Build Coastguard Worker             if ((pairIndex & 1) == 0) {
156*0e209d39SAndroid Build Coastguard Worker                 parenStack[++parenSP].pairIndex = pairIndex;
157*0e209d39SAndroid Build Coastguard Worker                 parenStack[parenSP].scriptCode  = scriptCode;
158*0e209d39SAndroid Build Coastguard Worker             } else if (parenSP >= 0) {
159*0e209d39SAndroid Build Coastguard Worker                 int32_t pi = pairIndex & ~1;
160*0e209d39SAndroid Build Coastguard Worker 
161*0e209d39SAndroid Build Coastguard Worker                 while (parenSP >= 0 && parenStack[parenSP].pairIndex != pi) {
162*0e209d39SAndroid Build Coastguard Worker                     parenSP -= 1;
163*0e209d39SAndroid Build Coastguard Worker                 }
164*0e209d39SAndroid Build Coastguard Worker 
165*0e209d39SAndroid Build Coastguard Worker                 if (parenSP < startSP) {
166*0e209d39SAndroid Build Coastguard Worker                     startSP = parenSP;
167*0e209d39SAndroid Build Coastguard Worker                 }
168*0e209d39SAndroid Build Coastguard Worker 
169*0e209d39SAndroid Build Coastguard Worker                 if (parenSP >= 0) {
170*0e209d39SAndroid Build Coastguard Worker                     sc = parenStack[parenSP].scriptCode;
171*0e209d39SAndroid Build Coastguard Worker                 }
172*0e209d39SAndroid Build Coastguard Worker             }
173*0e209d39SAndroid Build Coastguard Worker         }
174*0e209d39SAndroid Build Coastguard Worker 
175*0e209d39SAndroid Build Coastguard Worker         if (sameScript(scriptCode, sc)) {
176*0e209d39SAndroid Build Coastguard Worker             if (scriptCode <= USCRIPT_INHERITED && sc > USCRIPT_INHERITED) {
177*0e209d39SAndroid Build Coastguard Worker                 scriptCode = sc;
178*0e209d39SAndroid Build Coastguard Worker 
179*0e209d39SAndroid Build Coastguard Worker                 // now that we have a final script code, fix any open
180*0e209d39SAndroid Build Coastguard Worker                 // characters we pushed before we knew the script code.
181*0e209d39SAndroid Build Coastguard Worker                 while (startSP < parenSP) {
182*0e209d39SAndroid Build Coastguard Worker                     parenStack[++startSP].scriptCode = scriptCode;
183*0e209d39SAndroid Build Coastguard Worker                 }
184*0e209d39SAndroid Build Coastguard Worker             }
185*0e209d39SAndroid Build Coastguard Worker 
186*0e209d39SAndroid Build Coastguard Worker             // if this character is a close paired character,
187*0e209d39SAndroid Build Coastguard Worker             // pop it from the stack
188*0e209d39SAndroid Build Coastguard Worker             if (pairIndex >= 0 && (pairIndex & 1) != 0 && parenSP >= 0) {
189*0e209d39SAndroid Build Coastguard Worker                 parenSP -= 1;
190*0e209d39SAndroid Build Coastguard Worker                 startSP -= 1;
191*0e209d39SAndroid Build Coastguard Worker             }
192*0e209d39SAndroid Build Coastguard Worker         } else {
193*0e209d39SAndroid Build Coastguard Worker             // if the run broke on a surrogate pair,
194*0e209d39SAndroid Build Coastguard Worker             // end it before the high surrogate
195*0e209d39SAndroid Build Coastguard Worker             if (ch >= 0x10000) {
196*0e209d39SAndroid Build Coastguard Worker                 scriptEnd -= 1;
197*0e209d39SAndroid Build Coastguard Worker             }
198*0e209d39SAndroid Build Coastguard Worker 
199*0e209d39SAndroid Build Coastguard Worker             break;
200*0e209d39SAndroid Build Coastguard Worker         }
201*0e209d39SAndroid Build Coastguard Worker     }
202*0e209d39SAndroid Build Coastguard Worker 
203*0e209d39SAndroid Build Coastguard Worker     return true;
204*0e209d39SAndroid Build Coastguard Worker }
205*0e209d39SAndroid Build Coastguard Worker 
206*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END
207