1 /*
2 Unicode character type helpers.
3
4 Written by Marc-Andre Lemburg ([email protected]).
5 Modified for Python 2.0 by Fredrik Lundh ([email protected])
6
7 Copyright (c) Corporation for National Research Initiatives.
8
9 */
10
11 #include "Python.h"
12
13 #define ALPHA_MASK 0x01
14 #define DECIMAL_MASK 0x02
15 #define DIGIT_MASK 0x04
16 #define LOWER_MASK 0x08
17 #define TITLE_MASK 0x40
18 #define UPPER_MASK 0x80
19 #define XID_START_MASK 0x100
20 #define XID_CONTINUE_MASK 0x200
21 #define PRINTABLE_MASK 0x400
22 #define NUMERIC_MASK 0x800
23 #define CASE_IGNORABLE_MASK 0x1000
24 #define CASED_MASK 0x2000
25 #define EXTENDED_CASE_MASK 0x4000
26
27 typedef struct {
28 /*
29 These are either deltas to the character or offsets in
30 _PyUnicode_ExtendedCase.
31 */
32 const int upper;
33 const int lower;
34 const int title;
35 /* Note if more flag space is needed, decimal and digit could be unified. */
36 const unsigned char decimal;
37 const unsigned char digit;
38 const unsigned short flags;
39 } _PyUnicode_TypeRecord;
40
41 #include "unicodetype_db.h"
42
43 static const _PyUnicode_TypeRecord *
gettyperecord(Py_UCS4 code)44 gettyperecord(Py_UCS4 code)
45 {
46 int index;
47
48 if (code >= 0x110000)
49 index = 0;
50 else
51 {
52 index = index1[(code>>SHIFT)];
53 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
54 }
55
56 return &_PyUnicode_TypeRecords[index];
57 }
58
59 /* Returns the titlecase Unicode characters corresponding to ch or just
60 ch if no titlecase mapping is known. */
61
_PyUnicode_ToTitlecase(Py_UCS4 ch)62 Py_UCS4 _PyUnicode_ToTitlecase(Py_UCS4 ch)
63 {
64 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
65
66 if (ctype->flags & EXTENDED_CASE_MASK)
67 return _PyUnicode_ExtendedCase[ctype->title & 0xFFFF];
68 return ch + ctype->title;
69 }
70
71 /* Returns 1 for Unicode characters having the category 'Lt', 0
72 otherwise. */
73
_PyUnicode_IsTitlecase(Py_UCS4 ch)74 int _PyUnicode_IsTitlecase(Py_UCS4 ch)
75 {
76 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
77
78 return (ctype->flags & TITLE_MASK) != 0;
79 }
80
81 /* Returns 1 for Unicode characters having the XID_Start property, 0
82 otherwise. */
83
_PyUnicode_IsXidStart(Py_UCS4 ch)84 int _PyUnicode_IsXidStart(Py_UCS4 ch)
85 {
86 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
87
88 return (ctype->flags & XID_START_MASK) != 0;
89 }
90
91 /* Returns 1 for Unicode characters having the XID_Continue property,
92 0 otherwise. */
93
_PyUnicode_IsXidContinue(Py_UCS4 ch)94 int _PyUnicode_IsXidContinue(Py_UCS4 ch)
95 {
96 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
97
98 return (ctype->flags & XID_CONTINUE_MASK) != 0;
99 }
100
101 /* Returns the integer decimal (0-9) for Unicode characters having
102 this property, -1 otherwise. */
103
_PyUnicode_ToDecimalDigit(Py_UCS4 ch)104 int _PyUnicode_ToDecimalDigit(Py_UCS4 ch)
105 {
106 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
107
108 return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1;
109 }
110
_PyUnicode_IsDecimalDigit(Py_UCS4 ch)111 int _PyUnicode_IsDecimalDigit(Py_UCS4 ch)
112 {
113 if (_PyUnicode_ToDecimalDigit(ch) < 0)
114 return 0;
115 return 1;
116 }
117
118 /* Returns the integer digit (0-9) for Unicode characters having
119 this property, -1 otherwise. */
120
_PyUnicode_ToDigit(Py_UCS4 ch)121 int _PyUnicode_ToDigit(Py_UCS4 ch)
122 {
123 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
124
125 return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1;
126 }
127
_PyUnicode_IsDigit(Py_UCS4 ch)128 int _PyUnicode_IsDigit(Py_UCS4 ch)
129 {
130 if (_PyUnicode_ToDigit(ch) < 0)
131 return 0;
132 return 1;
133 }
134
135 /* Returns the numeric value as double for Unicode characters having
136 this property, -1.0 otherwise. */
137
_PyUnicode_IsNumeric(Py_UCS4 ch)138 int _PyUnicode_IsNumeric(Py_UCS4 ch)
139 {
140 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
141
142 return (ctype->flags & NUMERIC_MASK) != 0;
143 }
144
145 /* Returns 1 for Unicode characters to be hex-escaped when repr()ed,
146 0 otherwise.
147 All characters except those characters defined in the Unicode character
148 database as following categories are considered printable.
149 * Cc (Other, Control)
150 * Cf (Other, Format)
151 * Cs (Other, Surrogate)
152 * Co (Other, Private Use)
153 * Cn (Other, Not Assigned)
154 * Zl Separator, Line ('\u2028', LINE SEPARATOR)
155 * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
156 * Zs (Separator, Space) other than ASCII space('\x20').
157 */
_PyUnicode_IsPrintable(Py_UCS4 ch)158 int _PyUnicode_IsPrintable(Py_UCS4 ch)
159 {
160 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
161
162 return (ctype->flags & PRINTABLE_MASK) != 0;
163 }
164
165 /* Returns 1 for Unicode characters having the category 'Ll', 0
166 otherwise. */
167
_PyUnicode_IsLowercase(Py_UCS4 ch)168 int _PyUnicode_IsLowercase(Py_UCS4 ch)
169 {
170 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
171
172 return (ctype->flags & LOWER_MASK) != 0;
173 }
174
175 /* Returns 1 for Unicode characters having the category 'Lu', 0
176 otherwise. */
177
_PyUnicode_IsUppercase(Py_UCS4 ch)178 int _PyUnicode_IsUppercase(Py_UCS4 ch)
179 {
180 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
181
182 return (ctype->flags & UPPER_MASK) != 0;
183 }
184
185 /* Returns the uppercase Unicode characters corresponding to ch or just
186 ch if no uppercase mapping is known. */
187
_PyUnicode_ToUppercase(Py_UCS4 ch)188 Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
189 {
190 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
191
192 if (ctype->flags & EXTENDED_CASE_MASK)
193 return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFF];
194 return ch + ctype->upper;
195 }
196
197 /* Returns the lowercase Unicode characters corresponding to ch or just
198 ch if no lowercase mapping is known. */
199
_PyUnicode_ToLowercase(Py_UCS4 ch)200 Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
201 {
202 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
203
204 if (ctype->flags & EXTENDED_CASE_MASK)
205 return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFF];
206 return ch + ctype->lower;
207 }
208
_PyUnicode_ToLowerFull(Py_UCS4 ch,Py_UCS4 * res)209 int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res)
210 {
211 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
212
213 if (ctype->flags & EXTENDED_CASE_MASK) {
214 int index = ctype->lower & 0xFFFF;
215 int n = ctype->lower >> 24;
216 int i;
217 for (i = 0; i < n; i++)
218 res[i] = _PyUnicode_ExtendedCase[index + i];
219 return n;
220 }
221 res[0] = ch + ctype->lower;
222 return 1;
223 }
224
_PyUnicode_ToTitleFull(Py_UCS4 ch,Py_UCS4 * res)225 int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res)
226 {
227 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
228
229 if (ctype->flags & EXTENDED_CASE_MASK) {
230 int index = ctype->title & 0xFFFF;
231 int n = ctype->title >> 24;
232 int i;
233 for (i = 0; i < n; i++)
234 res[i] = _PyUnicode_ExtendedCase[index + i];
235 return n;
236 }
237 res[0] = ch + ctype->title;
238 return 1;
239 }
240
_PyUnicode_ToUpperFull(Py_UCS4 ch,Py_UCS4 * res)241 int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
242 {
243 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
244
245 if (ctype->flags & EXTENDED_CASE_MASK) {
246 int index = ctype->upper & 0xFFFF;
247 int n = ctype->upper >> 24;
248 int i;
249 for (i = 0; i < n; i++)
250 res[i] = _PyUnicode_ExtendedCase[index + i];
251 return n;
252 }
253 res[0] = ch + ctype->upper;
254 return 1;
255 }
256
_PyUnicode_ToFoldedFull(Py_UCS4 ch,Py_UCS4 * res)257 int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res)
258 {
259 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
260
261 if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) {
262 int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24);
263 int n = (ctype->lower >> 20) & 7;
264 int i;
265 for (i = 0; i < n; i++)
266 res[i] = _PyUnicode_ExtendedCase[index + i];
267 return n;
268 }
269 return _PyUnicode_ToLowerFull(ch, res);
270 }
271
_PyUnicode_IsCased(Py_UCS4 ch)272 int _PyUnicode_IsCased(Py_UCS4 ch)
273 {
274 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
275
276 return (ctype->flags & CASED_MASK) != 0;
277 }
278
_PyUnicode_IsCaseIgnorable(Py_UCS4 ch)279 int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch)
280 {
281 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
282
283 return (ctype->flags & CASE_IGNORABLE_MASK) != 0;
284 }
285
286 /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
287 'Lo' or 'Lm', 0 otherwise. */
288
_PyUnicode_IsAlpha(Py_UCS4 ch)289 int _PyUnicode_IsAlpha(Py_UCS4 ch)
290 {
291 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
292
293 return (ctype->flags & ALPHA_MASK) != 0;
294 }
295
296