1 /*
2 * Secret Labs' Regular Expression Engine
3 *
4 * regular expression matching engine
5 *
6 * partial history:
7 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; re-enable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
25 * 2013-02-04 mrab added fullmatch primitive
26 *
27 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
28 *
29 * This version of the SRE library can be redistributed under CNRI's
30 * Python 1.6 license. For any other use, please contact Secret Labs
31 * AB ([email protected]).
32 *
33 * Portions of this engine have been developed in cooperation with
34 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
35 * other compatibility work.
36 */
37
38 static const char copyright[] =
39 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
40
41 #define PY_SSIZE_T_CLEAN
42
43 #include "Python.h"
44 #include "pycore_long.h" // _PyLong_GetZero()
45 #include "pycore_moduleobject.h" // _PyModule_GetState()
46 #include "structmember.h" // PyMemberDef
47
48 #include "sre.h"
49
50 #define SRE_CODE_BITS (8 * sizeof(SRE_CODE))
51
52 #include <ctype.h>
53
54 /* name of this module, minus the leading underscore */
55 #if !defined(SRE_MODULE)
56 #define SRE_MODULE "sre"
57 #endif
58
59 #define SRE_PY_MODULE "re"
60
61 /* defining this one enables tracing */
62 #undef VERBOSE
63
64 /* -------------------------------------------------------------------- */
65
66 #if defined(_MSC_VER)
67 #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
68 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
69 /* fastest possible local call under MSVC */
70 #define LOCAL(type) static __inline type __fastcall
71 #else
72 #define LOCAL(type) static inline type
73 #endif
74
75 /* error codes */
76 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
77 #define SRE_ERROR_STATE -2 /* illegal state */
78 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
79 #define SRE_ERROR_MEMORY -9 /* out of memory */
80 #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
81
82 #if defined(VERBOSE)
83 #define TRACE(v) printf v
84 #else
85 #define TRACE(v)
86 #endif
87
88 /* -------------------------------------------------------------------- */
89 /* search engine state */
90
91 #define SRE_IS_DIGIT(ch)\
92 ((ch) <= '9' && Py_ISDIGIT(ch))
93 #define SRE_IS_SPACE(ch)\
94 ((ch) <= ' ' && Py_ISSPACE(ch))
95 #define SRE_IS_LINEBREAK(ch)\
96 ((ch) == '\n')
97 #define SRE_IS_WORD(ch)\
98 ((ch) <= 'z' && (Py_ISALNUM(ch) || (ch) == '_'))
99
sre_lower_ascii(unsigned int ch)100 static unsigned int sre_lower_ascii(unsigned int ch)
101 {
102 return ((ch) < 128 ? Py_TOLOWER(ch) : ch);
103 }
104
105 /* locale-specific character predicates */
106 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
107 * warnings when c's type supports only numbers < N+1 */
108 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
109 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
110
sre_lower_locale(unsigned int ch)111 static unsigned int sre_lower_locale(unsigned int ch)
112 {
113 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
114 }
115
sre_upper_locale(unsigned int ch)116 static unsigned int sre_upper_locale(unsigned int ch)
117 {
118 return ((ch) < 256 ? (unsigned int)toupper((ch)) : ch);
119 }
120
121 /* unicode-specific character predicates */
122
123 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
124 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE(ch)
125 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch)
126 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch)
127 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_')
128
sre_lower_unicode(unsigned int ch)129 static unsigned int sre_lower_unicode(unsigned int ch)
130 {
131 return (unsigned int) Py_UNICODE_TOLOWER(ch);
132 }
133
sre_upper_unicode(unsigned int ch)134 static unsigned int sre_upper_unicode(unsigned int ch)
135 {
136 return (unsigned int) Py_UNICODE_TOUPPER(ch);
137 }
138
139 LOCAL(int)
sre_category(SRE_CODE category,unsigned int ch)140 sre_category(SRE_CODE category, unsigned int ch)
141 {
142 switch (category) {
143
144 case SRE_CATEGORY_DIGIT:
145 return SRE_IS_DIGIT(ch);
146 case SRE_CATEGORY_NOT_DIGIT:
147 return !SRE_IS_DIGIT(ch);
148 case SRE_CATEGORY_SPACE:
149 return SRE_IS_SPACE(ch);
150 case SRE_CATEGORY_NOT_SPACE:
151 return !SRE_IS_SPACE(ch);
152 case SRE_CATEGORY_WORD:
153 return SRE_IS_WORD(ch);
154 case SRE_CATEGORY_NOT_WORD:
155 return !SRE_IS_WORD(ch);
156 case SRE_CATEGORY_LINEBREAK:
157 return SRE_IS_LINEBREAK(ch);
158 case SRE_CATEGORY_NOT_LINEBREAK:
159 return !SRE_IS_LINEBREAK(ch);
160
161 case SRE_CATEGORY_LOC_WORD:
162 return SRE_LOC_IS_WORD(ch);
163 case SRE_CATEGORY_LOC_NOT_WORD:
164 return !SRE_LOC_IS_WORD(ch);
165
166 case SRE_CATEGORY_UNI_DIGIT:
167 return SRE_UNI_IS_DIGIT(ch);
168 case SRE_CATEGORY_UNI_NOT_DIGIT:
169 return !SRE_UNI_IS_DIGIT(ch);
170 case SRE_CATEGORY_UNI_SPACE:
171 return SRE_UNI_IS_SPACE(ch);
172 case SRE_CATEGORY_UNI_NOT_SPACE:
173 return !SRE_UNI_IS_SPACE(ch);
174 case SRE_CATEGORY_UNI_WORD:
175 return SRE_UNI_IS_WORD(ch);
176 case SRE_CATEGORY_UNI_NOT_WORD:
177 return !SRE_UNI_IS_WORD(ch);
178 case SRE_CATEGORY_UNI_LINEBREAK:
179 return SRE_UNI_IS_LINEBREAK(ch);
180 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
181 return !SRE_UNI_IS_LINEBREAK(ch);
182 }
183 return 0;
184 }
185
186 LOCAL(int)
char_loc_ignore(SRE_CODE pattern,SRE_CODE ch)187 char_loc_ignore(SRE_CODE pattern, SRE_CODE ch)
188 {
189 return ch == pattern
190 || (SRE_CODE) sre_lower_locale(ch) == pattern
191 || (SRE_CODE) sre_upper_locale(ch) == pattern;
192 }
193
194
195 /* helpers */
196
197 static void
data_stack_dealloc(SRE_STATE * state)198 data_stack_dealloc(SRE_STATE* state)
199 {
200 if (state->data_stack) {
201 PyMem_Free(state->data_stack);
202 state->data_stack = NULL;
203 }
204 state->data_stack_size = state->data_stack_base = 0;
205 }
206
207 static int
data_stack_grow(SRE_STATE * state,Py_ssize_t size)208 data_stack_grow(SRE_STATE* state, Py_ssize_t size)
209 {
210 Py_ssize_t minsize, cursize;
211 minsize = state->data_stack_base+size;
212 cursize = state->data_stack_size;
213 if (cursize < minsize) {
214 void* stack;
215 cursize = minsize+minsize/4+1024;
216 TRACE(("allocate/grow stack %zd\n", cursize));
217 stack = PyMem_Realloc(state->data_stack, cursize);
218 if (!stack) {
219 data_stack_dealloc(state);
220 return SRE_ERROR_MEMORY;
221 }
222 state->data_stack = (char *)stack;
223 state->data_stack_size = cursize;
224 }
225 return 0;
226 }
227
228 /* generate 8-bit version */
229
230 #define SRE_CHAR Py_UCS1
231 #define SIZEOF_SRE_CHAR 1
232 #define SRE(F) sre_ucs1_##F
233 #include "sre_lib.h"
234
235 /* generate 16-bit unicode version */
236
237 #define SRE_CHAR Py_UCS2
238 #define SIZEOF_SRE_CHAR 2
239 #define SRE(F) sre_ucs2_##F
240 #include "sre_lib.h"
241
242 /* generate 32-bit unicode version */
243
244 #define SRE_CHAR Py_UCS4
245 #define SIZEOF_SRE_CHAR 4
246 #define SRE(F) sre_ucs4_##F
247 #include "sre_lib.h"
248
249 /* -------------------------------------------------------------------- */
250 /* factories and destructors */
251
252 /* module state */
253 typedef struct {
254 PyTypeObject *Pattern_Type;
255 PyTypeObject *Match_Type;
256 PyTypeObject *Scanner_Type;
257 } _sremodulestate;
258
259 static _sremodulestate *
get_sre_module_state(PyObject * m)260 get_sre_module_state(PyObject *m)
261 {
262 _sremodulestate *state = (_sremodulestate *)_PyModule_GetState(m);
263 assert(state);
264 return state;
265 }
266
267 static struct PyModuleDef sremodule;
268 #define get_sre_module_state_by_class(cls) \
269 (get_sre_module_state(PyType_GetModule(cls)))
270
271 /* see sre.h for object declarations */
272 static PyObject*pattern_new_match(_sremodulestate *, PatternObject*, SRE_STATE*, Py_ssize_t);
273 static PyObject *pattern_scanner(_sremodulestate *, PatternObject *, PyObject *, Py_ssize_t, Py_ssize_t);
274
275 /*[clinic input]
276 module _sre
277 class _sre.SRE_Pattern "PatternObject *" "get_sre_module_state_by_class(tp)->Pattern_Type"
278 class _sre.SRE_Match "MatchObject *" "get_sre_module_state_by_class(tp)->Match_Type"
279 class _sre.SRE_Scanner "ScannerObject *" "get_sre_module_state_by_class(tp)->Scanner_Type"
280 [clinic start generated code]*/
281 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=fe2966e32b66a231]*/
282
283 /*[clinic input]
284 _sre.getcodesize -> int
285 [clinic start generated code]*/
286
287 static int
_sre_getcodesize_impl(PyObject * module)288 _sre_getcodesize_impl(PyObject *module)
289 /*[clinic end generated code: output=e0db7ce34a6dd7b1 input=bd6f6ecf4916bb2b]*/
290 {
291 return sizeof(SRE_CODE);
292 }
293
294 /*[clinic input]
295 _sre.ascii_iscased -> bool
296
297 character: int
298 /
299
300 [clinic start generated code]*/
301
302 static int
_sre_ascii_iscased_impl(PyObject * module,int character)303 _sre_ascii_iscased_impl(PyObject *module, int character)
304 /*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/
305 {
306 unsigned int ch = (unsigned int)character;
307 return ch < 128 && Py_ISALPHA(ch);
308 }
309
310 /*[clinic input]
311 _sre.unicode_iscased -> bool
312
313 character: int
314 /
315
316 [clinic start generated code]*/
317
318 static int
_sre_unicode_iscased_impl(PyObject * module,int character)319 _sre_unicode_iscased_impl(PyObject *module, int character)
320 /*[clinic end generated code: output=9c5ddee0dc2bc258 input=51e42c3b8dddb78e]*/
321 {
322 unsigned int ch = (unsigned int)character;
323 return ch != sre_lower_unicode(ch) || ch != sre_upper_unicode(ch);
324 }
325
326 /*[clinic input]
327 _sre.ascii_tolower -> int
328
329 character: int
330 /
331
332 [clinic start generated code]*/
333
334 static int
_sre_ascii_tolower_impl(PyObject * module,int character)335 _sre_ascii_tolower_impl(PyObject *module, int character)
336 /*[clinic end generated code: output=228294ed6ff2a612 input=272c609b5b61f136]*/
337 {
338 return sre_lower_ascii(character);
339 }
340
341 /*[clinic input]
342 _sre.unicode_tolower -> int
343
344 character: int
345 /
346
347 [clinic start generated code]*/
348
349 static int
_sre_unicode_tolower_impl(PyObject * module,int character)350 _sre_unicode_tolower_impl(PyObject *module, int character)
351 /*[clinic end generated code: output=6422272d7d7fee65 input=91d708c5f3c2045a]*/
352 {
353 return sre_lower_unicode(character);
354 }
355
356 LOCAL(void)
state_reset(SRE_STATE * state)357 state_reset(SRE_STATE* state)
358 {
359 /* state->mark will be set to 0 in SRE_OP_MARK dynamically. */
360 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
361
362 state->lastmark = -1;
363 state->lastindex = -1;
364
365 state->repeat = NULL;
366
367 data_stack_dealloc(state);
368 }
369
370 static const void*
getstring(PyObject * string,Py_ssize_t * p_length,int * p_isbytes,int * p_charsize,Py_buffer * view)371 getstring(PyObject* string, Py_ssize_t* p_length,
372 int* p_isbytes, int* p_charsize,
373 Py_buffer *view)
374 {
375 /* given a python object, return a data pointer, a length (in
376 characters), and a character size. return NULL if the object
377 is not a string (or not compatible) */
378
379 /* Unicode objects do not support the buffer API. So, get the data
380 directly instead. */
381 if (PyUnicode_Check(string)) {
382 if (PyUnicode_READY(string) == -1)
383 return NULL;
384 *p_length = PyUnicode_GET_LENGTH(string);
385 *p_charsize = PyUnicode_KIND(string);
386 *p_isbytes = 0;
387 return PyUnicode_DATA(string);
388 }
389
390 /* get pointer to byte string buffer */
391 if (PyObject_GetBuffer(string, view, PyBUF_SIMPLE) != 0) {
392 PyErr_Format(PyExc_TypeError, "expected string or bytes-like "
393 "object, got '%.200s'", Py_TYPE(string)->tp_name);
394 return NULL;
395 }
396
397 *p_length = view->len;
398 *p_charsize = 1;
399 *p_isbytes = 1;
400
401 if (view->buf == NULL) {
402 PyErr_SetString(PyExc_ValueError, "Buffer is NULL");
403 PyBuffer_Release(view);
404 view->buf = NULL;
405 return NULL;
406 }
407 return view->buf;
408 }
409
410 LOCAL(PyObject*)
state_init(SRE_STATE * state,PatternObject * pattern,PyObject * string,Py_ssize_t start,Py_ssize_t end)411 state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
412 Py_ssize_t start, Py_ssize_t end)
413 {
414 /* prepare state object */
415
416 Py_ssize_t length;
417 int isbytes, charsize;
418 const void* ptr;
419
420 memset(state, 0, sizeof(SRE_STATE));
421
422 state->mark = PyMem_New(const void *, pattern->groups * 2);
423 if (!state->mark) {
424 PyErr_NoMemory();
425 goto err;
426 }
427 state->lastmark = -1;
428 state->lastindex = -1;
429
430 state->buffer.buf = NULL;
431 ptr = getstring(string, &length, &isbytes, &charsize, &state->buffer);
432 if (!ptr)
433 goto err;
434
435 if (isbytes && pattern->isbytes == 0) {
436 PyErr_SetString(PyExc_TypeError,
437 "cannot use a string pattern on a bytes-like object");
438 goto err;
439 }
440 if (!isbytes && pattern->isbytes > 0) {
441 PyErr_SetString(PyExc_TypeError,
442 "cannot use a bytes pattern on a string-like object");
443 goto err;
444 }
445
446 /* adjust boundaries */
447 if (start < 0)
448 start = 0;
449 else if (start > length)
450 start = length;
451
452 if (end < 0)
453 end = 0;
454 else if (end > length)
455 end = length;
456
457 state->isbytes = isbytes;
458 state->charsize = charsize;
459 state->match_all = 0;
460 state->must_advance = 0;
461
462 state->beginning = ptr;
463
464 state->start = (void*) ((char*) ptr + start * state->charsize);
465 state->end = (void*) ((char*) ptr + end * state->charsize);
466
467 Py_INCREF(string);
468 state->string = string;
469 state->pos = start;
470 state->endpos = end;
471
472 return string;
473 err:
474 /* We add an explicit cast here because MSVC has a bug when
475 compiling C code where it believes that `const void**` cannot be
476 safely casted to `void*`, see bpo-39943 for details. */
477 PyMem_Free((void*) state->mark);
478 state->mark = NULL;
479 if (state->buffer.buf)
480 PyBuffer_Release(&state->buffer);
481 return NULL;
482 }
483
484 LOCAL(void)
state_fini(SRE_STATE * state)485 state_fini(SRE_STATE* state)
486 {
487 if (state->buffer.buf)
488 PyBuffer_Release(&state->buffer);
489 Py_XDECREF(state->string);
490 data_stack_dealloc(state);
491 /* See above PyMem_Del for why we explicitly cast here. */
492 PyMem_Free((void*) state->mark);
493 state->mark = NULL;
494 }
495
496 /* calculate offset from start of string */
497 #define STATE_OFFSET(state, member)\
498 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
499
500 LOCAL(PyObject*)
getslice(int isbytes,const void * ptr,PyObject * string,Py_ssize_t start,Py_ssize_t end)501 getslice(int isbytes, const void *ptr,
502 PyObject* string, Py_ssize_t start, Py_ssize_t end)
503 {
504 if (isbytes) {
505 if (PyBytes_CheckExact(string) &&
506 start == 0 && end == PyBytes_GET_SIZE(string)) {
507 Py_INCREF(string);
508 return string;
509 }
510 return PyBytes_FromStringAndSize(
511 (const char *)ptr + start, end - start);
512 }
513 else {
514 return PyUnicode_Substring(string, start, end);
515 }
516 }
517
518 LOCAL(PyObject*)
state_getslice(SRE_STATE * state,Py_ssize_t index,PyObject * string,int empty)519 state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
520 {
521 Py_ssize_t i, j;
522
523 index = (index - 1) * 2;
524
525 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
526 if (empty)
527 /* want empty string */
528 i = j = 0;
529 else {
530 Py_RETURN_NONE;
531 }
532 } else {
533 i = STATE_OFFSET(state, state->mark[index]);
534 j = STATE_OFFSET(state, state->mark[index+1]);
535
536 /* check wrong span */
537 if (i > j) {
538 PyErr_SetString(PyExc_SystemError,
539 "The span of capturing group is wrong,"
540 " please report a bug for the re module.");
541 return NULL;
542 }
543 }
544
545 return getslice(state->isbytes, state->beginning, string, i, j);
546 }
547
548 static void
pattern_error(Py_ssize_t status)549 pattern_error(Py_ssize_t status)
550 {
551 switch (status) {
552 case SRE_ERROR_RECURSION_LIMIT:
553 /* This error code seems to be unused. */
554 PyErr_SetString(
555 PyExc_RecursionError,
556 "maximum recursion limit exceeded"
557 );
558 break;
559 case SRE_ERROR_MEMORY:
560 PyErr_NoMemory();
561 break;
562 case SRE_ERROR_INTERRUPTED:
563 /* An exception has already been raised, so let it fly */
564 break;
565 default:
566 /* other error codes indicate compiler/engine bugs */
567 PyErr_SetString(
568 PyExc_RuntimeError,
569 "internal error in regular expression engine"
570 );
571 }
572 }
573
574 static int
pattern_traverse(PatternObject * self,visitproc visit,void * arg)575 pattern_traverse(PatternObject *self, visitproc visit, void *arg)
576 {
577 Py_VISIT(Py_TYPE(self));
578 Py_VISIT(self->groupindex);
579 Py_VISIT(self->indexgroup);
580 Py_VISIT(self->pattern);
581 return 0;
582 }
583
584 static int
pattern_clear(PatternObject * self)585 pattern_clear(PatternObject *self)
586 {
587 Py_CLEAR(self->groupindex);
588 Py_CLEAR(self->indexgroup);
589 Py_CLEAR(self->pattern);
590 return 0;
591 }
592
593 static void
pattern_dealloc(PatternObject * self)594 pattern_dealloc(PatternObject* self)
595 {
596 PyTypeObject *tp = Py_TYPE(self);
597
598 PyObject_GC_UnTrack(self);
599 if (self->weakreflist != NULL) {
600 PyObject_ClearWeakRefs((PyObject *) self);
601 }
602 (void)pattern_clear(self);
603 tp->tp_free(self);
604 Py_DECREF(tp);
605 }
606
607 LOCAL(Py_ssize_t)
sre_match(SRE_STATE * state,SRE_CODE * pattern)608 sre_match(SRE_STATE* state, SRE_CODE* pattern)
609 {
610 if (state->charsize == 1)
611 return sre_ucs1_match(state, pattern, 1);
612 if (state->charsize == 2)
613 return sre_ucs2_match(state, pattern, 1);
614 assert(state->charsize == 4);
615 return sre_ucs4_match(state, pattern, 1);
616 }
617
618 LOCAL(Py_ssize_t)
sre_search(SRE_STATE * state,SRE_CODE * pattern)619 sre_search(SRE_STATE* state, SRE_CODE* pattern)
620 {
621 if (state->charsize == 1)
622 return sre_ucs1_search(state, pattern);
623 if (state->charsize == 2)
624 return sre_ucs2_search(state, pattern);
625 assert(state->charsize == 4);
626 return sre_ucs4_search(state, pattern);
627 }
628
629 /*[clinic input]
630 _sre.SRE_Pattern.match
631
632 cls: defining_class
633 /
634 string: object
635 pos: Py_ssize_t = 0
636 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
637
638 Matches zero or more characters at the beginning of the string.
639 [clinic start generated code]*/
640
641 static PyObject *
_sre_SRE_Pattern_match_impl(PatternObject * self,PyTypeObject * cls,PyObject * string,Py_ssize_t pos,Py_ssize_t endpos)642 _sre_SRE_Pattern_match_impl(PatternObject *self, PyTypeObject *cls,
643 PyObject *string, Py_ssize_t pos,
644 Py_ssize_t endpos)
645 /*[clinic end generated code: output=ec6208ea58a0cca0 input=4bdb9c3e564d13ac]*/
646 {
647 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
648 SRE_STATE state;
649 Py_ssize_t status;
650 PyObject *match;
651
652 if (!state_init(&state, (PatternObject *)self, string, pos, endpos))
653 return NULL;
654
655 state.ptr = state.start;
656
657 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
658
659 status = sre_match(&state, PatternObject_GetCode(self));
660
661 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
662 if (PyErr_Occurred()) {
663 state_fini(&state);
664 return NULL;
665 }
666
667 match = pattern_new_match(module_state, self, &state, status);
668 state_fini(&state);
669 return match;
670 }
671
672 /*[clinic input]
673 _sre.SRE_Pattern.fullmatch
674
675 cls: defining_class
676 /
677 string: object
678 pos: Py_ssize_t = 0
679 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
680
681 Matches against all of the string.
682 [clinic start generated code]*/
683
684 static PyObject *
_sre_SRE_Pattern_fullmatch_impl(PatternObject * self,PyTypeObject * cls,PyObject * string,Py_ssize_t pos,Py_ssize_t endpos)685 _sre_SRE_Pattern_fullmatch_impl(PatternObject *self, PyTypeObject *cls,
686 PyObject *string, Py_ssize_t pos,
687 Py_ssize_t endpos)
688 /*[clinic end generated code: output=625b75b027ef94da input=50981172ab0fcfdd]*/
689 {
690 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
691 SRE_STATE state;
692 Py_ssize_t status;
693 PyObject *match;
694
695 if (!state_init(&state, self, string, pos, endpos))
696 return NULL;
697
698 state.ptr = state.start;
699
700 TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr));
701
702 state.match_all = 1;
703 status = sre_match(&state, PatternObject_GetCode(self));
704
705 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
706 if (PyErr_Occurred()) {
707 state_fini(&state);
708 return NULL;
709 }
710
711 match = pattern_new_match(module_state, self, &state, status);
712 state_fini(&state);
713 return match;
714 }
715
716 /*[clinic input]
717 _sre.SRE_Pattern.search
718
719 cls: defining_class
720 /
721 string: object
722 pos: Py_ssize_t = 0
723 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
724
725 Scan through string looking for a match, and return a corresponding match object instance.
726
727 Return None if no position in the string matches.
728 [clinic start generated code]*/
729
730 static PyObject *
_sre_SRE_Pattern_search_impl(PatternObject * self,PyTypeObject * cls,PyObject * string,Py_ssize_t pos,Py_ssize_t endpos)731 _sre_SRE_Pattern_search_impl(PatternObject *self, PyTypeObject *cls,
732 PyObject *string, Py_ssize_t pos,
733 Py_ssize_t endpos)
734 /*[clinic end generated code: output=bd7f2d9d583e1463 input=afa9afb66a74a4b3]*/
735 {
736 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
737 SRE_STATE state;
738 Py_ssize_t status;
739 PyObject *match;
740
741 if (!state_init(&state, self, string, pos, endpos))
742 return NULL;
743
744 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
745
746 status = sre_search(&state, PatternObject_GetCode(self));
747
748 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
749
750 if (PyErr_Occurred()) {
751 state_fini(&state);
752 return NULL;
753 }
754
755 match = pattern_new_match(module_state, self, &state, status);
756 state_fini(&state);
757 return match;
758 }
759
760 static PyObject*
call(const char * module,const char * function,PyObject * args)761 call(const char* module, const char* function, PyObject* args)
762 {
763 PyObject* name;
764 PyObject* mod;
765 PyObject* func;
766 PyObject* result;
767
768 if (!args)
769 return NULL;
770 name = PyUnicode_FromString(module);
771 if (!name)
772 return NULL;
773 mod = PyImport_Import(name);
774 Py_DECREF(name);
775 if (!mod)
776 return NULL;
777 func = PyObject_GetAttrString(mod, function);
778 Py_DECREF(mod);
779 if (!func)
780 return NULL;
781 result = PyObject_CallObject(func, args);
782 Py_DECREF(func);
783 Py_DECREF(args);
784 return result;
785 }
786
787 /*[clinic input]
788 _sre.SRE_Pattern.findall
789
790 string: object
791 pos: Py_ssize_t = 0
792 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
793
794 Return a list of all non-overlapping matches of pattern in string.
795 [clinic start generated code]*/
796
797 static PyObject *
_sre_SRE_Pattern_findall_impl(PatternObject * self,PyObject * string,Py_ssize_t pos,Py_ssize_t endpos)798 _sre_SRE_Pattern_findall_impl(PatternObject *self, PyObject *string,
799 Py_ssize_t pos, Py_ssize_t endpos)
800 /*[clinic end generated code: output=f4966baceea60aca input=5b6a4ee799741563]*/
801 {
802 SRE_STATE state;
803 PyObject* list;
804 Py_ssize_t status;
805 Py_ssize_t i, b, e;
806
807 if (!state_init(&state, self, string, pos, endpos))
808 return NULL;
809
810 list = PyList_New(0);
811 if (!list) {
812 state_fini(&state);
813 return NULL;
814 }
815
816 while (state.start <= state.end) {
817
818 PyObject* item;
819
820 state_reset(&state);
821
822 state.ptr = state.start;
823
824 status = sre_search(&state, PatternObject_GetCode(self));
825 if (PyErr_Occurred())
826 goto error;
827
828 if (status <= 0) {
829 if (status == 0)
830 break;
831 pattern_error(status);
832 goto error;
833 }
834
835 /* don't bother to build a match object */
836 switch (self->groups) {
837 case 0:
838 b = STATE_OFFSET(&state, state.start);
839 e = STATE_OFFSET(&state, state.ptr);
840 item = getslice(state.isbytes, state.beginning,
841 string, b, e);
842 if (!item)
843 goto error;
844 break;
845 case 1:
846 item = state_getslice(&state, 1, string, 1);
847 if (!item)
848 goto error;
849 break;
850 default:
851 item = PyTuple_New(self->groups);
852 if (!item)
853 goto error;
854 for (i = 0; i < self->groups; i++) {
855 PyObject* o = state_getslice(&state, i+1, string, 1);
856 if (!o) {
857 Py_DECREF(item);
858 goto error;
859 }
860 PyTuple_SET_ITEM(item, i, o);
861 }
862 break;
863 }
864
865 status = PyList_Append(list, item);
866 Py_DECREF(item);
867 if (status < 0)
868 goto error;
869
870 state.must_advance = (state.ptr == state.start);
871 state.start = state.ptr;
872 }
873
874 state_fini(&state);
875 return list;
876
877 error:
878 Py_DECREF(list);
879 state_fini(&state);
880 return NULL;
881
882 }
883
884 /*[clinic input]
885 _sre.SRE_Pattern.finditer
886
887 cls: defining_class
888 /
889 string: object
890 pos: Py_ssize_t = 0
891 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
892
893 Return an iterator over all non-overlapping matches for the RE pattern in string.
894
895 For each match, the iterator returns a match object.
896 [clinic start generated code]*/
897
898 static PyObject *
_sre_SRE_Pattern_finditer_impl(PatternObject * self,PyTypeObject * cls,PyObject * string,Py_ssize_t pos,Py_ssize_t endpos)899 _sre_SRE_Pattern_finditer_impl(PatternObject *self, PyTypeObject *cls,
900 PyObject *string, Py_ssize_t pos,
901 Py_ssize_t endpos)
902 /*[clinic end generated code: output=1791dbf3618ade56 input=812e332a4848cbaf]*/
903 {
904 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
905 PyObject* scanner;
906 PyObject* search;
907 PyObject* iterator;
908
909 scanner = pattern_scanner(module_state, self, string, pos, endpos);
910 if (!scanner)
911 return NULL;
912
913 search = PyObject_GetAttrString(scanner, "search");
914 Py_DECREF(scanner);
915 if (!search)
916 return NULL;
917
918 iterator = PyCallIter_New(search, Py_None);
919 Py_DECREF(search);
920
921 return iterator;
922 }
923
924 /*[clinic input]
925 _sre.SRE_Pattern.scanner
926
927 cls: defining_class
928 /
929 string: object
930 pos: Py_ssize_t = 0
931 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
932
933 [clinic start generated code]*/
934
935 static PyObject *
_sre_SRE_Pattern_scanner_impl(PatternObject * self,PyTypeObject * cls,PyObject * string,Py_ssize_t pos,Py_ssize_t endpos)936 _sre_SRE_Pattern_scanner_impl(PatternObject *self, PyTypeObject *cls,
937 PyObject *string, Py_ssize_t pos,
938 Py_ssize_t endpos)
939 /*[clinic end generated code: output=f70cd506112f1bd9 input=2e487e5151bcee4c]*/
940 {
941 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
942
943 return pattern_scanner(module_state, self, string, pos, endpos);
944 }
945
946 /*[clinic input]
947 _sre.SRE_Pattern.split
948
949 string: object
950 maxsplit: Py_ssize_t = 0
951
952 Split string by the occurrences of pattern.
953 [clinic start generated code]*/
954
955 static PyObject *
_sre_SRE_Pattern_split_impl(PatternObject * self,PyObject * string,Py_ssize_t maxsplit)956 _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string,
957 Py_ssize_t maxsplit)
958 /*[clinic end generated code: output=7ac66f381c45e0be input=1eeeb10dafc9947a]*/
959 {
960 SRE_STATE state;
961 PyObject* list;
962 PyObject* item;
963 Py_ssize_t status;
964 Py_ssize_t n;
965 Py_ssize_t i;
966 const void* last;
967
968 assert(self->codesize != 0);
969
970 if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX))
971 return NULL;
972
973 list = PyList_New(0);
974 if (!list) {
975 state_fini(&state);
976 return NULL;
977 }
978
979 n = 0;
980 last = state.start;
981
982 while (!maxsplit || n < maxsplit) {
983
984 state_reset(&state);
985
986 state.ptr = state.start;
987
988 status = sre_search(&state, PatternObject_GetCode(self));
989 if (PyErr_Occurred())
990 goto error;
991
992 if (status <= 0) {
993 if (status == 0)
994 break;
995 pattern_error(status);
996 goto error;
997 }
998
999 /* get segment before this match */
1000 item = getslice(state.isbytes, state.beginning,
1001 string, STATE_OFFSET(&state, last),
1002 STATE_OFFSET(&state, state.start)
1003 );
1004 if (!item)
1005 goto error;
1006 status = PyList_Append(list, item);
1007 Py_DECREF(item);
1008 if (status < 0)
1009 goto error;
1010
1011 /* add groups (if any) */
1012 for (i = 0; i < self->groups; i++) {
1013 item = state_getslice(&state, i+1, string, 0);
1014 if (!item)
1015 goto error;
1016 status = PyList_Append(list, item);
1017 Py_DECREF(item);
1018 if (status < 0)
1019 goto error;
1020 }
1021
1022 n = n + 1;
1023 state.must_advance = (state.ptr == state.start);
1024 last = state.start = state.ptr;
1025
1026 }
1027
1028 /* get segment following last match (even if empty) */
1029 item = getslice(state.isbytes, state.beginning,
1030 string, STATE_OFFSET(&state, last), state.endpos
1031 );
1032 if (!item)
1033 goto error;
1034 status = PyList_Append(list, item);
1035 Py_DECREF(item);
1036 if (status < 0)
1037 goto error;
1038
1039 state_fini(&state);
1040 return list;
1041
1042 error:
1043 Py_DECREF(list);
1044 state_fini(&state);
1045 return NULL;
1046
1047 }
1048
1049 static PyObject*
pattern_subx(_sremodulestate * module_state,PatternObject * self,PyObject * ptemplate,PyObject * string,Py_ssize_t count,Py_ssize_t subn)1050 pattern_subx(_sremodulestate* module_state,
1051 PatternObject* self,
1052 PyObject* ptemplate,
1053 PyObject* string,
1054 Py_ssize_t count,
1055 Py_ssize_t subn)
1056 {
1057 SRE_STATE state;
1058 PyObject* list;
1059 PyObject* joiner;
1060 PyObject* item;
1061 PyObject* filter;
1062 PyObject* match;
1063 const void* ptr;
1064 Py_ssize_t status;
1065 Py_ssize_t n;
1066 Py_ssize_t i, b, e;
1067 int isbytes, charsize;
1068 int filter_is_callable;
1069 Py_buffer view;
1070
1071 if (PyCallable_Check(ptemplate)) {
1072 /* sub/subn takes either a function or a template */
1073 filter = ptemplate;
1074 Py_INCREF(filter);
1075 filter_is_callable = 1;
1076 } else {
1077 /* if not callable, check if it's a literal string */
1078 int literal;
1079 view.buf = NULL;
1080 ptr = getstring(ptemplate, &n, &isbytes, &charsize, &view);
1081 if (ptr) {
1082 if (charsize == 1)
1083 literal = memchr(ptr, '\\', n) == NULL;
1084 else
1085 literal = PyUnicode_FindChar(ptemplate, '\\', 0, n, 1) == -1;
1086 } else {
1087 PyErr_Clear();
1088 literal = 0;
1089 }
1090 if (view.buf)
1091 PyBuffer_Release(&view);
1092 if (literal) {
1093 filter = ptemplate;
1094 Py_INCREF(filter);
1095 filter_is_callable = 0;
1096 } else {
1097 /* not a literal; hand it over to the template compiler */
1098 filter = call(
1099 SRE_PY_MODULE, "_subx",
1100 PyTuple_Pack(2, self, ptemplate)
1101 );
1102 if (!filter)
1103 return NULL;
1104 filter_is_callable = PyCallable_Check(filter);
1105 }
1106 }
1107
1108 if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX)) {
1109 Py_DECREF(filter);
1110 return NULL;
1111 }
1112
1113 list = PyList_New(0);
1114 if (!list) {
1115 Py_DECREF(filter);
1116 state_fini(&state);
1117 return NULL;
1118 }
1119
1120 n = i = 0;
1121
1122 while (!count || n < count) {
1123
1124 state_reset(&state);
1125
1126 state.ptr = state.start;
1127
1128 status = sre_search(&state, PatternObject_GetCode(self));
1129 if (PyErr_Occurred())
1130 goto error;
1131
1132 if (status <= 0) {
1133 if (status == 0)
1134 break;
1135 pattern_error(status);
1136 goto error;
1137 }
1138
1139 b = STATE_OFFSET(&state, state.start);
1140 e = STATE_OFFSET(&state, state.ptr);
1141
1142 if (i < b) {
1143 /* get segment before this match */
1144 item = getslice(state.isbytes, state.beginning,
1145 string, i, b);
1146 if (!item)
1147 goto error;
1148 status = PyList_Append(list, item);
1149 Py_DECREF(item);
1150 if (status < 0)
1151 goto error;
1152
1153 }
1154
1155 if (filter_is_callable) {
1156 /* pass match object through filter */
1157 match = pattern_new_match(module_state, self, &state, 1);
1158 if (!match)
1159 goto error;
1160 item = PyObject_CallOneArg(filter, match);
1161 Py_DECREF(match);
1162 if (!item)
1163 goto error;
1164 } else {
1165 /* filter is literal string */
1166 item = filter;
1167 Py_INCREF(item);
1168 }
1169
1170 /* add to list */
1171 if (item != Py_None) {
1172 status = PyList_Append(list, item);
1173 Py_DECREF(item);
1174 if (status < 0)
1175 goto error;
1176 }
1177
1178 i = e;
1179 n = n + 1;
1180 state.must_advance = (state.ptr == state.start);
1181 state.start = state.ptr;
1182 }
1183
1184 /* get segment following last match */
1185 if (i < state.endpos) {
1186 item = getslice(state.isbytes, state.beginning,
1187 string, i, state.endpos);
1188 if (!item)
1189 goto error;
1190 status = PyList_Append(list, item);
1191 Py_DECREF(item);
1192 if (status < 0)
1193 goto error;
1194 }
1195
1196 state_fini(&state);
1197
1198 Py_DECREF(filter);
1199
1200 /* convert list to single string (also removes list) */
1201 joiner = getslice(state.isbytes, state.beginning, string, 0, 0);
1202 if (!joiner) {
1203 Py_DECREF(list);
1204 return NULL;
1205 }
1206 if (PyList_GET_SIZE(list) == 0) {
1207 Py_DECREF(list);
1208 item = joiner;
1209 }
1210 else {
1211 if (state.isbytes)
1212 item = _PyBytes_Join(joiner, list);
1213 else
1214 item = PyUnicode_Join(joiner, list);
1215 Py_DECREF(joiner);
1216 Py_DECREF(list);
1217 if (!item)
1218 return NULL;
1219 }
1220
1221 if (subn)
1222 return Py_BuildValue("Nn", item, n);
1223
1224 return item;
1225
1226 error:
1227 Py_DECREF(list);
1228 state_fini(&state);
1229 Py_DECREF(filter);
1230 return NULL;
1231
1232 }
1233
1234 /*[clinic input]
1235 _sre.SRE_Pattern.sub
1236
1237 cls: defining_class
1238 /
1239 repl: object
1240 string: object
1241 count: Py_ssize_t = 0
1242
1243 Return the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement repl.
1244 [clinic start generated code]*/
1245
1246 static PyObject *
_sre_SRE_Pattern_sub_impl(PatternObject * self,PyTypeObject * cls,PyObject * repl,PyObject * string,Py_ssize_t count)1247 _sre_SRE_Pattern_sub_impl(PatternObject *self, PyTypeObject *cls,
1248 PyObject *repl, PyObject *string, Py_ssize_t count)
1249 /*[clinic end generated code: output=4be141ab04bca60d input=d8d1d4ac2311a07c]*/
1250 {
1251 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1252
1253 return pattern_subx(module_state, self, repl, string, count, 0);
1254 }
1255
1256 /*[clinic input]
1257 _sre.SRE_Pattern.subn
1258
1259 cls: defining_class
1260 /
1261 repl: object
1262 string: object
1263 count: Py_ssize_t = 0
1264
1265 Return the tuple (new_string, number_of_subs_made) found by replacing the leftmost non-overlapping occurrences of pattern with the replacement repl.
1266 [clinic start generated code]*/
1267
1268 static PyObject *
_sre_SRE_Pattern_subn_impl(PatternObject * self,PyTypeObject * cls,PyObject * repl,PyObject * string,Py_ssize_t count)1269 _sre_SRE_Pattern_subn_impl(PatternObject *self, PyTypeObject *cls,
1270 PyObject *repl, PyObject *string,
1271 Py_ssize_t count)
1272 /*[clinic end generated code: output=da02fd85258b1e1f input=8b78a65b8302e58d]*/
1273 {
1274 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1275
1276 return pattern_subx(module_state, self, repl, string, count, 1);
1277 }
1278
1279 /*[clinic input]
1280 _sre.SRE_Pattern.__copy__
1281
1282 [clinic start generated code]*/
1283
1284 static PyObject *
_sre_SRE_Pattern___copy___impl(PatternObject * self)1285 _sre_SRE_Pattern___copy___impl(PatternObject *self)
1286 /*[clinic end generated code: output=85dedc2db1bd8694 input=a730a59d863bc9f5]*/
1287 {
1288 Py_INCREF(self);
1289 return (PyObject *)self;
1290 }
1291
1292 /*[clinic input]
1293 _sre.SRE_Pattern.__deepcopy__
1294
1295 memo: object
1296 /
1297
1298 [clinic start generated code]*/
1299
1300 static PyObject *
_sre_SRE_Pattern___deepcopy__(PatternObject * self,PyObject * memo)1301 _sre_SRE_Pattern___deepcopy__(PatternObject *self, PyObject *memo)
1302 /*[clinic end generated code: output=2ad25679c1f1204a input=a465b1602f997bed]*/
1303 {
1304 Py_INCREF(self);
1305 return (PyObject *)self;
1306 }
1307
1308 static PyObject *
pattern_repr(PatternObject * obj)1309 pattern_repr(PatternObject *obj)
1310 {
1311 static const struct {
1312 const char *name;
1313 int value;
1314 } flag_names[] = {
1315 {"re.TEMPLATE", SRE_FLAG_TEMPLATE},
1316 {"re.IGNORECASE", SRE_FLAG_IGNORECASE},
1317 {"re.LOCALE", SRE_FLAG_LOCALE},
1318 {"re.MULTILINE", SRE_FLAG_MULTILINE},
1319 {"re.DOTALL", SRE_FLAG_DOTALL},
1320 {"re.UNICODE", SRE_FLAG_UNICODE},
1321 {"re.VERBOSE", SRE_FLAG_VERBOSE},
1322 {"re.DEBUG", SRE_FLAG_DEBUG},
1323 {"re.ASCII", SRE_FLAG_ASCII},
1324 };
1325 PyObject *result = NULL;
1326 PyObject *flag_items;
1327 size_t i;
1328 int flags = obj->flags;
1329
1330 /* Omit re.UNICODE for valid string patterns. */
1331 if (obj->isbytes == 0 &&
1332 (flags & (SRE_FLAG_LOCALE|SRE_FLAG_UNICODE|SRE_FLAG_ASCII)) ==
1333 SRE_FLAG_UNICODE)
1334 flags &= ~SRE_FLAG_UNICODE;
1335
1336 flag_items = PyList_New(0);
1337 if (!flag_items)
1338 return NULL;
1339
1340 for (i = 0; i < Py_ARRAY_LENGTH(flag_names); i++) {
1341 if (flags & flag_names[i].value) {
1342 PyObject *item = PyUnicode_FromString(flag_names[i].name);
1343 if (!item)
1344 goto done;
1345
1346 if (PyList_Append(flag_items, item) < 0) {
1347 Py_DECREF(item);
1348 goto done;
1349 }
1350 Py_DECREF(item);
1351 flags &= ~flag_names[i].value;
1352 }
1353 }
1354 if (flags) {
1355 PyObject *item = PyUnicode_FromFormat("0x%x", flags);
1356 if (!item)
1357 goto done;
1358
1359 if (PyList_Append(flag_items, item) < 0) {
1360 Py_DECREF(item);
1361 goto done;
1362 }
1363 Py_DECREF(item);
1364 }
1365
1366 if (PyList_Size(flag_items) > 0) {
1367 PyObject *flags_result;
1368 PyObject *sep = PyUnicode_FromString("|");
1369 if (!sep)
1370 goto done;
1371 flags_result = PyUnicode_Join(sep, flag_items);
1372 Py_DECREF(sep);
1373 if (!flags_result)
1374 goto done;
1375 result = PyUnicode_FromFormat("re.compile(%.200R, %S)",
1376 obj->pattern, flags_result);
1377 Py_DECREF(flags_result);
1378 }
1379 else {
1380 result = PyUnicode_FromFormat("re.compile(%.200R)", obj->pattern);
1381 }
1382
1383 done:
1384 Py_DECREF(flag_items);
1385 return result;
1386 }
1387
1388 PyDoc_STRVAR(pattern_doc, "Compiled regular expression object.");
1389
1390 /* PatternObject's 'groupindex' method. */
1391 static PyObject *
pattern_groupindex(PatternObject * self,void * Py_UNUSED (ignored))1392 pattern_groupindex(PatternObject *self, void *Py_UNUSED(ignored))
1393 {
1394 if (self->groupindex == NULL)
1395 return PyDict_New();
1396 return PyDictProxy_New(self->groupindex);
1397 }
1398
1399 static int _validate(PatternObject *self); /* Forward */
1400
1401 /*[clinic input]
1402 _sre.compile
1403
1404 pattern: object
1405 flags: int
1406 code: object(subclass_of='&PyList_Type')
1407 groups: Py_ssize_t
1408 groupindex: object(subclass_of='&PyDict_Type')
1409 indexgroup: object(subclass_of='&PyTuple_Type')
1410
1411 [clinic start generated code]*/
1412
1413 static PyObject *
_sre_compile_impl(PyObject * module,PyObject * pattern,int flags,PyObject * code,Py_ssize_t groups,PyObject * groupindex,PyObject * indexgroup)1414 _sre_compile_impl(PyObject *module, PyObject *pattern, int flags,
1415 PyObject *code, Py_ssize_t groups, PyObject *groupindex,
1416 PyObject *indexgroup)
1417 /*[clinic end generated code: output=ef9c2b3693776404 input=0a68476dbbe5db30]*/
1418 {
1419 /* "compile" pattern descriptor to pattern object */
1420
1421 _sremodulestate *module_state = get_sre_module_state(module);
1422 PatternObject* self;
1423 Py_ssize_t i, n;
1424
1425 n = PyList_GET_SIZE(code);
1426 /* coverity[ampersand_in_size] */
1427 self = PyObject_GC_NewVar(PatternObject, module_state->Pattern_Type, n);
1428 if (!self)
1429 return NULL;
1430 self->weakreflist = NULL;
1431 self->pattern = NULL;
1432 self->groupindex = NULL;
1433 self->indexgroup = NULL;
1434
1435 self->codesize = n;
1436
1437 for (i = 0; i < n; i++) {
1438 PyObject *o = PyList_GET_ITEM(code, i);
1439 unsigned long value = PyLong_AsUnsignedLong(o);
1440 self->code[i] = (SRE_CODE) value;
1441 if ((unsigned long) self->code[i] != value) {
1442 PyErr_SetString(PyExc_OverflowError,
1443 "regular expression code size limit exceeded");
1444 break;
1445 }
1446 }
1447 PyObject_GC_Track(self);
1448
1449 if (PyErr_Occurred()) {
1450 Py_DECREF(self);
1451 return NULL;
1452 }
1453
1454 if (pattern == Py_None) {
1455 self->isbytes = -1;
1456 }
1457 else {
1458 Py_ssize_t p_length;
1459 int charsize;
1460 Py_buffer view;
1461 view.buf = NULL;
1462 if (!getstring(pattern, &p_length, &self->isbytes,
1463 &charsize, &view)) {
1464 Py_DECREF(self);
1465 return NULL;
1466 }
1467 if (view.buf)
1468 PyBuffer_Release(&view);
1469 }
1470
1471 Py_INCREF(pattern);
1472 self->pattern = pattern;
1473
1474 self->flags = flags;
1475
1476 self->groups = groups;
1477
1478 if (PyDict_GET_SIZE(groupindex) > 0) {
1479 Py_INCREF(groupindex);
1480 self->groupindex = groupindex;
1481 if (PyTuple_GET_SIZE(indexgroup) > 0) {
1482 Py_INCREF(indexgroup);
1483 self->indexgroup = indexgroup;
1484 }
1485 }
1486
1487 if (!_validate(self)) {
1488 Py_DECREF(self);
1489 return NULL;
1490 }
1491
1492 return (PyObject*) self;
1493 }
1494
1495 /* -------------------------------------------------------------------- */
1496 /* Code validation */
1497
1498 /* To learn more about this code, have a look at the _compile() function in
1499 Lib/sre_compile.py. The validation functions below checks the code array
1500 for conformance with the code patterns generated there.
1501
1502 The nice thing about the generated code is that it is position-independent:
1503 all jumps are relative jumps forward. Also, jumps don't cross each other:
1504 the target of a later jump is always earlier than the target of an earlier
1505 jump. IOW, this is okay:
1506
1507 J---------J-------T--------T
1508 \ \_____/ /
1509 \______________________/
1510
1511 but this is not:
1512
1513 J---------J-------T--------T
1514 \_________\_____/ /
1515 \____________/
1516
1517 It also helps that SRE_CODE is always an unsigned type.
1518 */
1519
1520 /* Defining this one enables tracing of the validator */
1521 #undef VVERBOSE
1522
1523 /* Trace macro for the validator */
1524 #if defined(VVERBOSE)
1525 #define VTRACE(v) printf v
1526 #else
1527 #define VTRACE(v) do {} while(0) /* do nothing */
1528 #endif
1529
1530 /* Report failure */
1531 #define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return -1; } while (0)
1532
1533 /* Extract opcode, argument, or skip count from code array */
1534 #define GET_OP \
1535 do { \
1536 VTRACE(("%p: ", code)); \
1537 if (code >= end) FAIL; \
1538 op = *code++; \
1539 VTRACE(("%lu (op)\n", (unsigned long)op)); \
1540 } while (0)
1541 #define GET_ARG \
1542 do { \
1543 VTRACE(("%p= ", code)); \
1544 if (code >= end) FAIL; \
1545 arg = *code++; \
1546 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
1547 } while (0)
1548 #define GET_SKIP_ADJ(adj) \
1549 do { \
1550 VTRACE(("%p= ", code)); \
1551 if (code >= end) FAIL; \
1552 skip = *code; \
1553 VTRACE(("%lu (skip to %p)\n", \
1554 (unsigned long)skip, code+skip)); \
1555 if (skip-adj > (uintptr_t)(end - code)) \
1556 FAIL; \
1557 code++; \
1558 } while (0)
1559 #define GET_SKIP GET_SKIP_ADJ(0)
1560
1561 static int
_validate_charset(SRE_CODE * code,SRE_CODE * end)1562 _validate_charset(SRE_CODE *code, SRE_CODE *end)
1563 {
1564 /* Some variables are manipulated by the macros above */
1565 SRE_CODE op;
1566 SRE_CODE arg;
1567 SRE_CODE offset;
1568 int i;
1569
1570 while (code < end) {
1571 GET_OP;
1572 switch (op) {
1573
1574 case SRE_OP_NEGATE:
1575 break;
1576
1577 case SRE_OP_LITERAL:
1578 GET_ARG;
1579 break;
1580
1581 case SRE_OP_RANGE:
1582 case SRE_OP_RANGE_UNI_IGNORE:
1583 GET_ARG;
1584 GET_ARG;
1585 break;
1586
1587 case SRE_OP_CHARSET:
1588 offset = 256/SRE_CODE_BITS; /* 256-bit bitmap */
1589 if (offset > (uintptr_t)(end - code))
1590 FAIL;
1591 code += offset;
1592 break;
1593
1594 case SRE_OP_BIGCHARSET:
1595 GET_ARG; /* Number of blocks */
1596 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
1597 if (offset > (uintptr_t)(end - code))
1598 FAIL;
1599 /* Make sure that each byte points to a valid block */
1600 for (i = 0; i < 256; i++) {
1601 if (((unsigned char *)code)[i] >= arg)
1602 FAIL;
1603 }
1604 code += offset;
1605 offset = arg * (256/SRE_CODE_BITS); /* 256-bit bitmap times arg */
1606 if (offset > (uintptr_t)(end - code))
1607 FAIL;
1608 code += offset;
1609 break;
1610
1611 case SRE_OP_CATEGORY:
1612 GET_ARG;
1613 switch (arg) {
1614 case SRE_CATEGORY_DIGIT:
1615 case SRE_CATEGORY_NOT_DIGIT:
1616 case SRE_CATEGORY_SPACE:
1617 case SRE_CATEGORY_NOT_SPACE:
1618 case SRE_CATEGORY_WORD:
1619 case SRE_CATEGORY_NOT_WORD:
1620 case SRE_CATEGORY_LINEBREAK:
1621 case SRE_CATEGORY_NOT_LINEBREAK:
1622 case SRE_CATEGORY_LOC_WORD:
1623 case SRE_CATEGORY_LOC_NOT_WORD:
1624 case SRE_CATEGORY_UNI_DIGIT:
1625 case SRE_CATEGORY_UNI_NOT_DIGIT:
1626 case SRE_CATEGORY_UNI_SPACE:
1627 case SRE_CATEGORY_UNI_NOT_SPACE:
1628 case SRE_CATEGORY_UNI_WORD:
1629 case SRE_CATEGORY_UNI_NOT_WORD:
1630 case SRE_CATEGORY_UNI_LINEBREAK:
1631 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
1632 break;
1633 default:
1634 FAIL;
1635 }
1636 break;
1637
1638 default:
1639 FAIL;
1640
1641 }
1642 }
1643
1644 return 0;
1645 }
1646
1647 /* Returns 0 on success, -1 on failure, and 1 if the last op is JUMP. */
1648 static int
_validate_inner(SRE_CODE * code,SRE_CODE * end,Py_ssize_t groups)1649 _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1650 {
1651 /* Some variables are manipulated by the macros above */
1652 SRE_CODE op;
1653 SRE_CODE arg;
1654 SRE_CODE skip;
1655
1656 VTRACE(("code=%p, end=%p\n", code, end));
1657
1658 if (code > end)
1659 FAIL;
1660
1661 while (code < end) {
1662 GET_OP;
1663 switch (op) {
1664
1665 case SRE_OP_MARK:
1666 /* We don't check whether marks are properly nested; the
1667 sre_match() code is robust even if they don't, and the worst
1668 you can get is nonsensical match results. */
1669 GET_ARG;
1670 if (arg > 2 * (size_t)groups + 1) {
1671 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
1672 FAIL;
1673 }
1674 break;
1675
1676 case SRE_OP_LITERAL:
1677 case SRE_OP_NOT_LITERAL:
1678 case SRE_OP_LITERAL_IGNORE:
1679 case SRE_OP_NOT_LITERAL_IGNORE:
1680 case SRE_OP_LITERAL_UNI_IGNORE:
1681 case SRE_OP_NOT_LITERAL_UNI_IGNORE:
1682 case SRE_OP_LITERAL_LOC_IGNORE:
1683 case SRE_OP_NOT_LITERAL_LOC_IGNORE:
1684 GET_ARG;
1685 /* The arg is just a character, nothing to check */
1686 break;
1687
1688 case SRE_OP_SUCCESS:
1689 case SRE_OP_FAILURE:
1690 /* Nothing to check; these normally end the matching process */
1691 break;
1692
1693 case SRE_OP_AT:
1694 GET_ARG;
1695 switch (arg) {
1696 case SRE_AT_BEGINNING:
1697 case SRE_AT_BEGINNING_STRING:
1698 case SRE_AT_BEGINNING_LINE:
1699 case SRE_AT_END:
1700 case SRE_AT_END_LINE:
1701 case SRE_AT_END_STRING:
1702 case SRE_AT_BOUNDARY:
1703 case SRE_AT_NON_BOUNDARY:
1704 case SRE_AT_LOC_BOUNDARY:
1705 case SRE_AT_LOC_NON_BOUNDARY:
1706 case SRE_AT_UNI_BOUNDARY:
1707 case SRE_AT_UNI_NON_BOUNDARY:
1708 break;
1709 default:
1710 FAIL;
1711 }
1712 break;
1713
1714 case SRE_OP_ANY:
1715 case SRE_OP_ANY_ALL:
1716 /* These have no operands */
1717 break;
1718
1719 case SRE_OP_IN:
1720 case SRE_OP_IN_IGNORE:
1721 case SRE_OP_IN_UNI_IGNORE:
1722 case SRE_OP_IN_LOC_IGNORE:
1723 GET_SKIP;
1724 /* Stop 1 before the end; we check the FAILURE below */
1725 if (_validate_charset(code, code+skip-2))
1726 FAIL;
1727 if (code[skip-2] != SRE_OP_FAILURE)
1728 FAIL;
1729 code += skip-1;
1730 break;
1731
1732 case SRE_OP_INFO:
1733 {
1734 /* A minimal info field is
1735 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
1736 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
1737 more follows. */
1738 SRE_CODE flags, i;
1739 SRE_CODE *newcode;
1740 GET_SKIP;
1741 newcode = code+skip-1;
1742 GET_ARG; flags = arg;
1743 GET_ARG;
1744 GET_ARG;
1745 /* Check that only valid flags are present */
1746 if ((flags & ~(SRE_INFO_PREFIX |
1747 SRE_INFO_LITERAL |
1748 SRE_INFO_CHARSET)) != 0)
1749 FAIL;
1750 /* PREFIX and CHARSET are mutually exclusive */
1751 if ((flags & SRE_INFO_PREFIX) &&
1752 (flags & SRE_INFO_CHARSET))
1753 FAIL;
1754 /* LITERAL implies PREFIX */
1755 if ((flags & SRE_INFO_LITERAL) &&
1756 !(flags & SRE_INFO_PREFIX))
1757 FAIL;
1758 /* Validate the prefix */
1759 if (flags & SRE_INFO_PREFIX) {
1760 SRE_CODE prefix_len;
1761 GET_ARG; prefix_len = arg;
1762 GET_ARG;
1763 /* Here comes the prefix string */
1764 if (prefix_len > (uintptr_t)(newcode - code))
1765 FAIL;
1766 code += prefix_len;
1767 /* And here comes the overlap table */
1768 if (prefix_len > (uintptr_t)(newcode - code))
1769 FAIL;
1770 /* Each overlap value should be < prefix_len */
1771 for (i = 0; i < prefix_len; i++) {
1772 if (code[i] >= prefix_len)
1773 FAIL;
1774 }
1775 code += prefix_len;
1776 }
1777 /* Validate the charset */
1778 if (flags & SRE_INFO_CHARSET) {
1779 if (_validate_charset(code, newcode-1))
1780 FAIL;
1781 if (newcode[-1] != SRE_OP_FAILURE)
1782 FAIL;
1783 code = newcode;
1784 }
1785 else if (code != newcode) {
1786 VTRACE(("code=%p, newcode=%p\n", code, newcode));
1787 FAIL;
1788 }
1789 }
1790 break;
1791
1792 case SRE_OP_BRANCH:
1793 {
1794 SRE_CODE *target = NULL;
1795 for (;;) {
1796 GET_SKIP;
1797 if (skip == 0)
1798 break;
1799 /* Stop 2 before the end; we check the JUMP below */
1800 if (_validate_inner(code, code+skip-3, groups))
1801 FAIL;
1802 code += skip-3;
1803 /* Check that it ends with a JUMP, and that each JUMP
1804 has the same target */
1805 GET_OP;
1806 if (op != SRE_OP_JUMP)
1807 FAIL;
1808 GET_SKIP;
1809 if (target == NULL)
1810 target = code+skip-1;
1811 else if (code+skip-1 != target)
1812 FAIL;
1813 }
1814 if (code != target)
1815 FAIL;
1816 }
1817 break;
1818
1819 case SRE_OP_REPEAT_ONE:
1820 case SRE_OP_MIN_REPEAT_ONE:
1821 case SRE_OP_POSSESSIVE_REPEAT_ONE:
1822 {
1823 SRE_CODE min, max;
1824 GET_SKIP;
1825 GET_ARG; min = arg;
1826 GET_ARG; max = arg;
1827 if (min > max)
1828 FAIL;
1829 if (max > SRE_MAXREPEAT)
1830 FAIL;
1831 if (_validate_inner(code, code+skip-4, groups))
1832 FAIL;
1833 code += skip-4;
1834 GET_OP;
1835 if (op != SRE_OP_SUCCESS)
1836 FAIL;
1837 }
1838 break;
1839
1840 case SRE_OP_REPEAT:
1841 case SRE_OP_POSSESSIVE_REPEAT:
1842 {
1843 SRE_CODE op1 = op, min, max;
1844 GET_SKIP;
1845 GET_ARG; min = arg;
1846 GET_ARG; max = arg;
1847 if (min > max)
1848 FAIL;
1849 if (max > SRE_MAXREPEAT)
1850 FAIL;
1851 if (_validate_inner(code, code+skip-3, groups))
1852 FAIL;
1853 code += skip-3;
1854 GET_OP;
1855 if (op1 == SRE_OP_POSSESSIVE_REPEAT) {
1856 if (op != SRE_OP_SUCCESS)
1857 FAIL;
1858 }
1859 else {
1860 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
1861 FAIL;
1862 }
1863 }
1864 break;
1865
1866 case SRE_OP_ATOMIC_GROUP:
1867 {
1868 GET_SKIP;
1869 if (_validate_inner(code, code+skip-2, groups))
1870 FAIL;
1871 code += skip-2;
1872 GET_OP;
1873 if (op != SRE_OP_SUCCESS)
1874 FAIL;
1875 }
1876 break;
1877
1878 case SRE_OP_GROUPREF:
1879 case SRE_OP_GROUPREF_IGNORE:
1880 case SRE_OP_GROUPREF_UNI_IGNORE:
1881 case SRE_OP_GROUPREF_LOC_IGNORE:
1882 GET_ARG;
1883 if (arg >= (size_t)groups)
1884 FAIL;
1885 break;
1886
1887 case SRE_OP_GROUPREF_EXISTS:
1888 /* The regex syntax for this is: '(?(group)then|else)', where
1889 'group' is either an integer group number or a group name,
1890 'then' and 'else' are sub-regexes, and 'else' is optional. */
1891 GET_ARG;
1892 if (arg >= (size_t)groups)
1893 FAIL;
1894 GET_SKIP_ADJ(1);
1895 code--; /* The skip is relative to the first arg! */
1896 /* There are two possibilities here: if there is both a 'then'
1897 part and an 'else' part, the generated code looks like:
1898
1899 GROUPREF_EXISTS
1900 <group>
1901 <skipyes>
1902 ...then part...
1903 JUMP
1904 <skipno>
1905 (<skipyes> jumps here)
1906 ...else part...
1907 (<skipno> jumps here)
1908
1909 If there is only a 'then' part, it looks like:
1910
1911 GROUPREF_EXISTS
1912 <group>
1913 <skip>
1914 ...then part...
1915 (<skip> jumps here)
1916
1917 There is no direct way to decide which it is, and we don't want
1918 to allow arbitrary jumps anywhere in the code; so we just look
1919 for a JUMP opcode preceding our skip target.
1920 */
1921 VTRACE(("then part:\n"));
1922 int rc = _validate_inner(code+1, code+skip-1, groups);
1923 if (rc == 1) {
1924 VTRACE(("else part:\n"));
1925 code += skip-2; /* Position after JUMP, at <skipno> */
1926 GET_SKIP;
1927 rc = _validate_inner(code, code+skip-1, groups);
1928 }
1929 if (rc)
1930 FAIL;
1931 code += skip-1;
1932 break;
1933
1934 case SRE_OP_ASSERT:
1935 case SRE_OP_ASSERT_NOT:
1936 GET_SKIP;
1937 GET_ARG; /* 0 for lookahead, width for lookbehind */
1938 code--; /* Back up over arg to simplify math below */
1939 if (arg & 0x80000000)
1940 FAIL; /* Width too large */
1941 /* Stop 1 before the end; we check the SUCCESS below */
1942 if (_validate_inner(code+1, code+skip-2, groups))
1943 FAIL;
1944 code += skip-2;
1945 GET_OP;
1946 if (op != SRE_OP_SUCCESS)
1947 FAIL;
1948 break;
1949
1950 case SRE_OP_JUMP:
1951 if (code + 1 != end)
1952 FAIL;
1953 VTRACE(("JUMP: %d\n", __LINE__));
1954 return 1;
1955
1956 default:
1957 FAIL;
1958
1959 }
1960 }
1961
1962 VTRACE(("okay\n"));
1963 return 0;
1964 }
1965
1966 static int
_validate_outer(SRE_CODE * code,SRE_CODE * end,Py_ssize_t groups)1967 _validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1968 {
1969 if (groups < 0 || (size_t)groups > SRE_MAXGROUPS ||
1970 code >= end || end[-1] != SRE_OP_SUCCESS)
1971 FAIL;
1972 return _validate_inner(code, end-1, groups);
1973 }
1974
1975 static int
_validate(PatternObject * self)1976 _validate(PatternObject *self)
1977 {
1978 if (_validate_outer(self->code, self->code+self->codesize, self->groups))
1979 {
1980 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
1981 return 0;
1982 }
1983 else
1984 VTRACE(("Success!\n"));
1985 return 1;
1986 }
1987
1988 /* -------------------------------------------------------------------- */
1989 /* match methods */
1990
1991 static int
match_traverse(MatchObject * self,visitproc visit,void * arg)1992 match_traverse(MatchObject *self, visitproc visit, void *arg)
1993 {
1994 Py_VISIT(Py_TYPE(self));
1995 Py_VISIT(self->string);
1996 Py_VISIT(self->regs);
1997 Py_VISIT(self->pattern);
1998 return 0;
1999 }
2000
2001 static int
match_clear(MatchObject * self)2002 match_clear(MatchObject *self)
2003 {
2004 Py_CLEAR(self->string);
2005 Py_CLEAR(self->regs);
2006 Py_CLEAR(self->pattern);
2007 return 0;
2008 }
2009
2010 static void
match_dealloc(MatchObject * self)2011 match_dealloc(MatchObject* self)
2012 {
2013 PyTypeObject *tp = Py_TYPE(self);
2014
2015 PyObject_GC_UnTrack(self);
2016 (void)match_clear(self);
2017 tp->tp_free(self);
2018 Py_DECREF(tp);
2019 }
2020
2021 static PyObject*
match_getslice_by_index(MatchObject * self,Py_ssize_t index,PyObject * def)2022 match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
2023 {
2024 Py_ssize_t length;
2025 int isbytes, charsize;
2026 Py_buffer view;
2027 PyObject *result;
2028 const void* ptr;
2029 Py_ssize_t i, j;
2030
2031 assert(0 <= index && index < self->groups);
2032 index *= 2;
2033
2034 if (self->string == Py_None || self->mark[index] < 0) {
2035 /* return default value if the string or group is undefined */
2036 Py_INCREF(def);
2037 return def;
2038 }
2039
2040 ptr = getstring(self->string, &length, &isbytes, &charsize, &view);
2041 if (ptr == NULL)
2042 return NULL;
2043
2044 i = self->mark[index];
2045 j = self->mark[index+1];
2046 i = Py_MIN(i, length);
2047 j = Py_MIN(j, length);
2048 result = getslice(isbytes, ptr, self->string, i, j);
2049 if (isbytes && view.buf != NULL)
2050 PyBuffer_Release(&view);
2051 return result;
2052 }
2053
2054 static Py_ssize_t
match_getindex(MatchObject * self,PyObject * index)2055 match_getindex(MatchObject* self, PyObject* index)
2056 {
2057 Py_ssize_t i;
2058
2059 if (index == NULL)
2060 /* Default value */
2061 return 0;
2062
2063 if (PyIndex_Check(index)) {
2064 i = PyNumber_AsSsize_t(index, NULL);
2065 }
2066 else {
2067 i = -1;
2068
2069 if (self->pattern->groupindex) {
2070 index = PyDict_GetItemWithError(self->pattern->groupindex, index);
2071 if (index && PyLong_Check(index)) {
2072 i = PyLong_AsSsize_t(index);
2073 }
2074 }
2075 }
2076 if (i < 0 || i >= self->groups) {
2077 /* raise IndexError if we were given a bad group number */
2078 if (!PyErr_Occurred()) {
2079 PyErr_SetString(PyExc_IndexError, "no such group");
2080 }
2081 return -1;
2082 }
2083
2084 return i;
2085 }
2086
2087 static PyObject*
match_getslice(MatchObject * self,PyObject * index,PyObject * def)2088 match_getslice(MatchObject* self, PyObject* index, PyObject* def)
2089 {
2090 Py_ssize_t i = match_getindex(self, index);
2091
2092 if (i < 0) {
2093 return NULL;
2094 }
2095
2096 return match_getslice_by_index(self, i, def);
2097 }
2098
2099 /*[clinic input]
2100 _sre.SRE_Match.expand
2101
2102 template: object
2103
2104 Return the string obtained by doing backslash substitution on the string template, as done by the sub() method.
2105 [clinic start generated code]*/
2106
2107 static PyObject *
_sre_SRE_Match_expand_impl(MatchObject * self,PyObject * template)2108 _sre_SRE_Match_expand_impl(MatchObject *self, PyObject *template)
2109 /*[clinic end generated code: output=931b58ccc323c3a1 input=4bfdb22c2f8b146a]*/
2110 {
2111 /* delegate to Python code */
2112 return call(
2113 SRE_PY_MODULE, "_expand",
2114 PyTuple_Pack(3, self->pattern, self, template)
2115 );
2116 }
2117
2118 static PyObject*
match_group(MatchObject * self,PyObject * args)2119 match_group(MatchObject* self, PyObject* args)
2120 {
2121 PyObject* result;
2122 Py_ssize_t i, size;
2123
2124 size = PyTuple_GET_SIZE(args);
2125
2126 switch (size) {
2127 case 0:
2128 result = match_getslice(self, _PyLong_GetZero(), Py_None);
2129 break;
2130 case 1:
2131 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2132 break;
2133 default:
2134 /* fetch multiple items */
2135 result = PyTuple_New(size);
2136 if (!result)
2137 return NULL;
2138 for (i = 0; i < size; i++) {
2139 PyObject* item = match_getslice(
2140 self, PyTuple_GET_ITEM(args, i), Py_None
2141 );
2142 if (!item) {
2143 Py_DECREF(result);
2144 return NULL;
2145 }
2146 PyTuple_SET_ITEM(result, i, item);
2147 }
2148 break;
2149 }
2150 return result;
2151 }
2152
2153 static PyObject*
match_getitem(MatchObject * self,PyObject * name)2154 match_getitem(MatchObject* self, PyObject* name)
2155 {
2156 return match_getslice(self, name, Py_None);
2157 }
2158
2159 /*[clinic input]
2160 _sre.SRE_Match.groups
2161
2162 default: object = None
2163 Is used for groups that did not participate in the match.
2164
2165 Return a tuple containing all the subgroups of the match, from 1.
2166 [clinic start generated code]*/
2167
2168 static PyObject *
_sre_SRE_Match_groups_impl(MatchObject * self,PyObject * default_value)2169 _sre_SRE_Match_groups_impl(MatchObject *self, PyObject *default_value)
2170 /*[clinic end generated code: output=daf8e2641537238a input=bb069ef55dabca91]*/
2171 {
2172 PyObject* result;
2173 Py_ssize_t index;
2174
2175 result = PyTuple_New(self->groups-1);
2176 if (!result)
2177 return NULL;
2178
2179 for (index = 1; index < self->groups; index++) {
2180 PyObject* item;
2181 item = match_getslice_by_index(self, index, default_value);
2182 if (!item) {
2183 Py_DECREF(result);
2184 return NULL;
2185 }
2186 PyTuple_SET_ITEM(result, index-1, item);
2187 }
2188
2189 return result;
2190 }
2191
2192 /*[clinic input]
2193 _sre.SRE_Match.groupdict
2194
2195 default: object = None
2196 Is used for groups that did not participate in the match.
2197
2198 Return a dictionary containing all the named subgroups of the match, keyed by the subgroup name.
2199 [clinic start generated code]*/
2200
2201 static PyObject *
_sre_SRE_Match_groupdict_impl(MatchObject * self,PyObject * default_value)2202 _sre_SRE_Match_groupdict_impl(MatchObject *self, PyObject *default_value)
2203 /*[clinic end generated code: output=29917c9073e41757 input=0ded7960b23780aa]*/
2204 {
2205 PyObject *result;
2206 PyObject *key;
2207 PyObject *value;
2208 Py_ssize_t pos = 0;
2209 Py_hash_t hash;
2210
2211 result = PyDict_New();
2212 if (!result || !self->pattern->groupindex)
2213 return result;
2214
2215 while (_PyDict_Next(self->pattern->groupindex, &pos, &key, &value, &hash)) {
2216 int status;
2217 Py_INCREF(key);
2218 value = match_getslice(self, key, default_value);
2219 if (!value) {
2220 Py_DECREF(key);
2221 goto failed;
2222 }
2223 status = _PyDict_SetItem_KnownHash(result, key, value, hash);
2224 Py_DECREF(value);
2225 Py_DECREF(key);
2226 if (status < 0)
2227 goto failed;
2228 }
2229
2230 return result;
2231
2232 failed:
2233 Py_DECREF(result);
2234 return NULL;
2235 }
2236
2237 /*[clinic input]
2238 _sre.SRE_Match.start -> Py_ssize_t
2239
2240 group: object(c_default="NULL") = 0
2241 /
2242
2243 Return index of the start of the substring matched by group.
2244 [clinic start generated code]*/
2245
2246 static Py_ssize_t
_sre_SRE_Match_start_impl(MatchObject * self,PyObject * group)2247 _sre_SRE_Match_start_impl(MatchObject *self, PyObject *group)
2248 /*[clinic end generated code: output=3f6e7f9df2fb5201 input=ced8e4ed4b33ee6c]*/
2249 {
2250 Py_ssize_t index = match_getindex(self, group);
2251
2252 if (index < 0) {
2253 return -1;
2254 }
2255
2256 /* mark is -1 if group is undefined */
2257 return self->mark[index*2];
2258 }
2259
2260 /*[clinic input]
2261 _sre.SRE_Match.end -> Py_ssize_t
2262
2263 group: object(c_default="NULL") = 0
2264 /
2265
2266 Return index of the end of the substring matched by group.
2267 [clinic start generated code]*/
2268
2269 static Py_ssize_t
_sre_SRE_Match_end_impl(MatchObject * self,PyObject * group)2270 _sre_SRE_Match_end_impl(MatchObject *self, PyObject *group)
2271 /*[clinic end generated code: output=f4240b09911f7692 input=1b799560c7f3d7e6]*/
2272 {
2273 Py_ssize_t index = match_getindex(self, group);
2274
2275 if (index < 0) {
2276 return -1;
2277 }
2278
2279 /* mark is -1 if group is undefined */
2280 return self->mark[index*2+1];
2281 }
2282
2283 LOCAL(PyObject*)
_pair(Py_ssize_t i1,Py_ssize_t i2)2284 _pair(Py_ssize_t i1, Py_ssize_t i2)
2285 {
2286 PyObject* pair;
2287 PyObject* item;
2288
2289 pair = PyTuple_New(2);
2290 if (!pair)
2291 return NULL;
2292
2293 item = PyLong_FromSsize_t(i1);
2294 if (!item)
2295 goto error;
2296 PyTuple_SET_ITEM(pair, 0, item);
2297
2298 item = PyLong_FromSsize_t(i2);
2299 if (!item)
2300 goto error;
2301 PyTuple_SET_ITEM(pair, 1, item);
2302
2303 return pair;
2304
2305 error:
2306 Py_DECREF(pair);
2307 return NULL;
2308 }
2309
2310 /*[clinic input]
2311 _sre.SRE_Match.span
2312
2313 group: object(c_default="NULL") = 0
2314 /
2315
2316 For match object m, return the 2-tuple (m.start(group), m.end(group)).
2317 [clinic start generated code]*/
2318
2319 static PyObject *
_sre_SRE_Match_span_impl(MatchObject * self,PyObject * group)2320 _sre_SRE_Match_span_impl(MatchObject *self, PyObject *group)
2321 /*[clinic end generated code: output=f02ae40594d14fe6 input=8fa6014e982d71d4]*/
2322 {
2323 Py_ssize_t index = match_getindex(self, group);
2324
2325 if (index < 0) {
2326 return NULL;
2327 }
2328
2329 /* marks are -1 if group is undefined */
2330 return _pair(self->mark[index*2], self->mark[index*2+1]);
2331 }
2332
2333 static PyObject*
match_regs(MatchObject * self)2334 match_regs(MatchObject* self)
2335 {
2336 PyObject* regs;
2337 PyObject* item;
2338 Py_ssize_t index;
2339
2340 regs = PyTuple_New(self->groups);
2341 if (!regs)
2342 return NULL;
2343
2344 for (index = 0; index < self->groups; index++) {
2345 item = _pair(self->mark[index*2], self->mark[index*2+1]);
2346 if (!item) {
2347 Py_DECREF(regs);
2348 return NULL;
2349 }
2350 PyTuple_SET_ITEM(regs, index, item);
2351 }
2352
2353 Py_INCREF(regs);
2354 self->regs = regs;
2355
2356 return regs;
2357 }
2358
2359 /*[clinic input]
2360 _sre.SRE_Match.__copy__
2361
2362 [clinic start generated code]*/
2363
2364 static PyObject *
_sre_SRE_Match___copy___impl(MatchObject * self)2365 _sre_SRE_Match___copy___impl(MatchObject *self)
2366 /*[clinic end generated code: output=a779c5fc8b5b4eb4 input=3bb4d30b6baddb5b]*/
2367 {
2368 Py_INCREF(self);
2369 return (PyObject *)self;
2370 }
2371
2372 /*[clinic input]
2373 _sre.SRE_Match.__deepcopy__
2374
2375 memo: object
2376 /
2377
2378 [clinic start generated code]*/
2379
2380 static PyObject *
_sre_SRE_Match___deepcopy__(MatchObject * self,PyObject * memo)2381 _sre_SRE_Match___deepcopy__(MatchObject *self, PyObject *memo)
2382 /*[clinic end generated code: output=ba7cb46d655e4ee2 input=779d12a31c2c325e]*/
2383 {
2384 Py_INCREF(self);
2385 return (PyObject *)self;
2386 }
2387
2388 PyDoc_STRVAR(match_doc,
2389 "The result of re.match() and re.search().\n\
2390 Match objects always have a boolean value of True.");
2391
2392 PyDoc_STRVAR(match_group_doc,
2393 "group([group1, ...]) -> str or tuple.\n\
2394 Return subgroup(s) of the match by indices or names.\n\
2395 For 0 returns the entire match.");
2396
2397 static PyObject *
match_lastindex_get(MatchObject * self,void * Py_UNUSED (ignored))2398 match_lastindex_get(MatchObject *self, void *Py_UNUSED(ignored))
2399 {
2400 if (self->lastindex >= 0)
2401 return PyLong_FromSsize_t(self->lastindex);
2402 Py_RETURN_NONE;
2403 }
2404
2405 static PyObject *
match_lastgroup_get(MatchObject * self,void * Py_UNUSED (ignored))2406 match_lastgroup_get(MatchObject *self, void *Py_UNUSED(ignored))
2407 {
2408 if (self->pattern->indexgroup &&
2409 self->lastindex >= 0 &&
2410 self->lastindex < PyTuple_GET_SIZE(self->pattern->indexgroup))
2411 {
2412 PyObject *result = PyTuple_GET_ITEM(self->pattern->indexgroup,
2413 self->lastindex);
2414 Py_INCREF(result);
2415 return result;
2416 }
2417 Py_RETURN_NONE;
2418 }
2419
2420 static PyObject *
match_regs_get(MatchObject * self,void * Py_UNUSED (ignored))2421 match_regs_get(MatchObject *self, void *Py_UNUSED(ignored))
2422 {
2423 if (self->regs) {
2424 Py_INCREF(self->regs);
2425 return self->regs;
2426 } else
2427 return match_regs(self);
2428 }
2429
2430 static PyObject *
match_repr(MatchObject * self)2431 match_repr(MatchObject *self)
2432 {
2433 PyObject *result;
2434 PyObject *group0 = match_getslice_by_index(self, 0, Py_None);
2435 if (group0 == NULL)
2436 return NULL;
2437 result = PyUnicode_FromFormat(
2438 "<%s object; span=(%zd, %zd), match=%.50R>",
2439 Py_TYPE(self)->tp_name,
2440 self->mark[0], self->mark[1], group0);
2441 Py_DECREF(group0);
2442 return result;
2443 }
2444
2445
2446 static PyObject*
pattern_new_match(_sremodulestate * module_state,PatternObject * pattern,SRE_STATE * state,Py_ssize_t status)2447 pattern_new_match(_sremodulestate* module_state,
2448 PatternObject* pattern,
2449 SRE_STATE* state,
2450 Py_ssize_t status)
2451 {
2452 /* create match object (from state object) */
2453
2454 MatchObject* match;
2455 Py_ssize_t i, j;
2456 char* base;
2457 int n;
2458
2459 if (status > 0) {
2460
2461 /* create match object (with room for extra group marks) */
2462 /* coverity[ampersand_in_size] */
2463 match = PyObject_GC_NewVar(MatchObject,
2464 module_state->Match_Type,
2465 2*(pattern->groups+1));
2466 if (!match)
2467 return NULL;
2468
2469 Py_INCREF(pattern);
2470 match->pattern = pattern;
2471
2472 Py_INCREF(state->string);
2473 match->string = state->string;
2474
2475 match->regs = NULL;
2476 match->groups = pattern->groups+1;
2477
2478 /* fill in group slices */
2479
2480 base = (char*) state->beginning;
2481 n = state->charsize;
2482
2483 match->mark[0] = ((char*) state->start - base) / n;
2484 match->mark[1] = ((char*) state->ptr - base) / n;
2485
2486 for (i = j = 0; i < pattern->groups; i++, j+=2)
2487 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
2488 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
2489 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
2490
2491 /* check wrong span */
2492 if (match->mark[j+2] > match->mark[j+3]) {
2493 PyErr_SetString(PyExc_SystemError,
2494 "The span of capturing group is wrong,"
2495 " please report a bug for the re module.");
2496 Py_DECREF(match);
2497 return NULL;
2498 }
2499 } else
2500 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
2501
2502 match->pos = state->pos;
2503 match->endpos = state->endpos;
2504
2505 match->lastindex = state->lastindex;
2506
2507 PyObject_GC_Track(match);
2508 return (PyObject*) match;
2509
2510 } else if (status == 0) {
2511
2512 /* no match */
2513 Py_RETURN_NONE;
2514
2515 }
2516
2517 /* internal error */
2518 pattern_error(status);
2519 return NULL;
2520 }
2521
2522
2523 /* -------------------------------------------------------------------- */
2524 /* scanner methods (experimental) */
2525
2526 static int
scanner_traverse(ScannerObject * self,visitproc visit,void * arg)2527 scanner_traverse(ScannerObject *self, visitproc visit, void *arg)
2528 {
2529 Py_VISIT(Py_TYPE(self));
2530 Py_VISIT(self->pattern);
2531 return 0;
2532 }
2533
2534 static int
scanner_clear(ScannerObject * self)2535 scanner_clear(ScannerObject *self)
2536 {
2537 Py_CLEAR(self->pattern);
2538 return 0;
2539 }
2540
2541 static void
scanner_dealloc(ScannerObject * self)2542 scanner_dealloc(ScannerObject* self)
2543 {
2544 PyTypeObject *tp = Py_TYPE(self);
2545
2546 PyObject_GC_UnTrack(self);
2547 state_fini(&self->state);
2548 (void)scanner_clear(self);
2549 tp->tp_free(self);
2550 Py_DECREF(tp);
2551 }
2552
2553 static int
scanner_begin(ScannerObject * self)2554 scanner_begin(ScannerObject* self)
2555 {
2556 if (self->executing) {
2557 PyErr_SetString(PyExc_ValueError,
2558 "regular expression scanner already executing");
2559 return 0;
2560 }
2561 self->executing = 1;
2562 return 1;
2563 }
2564
2565 static void
scanner_end(ScannerObject * self)2566 scanner_end(ScannerObject* self)
2567 {
2568 assert(self->executing);
2569 self->executing = 0;
2570 }
2571
2572 /*[clinic input]
2573 _sre.SRE_Scanner.match
2574
2575 cls: defining_class
2576 /
2577
2578 [clinic start generated code]*/
2579
2580 static PyObject *
_sre_SRE_Scanner_match_impl(ScannerObject * self,PyTypeObject * cls)2581 _sre_SRE_Scanner_match_impl(ScannerObject *self, PyTypeObject *cls)
2582 /*[clinic end generated code: output=6e22c149dc0f0325 input=b5146e1f30278cb7]*/
2583 {
2584 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
2585 SRE_STATE* state = &self->state;
2586 PyObject* match;
2587 Py_ssize_t status;
2588
2589 if (!scanner_begin(self)) {
2590 return NULL;
2591 }
2592 if (state->start == NULL) {
2593 scanner_end(self);
2594 Py_RETURN_NONE;
2595 }
2596
2597 state_reset(state);
2598
2599 state->ptr = state->start;
2600
2601 status = sre_match(state, PatternObject_GetCode(self->pattern));
2602 if (PyErr_Occurred()) {
2603 scanner_end(self);
2604 return NULL;
2605 }
2606
2607 match = pattern_new_match(module_state, (PatternObject*) self->pattern,
2608 state, status);
2609
2610 if (status == 0)
2611 state->start = NULL;
2612 else {
2613 state->must_advance = (state->ptr == state->start);
2614 state->start = state->ptr;
2615 }
2616
2617 scanner_end(self);
2618 return match;
2619 }
2620
2621
2622 /*[clinic input]
2623 _sre.SRE_Scanner.search
2624
2625 cls: defining_class
2626 /
2627
2628 [clinic start generated code]*/
2629
2630 static PyObject *
_sre_SRE_Scanner_search_impl(ScannerObject * self,PyTypeObject * cls)2631 _sre_SRE_Scanner_search_impl(ScannerObject *self, PyTypeObject *cls)
2632 /*[clinic end generated code: output=23e8fc78013f9161 input=056c2d37171d0bf2]*/
2633 {
2634 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
2635 SRE_STATE* state = &self->state;
2636 PyObject* match;
2637 Py_ssize_t status;
2638
2639 if (!scanner_begin(self)) {
2640 return NULL;
2641 }
2642 if (state->start == NULL) {
2643 scanner_end(self);
2644 Py_RETURN_NONE;
2645 }
2646
2647 state_reset(state);
2648
2649 state->ptr = state->start;
2650
2651 status = sre_search(state, PatternObject_GetCode(self->pattern));
2652 if (PyErr_Occurred()) {
2653 scanner_end(self);
2654 return NULL;
2655 }
2656
2657 match = pattern_new_match(module_state, (PatternObject*) self->pattern,
2658 state, status);
2659
2660 if (status == 0)
2661 state->start = NULL;
2662 else {
2663 state->must_advance = (state->ptr == state->start);
2664 state->start = state->ptr;
2665 }
2666
2667 scanner_end(self);
2668 return match;
2669 }
2670
2671 static PyObject *
pattern_scanner(_sremodulestate * module_state,PatternObject * self,PyObject * string,Py_ssize_t pos,Py_ssize_t endpos)2672 pattern_scanner(_sremodulestate *module_state,
2673 PatternObject *self,
2674 PyObject *string,
2675 Py_ssize_t pos,
2676 Py_ssize_t endpos)
2677 {
2678 ScannerObject* scanner;
2679
2680 /* create scanner object */
2681 scanner = PyObject_GC_New(ScannerObject, module_state->Scanner_Type);
2682 if (!scanner)
2683 return NULL;
2684 scanner->pattern = NULL;
2685 scanner->executing = 0;
2686
2687 /* create search state object */
2688 if (!state_init(&scanner->state, self, string, pos, endpos)) {
2689 Py_DECREF(scanner);
2690 return NULL;
2691 }
2692
2693 Py_INCREF(self);
2694 scanner->pattern = (PyObject*) self;
2695
2696 PyObject_GC_Track(scanner);
2697 return (PyObject*) scanner;
2698 }
2699
2700 static Py_hash_t
pattern_hash(PatternObject * self)2701 pattern_hash(PatternObject *self)
2702 {
2703 Py_hash_t hash, hash2;
2704
2705 hash = PyObject_Hash(self->pattern);
2706 if (hash == -1) {
2707 return -1;
2708 }
2709
2710 hash2 = _Py_HashBytes(self->code, sizeof(self->code[0]) * self->codesize);
2711 hash ^= hash2;
2712
2713 hash ^= self->flags;
2714 hash ^= self->isbytes;
2715 hash ^= self->codesize;
2716
2717 if (hash == -1) {
2718 hash = -2;
2719 }
2720 return hash;
2721 }
2722
2723 static PyObject*
pattern_richcompare(PyObject * lefto,PyObject * righto,int op)2724 pattern_richcompare(PyObject *lefto, PyObject *righto, int op)
2725 {
2726 PyTypeObject *tp = Py_TYPE(lefto);
2727 _sremodulestate *module_state = get_sre_module_state_by_class(tp);
2728 PatternObject *left, *right;
2729 int cmp;
2730
2731 if (op != Py_EQ && op != Py_NE) {
2732 Py_RETURN_NOTIMPLEMENTED;
2733 }
2734
2735 if (!Py_IS_TYPE(righto, module_state->Pattern_Type))
2736 {
2737 Py_RETURN_NOTIMPLEMENTED;
2738 }
2739
2740 if (lefto == righto) {
2741 /* a pattern is equal to itself */
2742 return PyBool_FromLong(op == Py_EQ);
2743 }
2744
2745 left = (PatternObject *)lefto;
2746 right = (PatternObject *)righto;
2747
2748 cmp = (left->flags == right->flags
2749 && left->isbytes == right->isbytes
2750 && left->codesize == right->codesize);
2751 if (cmp) {
2752 /* Compare the code and the pattern because the same pattern can
2753 produce different codes depending on the locale used to compile the
2754 pattern when the re.LOCALE flag is used. Don't compare groups,
2755 indexgroup nor groupindex: they are derivated from the pattern. */
2756 cmp = (memcmp(left->code, right->code,
2757 sizeof(left->code[0]) * left->codesize) == 0);
2758 }
2759 if (cmp) {
2760 cmp = PyObject_RichCompareBool(left->pattern, right->pattern,
2761 Py_EQ);
2762 if (cmp < 0) {
2763 return NULL;
2764 }
2765 }
2766 if (op == Py_NE) {
2767 cmp = !cmp;
2768 }
2769 return PyBool_FromLong(cmp);
2770 }
2771
2772 #include "clinic/sre.c.h"
2773
2774 static PyMethodDef pattern_methods[] = {
2775 _SRE_SRE_PATTERN_MATCH_METHODDEF
2776 _SRE_SRE_PATTERN_FULLMATCH_METHODDEF
2777 _SRE_SRE_PATTERN_SEARCH_METHODDEF
2778 _SRE_SRE_PATTERN_SUB_METHODDEF
2779 _SRE_SRE_PATTERN_SUBN_METHODDEF
2780 _SRE_SRE_PATTERN_FINDALL_METHODDEF
2781 _SRE_SRE_PATTERN_SPLIT_METHODDEF
2782 _SRE_SRE_PATTERN_FINDITER_METHODDEF
2783 _SRE_SRE_PATTERN_SCANNER_METHODDEF
2784 _SRE_SRE_PATTERN___COPY___METHODDEF
2785 _SRE_SRE_PATTERN___DEEPCOPY___METHODDEF
2786 {"__class_getitem__", Py_GenericAlias, METH_O|METH_CLASS,
2787 PyDoc_STR("See PEP 585")},
2788 {NULL, NULL}
2789 };
2790
2791 static PyGetSetDef pattern_getset[] = {
2792 {"groupindex", (getter)pattern_groupindex, (setter)NULL,
2793 "A dictionary mapping group names to group numbers."},
2794 {NULL} /* Sentinel */
2795 };
2796
2797 #define PAT_OFF(x) offsetof(PatternObject, x)
2798 static PyMemberDef pattern_members[] = {
2799 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY,
2800 "The pattern string from which the RE object was compiled."},
2801 {"flags", T_INT, PAT_OFF(flags), READONLY,
2802 "The regex matching flags."},
2803 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY,
2804 "The number of capturing groups in the pattern."},
2805 {"__weaklistoffset__", T_PYSSIZET, offsetof(PatternObject, weakreflist), READONLY},
2806 {NULL} /* Sentinel */
2807 };
2808
2809 static PyType_Slot pattern_slots[] = {
2810 {Py_tp_dealloc, (destructor)pattern_dealloc},
2811 {Py_tp_repr, (reprfunc)pattern_repr},
2812 {Py_tp_hash, (hashfunc)pattern_hash},
2813 {Py_tp_doc, (void *)pattern_doc},
2814 {Py_tp_richcompare, pattern_richcompare},
2815 {Py_tp_methods, pattern_methods},
2816 {Py_tp_members, pattern_members},
2817 {Py_tp_getset, pattern_getset},
2818 {Py_tp_traverse, pattern_traverse},
2819 {Py_tp_clear, pattern_clear},
2820 {0, NULL},
2821 };
2822
2823 static PyType_Spec pattern_spec = {
2824 .name = "re.Pattern",
2825 .basicsize = sizeof(PatternObject),
2826 .itemsize = sizeof(SRE_CODE),
2827 .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
2828 Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
2829 .slots = pattern_slots,
2830 };
2831
2832 static PyMethodDef match_methods[] = {
2833 {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
2834 _SRE_SRE_MATCH_START_METHODDEF
2835 _SRE_SRE_MATCH_END_METHODDEF
2836 _SRE_SRE_MATCH_SPAN_METHODDEF
2837 _SRE_SRE_MATCH_GROUPS_METHODDEF
2838 _SRE_SRE_MATCH_GROUPDICT_METHODDEF
2839 _SRE_SRE_MATCH_EXPAND_METHODDEF
2840 _SRE_SRE_MATCH___COPY___METHODDEF
2841 _SRE_SRE_MATCH___DEEPCOPY___METHODDEF
2842 {"__class_getitem__", Py_GenericAlias, METH_O|METH_CLASS,
2843 PyDoc_STR("See PEP 585")},
2844 {NULL, NULL}
2845 };
2846
2847 static PyGetSetDef match_getset[] = {
2848 {"lastindex", (getter)match_lastindex_get, (setter)NULL,
2849 "The integer index of the last matched capturing group."},
2850 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL,
2851 "The name of the last matched capturing group."},
2852 {"regs", (getter)match_regs_get, (setter)NULL},
2853 {NULL}
2854 };
2855
2856 #define MATCH_OFF(x) offsetof(MatchObject, x)
2857 static PyMemberDef match_members[] = {
2858 {"string", T_OBJECT, MATCH_OFF(string), READONLY,
2859 "The string passed to match() or search()."},
2860 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY,
2861 "The regular expression object."},
2862 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY,
2863 "The index into the string at which the RE engine started looking for a match."},
2864 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY,
2865 "The index into the string beyond which the RE engine will not go."},
2866 {NULL}
2867 };
2868
2869 /* FIXME: implement setattr("string", None) as a special case (to
2870 detach the associated string, if any */
2871 static PyType_Slot match_slots[] = {
2872 {Py_tp_dealloc, match_dealloc},
2873 {Py_tp_repr, match_repr},
2874 {Py_tp_doc, (void *)match_doc},
2875 {Py_tp_methods, match_methods},
2876 {Py_tp_members, match_members},
2877 {Py_tp_getset, match_getset},
2878 {Py_tp_traverse, match_traverse},
2879 {Py_tp_clear, match_clear},
2880
2881 /* As mapping.
2882 *
2883 * Match objects do not support length or assignment, but do support
2884 * __getitem__.
2885 */
2886 {Py_mp_subscript, match_getitem},
2887
2888 {0, NULL},
2889 };
2890
2891 static PyType_Spec match_spec = {
2892 .name = "re.Match",
2893 .basicsize = sizeof(MatchObject),
2894 .itemsize = sizeof(Py_ssize_t),
2895 .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
2896 Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
2897 .slots = match_slots,
2898 };
2899
2900 static PyMethodDef scanner_methods[] = {
2901 _SRE_SRE_SCANNER_MATCH_METHODDEF
2902 _SRE_SRE_SCANNER_SEARCH_METHODDEF
2903 {NULL, NULL}
2904 };
2905
2906 #define SCAN_OFF(x) offsetof(ScannerObject, x)
2907 static PyMemberDef scanner_members[] = {
2908 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
2909 {NULL} /* Sentinel */
2910 };
2911
2912 static PyType_Slot scanner_slots[] = {
2913 {Py_tp_dealloc, scanner_dealloc},
2914 {Py_tp_methods, scanner_methods},
2915 {Py_tp_members, scanner_members},
2916 {Py_tp_traverse, scanner_traverse},
2917 {Py_tp_clear, scanner_clear},
2918 {0, NULL},
2919 };
2920
2921 static PyType_Spec scanner_spec = {
2922 .name = "_" SRE_MODULE ".SRE_Scanner",
2923 .basicsize = sizeof(ScannerObject),
2924 .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
2925 Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
2926 .slots = scanner_slots,
2927 };
2928
2929 static PyMethodDef _functions[] = {
2930 _SRE_COMPILE_METHODDEF
2931 _SRE_GETCODESIZE_METHODDEF
2932 _SRE_ASCII_ISCASED_METHODDEF
2933 _SRE_UNICODE_ISCASED_METHODDEF
2934 _SRE_ASCII_TOLOWER_METHODDEF
2935 _SRE_UNICODE_TOLOWER_METHODDEF
2936 {NULL, NULL}
2937 };
2938
2939 static int
sre_traverse(PyObject * module,visitproc visit,void * arg)2940 sre_traverse(PyObject *module, visitproc visit, void *arg)
2941 {
2942 _sremodulestate *state = get_sre_module_state(module);
2943
2944 Py_VISIT(state->Pattern_Type);
2945 Py_VISIT(state->Match_Type);
2946 Py_VISIT(state->Scanner_Type);
2947
2948 return 0;
2949 }
2950
2951 static int
sre_clear(PyObject * module)2952 sre_clear(PyObject *module)
2953 {
2954 _sremodulestate *state = get_sre_module_state(module);
2955
2956 Py_CLEAR(state->Pattern_Type);
2957 Py_CLEAR(state->Match_Type);
2958 Py_CLEAR(state->Scanner_Type);
2959
2960 return 0;
2961 }
2962
2963 static void
sre_free(void * module)2964 sre_free(void *module)
2965 {
2966 sre_clear((PyObject *)module);
2967 }
2968
2969 #define CREATE_TYPE(m, type, spec) \
2970 do { \
2971 type = (PyTypeObject *)PyType_FromModuleAndSpec(m, spec, NULL); \
2972 if (type == NULL) { \
2973 goto error; \
2974 } \
2975 } while (0)
2976
2977 #define ADD_ULONG_CONSTANT(module, name, value) \
2978 do { \
2979 PyObject *o = PyLong_FromUnsignedLong(value); \
2980 if (!o) \
2981 goto error; \
2982 int res = PyModule_AddObjectRef(module, name, o); \
2983 Py_DECREF(o); \
2984 if (res < 0) { \
2985 goto error; \
2986 } \
2987 } while (0)
2988
2989 static int
sre_exec(PyObject * m)2990 sre_exec(PyObject *m)
2991 {
2992 _sremodulestate *state;
2993
2994 /* Create heap types */
2995 state = get_sre_module_state(m);
2996 CREATE_TYPE(m, state->Pattern_Type, &pattern_spec);
2997 CREATE_TYPE(m, state->Match_Type, &match_spec);
2998 CREATE_TYPE(m, state->Scanner_Type, &scanner_spec);
2999
3000 if (PyModule_AddIntConstant(m, "MAGIC", SRE_MAGIC) < 0) {
3001 goto error;
3002 }
3003
3004 if (PyModule_AddIntConstant(m, "CODESIZE", sizeof(SRE_CODE)) < 0) {
3005 goto error;
3006 }
3007
3008 ADD_ULONG_CONSTANT(m, "MAXREPEAT", SRE_MAXREPEAT);
3009 ADD_ULONG_CONSTANT(m, "MAXGROUPS", SRE_MAXGROUPS);
3010
3011 if (PyModule_AddStringConstant(m, "copyright", copyright) < 0) {
3012 goto error;
3013 }
3014
3015 return 0;
3016
3017 error:
3018 return -1;
3019 }
3020
3021 static PyModuleDef_Slot sre_slots[] = {
3022 {Py_mod_exec, sre_exec},
3023 {0, NULL},
3024 };
3025
3026 static struct PyModuleDef sremodule = {
3027 .m_base = PyModuleDef_HEAD_INIT,
3028 .m_name = "_" SRE_MODULE,
3029 .m_size = sizeof(_sremodulestate),
3030 .m_methods = _functions,
3031 .m_slots = sre_slots,
3032 .m_traverse = sre_traverse,
3033 .m_free = sre_free,
3034 .m_clear = sre_clear,
3035 };
3036
3037 PyMODINIT_FUNC
PyInit__sre(void)3038 PyInit__sre(void)
3039 {
3040 return PyModuleDef_Init(&sremodule);
3041 }
3042
3043 /* vim:ts=4:sw=4:et
3044 */
3045