1 /*
2  * Secret Labs' Regular Expression Engine
3  *
4  * regular expression matching engine
5  *
6  * partial history:
7  * 1999-10-24 fl   created (based on existing template matcher code)
8  * 2000-03-06 fl   first alpha, sort of
9  * 2000-08-01 fl   fixes for 1.6b1
10  * 2000-08-07 fl   use PyOS_CheckStack() if available
11  * 2000-09-20 fl   added expand method
12  * 2001-03-20 fl   lots of fixes for 2.1b2
13  * 2001-04-15 fl   export copyright as Python attribute, not global
14  * 2001-04-28 fl   added __copy__ methods (work in progress)
15  * 2001-05-14 fl   fixes for 1.5.2 compatibility
16  * 2001-07-01 fl   added BIGCHARSET support (from Martin von Loewis)
17  * 2001-10-18 fl   fixed group reset issue (from Matthew Mueller)
18  * 2001-10-20 fl   added split primitive; re-enable unicode for 1.6/2.0/2.1
19  * 2001-10-21 fl   added sub/subn primitive
20  * 2001-10-24 fl   added finditer primitive (for 2.2 only)
21  * 2001-12-07 fl   fixed memory leak in sub/subn (Guido van Rossum)
22  * 2002-11-09 fl   fixed empty sub/subn return type
23  * 2003-04-18 mvl  fully support 4-byte codes
24  * 2003-10-17 gn   implemented non recursive scheme
25  * 2013-02-04 mrab added fullmatch primitive
26  *
27  * Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
28  *
29  * This version of the SRE library can be redistributed under CNRI's
30  * Python 1.6 license.  For any other use, please contact Secret Labs
31  * AB ([email protected]).
32  *
33  * Portions of this engine have been developed in cooperation with
34  * CNRI.  Hewlett-Packard provided funding for 1.6 integration and
35  * other compatibility work.
36  */
37 
38 static const char copyright[] =
39     " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
40 
41 #define PY_SSIZE_T_CLEAN
42 
43 #include "Python.h"
44 #include "pycore_long.h"          // _PyLong_GetZero()
45 #include "pycore_moduleobject.h"  // _PyModule_GetState()
46 #include "structmember.h"         // PyMemberDef
47 
48 #include "sre.h"
49 
50 #define SRE_CODE_BITS (8 * sizeof(SRE_CODE))
51 
52 #include <ctype.h>
53 
54 /* name of this module, minus the leading underscore */
55 #if !defined(SRE_MODULE)
56 #define SRE_MODULE "sre"
57 #endif
58 
59 #define SRE_PY_MODULE "re"
60 
61 /* defining this one enables tracing */
62 #undef VERBOSE
63 
64 /* -------------------------------------------------------------------- */
65 
66 #if defined(_MSC_VER)
67 #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
68 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
69 /* fastest possible local call under MSVC */
70 #define LOCAL(type) static __inline type __fastcall
71 #else
72 #define LOCAL(type) static inline type
73 #endif
74 
75 /* error codes */
76 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
77 #define SRE_ERROR_STATE -2 /* illegal state */
78 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
79 #define SRE_ERROR_MEMORY -9 /* out of memory */
80 #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
81 
82 #if defined(VERBOSE)
83 #define TRACE(v) printf v
84 #else
85 #define TRACE(v)
86 #endif
87 
88 /* -------------------------------------------------------------------- */
89 /* search engine state */
90 
91 #define SRE_IS_DIGIT(ch)\
92     ((ch) <= '9' && Py_ISDIGIT(ch))
93 #define SRE_IS_SPACE(ch)\
94     ((ch) <= ' ' && Py_ISSPACE(ch))
95 #define SRE_IS_LINEBREAK(ch)\
96     ((ch) == '\n')
97 #define SRE_IS_WORD(ch)\
98     ((ch) <= 'z' && (Py_ISALNUM(ch) || (ch) == '_'))
99 
sre_lower_ascii(unsigned int ch)100 static unsigned int sre_lower_ascii(unsigned int ch)
101 {
102     return ((ch) < 128 ? Py_TOLOWER(ch) : ch);
103 }
104 
105 /* locale-specific character predicates */
106 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
107  * warnings when c's type supports only numbers < N+1 */
108 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
109 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
110 
sre_lower_locale(unsigned int ch)111 static unsigned int sre_lower_locale(unsigned int ch)
112 {
113     return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
114 }
115 
sre_upper_locale(unsigned int ch)116 static unsigned int sre_upper_locale(unsigned int ch)
117 {
118     return ((ch) < 256 ? (unsigned int)toupper((ch)) : ch);
119 }
120 
121 /* unicode-specific character predicates */
122 
123 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
124 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE(ch)
125 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch)
126 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch)
127 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_')
128 
sre_lower_unicode(unsigned int ch)129 static unsigned int sre_lower_unicode(unsigned int ch)
130 {
131     return (unsigned int) Py_UNICODE_TOLOWER(ch);
132 }
133 
sre_upper_unicode(unsigned int ch)134 static unsigned int sre_upper_unicode(unsigned int ch)
135 {
136     return (unsigned int) Py_UNICODE_TOUPPER(ch);
137 }
138 
139 LOCAL(int)
sre_category(SRE_CODE category,unsigned int ch)140 sre_category(SRE_CODE category, unsigned int ch)
141 {
142     switch (category) {
143 
144     case SRE_CATEGORY_DIGIT:
145         return SRE_IS_DIGIT(ch);
146     case SRE_CATEGORY_NOT_DIGIT:
147         return !SRE_IS_DIGIT(ch);
148     case SRE_CATEGORY_SPACE:
149         return SRE_IS_SPACE(ch);
150     case SRE_CATEGORY_NOT_SPACE:
151         return !SRE_IS_SPACE(ch);
152     case SRE_CATEGORY_WORD:
153         return SRE_IS_WORD(ch);
154     case SRE_CATEGORY_NOT_WORD:
155         return !SRE_IS_WORD(ch);
156     case SRE_CATEGORY_LINEBREAK:
157         return SRE_IS_LINEBREAK(ch);
158     case SRE_CATEGORY_NOT_LINEBREAK:
159         return !SRE_IS_LINEBREAK(ch);
160 
161     case SRE_CATEGORY_LOC_WORD:
162         return SRE_LOC_IS_WORD(ch);
163     case SRE_CATEGORY_LOC_NOT_WORD:
164         return !SRE_LOC_IS_WORD(ch);
165 
166     case SRE_CATEGORY_UNI_DIGIT:
167         return SRE_UNI_IS_DIGIT(ch);
168     case SRE_CATEGORY_UNI_NOT_DIGIT:
169         return !SRE_UNI_IS_DIGIT(ch);
170     case SRE_CATEGORY_UNI_SPACE:
171         return SRE_UNI_IS_SPACE(ch);
172     case SRE_CATEGORY_UNI_NOT_SPACE:
173         return !SRE_UNI_IS_SPACE(ch);
174     case SRE_CATEGORY_UNI_WORD:
175         return SRE_UNI_IS_WORD(ch);
176     case SRE_CATEGORY_UNI_NOT_WORD:
177         return !SRE_UNI_IS_WORD(ch);
178     case SRE_CATEGORY_UNI_LINEBREAK:
179         return SRE_UNI_IS_LINEBREAK(ch);
180     case SRE_CATEGORY_UNI_NOT_LINEBREAK:
181         return !SRE_UNI_IS_LINEBREAK(ch);
182     }
183     return 0;
184 }
185 
186 LOCAL(int)
char_loc_ignore(SRE_CODE pattern,SRE_CODE ch)187 char_loc_ignore(SRE_CODE pattern, SRE_CODE ch)
188 {
189     return ch == pattern
190         || (SRE_CODE) sre_lower_locale(ch) == pattern
191         || (SRE_CODE) sre_upper_locale(ch) == pattern;
192 }
193 
194 
195 /* helpers */
196 
197 static void
data_stack_dealloc(SRE_STATE * state)198 data_stack_dealloc(SRE_STATE* state)
199 {
200     if (state->data_stack) {
201         PyMem_Free(state->data_stack);
202         state->data_stack = NULL;
203     }
204     state->data_stack_size = state->data_stack_base = 0;
205 }
206 
207 static int
data_stack_grow(SRE_STATE * state,Py_ssize_t size)208 data_stack_grow(SRE_STATE* state, Py_ssize_t size)
209 {
210     Py_ssize_t minsize, cursize;
211     minsize = state->data_stack_base+size;
212     cursize = state->data_stack_size;
213     if (cursize < minsize) {
214         void* stack;
215         cursize = minsize+minsize/4+1024;
216         TRACE(("allocate/grow stack %zd\n", cursize));
217         stack = PyMem_Realloc(state->data_stack, cursize);
218         if (!stack) {
219             data_stack_dealloc(state);
220             return SRE_ERROR_MEMORY;
221         }
222         state->data_stack = (char *)stack;
223         state->data_stack_size = cursize;
224     }
225     return 0;
226 }
227 
228 /* generate 8-bit version */
229 
230 #define SRE_CHAR Py_UCS1
231 #define SIZEOF_SRE_CHAR 1
232 #define SRE(F) sre_ucs1_##F
233 #include "sre_lib.h"
234 
235 /* generate 16-bit unicode version */
236 
237 #define SRE_CHAR Py_UCS2
238 #define SIZEOF_SRE_CHAR 2
239 #define SRE(F) sre_ucs2_##F
240 #include "sre_lib.h"
241 
242 /* generate 32-bit unicode version */
243 
244 #define SRE_CHAR Py_UCS4
245 #define SIZEOF_SRE_CHAR 4
246 #define SRE(F) sre_ucs4_##F
247 #include "sre_lib.h"
248 
249 /* -------------------------------------------------------------------- */
250 /* factories and destructors */
251 
252 /* module state */
253 typedef struct {
254     PyTypeObject *Pattern_Type;
255     PyTypeObject *Match_Type;
256     PyTypeObject *Scanner_Type;
257 } _sremodulestate;
258 
259 static _sremodulestate *
get_sre_module_state(PyObject * m)260 get_sre_module_state(PyObject *m)
261 {
262     _sremodulestate *state = (_sremodulestate *)_PyModule_GetState(m);
263     assert(state);
264     return state;
265 }
266 
267 static struct PyModuleDef sremodule;
268 #define get_sre_module_state_by_class(cls) \
269     (get_sre_module_state(PyType_GetModule(cls)))
270 
271 /* see sre.h for object declarations */
272 static PyObject*pattern_new_match(_sremodulestate *, PatternObject*, SRE_STATE*, Py_ssize_t);
273 static PyObject *pattern_scanner(_sremodulestate *, PatternObject *, PyObject *, Py_ssize_t, Py_ssize_t);
274 
275 /*[clinic input]
276 module _sre
277 class _sre.SRE_Pattern "PatternObject *" "get_sre_module_state_by_class(tp)->Pattern_Type"
278 class _sre.SRE_Match "MatchObject *" "get_sre_module_state_by_class(tp)->Match_Type"
279 class _sre.SRE_Scanner "ScannerObject *" "get_sre_module_state_by_class(tp)->Scanner_Type"
280 [clinic start generated code]*/
281 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=fe2966e32b66a231]*/
282 
283 /*[clinic input]
284 _sre.getcodesize -> int
285 [clinic start generated code]*/
286 
287 static int
_sre_getcodesize_impl(PyObject * module)288 _sre_getcodesize_impl(PyObject *module)
289 /*[clinic end generated code: output=e0db7ce34a6dd7b1 input=bd6f6ecf4916bb2b]*/
290 {
291     return sizeof(SRE_CODE);
292 }
293 
294 /*[clinic input]
295 _sre.ascii_iscased -> bool
296 
297     character: int
298     /
299 
300 [clinic start generated code]*/
301 
302 static int
_sre_ascii_iscased_impl(PyObject * module,int character)303 _sre_ascii_iscased_impl(PyObject *module, int character)
304 /*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/
305 {
306     unsigned int ch = (unsigned int)character;
307     return ch < 128 && Py_ISALPHA(ch);
308 }
309 
310 /*[clinic input]
311 _sre.unicode_iscased -> bool
312 
313     character: int
314     /
315 
316 [clinic start generated code]*/
317 
318 static int
_sre_unicode_iscased_impl(PyObject * module,int character)319 _sre_unicode_iscased_impl(PyObject *module, int character)
320 /*[clinic end generated code: output=9c5ddee0dc2bc258 input=51e42c3b8dddb78e]*/
321 {
322     unsigned int ch = (unsigned int)character;
323     return ch != sre_lower_unicode(ch) || ch != sre_upper_unicode(ch);
324 }
325 
326 /*[clinic input]
327 _sre.ascii_tolower -> int
328 
329     character: int
330     /
331 
332 [clinic start generated code]*/
333 
334 static int
_sre_ascii_tolower_impl(PyObject * module,int character)335 _sre_ascii_tolower_impl(PyObject *module, int character)
336 /*[clinic end generated code: output=228294ed6ff2a612 input=272c609b5b61f136]*/
337 {
338     return sre_lower_ascii(character);
339 }
340 
341 /*[clinic input]
342 _sre.unicode_tolower -> int
343 
344     character: int
345     /
346 
347 [clinic start generated code]*/
348 
349 static int
_sre_unicode_tolower_impl(PyObject * module,int character)350 _sre_unicode_tolower_impl(PyObject *module, int character)
351 /*[clinic end generated code: output=6422272d7d7fee65 input=91d708c5f3c2045a]*/
352 {
353     return sre_lower_unicode(character);
354 }
355 
356 LOCAL(void)
state_reset(SRE_STATE * state)357 state_reset(SRE_STATE* state)
358 {
359     /* state->mark will be set to 0 in SRE_OP_MARK dynamically. */
360     /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
361 
362     state->lastmark = -1;
363     state->lastindex = -1;
364 
365     state->repeat = NULL;
366 
367     data_stack_dealloc(state);
368 }
369 
370 static const void*
getstring(PyObject * string,Py_ssize_t * p_length,int * p_isbytes,int * p_charsize,Py_buffer * view)371 getstring(PyObject* string, Py_ssize_t* p_length,
372           int* p_isbytes, int* p_charsize,
373           Py_buffer *view)
374 {
375     /* given a python object, return a data pointer, a length (in
376        characters), and a character size.  return NULL if the object
377        is not a string (or not compatible) */
378 
379     /* Unicode objects do not support the buffer API. So, get the data
380        directly instead. */
381     if (PyUnicode_Check(string)) {
382         if (PyUnicode_READY(string) == -1)
383             return NULL;
384         *p_length = PyUnicode_GET_LENGTH(string);
385         *p_charsize = PyUnicode_KIND(string);
386         *p_isbytes = 0;
387         return PyUnicode_DATA(string);
388     }
389 
390     /* get pointer to byte string buffer */
391     if (PyObject_GetBuffer(string, view, PyBUF_SIMPLE) != 0) {
392         PyErr_Format(PyExc_TypeError, "expected string or bytes-like "
393                      "object, got '%.200s'", Py_TYPE(string)->tp_name);
394         return NULL;
395     }
396 
397     *p_length = view->len;
398     *p_charsize = 1;
399     *p_isbytes = 1;
400 
401     if (view->buf == NULL) {
402         PyErr_SetString(PyExc_ValueError, "Buffer is NULL");
403         PyBuffer_Release(view);
404         view->buf = NULL;
405         return NULL;
406     }
407     return view->buf;
408 }
409 
410 LOCAL(PyObject*)
state_init(SRE_STATE * state,PatternObject * pattern,PyObject * string,Py_ssize_t start,Py_ssize_t end)411 state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
412            Py_ssize_t start, Py_ssize_t end)
413 {
414     /* prepare state object */
415 
416     Py_ssize_t length;
417     int isbytes, charsize;
418     const void* ptr;
419 
420     memset(state, 0, sizeof(SRE_STATE));
421 
422     state->mark = PyMem_New(const void *, pattern->groups * 2);
423     if (!state->mark) {
424         PyErr_NoMemory();
425         goto err;
426     }
427     state->lastmark = -1;
428     state->lastindex = -1;
429 
430     state->buffer.buf = NULL;
431     ptr = getstring(string, &length, &isbytes, &charsize, &state->buffer);
432     if (!ptr)
433         goto err;
434 
435     if (isbytes && pattern->isbytes == 0) {
436         PyErr_SetString(PyExc_TypeError,
437                         "cannot use a string pattern on a bytes-like object");
438         goto err;
439     }
440     if (!isbytes && pattern->isbytes > 0) {
441         PyErr_SetString(PyExc_TypeError,
442                         "cannot use a bytes pattern on a string-like object");
443         goto err;
444     }
445 
446     /* adjust boundaries */
447     if (start < 0)
448         start = 0;
449     else if (start > length)
450         start = length;
451 
452     if (end < 0)
453         end = 0;
454     else if (end > length)
455         end = length;
456 
457     state->isbytes = isbytes;
458     state->charsize = charsize;
459     state->match_all = 0;
460     state->must_advance = 0;
461 
462     state->beginning = ptr;
463 
464     state->start = (void*) ((char*) ptr + start * state->charsize);
465     state->end = (void*) ((char*) ptr + end * state->charsize);
466 
467     Py_INCREF(string);
468     state->string = string;
469     state->pos = start;
470     state->endpos = end;
471 
472     return string;
473   err:
474     /* We add an explicit cast here because MSVC has a bug when
475        compiling C code where it believes that `const void**` cannot be
476        safely casted to `void*`, see bpo-39943 for details. */
477     PyMem_Free((void*) state->mark);
478     state->mark = NULL;
479     if (state->buffer.buf)
480         PyBuffer_Release(&state->buffer);
481     return NULL;
482 }
483 
484 LOCAL(void)
state_fini(SRE_STATE * state)485 state_fini(SRE_STATE* state)
486 {
487     if (state->buffer.buf)
488         PyBuffer_Release(&state->buffer);
489     Py_XDECREF(state->string);
490     data_stack_dealloc(state);
491     /* See above PyMem_Del for why we explicitly cast here. */
492     PyMem_Free((void*) state->mark);
493     state->mark = NULL;
494 }
495 
496 /* calculate offset from start of string */
497 #define STATE_OFFSET(state, member)\
498     (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
499 
500 LOCAL(PyObject*)
getslice(int isbytes,const void * ptr,PyObject * string,Py_ssize_t start,Py_ssize_t end)501 getslice(int isbytes, const void *ptr,
502          PyObject* string, Py_ssize_t start, Py_ssize_t end)
503 {
504     if (isbytes) {
505         if (PyBytes_CheckExact(string) &&
506             start == 0 && end == PyBytes_GET_SIZE(string)) {
507             Py_INCREF(string);
508             return string;
509         }
510         return PyBytes_FromStringAndSize(
511                 (const char *)ptr + start, end - start);
512     }
513     else {
514         return PyUnicode_Substring(string, start, end);
515     }
516 }
517 
518 LOCAL(PyObject*)
state_getslice(SRE_STATE * state,Py_ssize_t index,PyObject * string,int empty)519 state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
520 {
521     Py_ssize_t i, j;
522 
523     index = (index - 1) * 2;
524 
525     if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
526         if (empty)
527             /* want empty string */
528             i = j = 0;
529         else {
530             Py_RETURN_NONE;
531         }
532     } else {
533         i = STATE_OFFSET(state, state->mark[index]);
534         j = STATE_OFFSET(state, state->mark[index+1]);
535 
536         /* check wrong span */
537         if (i > j) {
538             PyErr_SetString(PyExc_SystemError,
539                             "The span of capturing group is wrong,"
540                             " please report a bug for the re module.");
541             return NULL;
542         }
543     }
544 
545     return getslice(state->isbytes, state->beginning, string, i, j);
546 }
547 
548 static void
pattern_error(Py_ssize_t status)549 pattern_error(Py_ssize_t status)
550 {
551     switch (status) {
552     case SRE_ERROR_RECURSION_LIMIT:
553         /* This error code seems to be unused. */
554         PyErr_SetString(
555             PyExc_RecursionError,
556             "maximum recursion limit exceeded"
557             );
558         break;
559     case SRE_ERROR_MEMORY:
560         PyErr_NoMemory();
561         break;
562     case SRE_ERROR_INTERRUPTED:
563     /* An exception has already been raised, so let it fly */
564         break;
565     default:
566         /* other error codes indicate compiler/engine bugs */
567         PyErr_SetString(
568             PyExc_RuntimeError,
569             "internal error in regular expression engine"
570             );
571     }
572 }
573 
574 static int
pattern_traverse(PatternObject * self,visitproc visit,void * arg)575 pattern_traverse(PatternObject *self, visitproc visit, void *arg)
576 {
577     Py_VISIT(Py_TYPE(self));
578     Py_VISIT(self->groupindex);
579     Py_VISIT(self->indexgroup);
580     Py_VISIT(self->pattern);
581     return 0;
582 }
583 
584 static int
pattern_clear(PatternObject * self)585 pattern_clear(PatternObject *self)
586 {
587     Py_CLEAR(self->groupindex);
588     Py_CLEAR(self->indexgroup);
589     Py_CLEAR(self->pattern);
590     return 0;
591 }
592 
593 static void
pattern_dealloc(PatternObject * self)594 pattern_dealloc(PatternObject* self)
595 {
596     PyTypeObject *tp = Py_TYPE(self);
597 
598     PyObject_GC_UnTrack(self);
599     if (self->weakreflist != NULL) {
600         PyObject_ClearWeakRefs((PyObject *) self);
601     }
602     (void)pattern_clear(self);
603     tp->tp_free(self);
604     Py_DECREF(tp);
605 }
606 
607 LOCAL(Py_ssize_t)
sre_match(SRE_STATE * state,SRE_CODE * pattern)608 sre_match(SRE_STATE* state, SRE_CODE* pattern)
609 {
610     if (state->charsize == 1)
611         return sre_ucs1_match(state, pattern, 1);
612     if (state->charsize == 2)
613         return sre_ucs2_match(state, pattern, 1);
614     assert(state->charsize == 4);
615     return sre_ucs4_match(state, pattern, 1);
616 }
617 
618 LOCAL(Py_ssize_t)
sre_search(SRE_STATE * state,SRE_CODE * pattern)619 sre_search(SRE_STATE* state, SRE_CODE* pattern)
620 {
621     if (state->charsize == 1)
622         return sre_ucs1_search(state, pattern);
623     if (state->charsize == 2)
624         return sre_ucs2_search(state, pattern);
625     assert(state->charsize == 4);
626     return sre_ucs4_search(state, pattern);
627 }
628 
629 /*[clinic input]
630 _sre.SRE_Pattern.match
631 
632     cls: defining_class
633     /
634     string: object
635     pos: Py_ssize_t = 0
636     endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
637 
638 Matches zero or more characters at the beginning of the string.
639 [clinic start generated code]*/
640 
641 static PyObject *
_sre_SRE_Pattern_match_impl(PatternObject * self,PyTypeObject * cls,PyObject * string,Py_ssize_t pos,Py_ssize_t endpos)642 _sre_SRE_Pattern_match_impl(PatternObject *self, PyTypeObject *cls,
643                             PyObject *string, Py_ssize_t pos,
644                             Py_ssize_t endpos)
645 /*[clinic end generated code: output=ec6208ea58a0cca0 input=4bdb9c3e564d13ac]*/
646 {
647     _sremodulestate *module_state = get_sre_module_state_by_class(cls);
648     SRE_STATE state;
649     Py_ssize_t status;
650     PyObject *match;
651 
652     if (!state_init(&state, (PatternObject *)self, string, pos, endpos))
653         return NULL;
654 
655     state.ptr = state.start;
656 
657     TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
658 
659     status = sre_match(&state, PatternObject_GetCode(self));
660 
661     TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
662     if (PyErr_Occurred()) {
663         state_fini(&state);
664         return NULL;
665     }
666 
667     match = pattern_new_match(module_state, self, &state, status);
668     state_fini(&state);
669     return match;
670 }
671 
672 /*[clinic input]
673 _sre.SRE_Pattern.fullmatch
674 
675     cls: defining_class
676     /
677     string: object
678     pos: Py_ssize_t = 0
679     endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
680 
681 Matches against all of the string.
682 [clinic start generated code]*/
683 
684 static PyObject *
_sre_SRE_Pattern_fullmatch_impl(PatternObject * self,PyTypeObject * cls,PyObject * string,Py_ssize_t pos,Py_ssize_t endpos)685 _sre_SRE_Pattern_fullmatch_impl(PatternObject *self, PyTypeObject *cls,
686                                 PyObject *string, Py_ssize_t pos,
687                                 Py_ssize_t endpos)
688 /*[clinic end generated code: output=625b75b027ef94da input=50981172ab0fcfdd]*/
689 {
690     _sremodulestate *module_state = get_sre_module_state_by_class(cls);
691     SRE_STATE state;
692     Py_ssize_t status;
693     PyObject *match;
694 
695     if (!state_init(&state, self, string, pos, endpos))
696         return NULL;
697 
698     state.ptr = state.start;
699 
700     TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr));
701 
702     state.match_all = 1;
703     status = sre_match(&state, PatternObject_GetCode(self));
704 
705     TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
706     if (PyErr_Occurred()) {
707         state_fini(&state);
708         return NULL;
709     }
710 
711     match = pattern_new_match(module_state, self, &state, status);
712     state_fini(&state);
713     return match;
714 }
715 
716 /*[clinic input]
717 _sre.SRE_Pattern.search
718 
719     cls: defining_class
720     /
721     string: object
722     pos: Py_ssize_t = 0
723     endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
724 
725 Scan through string looking for a match, and return a corresponding match object instance.
726 
727 Return None if no position in the string matches.
728 [clinic start generated code]*/
729 
730 static PyObject *
_sre_SRE_Pattern_search_impl(PatternObject * self,PyTypeObject * cls,PyObject * string,Py_ssize_t pos,Py_ssize_t endpos)731 _sre_SRE_Pattern_search_impl(PatternObject *self, PyTypeObject *cls,
732                              PyObject *string, Py_ssize_t pos,
733                              Py_ssize_t endpos)
734 /*[clinic end generated code: output=bd7f2d9d583e1463 input=afa9afb66a74a4b3]*/
735 {
736     _sremodulestate *module_state = get_sre_module_state_by_class(cls);
737     SRE_STATE state;
738     Py_ssize_t status;
739     PyObject *match;
740 
741     if (!state_init(&state, self, string, pos, endpos))
742         return NULL;
743 
744     TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
745 
746     status = sre_search(&state, PatternObject_GetCode(self));
747 
748     TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
749 
750     if (PyErr_Occurred()) {
751         state_fini(&state);
752         return NULL;
753     }
754 
755     match = pattern_new_match(module_state, self, &state, status);
756     state_fini(&state);
757     return match;
758 }
759 
760 static PyObject*
call(const char * module,const char * function,PyObject * args)761 call(const char* module, const char* function, PyObject* args)
762 {
763     PyObject* name;
764     PyObject* mod;
765     PyObject* func;
766     PyObject* result;
767 
768     if (!args)
769         return NULL;
770     name = PyUnicode_FromString(module);
771     if (!name)
772         return NULL;
773     mod = PyImport_Import(name);
774     Py_DECREF(name);
775     if (!mod)
776         return NULL;
777     func = PyObject_GetAttrString(mod, function);
778     Py_DECREF(mod);
779     if (!func)
780         return NULL;
781     result = PyObject_CallObject(func, args);
782     Py_DECREF(func);
783     Py_DECREF(args);
784     return result;
785 }
786 
787 /*[clinic input]
788 _sre.SRE_Pattern.findall
789 
790     string: object
791     pos: Py_ssize_t = 0
792     endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
793 
794 Return a list of all non-overlapping matches of pattern in string.
795 [clinic start generated code]*/
796 
797 static PyObject *
_sre_SRE_Pattern_findall_impl(PatternObject * self,PyObject * string,Py_ssize_t pos,Py_ssize_t endpos)798 _sre_SRE_Pattern_findall_impl(PatternObject *self, PyObject *string,
799                               Py_ssize_t pos, Py_ssize_t endpos)
800 /*[clinic end generated code: output=f4966baceea60aca input=5b6a4ee799741563]*/
801 {
802     SRE_STATE state;
803     PyObject* list;
804     Py_ssize_t status;
805     Py_ssize_t i, b, e;
806 
807     if (!state_init(&state, self, string, pos, endpos))
808         return NULL;
809 
810     list = PyList_New(0);
811     if (!list) {
812         state_fini(&state);
813         return NULL;
814     }
815 
816     while (state.start <= state.end) {
817 
818         PyObject* item;
819 
820         state_reset(&state);
821 
822         state.ptr = state.start;
823 
824         status = sre_search(&state, PatternObject_GetCode(self));
825         if (PyErr_Occurred())
826             goto error;
827 
828         if (status <= 0) {
829             if (status == 0)
830                 break;
831             pattern_error(status);
832             goto error;
833         }
834 
835         /* don't bother to build a match object */
836         switch (self->groups) {
837         case 0:
838             b = STATE_OFFSET(&state, state.start);
839             e = STATE_OFFSET(&state, state.ptr);
840             item = getslice(state.isbytes, state.beginning,
841                             string, b, e);
842             if (!item)
843                 goto error;
844             break;
845         case 1:
846             item = state_getslice(&state, 1, string, 1);
847             if (!item)
848                 goto error;
849             break;
850         default:
851             item = PyTuple_New(self->groups);
852             if (!item)
853                 goto error;
854             for (i = 0; i < self->groups; i++) {
855                 PyObject* o = state_getslice(&state, i+1, string, 1);
856                 if (!o) {
857                     Py_DECREF(item);
858                     goto error;
859                 }
860                 PyTuple_SET_ITEM(item, i, o);
861             }
862             break;
863         }
864 
865         status = PyList_Append(list, item);
866         Py_DECREF(item);
867         if (status < 0)
868             goto error;
869 
870         state.must_advance = (state.ptr == state.start);
871         state.start = state.ptr;
872     }
873 
874     state_fini(&state);
875     return list;
876 
877 error:
878     Py_DECREF(list);
879     state_fini(&state);
880     return NULL;
881 
882 }
883 
884 /*[clinic input]
885 _sre.SRE_Pattern.finditer
886 
887     cls: defining_class
888     /
889     string: object
890     pos: Py_ssize_t = 0
891     endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
892 
893 Return an iterator over all non-overlapping matches for the RE pattern in string.
894 
895 For each match, the iterator returns a match object.
896 [clinic start generated code]*/
897 
898 static PyObject *
_sre_SRE_Pattern_finditer_impl(PatternObject * self,PyTypeObject * cls,PyObject * string,Py_ssize_t pos,Py_ssize_t endpos)899 _sre_SRE_Pattern_finditer_impl(PatternObject *self, PyTypeObject *cls,
900                                PyObject *string, Py_ssize_t pos,
901                                Py_ssize_t endpos)
902 /*[clinic end generated code: output=1791dbf3618ade56 input=812e332a4848cbaf]*/
903 {
904     _sremodulestate *module_state = get_sre_module_state_by_class(cls);
905     PyObject* scanner;
906     PyObject* search;
907     PyObject* iterator;
908 
909     scanner = pattern_scanner(module_state, self, string, pos, endpos);
910     if (!scanner)
911         return NULL;
912 
913     search = PyObject_GetAttrString(scanner, "search");
914     Py_DECREF(scanner);
915     if (!search)
916         return NULL;
917 
918     iterator = PyCallIter_New(search, Py_None);
919     Py_DECREF(search);
920 
921     return iterator;
922 }
923 
924 /*[clinic input]
925 _sre.SRE_Pattern.scanner
926 
927     cls: defining_class
928     /
929     string: object
930     pos: Py_ssize_t = 0
931     endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
932 
933 [clinic start generated code]*/
934 
935 static PyObject *
_sre_SRE_Pattern_scanner_impl(PatternObject * self,PyTypeObject * cls,PyObject * string,Py_ssize_t pos,Py_ssize_t endpos)936 _sre_SRE_Pattern_scanner_impl(PatternObject *self, PyTypeObject *cls,
937                               PyObject *string, Py_ssize_t pos,
938                               Py_ssize_t endpos)
939 /*[clinic end generated code: output=f70cd506112f1bd9 input=2e487e5151bcee4c]*/
940 {
941     _sremodulestate *module_state = get_sre_module_state_by_class(cls);
942 
943     return pattern_scanner(module_state, self, string, pos, endpos);
944 }
945 
946 /*[clinic input]
947 _sre.SRE_Pattern.split
948 
949     string: object
950     maxsplit: Py_ssize_t = 0
951 
952 Split string by the occurrences of pattern.
953 [clinic start generated code]*/
954 
955 static PyObject *
_sre_SRE_Pattern_split_impl(PatternObject * self,PyObject * string,Py_ssize_t maxsplit)956 _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string,
957                             Py_ssize_t maxsplit)
958 /*[clinic end generated code: output=7ac66f381c45e0be input=1eeeb10dafc9947a]*/
959 {
960     SRE_STATE state;
961     PyObject* list;
962     PyObject* item;
963     Py_ssize_t status;
964     Py_ssize_t n;
965     Py_ssize_t i;
966     const void* last;
967 
968     assert(self->codesize != 0);
969 
970     if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX))
971         return NULL;
972 
973     list = PyList_New(0);
974     if (!list) {
975         state_fini(&state);
976         return NULL;
977     }
978 
979     n = 0;
980     last = state.start;
981 
982     while (!maxsplit || n < maxsplit) {
983 
984         state_reset(&state);
985 
986         state.ptr = state.start;
987 
988         status = sre_search(&state, PatternObject_GetCode(self));
989         if (PyErr_Occurred())
990             goto error;
991 
992         if (status <= 0) {
993             if (status == 0)
994                 break;
995             pattern_error(status);
996             goto error;
997         }
998 
999         /* get segment before this match */
1000         item = getslice(state.isbytes, state.beginning,
1001             string, STATE_OFFSET(&state, last),
1002             STATE_OFFSET(&state, state.start)
1003             );
1004         if (!item)
1005             goto error;
1006         status = PyList_Append(list, item);
1007         Py_DECREF(item);
1008         if (status < 0)
1009             goto error;
1010 
1011         /* add groups (if any) */
1012         for (i = 0; i < self->groups; i++) {
1013             item = state_getslice(&state, i+1, string, 0);
1014             if (!item)
1015                 goto error;
1016             status = PyList_Append(list, item);
1017             Py_DECREF(item);
1018             if (status < 0)
1019                 goto error;
1020         }
1021 
1022         n = n + 1;
1023         state.must_advance = (state.ptr == state.start);
1024         last = state.start = state.ptr;
1025 
1026     }
1027 
1028     /* get segment following last match (even if empty) */
1029     item = getslice(state.isbytes, state.beginning,
1030         string, STATE_OFFSET(&state, last), state.endpos
1031         );
1032     if (!item)
1033         goto error;
1034     status = PyList_Append(list, item);
1035     Py_DECREF(item);
1036     if (status < 0)
1037         goto error;
1038 
1039     state_fini(&state);
1040     return list;
1041 
1042 error:
1043     Py_DECREF(list);
1044     state_fini(&state);
1045     return NULL;
1046 
1047 }
1048 
1049 static PyObject*
pattern_subx(_sremodulestate * module_state,PatternObject * self,PyObject * ptemplate,PyObject * string,Py_ssize_t count,Py_ssize_t subn)1050 pattern_subx(_sremodulestate* module_state,
1051              PatternObject* self,
1052              PyObject* ptemplate,
1053              PyObject* string,
1054              Py_ssize_t count,
1055              Py_ssize_t subn)
1056 {
1057     SRE_STATE state;
1058     PyObject* list;
1059     PyObject* joiner;
1060     PyObject* item;
1061     PyObject* filter;
1062     PyObject* match;
1063     const void* ptr;
1064     Py_ssize_t status;
1065     Py_ssize_t n;
1066     Py_ssize_t i, b, e;
1067     int isbytes, charsize;
1068     int filter_is_callable;
1069     Py_buffer view;
1070 
1071     if (PyCallable_Check(ptemplate)) {
1072         /* sub/subn takes either a function or a template */
1073         filter = ptemplate;
1074         Py_INCREF(filter);
1075         filter_is_callable = 1;
1076     } else {
1077         /* if not callable, check if it's a literal string */
1078         int literal;
1079         view.buf = NULL;
1080         ptr = getstring(ptemplate, &n, &isbytes, &charsize, &view);
1081         if (ptr) {
1082             if (charsize == 1)
1083                 literal = memchr(ptr, '\\', n) == NULL;
1084             else
1085                 literal = PyUnicode_FindChar(ptemplate, '\\', 0, n, 1) == -1;
1086         } else {
1087             PyErr_Clear();
1088             literal = 0;
1089         }
1090         if (view.buf)
1091             PyBuffer_Release(&view);
1092         if (literal) {
1093             filter = ptemplate;
1094             Py_INCREF(filter);
1095             filter_is_callable = 0;
1096         } else {
1097             /* not a literal; hand it over to the template compiler */
1098             filter = call(
1099                 SRE_PY_MODULE, "_subx",
1100                 PyTuple_Pack(2, self, ptemplate)
1101                 );
1102             if (!filter)
1103                 return NULL;
1104             filter_is_callable = PyCallable_Check(filter);
1105         }
1106     }
1107 
1108     if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX)) {
1109         Py_DECREF(filter);
1110         return NULL;
1111     }
1112 
1113     list = PyList_New(0);
1114     if (!list) {
1115         Py_DECREF(filter);
1116         state_fini(&state);
1117         return NULL;
1118     }
1119 
1120     n = i = 0;
1121 
1122     while (!count || n < count) {
1123 
1124         state_reset(&state);
1125 
1126         state.ptr = state.start;
1127 
1128         status = sre_search(&state, PatternObject_GetCode(self));
1129         if (PyErr_Occurred())
1130             goto error;
1131 
1132         if (status <= 0) {
1133             if (status == 0)
1134                 break;
1135             pattern_error(status);
1136             goto error;
1137         }
1138 
1139         b = STATE_OFFSET(&state, state.start);
1140         e = STATE_OFFSET(&state, state.ptr);
1141 
1142         if (i < b) {
1143             /* get segment before this match */
1144             item = getslice(state.isbytes, state.beginning,
1145                 string, i, b);
1146             if (!item)
1147                 goto error;
1148             status = PyList_Append(list, item);
1149             Py_DECREF(item);
1150             if (status < 0)
1151                 goto error;
1152 
1153         }
1154 
1155         if (filter_is_callable) {
1156             /* pass match object through filter */
1157             match = pattern_new_match(module_state, self, &state, 1);
1158             if (!match)
1159                 goto error;
1160             item = PyObject_CallOneArg(filter, match);
1161             Py_DECREF(match);
1162             if (!item)
1163                 goto error;
1164         } else {
1165             /* filter is literal string */
1166             item = filter;
1167             Py_INCREF(item);
1168         }
1169 
1170         /* add to list */
1171         if (item != Py_None) {
1172             status = PyList_Append(list, item);
1173             Py_DECREF(item);
1174             if (status < 0)
1175                 goto error;
1176         }
1177 
1178         i = e;
1179         n = n + 1;
1180         state.must_advance = (state.ptr == state.start);
1181         state.start = state.ptr;
1182     }
1183 
1184     /* get segment following last match */
1185     if (i < state.endpos) {
1186         item = getslice(state.isbytes, state.beginning,
1187                         string, i, state.endpos);
1188         if (!item)
1189             goto error;
1190         status = PyList_Append(list, item);
1191         Py_DECREF(item);
1192         if (status < 0)
1193             goto error;
1194     }
1195 
1196     state_fini(&state);
1197 
1198     Py_DECREF(filter);
1199 
1200     /* convert list to single string (also removes list) */
1201     joiner = getslice(state.isbytes, state.beginning, string, 0, 0);
1202     if (!joiner) {
1203         Py_DECREF(list);
1204         return NULL;
1205     }
1206     if (PyList_GET_SIZE(list) == 0) {
1207         Py_DECREF(list);
1208         item = joiner;
1209     }
1210     else {
1211         if (state.isbytes)
1212             item = _PyBytes_Join(joiner, list);
1213         else
1214             item = PyUnicode_Join(joiner, list);
1215         Py_DECREF(joiner);
1216         Py_DECREF(list);
1217         if (!item)
1218             return NULL;
1219     }
1220 
1221     if (subn)
1222         return Py_BuildValue("Nn", item, n);
1223 
1224     return item;
1225 
1226 error:
1227     Py_DECREF(list);
1228     state_fini(&state);
1229     Py_DECREF(filter);
1230     return NULL;
1231 
1232 }
1233 
1234 /*[clinic input]
1235 _sre.SRE_Pattern.sub
1236 
1237     cls: defining_class
1238     /
1239     repl: object
1240     string: object
1241     count: Py_ssize_t = 0
1242 
1243 Return the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement repl.
1244 [clinic start generated code]*/
1245 
1246 static PyObject *
_sre_SRE_Pattern_sub_impl(PatternObject * self,PyTypeObject * cls,PyObject * repl,PyObject * string,Py_ssize_t count)1247 _sre_SRE_Pattern_sub_impl(PatternObject *self, PyTypeObject *cls,
1248                           PyObject *repl, PyObject *string, Py_ssize_t count)
1249 /*[clinic end generated code: output=4be141ab04bca60d input=d8d1d4ac2311a07c]*/
1250 {
1251     _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1252 
1253     return pattern_subx(module_state, self, repl, string, count, 0);
1254 }
1255 
1256 /*[clinic input]
1257 _sre.SRE_Pattern.subn
1258 
1259     cls: defining_class
1260     /
1261     repl: object
1262     string: object
1263     count: Py_ssize_t = 0
1264 
1265 Return the tuple (new_string, number_of_subs_made) found by replacing the leftmost non-overlapping occurrences of pattern with the replacement repl.
1266 [clinic start generated code]*/
1267 
1268 static PyObject *
_sre_SRE_Pattern_subn_impl(PatternObject * self,PyTypeObject * cls,PyObject * repl,PyObject * string,Py_ssize_t count)1269 _sre_SRE_Pattern_subn_impl(PatternObject *self, PyTypeObject *cls,
1270                            PyObject *repl, PyObject *string,
1271                            Py_ssize_t count)
1272 /*[clinic end generated code: output=da02fd85258b1e1f input=8b78a65b8302e58d]*/
1273 {
1274     _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1275 
1276     return pattern_subx(module_state, self, repl, string, count, 1);
1277 }
1278 
1279 /*[clinic input]
1280 _sre.SRE_Pattern.__copy__
1281 
1282 [clinic start generated code]*/
1283 
1284 static PyObject *
_sre_SRE_Pattern___copy___impl(PatternObject * self)1285 _sre_SRE_Pattern___copy___impl(PatternObject *self)
1286 /*[clinic end generated code: output=85dedc2db1bd8694 input=a730a59d863bc9f5]*/
1287 {
1288     Py_INCREF(self);
1289     return (PyObject *)self;
1290 }
1291 
1292 /*[clinic input]
1293 _sre.SRE_Pattern.__deepcopy__
1294 
1295     memo: object
1296     /
1297 
1298 [clinic start generated code]*/
1299 
1300 static PyObject *
_sre_SRE_Pattern___deepcopy__(PatternObject * self,PyObject * memo)1301 _sre_SRE_Pattern___deepcopy__(PatternObject *self, PyObject *memo)
1302 /*[clinic end generated code: output=2ad25679c1f1204a input=a465b1602f997bed]*/
1303 {
1304     Py_INCREF(self);
1305     return (PyObject *)self;
1306 }
1307 
1308 static PyObject *
pattern_repr(PatternObject * obj)1309 pattern_repr(PatternObject *obj)
1310 {
1311     static const struct {
1312         const char *name;
1313         int value;
1314     } flag_names[] = {
1315         {"re.TEMPLATE", SRE_FLAG_TEMPLATE},
1316         {"re.IGNORECASE", SRE_FLAG_IGNORECASE},
1317         {"re.LOCALE", SRE_FLAG_LOCALE},
1318         {"re.MULTILINE", SRE_FLAG_MULTILINE},
1319         {"re.DOTALL", SRE_FLAG_DOTALL},
1320         {"re.UNICODE", SRE_FLAG_UNICODE},
1321         {"re.VERBOSE", SRE_FLAG_VERBOSE},
1322         {"re.DEBUG", SRE_FLAG_DEBUG},
1323         {"re.ASCII", SRE_FLAG_ASCII},
1324     };
1325     PyObject *result = NULL;
1326     PyObject *flag_items;
1327     size_t i;
1328     int flags = obj->flags;
1329 
1330     /* Omit re.UNICODE for valid string patterns. */
1331     if (obj->isbytes == 0 &&
1332         (flags & (SRE_FLAG_LOCALE|SRE_FLAG_UNICODE|SRE_FLAG_ASCII)) ==
1333          SRE_FLAG_UNICODE)
1334         flags &= ~SRE_FLAG_UNICODE;
1335 
1336     flag_items = PyList_New(0);
1337     if (!flag_items)
1338         return NULL;
1339 
1340     for (i = 0; i < Py_ARRAY_LENGTH(flag_names); i++) {
1341         if (flags & flag_names[i].value) {
1342             PyObject *item = PyUnicode_FromString(flag_names[i].name);
1343             if (!item)
1344                 goto done;
1345 
1346             if (PyList_Append(flag_items, item) < 0) {
1347                 Py_DECREF(item);
1348                 goto done;
1349             }
1350             Py_DECREF(item);
1351             flags &= ~flag_names[i].value;
1352         }
1353     }
1354     if (flags) {
1355         PyObject *item = PyUnicode_FromFormat("0x%x", flags);
1356         if (!item)
1357             goto done;
1358 
1359         if (PyList_Append(flag_items, item) < 0) {
1360             Py_DECREF(item);
1361             goto done;
1362         }
1363         Py_DECREF(item);
1364     }
1365 
1366     if (PyList_Size(flag_items) > 0) {
1367         PyObject *flags_result;
1368         PyObject *sep = PyUnicode_FromString("|");
1369         if (!sep)
1370             goto done;
1371         flags_result = PyUnicode_Join(sep, flag_items);
1372         Py_DECREF(sep);
1373         if (!flags_result)
1374             goto done;
1375         result = PyUnicode_FromFormat("re.compile(%.200R, %S)",
1376                                       obj->pattern, flags_result);
1377         Py_DECREF(flags_result);
1378     }
1379     else {
1380         result = PyUnicode_FromFormat("re.compile(%.200R)", obj->pattern);
1381     }
1382 
1383 done:
1384     Py_DECREF(flag_items);
1385     return result;
1386 }
1387 
1388 PyDoc_STRVAR(pattern_doc, "Compiled regular expression object.");
1389 
1390 /* PatternObject's 'groupindex' method. */
1391 static PyObject *
pattern_groupindex(PatternObject * self,void * Py_UNUSED (ignored))1392 pattern_groupindex(PatternObject *self, void *Py_UNUSED(ignored))
1393 {
1394     if (self->groupindex == NULL)
1395         return PyDict_New();
1396     return PyDictProxy_New(self->groupindex);
1397 }
1398 
1399 static int _validate(PatternObject *self); /* Forward */
1400 
1401 /*[clinic input]
1402 _sre.compile
1403 
1404     pattern: object
1405     flags: int
1406     code: object(subclass_of='&PyList_Type')
1407     groups: Py_ssize_t
1408     groupindex: object(subclass_of='&PyDict_Type')
1409     indexgroup: object(subclass_of='&PyTuple_Type')
1410 
1411 [clinic start generated code]*/
1412 
1413 static PyObject *
_sre_compile_impl(PyObject * module,PyObject * pattern,int flags,PyObject * code,Py_ssize_t groups,PyObject * groupindex,PyObject * indexgroup)1414 _sre_compile_impl(PyObject *module, PyObject *pattern, int flags,
1415                   PyObject *code, Py_ssize_t groups, PyObject *groupindex,
1416                   PyObject *indexgroup)
1417 /*[clinic end generated code: output=ef9c2b3693776404 input=0a68476dbbe5db30]*/
1418 {
1419     /* "compile" pattern descriptor to pattern object */
1420 
1421     _sremodulestate *module_state = get_sre_module_state(module);
1422     PatternObject* self;
1423     Py_ssize_t i, n;
1424 
1425     n = PyList_GET_SIZE(code);
1426     /* coverity[ampersand_in_size] */
1427     self = PyObject_GC_NewVar(PatternObject, module_state->Pattern_Type, n);
1428     if (!self)
1429         return NULL;
1430     self->weakreflist = NULL;
1431     self->pattern = NULL;
1432     self->groupindex = NULL;
1433     self->indexgroup = NULL;
1434 
1435     self->codesize = n;
1436 
1437     for (i = 0; i < n; i++) {
1438         PyObject *o = PyList_GET_ITEM(code, i);
1439         unsigned long value = PyLong_AsUnsignedLong(o);
1440         self->code[i] = (SRE_CODE) value;
1441         if ((unsigned long) self->code[i] != value) {
1442             PyErr_SetString(PyExc_OverflowError,
1443                             "regular expression code size limit exceeded");
1444             break;
1445         }
1446     }
1447     PyObject_GC_Track(self);
1448 
1449     if (PyErr_Occurred()) {
1450         Py_DECREF(self);
1451         return NULL;
1452     }
1453 
1454     if (pattern == Py_None) {
1455         self->isbytes = -1;
1456     }
1457     else {
1458         Py_ssize_t p_length;
1459         int charsize;
1460         Py_buffer view;
1461         view.buf = NULL;
1462         if (!getstring(pattern, &p_length, &self->isbytes,
1463                        &charsize, &view)) {
1464             Py_DECREF(self);
1465             return NULL;
1466         }
1467         if (view.buf)
1468             PyBuffer_Release(&view);
1469     }
1470 
1471     Py_INCREF(pattern);
1472     self->pattern = pattern;
1473 
1474     self->flags = flags;
1475 
1476     self->groups = groups;
1477 
1478     if (PyDict_GET_SIZE(groupindex) > 0) {
1479         Py_INCREF(groupindex);
1480         self->groupindex = groupindex;
1481         if (PyTuple_GET_SIZE(indexgroup) > 0) {
1482             Py_INCREF(indexgroup);
1483             self->indexgroup = indexgroup;
1484         }
1485     }
1486 
1487     if (!_validate(self)) {
1488         Py_DECREF(self);
1489         return NULL;
1490     }
1491 
1492     return (PyObject*) self;
1493 }
1494 
1495 /* -------------------------------------------------------------------- */
1496 /* Code validation */
1497 
1498 /* To learn more about this code, have a look at the _compile() function in
1499    Lib/sre_compile.py.  The validation functions below checks the code array
1500    for conformance with the code patterns generated there.
1501 
1502    The nice thing about the generated code is that it is position-independent:
1503    all jumps are relative jumps forward.  Also, jumps don't cross each other:
1504    the target of a later jump is always earlier than the target of an earlier
1505    jump.  IOW, this is okay:
1506 
1507    J---------J-------T--------T
1508     \         \_____/        /
1509      \______________________/
1510 
1511    but this is not:
1512 
1513    J---------J-------T--------T
1514     \_________\_____/        /
1515                \____________/
1516 
1517    It also helps that SRE_CODE is always an unsigned type.
1518 */
1519 
1520 /* Defining this one enables tracing of the validator */
1521 #undef VVERBOSE
1522 
1523 /* Trace macro for the validator */
1524 #if defined(VVERBOSE)
1525 #define VTRACE(v) printf v
1526 #else
1527 #define VTRACE(v) do {} while(0)  /* do nothing */
1528 #endif
1529 
1530 /* Report failure */
1531 #define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return -1; } while (0)
1532 
1533 /* Extract opcode, argument, or skip count from code array */
1534 #define GET_OP                                          \
1535     do {                                                \
1536         VTRACE(("%p: ", code));                         \
1537         if (code >= end) FAIL;                          \
1538         op = *code++;                                   \
1539         VTRACE(("%lu (op)\n", (unsigned long)op));      \
1540     } while (0)
1541 #define GET_ARG                                         \
1542     do {                                                \
1543         VTRACE(("%p= ", code));                         \
1544         if (code >= end) FAIL;                          \
1545         arg = *code++;                                  \
1546         VTRACE(("%lu (arg)\n", (unsigned long)arg));    \
1547     } while (0)
1548 #define GET_SKIP_ADJ(adj)                               \
1549     do {                                                \
1550         VTRACE(("%p= ", code));                         \
1551         if (code >= end) FAIL;                          \
1552         skip = *code;                                   \
1553         VTRACE(("%lu (skip to %p)\n",                   \
1554                (unsigned long)skip, code+skip));        \
1555         if (skip-adj > (uintptr_t)(end - code))         \
1556             FAIL;                                       \
1557         code++;                                         \
1558     } while (0)
1559 #define GET_SKIP GET_SKIP_ADJ(0)
1560 
1561 static int
_validate_charset(SRE_CODE * code,SRE_CODE * end)1562 _validate_charset(SRE_CODE *code, SRE_CODE *end)
1563 {
1564     /* Some variables are manipulated by the macros above */
1565     SRE_CODE op;
1566     SRE_CODE arg;
1567     SRE_CODE offset;
1568     int i;
1569 
1570     while (code < end) {
1571         GET_OP;
1572         switch (op) {
1573 
1574         case SRE_OP_NEGATE:
1575             break;
1576 
1577         case SRE_OP_LITERAL:
1578             GET_ARG;
1579             break;
1580 
1581         case SRE_OP_RANGE:
1582         case SRE_OP_RANGE_UNI_IGNORE:
1583             GET_ARG;
1584             GET_ARG;
1585             break;
1586 
1587         case SRE_OP_CHARSET:
1588             offset = 256/SRE_CODE_BITS; /* 256-bit bitmap */
1589             if (offset > (uintptr_t)(end - code))
1590                 FAIL;
1591             code += offset;
1592             break;
1593 
1594         case SRE_OP_BIGCHARSET:
1595             GET_ARG; /* Number of blocks */
1596             offset = 256/sizeof(SRE_CODE); /* 256-byte table */
1597             if (offset > (uintptr_t)(end - code))
1598                 FAIL;
1599             /* Make sure that each byte points to a valid block */
1600             for (i = 0; i < 256; i++) {
1601                 if (((unsigned char *)code)[i] >= arg)
1602                     FAIL;
1603             }
1604             code += offset;
1605             offset = arg * (256/SRE_CODE_BITS); /* 256-bit bitmap times arg */
1606             if (offset > (uintptr_t)(end - code))
1607                 FAIL;
1608             code += offset;
1609             break;
1610 
1611         case SRE_OP_CATEGORY:
1612             GET_ARG;
1613             switch (arg) {
1614             case SRE_CATEGORY_DIGIT:
1615             case SRE_CATEGORY_NOT_DIGIT:
1616             case SRE_CATEGORY_SPACE:
1617             case SRE_CATEGORY_NOT_SPACE:
1618             case SRE_CATEGORY_WORD:
1619             case SRE_CATEGORY_NOT_WORD:
1620             case SRE_CATEGORY_LINEBREAK:
1621             case SRE_CATEGORY_NOT_LINEBREAK:
1622             case SRE_CATEGORY_LOC_WORD:
1623             case SRE_CATEGORY_LOC_NOT_WORD:
1624             case SRE_CATEGORY_UNI_DIGIT:
1625             case SRE_CATEGORY_UNI_NOT_DIGIT:
1626             case SRE_CATEGORY_UNI_SPACE:
1627             case SRE_CATEGORY_UNI_NOT_SPACE:
1628             case SRE_CATEGORY_UNI_WORD:
1629             case SRE_CATEGORY_UNI_NOT_WORD:
1630             case SRE_CATEGORY_UNI_LINEBREAK:
1631             case SRE_CATEGORY_UNI_NOT_LINEBREAK:
1632                 break;
1633             default:
1634                 FAIL;
1635             }
1636             break;
1637 
1638         default:
1639             FAIL;
1640 
1641         }
1642     }
1643 
1644     return 0;
1645 }
1646 
1647 /* Returns 0 on success, -1 on failure, and 1 if the last op is JUMP. */
1648 static int
_validate_inner(SRE_CODE * code,SRE_CODE * end,Py_ssize_t groups)1649 _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1650 {
1651     /* Some variables are manipulated by the macros above */
1652     SRE_CODE op;
1653     SRE_CODE arg;
1654     SRE_CODE skip;
1655 
1656     VTRACE(("code=%p, end=%p\n", code, end));
1657 
1658     if (code > end)
1659         FAIL;
1660 
1661     while (code < end) {
1662         GET_OP;
1663         switch (op) {
1664 
1665         case SRE_OP_MARK:
1666             /* We don't check whether marks are properly nested; the
1667                sre_match() code is robust even if they don't, and the worst
1668                you can get is nonsensical match results. */
1669             GET_ARG;
1670             if (arg > 2 * (size_t)groups + 1) {
1671                 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
1672                 FAIL;
1673             }
1674             break;
1675 
1676         case SRE_OP_LITERAL:
1677         case SRE_OP_NOT_LITERAL:
1678         case SRE_OP_LITERAL_IGNORE:
1679         case SRE_OP_NOT_LITERAL_IGNORE:
1680         case SRE_OP_LITERAL_UNI_IGNORE:
1681         case SRE_OP_NOT_LITERAL_UNI_IGNORE:
1682         case SRE_OP_LITERAL_LOC_IGNORE:
1683         case SRE_OP_NOT_LITERAL_LOC_IGNORE:
1684             GET_ARG;
1685             /* The arg is just a character, nothing to check */
1686             break;
1687 
1688         case SRE_OP_SUCCESS:
1689         case SRE_OP_FAILURE:
1690             /* Nothing to check; these normally end the matching process */
1691             break;
1692 
1693         case SRE_OP_AT:
1694             GET_ARG;
1695             switch (arg) {
1696             case SRE_AT_BEGINNING:
1697             case SRE_AT_BEGINNING_STRING:
1698             case SRE_AT_BEGINNING_LINE:
1699             case SRE_AT_END:
1700             case SRE_AT_END_LINE:
1701             case SRE_AT_END_STRING:
1702             case SRE_AT_BOUNDARY:
1703             case SRE_AT_NON_BOUNDARY:
1704             case SRE_AT_LOC_BOUNDARY:
1705             case SRE_AT_LOC_NON_BOUNDARY:
1706             case SRE_AT_UNI_BOUNDARY:
1707             case SRE_AT_UNI_NON_BOUNDARY:
1708                 break;
1709             default:
1710                 FAIL;
1711             }
1712             break;
1713 
1714         case SRE_OP_ANY:
1715         case SRE_OP_ANY_ALL:
1716             /* These have no operands */
1717             break;
1718 
1719         case SRE_OP_IN:
1720         case SRE_OP_IN_IGNORE:
1721         case SRE_OP_IN_UNI_IGNORE:
1722         case SRE_OP_IN_LOC_IGNORE:
1723             GET_SKIP;
1724             /* Stop 1 before the end; we check the FAILURE below */
1725             if (_validate_charset(code, code+skip-2))
1726                 FAIL;
1727             if (code[skip-2] != SRE_OP_FAILURE)
1728                 FAIL;
1729             code += skip-1;
1730             break;
1731 
1732         case SRE_OP_INFO:
1733             {
1734                 /* A minimal info field is
1735                    <INFO> <1=skip> <2=flags> <3=min> <4=max>;
1736                    If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
1737                    more follows. */
1738                 SRE_CODE flags, i;
1739                 SRE_CODE *newcode;
1740                 GET_SKIP;
1741                 newcode = code+skip-1;
1742                 GET_ARG; flags = arg;
1743                 GET_ARG;
1744                 GET_ARG;
1745                 /* Check that only valid flags are present */
1746                 if ((flags & ~(SRE_INFO_PREFIX |
1747                                SRE_INFO_LITERAL |
1748                                SRE_INFO_CHARSET)) != 0)
1749                     FAIL;
1750                 /* PREFIX and CHARSET are mutually exclusive */
1751                 if ((flags & SRE_INFO_PREFIX) &&
1752                     (flags & SRE_INFO_CHARSET))
1753                     FAIL;
1754                 /* LITERAL implies PREFIX */
1755                 if ((flags & SRE_INFO_LITERAL) &&
1756                     !(flags & SRE_INFO_PREFIX))
1757                     FAIL;
1758                 /* Validate the prefix */
1759                 if (flags & SRE_INFO_PREFIX) {
1760                     SRE_CODE prefix_len;
1761                     GET_ARG; prefix_len = arg;
1762                     GET_ARG;
1763                     /* Here comes the prefix string */
1764                     if (prefix_len > (uintptr_t)(newcode - code))
1765                         FAIL;
1766                     code += prefix_len;
1767                     /* And here comes the overlap table */
1768                     if (prefix_len > (uintptr_t)(newcode - code))
1769                         FAIL;
1770                     /* Each overlap value should be < prefix_len */
1771                     for (i = 0; i < prefix_len; i++) {
1772                         if (code[i] >= prefix_len)
1773                             FAIL;
1774                     }
1775                     code += prefix_len;
1776                 }
1777                 /* Validate the charset */
1778                 if (flags & SRE_INFO_CHARSET) {
1779                     if (_validate_charset(code, newcode-1))
1780                         FAIL;
1781                     if (newcode[-1] != SRE_OP_FAILURE)
1782                         FAIL;
1783                     code = newcode;
1784                 }
1785                 else if (code != newcode) {
1786                   VTRACE(("code=%p, newcode=%p\n", code, newcode));
1787                     FAIL;
1788                 }
1789             }
1790             break;
1791 
1792         case SRE_OP_BRANCH:
1793             {
1794                 SRE_CODE *target = NULL;
1795                 for (;;) {
1796                     GET_SKIP;
1797                     if (skip == 0)
1798                         break;
1799                     /* Stop 2 before the end; we check the JUMP below */
1800                     if (_validate_inner(code, code+skip-3, groups))
1801                         FAIL;
1802                     code += skip-3;
1803                     /* Check that it ends with a JUMP, and that each JUMP
1804                        has the same target */
1805                     GET_OP;
1806                     if (op != SRE_OP_JUMP)
1807                         FAIL;
1808                     GET_SKIP;
1809                     if (target == NULL)
1810                         target = code+skip-1;
1811                     else if (code+skip-1 != target)
1812                         FAIL;
1813                 }
1814                 if (code != target)
1815                     FAIL;
1816             }
1817             break;
1818 
1819         case SRE_OP_REPEAT_ONE:
1820         case SRE_OP_MIN_REPEAT_ONE:
1821         case SRE_OP_POSSESSIVE_REPEAT_ONE:
1822             {
1823                 SRE_CODE min, max;
1824                 GET_SKIP;
1825                 GET_ARG; min = arg;
1826                 GET_ARG; max = arg;
1827                 if (min > max)
1828                     FAIL;
1829                 if (max > SRE_MAXREPEAT)
1830                     FAIL;
1831                 if (_validate_inner(code, code+skip-4, groups))
1832                     FAIL;
1833                 code += skip-4;
1834                 GET_OP;
1835                 if (op != SRE_OP_SUCCESS)
1836                     FAIL;
1837             }
1838             break;
1839 
1840         case SRE_OP_REPEAT:
1841         case SRE_OP_POSSESSIVE_REPEAT:
1842             {
1843                 SRE_CODE op1 = op, min, max;
1844                 GET_SKIP;
1845                 GET_ARG; min = arg;
1846                 GET_ARG; max = arg;
1847                 if (min > max)
1848                     FAIL;
1849                 if (max > SRE_MAXREPEAT)
1850                     FAIL;
1851                 if (_validate_inner(code, code+skip-3, groups))
1852                     FAIL;
1853                 code += skip-3;
1854                 GET_OP;
1855                 if (op1 == SRE_OP_POSSESSIVE_REPEAT) {
1856                     if (op != SRE_OP_SUCCESS)
1857                         FAIL;
1858                 }
1859                 else {
1860                     if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
1861                         FAIL;
1862                 }
1863             }
1864             break;
1865 
1866         case SRE_OP_ATOMIC_GROUP:
1867             {
1868                 GET_SKIP;
1869                 if (_validate_inner(code, code+skip-2, groups))
1870                     FAIL;
1871                 code += skip-2;
1872                 GET_OP;
1873                 if (op != SRE_OP_SUCCESS)
1874                     FAIL;
1875             }
1876             break;
1877 
1878         case SRE_OP_GROUPREF:
1879         case SRE_OP_GROUPREF_IGNORE:
1880         case SRE_OP_GROUPREF_UNI_IGNORE:
1881         case SRE_OP_GROUPREF_LOC_IGNORE:
1882             GET_ARG;
1883             if (arg >= (size_t)groups)
1884                 FAIL;
1885             break;
1886 
1887         case SRE_OP_GROUPREF_EXISTS:
1888             /* The regex syntax for this is: '(?(group)then|else)', where
1889                'group' is either an integer group number or a group name,
1890                'then' and 'else' are sub-regexes, and 'else' is optional. */
1891             GET_ARG;
1892             if (arg >= (size_t)groups)
1893                 FAIL;
1894             GET_SKIP_ADJ(1);
1895             code--; /* The skip is relative to the first arg! */
1896             /* There are two possibilities here: if there is both a 'then'
1897                part and an 'else' part, the generated code looks like:
1898 
1899                GROUPREF_EXISTS
1900                <group>
1901                <skipyes>
1902                ...then part...
1903                JUMP
1904                <skipno>
1905                (<skipyes> jumps here)
1906                ...else part...
1907                (<skipno> jumps here)
1908 
1909                If there is only a 'then' part, it looks like:
1910 
1911                GROUPREF_EXISTS
1912                <group>
1913                <skip>
1914                ...then part...
1915                (<skip> jumps here)
1916 
1917                There is no direct way to decide which it is, and we don't want
1918                to allow arbitrary jumps anywhere in the code; so we just look
1919                for a JUMP opcode preceding our skip target.
1920             */
1921             VTRACE(("then part:\n"));
1922             int rc = _validate_inner(code+1, code+skip-1, groups);
1923             if (rc == 1) {
1924                 VTRACE(("else part:\n"));
1925                 code += skip-2; /* Position after JUMP, at <skipno> */
1926                 GET_SKIP;
1927                 rc = _validate_inner(code, code+skip-1, groups);
1928             }
1929             if (rc)
1930                 FAIL;
1931             code += skip-1;
1932             break;
1933 
1934         case SRE_OP_ASSERT:
1935         case SRE_OP_ASSERT_NOT:
1936             GET_SKIP;
1937             GET_ARG; /* 0 for lookahead, width for lookbehind */
1938             code--; /* Back up over arg to simplify math below */
1939             if (arg & 0x80000000)
1940                 FAIL; /* Width too large */
1941             /* Stop 1 before the end; we check the SUCCESS below */
1942             if (_validate_inner(code+1, code+skip-2, groups))
1943                 FAIL;
1944             code += skip-2;
1945             GET_OP;
1946             if (op != SRE_OP_SUCCESS)
1947                 FAIL;
1948             break;
1949 
1950         case SRE_OP_JUMP:
1951             if (code + 1 != end)
1952                 FAIL;
1953             VTRACE(("JUMP: %d\n", __LINE__));
1954             return 1;
1955 
1956         default:
1957             FAIL;
1958 
1959         }
1960     }
1961 
1962     VTRACE(("okay\n"));
1963     return 0;
1964 }
1965 
1966 static int
_validate_outer(SRE_CODE * code,SRE_CODE * end,Py_ssize_t groups)1967 _validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1968 {
1969     if (groups < 0 || (size_t)groups > SRE_MAXGROUPS ||
1970         code >= end || end[-1] != SRE_OP_SUCCESS)
1971         FAIL;
1972     return _validate_inner(code, end-1, groups);
1973 }
1974 
1975 static int
_validate(PatternObject * self)1976 _validate(PatternObject *self)
1977 {
1978     if (_validate_outer(self->code, self->code+self->codesize, self->groups))
1979     {
1980         PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
1981         return 0;
1982     }
1983     else
1984         VTRACE(("Success!\n"));
1985     return 1;
1986 }
1987 
1988 /* -------------------------------------------------------------------- */
1989 /* match methods */
1990 
1991 static int
match_traverse(MatchObject * self,visitproc visit,void * arg)1992 match_traverse(MatchObject *self, visitproc visit, void *arg)
1993 {
1994     Py_VISIT(Py_TYPE(self));
1995     Py_VISIT(self->string);
1996     Py_VISIT(self->regs);
1997     Py_VISIT(self->pattern);
1998     return 0;
1999 }
2000 
2001 static int
match_clear(MatchObject * self)2002 match_clear(MatchObject *self)
2003 {
2004     Py_CLEAR(self->string);
2005     Py_CLEAR(self->regs);
2006     Py_CLEAR(self->pattern);
2007     return 0;
2008 }
2009 
2010 static void
match_dealloc(MatchObject * self)2011 match_dealloc(MatchObject* self)
2012 {
2013     PyTypeObject *tp = Py_TYPE(self);
2014 
2015     PyObject_GC_UnTrack(self);
2016     (void)match_clear(self);
2017     tp->tp_free(self);
2018     Py_DECREF(tp);
2019 }
2020 
2021 static PyObject*
match_getslice_by_index(MatchObject * self,Py_ssize_t index,PyObject * def)2022 match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
2023 {
2024     Py_ssize_t length;
2025     int isbytes, charsize;
2026     Py_buffer view;
2027     PyObject *result;
2028     const void* ptr;
2029     Py_ssize_t i, j;
2030 
2031     assert(0 <= index && index < self->groups);
2032     index *= 2;
2033 
2034     if (self->string == Py_None || self->mark[index] < 0) {
2035         /* return default value if the string or group is undefined */
2036         Py_INCREF(def);
2037         return def;
2038     }
2039 
2040     ptr = getstring(self->string, &length, &isbytes, &charsize, &view);
2041     if (ptr == NULL)
2042         return NULL;
2043 
2044     i = self->mark[index];
2045     j = self->mark[index+1];
2046     i = Py_MIN(i, length);
2047     j = Py_MIN(j, length);
2048     result = getslice(isbytes, ptr, self->string, i, j);
2049     if (isbytes && view.buf != NULL)
2050         PyBuffer_Release(&view);
2051     return result;
2052 }
2053 
2054 static Py_ssize_t
match_getindex(MatchObject * self,PyObject * index)2055 match_getindex(MatchObject* self, PyObject* index)
2056 {
2057     Py_ssize_t i;
2058 
2059     if (index == NULL)
2060         /* Default value */
2061         return 0;
2062 
2063     if (PyIndex_Check(index)) {
2064         i = PyNumber_AsSsize_t(index, NULL);
2065     }
2066     else {
2067         i = -1;
2068 
2069         if (self->pattern->groupindex) {
2070             index = PyDict_GetItemWithError(self->pattern->groupindex, index);
2071             if (index && PyLong_Check(index)) {
2072                 i = PyLong_AsSsize_t(index);
2073             }
2074         }
2075     }
2076     if (i < 0 || i >= self->groups) {
2077         /* raise IndexError if we were given a bad group number */
2078         if (!PyErr_Occurred()) {
2079             PyErr_SetString(PyExc_IndexError, "no such group");
2080         }
2081         return -1;
2082     }
2083 
2084     return i;
2085 }
2086 
2087 static PyObject*
match_getslice(MatchObject * self,PyObject * index,PyObject * def)2088 match_getslice(MatchObject* self, PyObject* index, PyObject* def)
2089 {
2090     Py_ssize_t i = match_getindex(self, index);
2091 
2092     if (i < 0) {
2093         return NULL;
2094     }
2095 
2096     return match_getslice_by_index(self, i, def);
2097 }
2098 
2099 /*[clinic input]
2100 _sre.SRE_Match.expand
2101 
2102     template: object
2103 
2104 Return the string obtained by doing backslash substitution on the string template, as done by the sub() method.
2105 [clinic start generated code]*/
2106 
2107 static PyObject *
_sre_SRE_Match_expand_impl(MatchObject * self,PyObject * template)2108 _sre_SRE_Match_expand_impl(MatchObject *self, PyObject *template)
2109 /*[clinic end generated code: output=931b58ccc323c3a1 input=4bfdb22c2f8b146a]*/
2110 {
2111     /* delegate to Python code */
2112     return call(
2113         SRE_PY_MODULE, "_expand",
2114         PyTuple_Pack(3, self->pattern, self, template)
2115         );
2116 }
2117 
2118 static PyObject*
match_group(MatchObject * self,PyObject * args)2119 match_group(MatchObject* self, PyObject* args)
2120 {
2121     PyObject* result;
2122     Py_ssize_t i, size;
2123 
2124     size = PyTuple_GET_SIZE(args);
2125 
2126     switch (size) {
2127     case 0:
2128         result = match_getslice(self, _PyLong_GetZero(), Py_None);
2129         break;
2130     case 1:
2131         result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2132         break;
2133     default:
2134         /* fetch multiple items */
2135         result = PyTuple_New(size);
2136         if (!result)
2137             return NULL;
2138         for (i = 0; i < size; i++) {
2139             PyObject* item = match_getslice(
2140                 self, PyTuple_GET_ITEM(args, i), Py_None
2141                 );
2142             if (!item) {
2143                 Py_DECREF(result);
2144                 return NULL;
2145             }
2146             PyTuple_SET_ITEM(result, i, item);
2147         }
2148         break;
2149     }
2150     return result;
2151 }
2152 
2153 static PyObject*
match_getitem(MatchObject * self,PyObject * name)2154 match_getitem(MatchObject* self, PyObject* name)
2155 {
2156     return match_getslice(self, name, Py_None);
2157 }
2158 
2159 /*[clinic input]
2160 _sre.SRE_Match.groups
2161 
2162     default: object = None
2163         Is used for groups that did not participate in the match.
2164 
2165 Return a tuple containing all the subgroups of the match, from 1.
2166 [clinic start generated code]*/
2167 
2168 static PyObject *
_sre_SRE_Match_groups_impl(MatchObject * self,PyObject * default_value)2169 _sre_SRE_Match_groups_impl(MatchObject *self, PyObject *default_value)
2170 /*[clinic end generated code: output=daf8e2641537238a input=bb069ef55dabca91]*/
2171 {
2172     PyObject* result;
2173     Py_ssize_t index;
2174 
2175     result = PyTuple_New(self->groups-1);
2176     if (!result)
2177         return NULL;
2178 
2179     for (index = 1; index < self->groups; index++) {
2180         PyObject* item;
2181         item = match_getslice_by_index(self, index, default_value);
2182         if (!item) {
2183             Py_DECREF(result);
2184             return NULL;
2185         }
2186         PyTuple_SET_ITEM(result, index-1, item);
2187     }
2188 
2189     return result;
2190 }
2191 
2192 /*[clinic input]
2193 _sre.SRE_Match.groupdict
2194 
2195     default: object = None
2196         Is used for groups that did not participate in the match.
2197 
2198 Return a dictionary containing all the named subgroups of the match, keyed by the subgroup name.
2199 [clinic start generated code]*/
2200 
2201 static PyObject *
_sre_SRE_Match_groupdict_impl(MatchObject * self,PyObject * default_value)2202 _sre_SRE_Match_groupdict_impl(MatchObject *self, PyObject *default_value)
2203 /*[clinic end generated code: output=29917c9073e41757 input=0ded7960b23780aa]*/
2204 {
2205     PyObject *result;
2206     PyObject *key;
2207     PyObject *value;
2208     Py_ssize_t pos = 0;
2209     Py_hash_t hash;
2210 
2211     result = PyDict_New();
2212     if (!result || !self->pattern->groupindex)
2213         return result;
2214 
2215     while (_PyDict_Next(self->pattern->groupindex, &pos, &key, &value, &hash)) {
2216         int status;
2217         Py_INCREF(key);
2218         value = match_getslice(self, key, default_value);
2219         if (!value) {
2220             Py_DECREF(key);
2221             goto failed;
2222         }
2223         status = _PyDict_SetItem_KnownHash(result, key, value, hash);
2224         Py_DECREF(value);
2225         Py_DECREF(key);
2226         if (status < 0)
2227             goto failed;
2228     }
2229 
2230     return result;
2231 
2232 failed:
2233     Py_DECREF(result);
2234     return NULL;
2235 }
2236 
2237 /*[clinic input]
2238 _sre.SRE_Match.start -> Py_ssize_t
2239 
2240     group: object(c_default="NULL") = 0
2241     /
2242 
2243 Return index of the start of the substring matched by group.
2244 [clinic start generated code]*/
2245 
2246 static Py_ssize_t
_sre_SRE_Match_start_impl(MatchObject * self,PyObject * group)2247 _sre_SRE_Match_start_impl(MatchObject *self, PyObject *group)
2248 /*[clinic end generated code: output=3f6e7f9df2fb5201 input=ced8e4ed4b33ee6c]*/
2249 {
2250     Py_ssize_t index = match_getindex(self, group);
2251 
2252     if (index < 0) {
2253         return -1;
2254     }
2255 
2256     /* mark is -1 if group is undefined */
2257     return self->mark[index*2];
2258 }
2259 
2260 /*[clinic input]
2261 _sre.SRE_Match.end -> Py_ssize_t
2262 
2263     group: object(c_default="NULL") = 0
2264     /
2265 
2266 Return index of the end of the substring matched by group.
2267 [clinic start generated code]*/
2268 
2269 static Py_ssize_t
_sre_SRE_Match_end_impl(MatchObject * self,PyObject * group)2270 _sre_SRE_Match_end_impl(MatchObject *self, PyObject *group)
2271 /*[clinic end generated code: output=f4240b09911f7692 input=1b799560c7f3d7e6]*/
2272 {
2273     Py_ssize_t index = match_getindex(self, group);
2274 
2275     if (index < 0) {
2276         return -1;
2277     }
2278 
2279     /* mark is -1 if group is undefined */
2280     return self->mark[index*2+1];
2281 }
2282 
2283 LOCAL(PyObject*)
_pair(Py_ssize_t i1,Py_ssize_t i2)2284 _pair(Py_ssize_t i1, Py_ssize_t i2)
2285 {
2286     PyObject* pair;
2287     PyObject* item;
2288 
2289     pair = PyTuple_New(2);
2290     if (!pair)
2291         return NULL;
2292 
2293     item = PyLong_FromSsize_t(i1);
2294     if (!item)
2295         goto error;
2296     PyTuple_SET_ITEM(pair, 0, item);
2297 
2298     item = PyLong_FromSsize_t(i2);
2299     if (!item)
2300         goto error;
2301     PyTuple_SET_ITEM(pair, 1, item);
2302 
2303     return pair;
2304 
2305   error:
2306     Py_DECREF(pair);
2307     return NULL;
2308 }
2309 
2310 /*[clinic input]
2311 _sre.SRE_Match.span
2312 
2313     group: object(c_default="NULL") = 0
2314     /
2315 
2316 For match object m, return the 2-tuple (m.start(group), m.end(group)).
2317 [clinic start generated code]*/
2318 
2319 static PyObject *
_sre_SRE_Match_span_impl(MatchObject * self,PyObject * group)2320 _sre_SRE_Match_span_impl(MatchObject *self, PyObject *group)
2321 /*[clinic end generated code: output=f02ae40594d14fe6 input=8fa6014e982d71d4]*/
2322 {
2323     Py_ssize_t index = match_getindex(self, group);
2324 
2325     if (index < 0) {
2326         return NULL;
2327     }
2328 
2329     /* marks are -1 if group is undefined */
2330     return _pair(self->mark[index*2], self->mark[index*2+1]);
2331 }
2332 
2333 static PyObject*
match_regs(MatchObject * self)2334 match_regs(MatchObject* self)
2335 {
2336     PyObject* regs;
2337     PyObject* item;
2338     Py_ssize_t index;
2339 
2340     regs = PyTuple_New(self->groups);
2341     if (!regs)
2342         return NULL;
2343 
2344     for (index = 0; index < self->groups; index++) {
2345         item = _pair(self->mark[index*2], self->mark[index*2+1]);
2346         if (!item) {
2347             Py_DECREF(regs);
2348             return NULL;
2349         }
2350         PyTuple_SET_ITEM(regs, index, item);
2351     }
2352 
2353     Py_INCREF(regs);
2354     self->regs = regs;
2355 
2356     return regs;
2357 }
2358 
2359 /*[clinic input]
2360 _sre.SRE_Match.__copy__
2361 
2362 [clinic start generated code]*/
2363 
2364 static PyObject *
_sre_SRE_Match___copy___impl(MatchObject * self)2365 _sre_SRE_Match___copy___impl(MatchObject *self)
2366 /*[clinic end generated code: output=a779c5fc8b5b4eb4 input=3bb4d30b6baddb5b]*/
2367 {
2368     Py_INCREF(self);
2369     return (PyObject *)self;
2370 }
2371 
2372 /*[clinic input]
2373 _sre.SRE_Match.__deepcopy__
2374 
2375     memo: object
2376     /
2377 
2378 [clinic start generated code]*/
2379 
2380 static PyObject *
_sre_SRE_Match___deepcopy__(MatchObject * self,PyObject * memo)2381 _sre_SRE_Match___deepcopy__(MatchObject *self, PyObject *memo)
2382 /*[clinic end generated code: output=ba7cb46d655e4ee2 input=779d12a31c2c325e]*/
2383 {
2384     Py_INCREF(self);
2385     return (PyObject *)self;
2386 }
2387 
2388 PyDoc_STRVAR(match_doc,
2389 "The result of re.match() and re.search().\n\
2390 Match objects always have a boolean value of True.");
2391 
2392 PyDoc_STRVAR(match_group_doc,
2393 "group([group1, ...]) -> str or tuple.\n\
2394     Return subgroup(s) of the match by indices or names.\n\
2395     For 0 returns the entire match.");
2396 
2397 static PyObject *
match_lastindex_get(MatchObject * self,void * Py_UNUSED (ignored))2398 match_lastindex_get(MatchObject *self, void *Py_UNUSED(ignored))
2399 {
2400     if (self->lastindex >= 0)
2401         return PyLong_FromSsize_t(self->lastindex);
2402     Py_RETURN_NONE;
2403 }
2404 
2405 static PyObject *
match_lastgroup_get(MatchObject * self,void * Py_UNUSED (ignored))2406 match_lastgroup_get(MatchObject *self, void *Py_UNUSED(ignored))
2407 {
2408     if (self->pattern->indexgroup &&
2409         self->lastindex >= 0 &&
2410         self->lastindex < PyTuple_GET_SIZE(self->pattern->indexgroup))
2411     {
2412         PyObject *result = PyTuple_GET_ITEM(self->pattern->indexgroup,
2413                                             self->lastindex);
2414         Py_INCREF(result);
2415         return result;
2416     }
2417     Py_RETURN_NONE;
2418 }
2419 
2420 static PyObject *
match_regs_get(MatchObject * self,void * Py_UNUSED (ignored))2421 match_regs_get(MatchObject *self, void *Py_UNUSED(ignored))
2422 {
2423     if (self->regs) {
2424         Py_INCREF(self->regs);
2425         return self->regs;
2426     } else
2427         return match_regs(self);
2428 }
2429 
2430 static PyObject *
match_repr(MatchObject * self)2431 match_repr(MatchObject *self)
2432 {
2433     PyObject *result;
2434     PyObject *group0 = match_getslice_by_index(self, 0, Py_None);
2435     if (group0 == NULL)
2436         return NULL;
2437     result = PyUnicode_FromFormat(
2438             "<%s object; span=(%zd, %zd), match=%.50R>",
2439             Py_TYPE(self)->tp_name,
2440             self->mark[0], self->mark[1], group0);
2441     Py_DECREF(group0);
2442     return result;
2443 }
2444 
2445 
2446 static PyObject*
pattern_new_match(_sremodulestate * module_state,PatternObject * pattern,SRE_STATE * state,Py_ssize_t status)2447 pattern_new_match(_sremodulestate* module_state,
2448                   PatternObject* pattern,
2449                   SRE_STATE* state,
2450                   Py_ssize_t status)
2451 {
2452     /* create match object (from state object) */
2453 
2454     MatchObject* match;
2455     Py_ssize_t i, j;
2456     char* base;
2457     int n;
2458 
2459     if (status > 0) {
2460 
2461         /* create match object (with room for extra group marks) */
2462         /* coverity[ampersand_in_size] */
2463         match = PyObject_GC_NewVar(MatchObject,
2464                                    module_state->Match_Type,
2465                                    2*(pattern->groups+1));
2466         if (!match)
2467             return NULL;
2468 
2469         Py_INCREF(pattern);
2470         match->pattern = pattern;
2471 
2472         Py_INCREF(state->string);
2473         match->string = state->string;
2474 
2475         match->regs = NULL;
2476         match->groups = pattern->groups+1;
2477 
2478         /* fill in group slices */
2479 
2480         base = (char*) state->beginning;
2481         n = state->charsize;
2482 
2483         match->mark[0] = ((char*) state->start - base) / n;
2484         match->mark[1] = ((char*) state->ptr - base) / n;
2485 
2486         for (i = j = 0; i < pattern->groups; i++, j+=2)
2487             if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
2488                 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
2489                 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
2490 
2491                 /* check wrong span */
2492                 if (match->mark[j+2] > match->mark[j+3]) {
2493                     PyErr_SetString(PyExc_SystemError,
2494                                     "The span of capturing group is wrong,"
2495                                     " please report a bug for the re module.");
2496                     Py_DECREF(match);
2497                     return NULL;
2498                 }
2499             } else
2500                 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
2501 
2502         match->pos = state->pos;
2503         match->endpos = state->endpos;
2504 
2505         match->lastindex = state->lastindex;
2506 
2507         PyObject_GC_Track(match);
2508         return (PyObject*) match;
2509 
2510     } else if (status == 0) {
2511 
2512         /* no match */
2513         Py_RETURN_NONE;
2514 
2515     }
2516 
2517     /* internal error */
2518     pattern_error(status);
2519     return NULL;
2520 }
2521 
2522 
2523 /* -------------------------------------------------------------------- */
2524 /* scanner methods (experimental) */
2525 
2526 static int
scanner_traverse(ScannerObject * self,visitproc visit,void * arg)2527 scanner_traverse(ScannerObject *self, visitproc visit, void *arg)
2528 {
2529     Py_VISIT(Py_TYPE(self));
2530     Py_VISIT(self->pattern);
2531     return 0;
2532 }
2533 
2534 static int
scanner_clear(ScannerObject * self)2535 scanner_clear(ScannerObject *self)
2536 {
2537     Py_CLEAR(self->pattern);
2538     return 0;
2539 }
2540 
2541 static void
scanner_dealloc(ScannerObject * self)2542 scanner_dealloc(ScannerObject* self)
2543 {
2544     PyTypeObject *tp = Py_TYPE(self);
2545 
2546     PyObject_GC_UnTrack(self);
2547     state_fini(&self->state);
2548     (void)scanner_clear(self);
2549     tp->tp_free(self);
2550     Py_DECREF(tp);
2551 }
2552 
2553 static int
scanner_begin(ScannerObject * self)2554 scanner_begin(ScannerObject* self)
2555 {
2556     if (self->executing) {
2557         PyErr_SetString(PyExc_ValueError,
2558                         "regular expression scanner already executing");
2559         return 0;
2560     }
2561     self->executing = 1;
2562     return 1;
2563 }
2564 
2565 static void
scanner_end(ScannerObject * self)2566 scanner_end(ScannerObject* self)
2567 {
2568     assert(self->executing);
2569     self->executing = 0;
2570 }
2571 
2572 /*[clinic input]
2573 _sre.SRE_Scanner.match
2574 
2575     cls: defining_class
2576     /
2577 
2578 [clinic start generated code]*/
2579 
2580 static PyObject *
_sre_SRE_Scanner_match_impl(ScannerObject * self,PyTypeObject * cls)2581 _sre_SRE_Scanner_match_impl(ScannerObject *self, PyTypeObject *cls)
2582 /*[clinic end generated code: output=6e22c149dc0f0325 input=b5146e1f30278cb7]*/
2583 {
2584     _sremodulestate *module_state = get_sre_module_state_by_class(cls);
2585     SRE_STATE* state = &self->state;
2586     PyObject* match;
2587     Py_ssize_t status;
2588 
2589     if (!scanner_begin(self)) {
2590         return NULL;
2591     }
2592     if (state->start == NULL) {
2593         scanner_end(self);
2594         Py_RETURN_NONE;
2595     }
2596 
2597     state_reset(state);
2598 
2599     state->ptr = state->start;
2600 
2601     status = sre_match(state, PatternObject_GetCode(self->pattern));
2602     if (PyErr_Occurred()) {
2603         scanner_end(self);
2604         return NULL;
2605     }
2606 
2607     match = pattern_new_match(module_state, (PatternObject*) self->pattern,
2608                               state, status);
2609 
2610     if (status == 0)
2611         state->start = NULL;
2612     else {
2613         state->must_advance = (state->ptr == state->start);
2614         state->start = state->ptr;
2615     }
2616 
2617     scanner_end(self);
2618     return match;
2619 }
2620 
2621 
2622 /*[clinic input]
2623 _sre.SRE_Scanner.search
2624 
2625     cls: defining_class
2626     /
2627 
2628 [clinic start generated code]*/
2629 
2630 static PyObject *
_sre_SRE_Scanner_search_impl(ScannerObject * self,PyTypeObject * cls)2631 _sre_SRE_Scanner_search_impl(ScannerObject *self, PyTypeObject *cls)
2632 /*[clinic end generated code: output=23e8fc78013f9161 input=056c2d37171d0bf2]*/
2633 {
2634     _sremodulestate *module_state = get_sre_module_state_by_class(cls);
2635     SRE_STATE* state = &self->state;
2636     PyObject* match;
2637     Py_ssize_t status;
2638 
2639     if (!scanner_begin(self)) {
2640         return NULL;
2641     }
2642     if (state->start == NULL) {
2643         scanner_end(self);
2644         Py_RETURN_NONE;
2645     }
2646 
2647     state_reset(state);
2648 
2649     state->ptr = state->start;
2650 
2651     status = sre_search(state, PatternObject_GetCode(self->pattern));
2652     if (PyErr_Occurred()) {
2653         scanner_end(self);
2654         return NULL;
2655     }
2656 
2657     match = pattern_new_match(module_state, (PatternObject*) self->pattern,
2658                               state, status);
2659 
2660     if (status == 0)
2661         state->start = NULL;
2662     else {
2663         state->must_advance = (state->ptr == state->start);
2664         state->start = state->ptr;
2665     }
2666 
2667     scanner_end(self);
2668     return match;
2669 }
2670 
2671 static PyObject *
pattern_scanner(_sremodulestate * module_state,PatternObject * self,PyObject * string,Py_ssize_t pos,Py_ssize_t endpos)2672 pattern_scanner(_sremodulestate *module_state,
2673                 PatternObject *self,
2674                 PyObject *string,
2675                 Py_ssize_t pos,
2676                 Py_ssize_t endpos)
2677 {
2678     ScannerObject* scanner;
2679 
2680     /* create scanner object */
2681     scanner = PyObject_GC_New(ScannerObject, module_state->Scanner_Type);
2682     if (!scanner)
2683         return NULL;
2684     scanner->pattern = NULL;
2685     scanner->executing = 0;
2686 
2687     /* create search state object */
2688     if (!state_init(&scanner->state, self, string, pos, endpos)) {
2689         Py_DECREF(scanner);
2690         return NULL;
2691     }
2692 
2693     Py_INCREF(self);
2694     scanner->pattern = (PyObject*) self;
2695 
2696     PyObject_GC_Track(scanner);
2697     return (PyObject*) scanner;
2698 }
2699 
2700 static Py_hash_t
pattern_hash(PatternObject * self)2701 pattern_hash(PatternObject *self)
2702 {
2703     Py_hash_t hash, hash2;
2704 
2705     hash = PyObject_Hash(self->pattern);
2706     if (hash == -1) {
2707         return -1;
2708     }
2709 
2710     hash2 = _Py_HashBytes(self->code, sizeof(self->code[0]) * self->codesize);
2711     hash ^= hash2;
2712 
2713     hash ^= self->flags;
2714     hash ^= self->isbytes;
2715     hash ^= self->codesize;
2716 
2717     if (hash == -1) {
2718         hash = -2;
2719     }
2720     return hash;
2721 }
2722 
2723 static PyObject*
pattern_richcompare(PyObject * lefto,PyObject * righto,int op)2724 pattern_richcompare(PyObject *lefto, PyObject *righto, int op)
2725 {
2726     PyTypeObject *tp = Py_TYPE(lefto);
2727     _sremodulestate *module_state = get_sre_module_state_by_class(tp);
2728     PatternObject *left, *right;
2729     int cmp;
2730 
2731     if (op != Py_EQ && op != Py_NE) {
2732         Py_RETURN_NOTIMPLEMENTED;
2733     }
2734 
2735     if (!Py_IS_TYPE(righto, module_state->Pattern_Type))
2736     {
2737         Py_RETURN_NOTIMPLEMENTED;
2738     }
2739 
2740     if (lefto == righto) {
2741         /* a pattern is equal to itself */
2742         return PyBool_FromLong(op == Py_EQ);
2743     }
2744 
2745     left = (PatternObject *)lefto;
2746     right = (PatternObject *)righto;
2747 
2748     cmp = (left->flags == right->flags
2749            && left->isbytes == right->isbytes
2750            && left->codesize == right->codesize);
2751     if (cmp) {
2752         /* Compare the code and the pattern because the same pattern can
2753            produce different codes depending on the locale used to compile the
2754            pattern when the re.LOCALE flag is used. Don't compare groups,
2755            indexgroup nor groupindex: they are derivated from the pattern. */
2756         cmp = (memcmp(left->code, right->code,
2757                       sizeof(left->code[0]) * left->codesize) == 0);
2758     }
2759     if (cmp) {
2760         cmp = PyObject_RichCompareBool(left->pattern, right->pattern,
2761                                        Py_EQ);
2762         if (cmp < 0) {
2763             return NULL;
2764         }
2765     }
2766     if (op == Py_NE) {
2767         cmp = !cmp;
2768     }
2769     return PyBool_FromLong(cmp);
2770 }
2771 
2772 #include "clinic/sre.c.h"
2773 
2774 static PyMethodDef pattern_methods[] = {
2775     _SRE_SRE_PATTERN_MATCH_METHODDEF
2776     _SRE_SRE_PATTERN_FULLMATCH_METHODDEF
2777     _SRE_SRE_PATTERN_SEARCH_METHODDEF
2778     _SRE_SRE_PATTERN_SUB_METHODDEF
2779     _SRE_SRE_PATTERN_SUBN_METHODDEF
2780     _SRE_SRE_PATTERN_FINDALL_METHODDEF
2781     _SRE_SRE_PATTERN_SPLIT_METHODDEF
2782     _SRE_SRE_PATTERN_FINDITER_METHODDEF
2783     _SRE_SRE_PATTERN_SCANNER_METHODDEF
2784     _SRE_SRE_PATTERN___COPY___METHODDEF
2785     _SRE_SRE_PATTERN___DEEPCOPY___METHODDEF
2786     {"__class_getitem__", Py_GenericAlias, METH_O|METH_CLASS,
2787      PyDoc_STR("See PEP 585")},
2788     {NULL, NULL}
2789 };
2790 
2791 static PyGetSetDef pattern_getset[] = {
2792     {"groupindex", (getter)pattern_groupindex, (setter)NULL,
2793       "A dictionary mapping group names to group numbers."},
2794     {NULL}  /* Sentinel */
2795 };
2796 
2797 #define PAT_OFF(x) offsetof(PatternObject, x)
2798 static PyMemberDef pattern_members[] = {
2799     {"pattern",    T_OBJECT,    PAT_OFF(pattern),       READONLY,
2800      "The pattern string from which the RE object was compiled."},
2801     {"flags",      T_INT,       PAT_OFF(flags),         READONLY,
2802      "The regex matching flags."},
2803     {"groups",     T_PYSSIZET,  PAT_OFF(groups),        READONLY,
2804      "The number of capturing groups in the pattern."},
2805     {"__weaklistoffset__", T_PYSSIZET, offsetof(PatternObject, weakreflist), READONLY},
2806     {NULL}  /* Sentinel */
2807 };
2808 
2809 static PyType_Slot pattern_slots[] = {
2810     {Py_tp_dealloc, (destructor)pattern_dealloc},
2811     {Py_tp_repr, (reprfunc)pattern_repr},
2812     {Py_tp_hash, (hashfunc)pattern_hash},
2813     {Py_tp_doc, (void *)pattern_doc},
2814     {Py_tp_richcompare, pattern_richcompare},
2815     {Py_tp_methods, pattern_methods},
2816     {Py_tp_members, pattern_members},
2817     {Py_tp_getset, pattern_getset},
2818     {Py_tp_traverse, pattern_traverse},
2819     {Py_tp_clear, pattern_clear},
2820     {0, NULL},
2821 };
2822 
2823 static PyType_Spec pattern_spec = {
2824     .name = "re.Pattern",
2825     .basicsize = sizeof(PatternObject),
2826     .itemsize = sizeof(SRE_CODE),
2827     .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
2828               Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
2829     .slots = pattern_slots,
2830 };
2831 
2832 static PyMethodDef match_methods[] = {
2833     {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
2834     _SRE_SRE_MATCH_START_METHODDEF
2835     _SRE_SRE_MATCH_END_METHODDEF
2836     _SRE_SRE_MATCH_SPAN_METHODDEF
2837     _SRE_SRE_MATCH_GROUPS_METHODDEF
2838     _SRE_SRE_MATCH_GROUPDICT_METHODDEF
2839     _SRE_SRE_MATCH_EXPAND_METHODDEF
2840     _SRE_SRE_MATCH___COPY___METHODDEF
2841     _SRE_SRE_MATCH___DEEPCOPY___METHODDEF
2842     {"__class_getitem__", Py_GenericAlias, METH_O|METH_CLASS,
2843      PyDoc_STR("See PEP 585")},
2844     {NULL, NULL}
2845 };
2846 
2847 static PyGetSetDef match_getset[] = {
2848     {"lastindex", (getter)match_lastindex_get, (setter)NULL,
2849      "The integer index of the last matched capturing group."},
2850     {"lastgroup", (getter)match_lastgroup_get, (setter)NULL,
2851      "The name of the last matched capturing group."},
2852     {"regs",      (getter)match_regs_get,      (setter)NULL},
2853     {NULL}
2854 };
2855 
2856 #define MATCH_OFF(x) offsetof(MatchObject, x)
2857 static PyMemberDef match_members[] = {
2858     {"string",  T_OBJECT,   MATCH_OFF(string),  READONLY,
2859      "The string passed to match() or search()."},
2860     {"re",      T_OBJECT,   MATCH_OFF(pattern), READONLY,
2861      "The regular expression object."},
2862     {"pos",     T_PYSSIZET, MATCH_OFF(pos),     READONLY,
2863      "The index into the string at which the RE engine started looking for a match."},
2864     {"endpos",  T_PYSSIZET, MATCH_OFF(endpos),  READONLY,
2865      "The index into the string beyond which the RE engine will not go."},
2866     {NULL}
2867 };
2868 
2869 /* FIXME: implement setattr("string", None) as a special case (to
2870    detach the associated string, if any */
2871 static PyType_Slot match_slots[] = {
2872     {Py_tp_dealloc, match_dealloc},
2873     {Py_tp_repr, match_repr},
2874     {Py_tp_doc, (void *)match_doc},
2875     {Py_tp_methods, match_methods},
2876     {Py_tp_members, match_members},
2877     {Py_tp_getset, match_getset},
2878     {Py_tp_traverse, match_traverse},
2879     {Py_tp_clear, match_clear},
2880 
2881     /* As mapping.
2882      *
2883      * Match objects do not support length or assignment, but do support
2884      * __getitem__.
2885      */
2886     {Py_mp_subscript, match_getitem},
2887 
2888     {0, NULL},
2889 };
2890 
2891 static PyType_Spec match_spec = {
2892     .name = "re.Match",
2893     .basicsize = sizeof(MatchObject),
2894     .itemsize = sizeof(Py_ssize_t),
2895     .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
2896               Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
2897     .slots = match_slots,
2898 };
2899 
2900 static PyMethodDef scanner_methods[] = {
2901     _SRE_SRE_SCANNER_MATCH_METHODDEF
2902     _SRE_SRE_SCANNER_SEARCH_METHODDEF
2903     {NULL, NULL}
2904 };
2905 
2906 #define SCAN_OFF(x) offsetof(ScannerObject, x)
2907 static PyMemberDef scanner_members[] = {
2908     {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
2909     {NULL}  /* Sentinel */
2910 };
2911 
2912 static PyType_Slot scanner_slots[] = {
2913     {Py_tp_dealloc, scanner_dealloc},
2914     {Py_tp_methods, scanner_methods},
2915     {Py_tp_members, scanner_members},
2916     {Py_tp_traverse, scanner_traverse},
2917     {Py_tp_clear, scanner_clear},
2918     {0, NULL},
2919 };
2920 
2921 static PyType_Spec scanner_spec = {
2922     .name = "_" SRE_MODULE ".SRE_Scanner",
2923     .basicsize = sizeof(ScannerObject),
2924     .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
2925               Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
2926     .slots = scanner_slots,
2927 };
2928 
2929 static PyMethodDef _functions[] = {
2930     _SRE_COMPILE_METHODDEF
2931     _SRE_GETCODESIZE_METHODDEF
2932     _SRE_ASCII_ISCASED_METHODDEF
2933     _SRE_UNICODE_ISCASED_METHODDEF
2934     _SRE_ASCII_TOLOWER_METHODDEF
2935     _SRE_UNICODE_TOLOWER_METHODDEF
2936     {NULL, NULL}
2937 };
2938 
2939 static int
sre_traverse(PyObject * module,visitproc visit,void * arg)2940 sre_traverse(PyObject *module, visitproc visit, void *arg)
2941 {
2942     _sremodulestate *state = get_sre_module_state(module);
2943 
2944     Py_VISIT(state->Pattern_Type);
2945     Py_VISIT(state->Match_Type);
2946     Py_VISIT(state->Scanner_Type);
2947 
2948     return 0;
2949 }
2950 
2951 static int
sre_clear(PyObject * module)2952 sre_clear(PyObject *module)
2953 {
2954     _sremodulestate *state = get_sre_module_state(module);
2955 
2956     Py_CLEAR(state->Pattern_Type);
2957     Py_CLEAR(state->Match_Type);
2958     Py_CLEAR(state->Scanner_Type);
2959 
2960     return 0;
2961 }
2962 
2963 static void
sre_free(void * module)2964 sre_free(void *module)
2965 {
2966     sre_clear((PyObject *)module);
2967 }
2968 
2969 #define CREATE_TYPE(m, type, spec)                                  \
2970 do {                                                                \
2971     type = (PyTypeObject *)PyType_FromModuleAndSpec(m, spec, NULL); \
2972     if (type == NULL) {                                             \
2973         goto error;                                                 \
2974     }                                                               \
2975 } while (0)
2976 
2977 #define ADD_ULONG_CONSTANT(module, name, value)           \
2978     do {                                                  \
2979         PyObject *o = PyLong_FromUnsignedLong(value);     \
2980         if (!o)                                           \
2981             goto error;                                   \
2982         int res = PyModule_AddObjectRef(module, name, o); \
2983         Py_DECREF(o);                                     \
2984         if (res < 0) {                                    \
2985             goto error;                                   \
2986         }                                                 \
2987 } while (0)
2988 
2989 static int
sre_exec(PyObject * m)2990 sre_exec(PyObject *m)
2991 {
2992     _sremodulestate *state;
2993 
2994     /* Create heap types */
2995     state = get_sre_module_state(m);
2996     CREATE_TYPE(m, state->Pattern_Type, &pattern_spec);
2997     CREATE_TYPE(m, state->Match_Type, &match_spec);
2998     CREATE_TYPE(m, state->Scanner_Type, &scanner_spec);
2999 
3000     if (PyModule_AddIntConstant(m, "MAGIC", SRE_MAGIC) < 0) {
3001         goto error;
3002     }
3003 
3004     if (PyModule_AddIntConstant(m, "CODESIZE", sizeof(SRE_CODE)) < 0) {
3005         goto error;
3006     }
3007 
3008     ADD_ULONG_CONSTANT(m, "MAXREPEAT", SRE_MAXREPEAT);
3009     ADD_ULONG_CONSTANT(m, "MAXGROUPS", SRE_MAXGROUPS);
3010 
3011     if (PyModule_AddStringConstant(m, "copyright", copyright) < 0) {
3012         goto error;
3013     }
3014 
3015     return 0;
3016 
3017 error:
3018     return -1;
3019 }
3020 
3021 static PyModuleDef_Slot sre_slots[] = {
3022     {Py_mod_exec, sre_exec},
3023     {0, NULL},
3024 };
3025 
3026 static struct PyModuleDef sremodule = {
3027     .m_base = PyModuleDef_HEAD_INIT,
3028     .m_name = "_" SRE_MODULE,
3029     .m_size = sizeof(_sremodulestate),
3030     .m_methods = _functions,
3031     .m_slots = sre_slots,
3032     .m_traverse = sre_traverse,
3033     .m_free = sre_free,
3034     .m_clear = sre_clear,
3035 };
3036 
3037 PyMODINIT_FUNC
PyInit__sre(void)3038 PyInit__sre(void)
3039 {
3040     return PyModuleDef_Init(&sremodule);
3041 }
3042 
3043 /* vim:ts=4:sw=4:et
3044 */
3045