]> git.proxmox.com Git - mirror_edk2.git/blob - AppPkg/Applications/Python/PyMod-2.7.2/Modules/_sre.c
ec723cc675c5074dea7cff16e5b4455442866fe2
[mirror_edk2.git] / AppPkg / Applications / Python / PyMod-2.7.2 / Modules / _sre.c
1 /*
2 * Secret Labs' Regular Expression Engine
3 *
4 * regular expression matching engine
5 *
6 * partial history:
7 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
25 *
26 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
27 *
28 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
32 * Portions of this engine have been developed in cooperation with
33 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
34 * other compatibility work.
35 */
36
37 /* Get rid of these macros to prevent collisions between EFI and Python in this file. */
38 #undef RETURN_ERROR
39 #undef RETURN_SUCCESS
40
41 #ifndef SRE_RECURSIVE
42
43 static char copyright[] =
44 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
45
46 #define PY_SSIZE_T_CLEAN
47
48 #include "Python.h"
49 #include "structmember.h" /* offsetof */
50
51 #include "sre.h"
52
53 #include <ctype.h>
54
55 /* name of this module, minus the leading underscore */
56 #if !defined(SRE_MODULE)
57 #define SRE_MODULE "sre"
58 #endif
59
60 #define SRE_PY_MODULE "re"
61
62 /* defining this one enables tracing */
63 #undef VERBOSE
64
65 #if PY_VERSION_HEX >= 0x01060000
66 #if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
67 /* defining this enables unicode support (default under 1.6a1 and later) */
68 #define HAVE_UNICODE
69 #endif
70 #endif
71
72 /* -------------------------------------------------------------------- */
73 /* optional features */
74
75 /* enables fast searching */
76 #define USE_FAST_SEARCH
77
78 /* enables aggressive inlining (always on for Visual C) */
79 #undef USE_INLINE
80
81 /* enables copy/deepcopy handling (work in progress) */
82 #undef USE_BUILTIN_COPY
83
84 #if PY_VERSION_HEX < 0x01060000
85 #define PyObject_DEL(op) PyMem_DEL((op))
86 #endif
87
88 /* -------------------------------------------------------------------- */
89
90 #if defined(_MSC_VER)
91 #pragma optimize("gt", on) /* doesn't seem to make much difference... */
92 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
93 /* fastest possible local call under MSVC */
94 #define LOCAL(type) static __inline type __fastcall
95 #elif defined(USE_INLINE)
96 #define LOCAL(type) static inline type
97 #else
98 #define LOCAL(type) static type
99 #endif
100
101 /* error codes */
102 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
103 #define SRE_ERROR_STATE -2 /* illegal state */
104 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
105 #define SRE_ERROR_MEMORY -9 /* out of memory */
106 #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
107
108 #if defined(VERBOSE)
109 #define TRACE(v) printf v
110 #else
111 #define TRACE(v)
112 #endif
113
114 /* -------------------------------------------------------------------- */
115 /* search engine state */
116
117 /* default character predicates (run sre_chars.py to regenerate tables) */
118
119 #define SRE_DIGIT_MASK 1
120 #define SRE_SPACE_MASK 2
121 #define SRE_LINEBREAK_MASK 4
122 #define SRE_ALNUM_MASK 8
123 #define SRE_WORD_MASK 16
124
125 /* FIXME: this assumes ASCII. create tables in init_sre() instead */
126
127 static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
128 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
129 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
130 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
131 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
132 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
133 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
134
135 static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
136 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
137 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
138 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
139 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
140 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
141 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
142 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
143 120, 121, 122, 123, 124, 125, 126, 127 };
144
145 #define SRE_IS_DIGIT(ch)\
146 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
147 #define SRE_IS_SPACE(ch)\
148 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
149 #define SRE_IS_LINEBREAK(ch)\
150 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
151 #define SRE_IS_ALNUM(ch)\
152 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
153 #define SRE_IS_WORD(ch)\
154 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
155
156 static unsigned int sre_lower(unsigned int ch)
157 {
158 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
159 }
160
161 /* locale-specific character predicates */
162 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
163 * warnings when c's type supports only numbers < N+1 */
164 #define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
165 #define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
166 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
167 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
168 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
169
170 static unsigned int sre_lower_locale(unsigned int ch)
171 {
172 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
173 }
174
175 /* unicode-specific character predicates */
176
177 #if defined(HAVE_UNICODE)
178
179 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
180 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
181 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
182 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
183 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
184
185 static unsigned int sre_lower_unicode(unsigned int ch)
186 {
187 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
188 }
189
190 #endif
191
192 LOCAL(int)
193 sre_category(SRE_CODE category, unsigned int ch)
194 {
195 switch (category) {
196
197 case SRE_CATEGORY_DIGIT:
198 return SRE_IS_DIGIT(ch);
199 case SRE_CATEGORY_NOT_DIGIT:
200 return !SRE_IS_DIGIT(ch);
201 case SRE_CATEGORY_SPACE:
202 return SRE_IS_SPACE(ch);
203 case SRE_CATEGORY_NOT_SPACE:
204 return !SRE_IS_SPACE(ch);
205 case SRE_CATEGORY_WORD:
206 return SRE_IS_WORD(ch);
207 case SRE_CATEGORY_NOT_WORD:
208 return !SRE_IS_WORD(ch);
209 case SRE_CATEGORY_LINEBREAK:
210 return SRE_IS_LINEBREAK(ch);
211 case SRE_CATEGORY_NOT_LINEBREAK:
212 return !SRE_IS_LINEBREAK(ch);
213
214 case SRE_CATEGORY_LOC_WORD:
215 return SRE_LOC_IS_WORD(ch);
216 case SRE_CATEGORY_LOC_NOT_WORD:
217 return !SRE_LOC_IS_WORD(ch);
218
219 #if defined(HAVE_UNICODE)
220 case SRE_CATEGORY_UNI_DIGIT:
221 return SRE_UNI_IS_DIGIT(ch);
222 case SRE_CATEGORY_UNI_NOT_DIGIT:
223 return !SRE_UNI_IS_DIGIT(ch);
224 case SRE_CATEGORY_UNI_SPACE:
225 return SRE_UNI_IS_SPACE(ch);
226 case SRE_CATEGORY_UNI_NOT_SPACE:
227 return !SRE_UNI_IS_SPACE(ch);
228 case SRE_CATEGORY_UNI_WORD:
229 return SRE_UNI_IS_WORD(ch);
230 case SRE_CATEGORY_UNI_NOT_WORD:
231 return !SRE_UNI_IS_WORD(ch);
232 case SRE_CATEGORY_UNI_LINEBREAK:
233 return SRE_UNI_IS_LINEBREAK(ch);
234 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
235 return !SRE_UNI_IS_LINEBREAK(ch);
236 #else
237 case SRE_CATEGORY_UNI_DIGIT:
238 return SRE_IS_DIGIT(ch);
239 case SRE_CATEGORY_UNI_NOT_DIGIT:
240 return !SRE_IS_DIGIT(ch);
241 case SRE_CATEGORY_UNI_SPACE:
242 return SRE_IS_SPACE(ch);
243 case SRE_CATEGORY_UNI_NOT_SPACE:
244 return !SRE_IS_SPACE(ch);
245 case SRE_CATEGORY_UNI_WORD:
246 return SRE_LOC_IS_WORD(ch);
247 case SRE_CATEGORY_UNI_NOT_WORD:
248 return !SRE_LOC_IS_WORD(ch);
249 case SRE_CATEGORY_UNI_LINEBREAK:
250 return SRE_IS_LINEBREAK(ch);
251 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
252 return !SRE_IS_LINEBREAK(ch);
253 #endif
254 }
255 return 0;
256 }
257
258 /* helpers */
259
260 static void
261 data_stack_dealloc(SRE_STATE* state)
262 {
263 if (state->data_stack) {
264 PyMem_FREE(state->data_stack);
265 state->data_stack = NULL;
266 }
267 state->data_stack_size = state->data_stack_base = 0;
268 }
269
270 static int
271 data_stack_grow(SRE_STATE* state, Py_ssize_t size)
272 {
273 Py_ssize_t minsize, cursize;
274 minsize = state->data_stack_base+size;
275 cursize = state->data_stack_size;
276 if (cursize < minsize) {
277 void* stack;
278 cursize = minsize+minsize/4+1024;
279 TRACE(("allocate/grow stack %d\n", cursize));
280 stack = PyMem_REALLOC(state->data_stack, cursize);
281 if (!stack) {
282 data_stack_dealloc(state);
283 return SRE_ERROR_MEMORY;
284 }
285 state->data_stack = (char *)stack;
286 state->data_stack_size = cursize;
287 }
288 return 0;
289 }
290
291 /* generate 8-bit version */
292
293 #define SRE_CHAR unsigned char
294 #define SRE_AT sre_at
295 #define SRE_COUNT sre_count
296 #define SRE_CHARSET sre_charset
297 #define SRE_INFO sre_info
298 #define SRE_MATCH sre_match
299 #define SRE_MATCH_CONTEXT sre_match_context
300 #define SRE_SEARCH sre_search
301 #define SRE_LITERAL_TEMPLATE sre_literal_template
302
303 #if defined(HAVE_UNICODE)
304
305 #define SRE_RECURSIVE
306 #include "_sre.c"
307 #undef SRE_RECURSIVE
308
309 #undef SRE_LITERAL_TEMPLATE
310 #undef SRE_SEARCH
311 #undef SRE_MATCH
312 #undef SRE_MATCH_CONTEXT
313 #undef SRE_INFO
314 #undef SRE_CHARSET
315 #undef SRE_COUNT
316 #undef SRE_AT
317 #undef SRE_CHAR
318
319 /* generate 16-bit unicode version */
320
321 #define SRE_CHAR Py_UNICODE
322 #define SRE_AT sre_uat
323 #define SRE_COUNT sre_ucount
324 #define SRE_CHARSET sre_ucharset
325 #define SRE_INFO sre_uinfo
326 #define SRE_MATCH sre_umatch
327 #define SRE_MATCH_CONTEXT sre_umatch_context
328 #define SRE_SEARCH sre_usearch
329 #define SRE_LITERAL_TEMPLATE sre_uliteral_template
330 #endif
331
332 #endif /* SRE_RECURSIVE */
333
334 /* -------------------------------------------------------------------- */
335 /* String matching engine */
336
337 /* the following section is compiled twice, with different character
338 settings */
339
340 LOCAL(int)
341 SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
342 {
343 /* check if pointer is at given position */
344
345 Py_ssize_t thisp, thatp;
346
347 switch (at) {
348
349 case SRE_AT_BEGINNING:
350 case SRE_AT_BEGINNING_STRING:
351 return ((void*) ptr == state->beginning);
352
353 case SRE_AT_BEGINNING_LINE:
354 return ((void*) ptr == state->beginning ||
355 SRE_IS_LINEBREAK((int) ptr[-1]));
356
357 case SRE_AT_END:
358 return (((void*) (ptr+1) == state->end &&
359 SRE_IS_LINEBREAK((int) ptr[0])) ||
360 ((void*) ptr == state->end));
361
362 case SRE_AT_END_LINE:
363 return ((void*) ptr == state->end ||
364 SRE_IS_LINEBREAK((int) ptr[0]));
365
366 case SRE_AT_END_STRING:
367 return ((void*) ptr == state->end);
368
369 case SRE_AT_BOUNDARY:
370 if (state->beginning == state->end)
371 return 0;
372 thatp = ((void*) ptr > state->beginning) ?
373 SRE_IS_WORD((int) ptr[-1]) : 0;
374 thisp = ((void*) ptr < state->end) ?
375 SRE_IS_WORD((int) ptr[0]) : 0;
376 return thisp != thatp;
377
378 case SRE_AT_NON_BOUNDARY:
379 if (state->beginning == state->end)
380 return 0;
381 thatp = ((void*) ptr > state->beginning) ?
382 SRE_IS_WORD((int) ptr[-1]) : 0;
383 thisp = ((void*) ptr < state->end) ?
384 SRE_IS_WORD((int) ptr[0]) : 0;
385 return thisp == thatp;
386
387 case SRE_AT_LOC_BOUNDARY:
388 if (state->beginning == state->end)
389 return 0;
390 thatp = ((void*) ptr > state->beginning) ?
391 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
392 thisp = ((void*) ptr < state->end) ?
393 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
394 return thisp != thatp;
395
396 case SRE_AT_LOC_NON_BOUNDARY:
397 if (state->beginning == state->end)
398 return 0;
399 thatp = ((void*) ptr > state->beginning) ?
400 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
401 thisp = ((void*) ptr < state->end) ?
402 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
403 return thisp == thatp;
404
405 #if defined(HAVE_UNICODE)
406 case SRE_AT_UNI_BOUNDARY:
407 if (state->beginning == state->end)
408 return 0;
409 thatp = ((void*) ptr > state->beginning) ?
410 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
411 thisp = ((void*) ptr < state->end) ?
412 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
413 return thisp != thatp;
414
415 case SRE_AT_UNI_NON_BOUNDARY:
416 if (state->beginning == state->end)
417 return 0;
418 thatp = ((void*) ptr > state->beginning) ?
419 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
420 thisp = ((void*) ptr < state->end) ?
421 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
422 return thisp == thatp;
423 #endif
424
425 }
426
427 return 0;
428 }
429
430 LOCAL(int)
431 SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
432 {
433 /* check if character is a member of the given set */
434
435 int ok = 1;
436
437 for (;;) {
438 switch (*set++) {
439
440 case SRE_OP_FAILURE:
441 return !ok;
442
443 case SRE_OP_LITERAL:
444 /* <LITERAL> <code> */
445 if (ch == set[0])
446 return ok;
447 set++;
448 break;
449
450 case SRE_OP_CATEGORY:
451 /* <CATEGORY> <code> */
452 if (sre_category(set[0], (int) ch))
453 return ok;
454 set += 1;
455 break;
456
457 case SRE_OP_CHARSET:
458 if (sizeof(SRE_CODE) == 2) {
459 /* <CHARSET> <bitmap> (16 bits per code word) */
460 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
461 return ok;
462 set += 16;
463 }
464 else {
465 /* <CHARSET> <bitmap> (32 bits per code word) */
466 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
467 return ok;
468 set += 8;
469 }
470 break;
471
472 case SRE_OP_RANGE:
473 /* <RANGE> <lower> <upper> */
474 if (set[0] <= ch && ch <= set[1])
475 return ok;
476 set += 2;
477 break;
478
479 case SRE_OP_NEGATE:
480 ok = !ok;
481 break;
482
483 case SRE_OP_BIGCHARSET:
484 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
485 {
486 Py_ssize_t count, block;
487 count = *(set++);
488
489 if (sizeof(SRE_CODE) == 2) {
490 block = ((unsigned char*)set)[ch >> 8];
491 set += 128;
492 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
493 return ok;
494 set += count*16;
495 }
496 else {
497 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
498 * warnings when c's type supports only numbers < N+1 */
499 if (!(ch & ~65535))
500 block = ((unsigned char*)set)[ch >> 8];
501 else
502 block = -1;
503 set += 64;
504 if (block >=0 &&
505 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
506 return ok;
507 set += count*8;
508 }
509 break;
510 }
511
512 default:
513 /* internal error -- there's not much we can do about it
514 here, so let's just pretend it didn't match... */
515 return 0;
516 }
517 }
518 }
519
520 LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
521
522 LOCAL(Py_ssize_t)
523 SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
524 {
525 SRE_CODE chr;
526 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
527 SRE_CHAR* end = (SRE_CHAR *)state->end;
528 Py_ssize_t i;
529
530 /* adjust end */
531 if (maxcount < end - ptr && maxcount != 65535)
532 end = ptr + maxcount;
533
534 switch (pattern[0]) {
535
536 case SRE_OP_IN:
537 /* repeated set */
538 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
539 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
540 ptr++;
541 break;
542
543 case SRE_OP_ANY:
544 /* repeated dot wildcard. */
545 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
546 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
547 ptr++;
548 break;
549
550 case SRE_OP_ANY_ALL:
551 /* repeated dot wildcard. skip to the end of the target
552 string, and backtrack from there */
553 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
554 ptr = end;
555 break;
556
557 case SRE_OP_LITERAL:
558 /* repeated literal */
559 chr = pattern[1];
560 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
561 while (ptr < end && (SRE_CODE) *ptr == chr)
562 ptr++;
563 break;
564
565 case SRE_OP_LITERAL_IGNORE:
566 /* repeated literal */
567 chr = pattern[1];
568 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
569 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
570 ptr++;
571 break;
572
573 case SRE_OP_NOT_LITERAL:
574 /* repeated non-literal */
575 chr = pattern[1];
576 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
577 while (ptr < end && (SRE_CODE) *ptr != chr)
578 ptr++;
579 break;
580
581 case SRE_OP_NOT_LITERAL_IGNORE:
582 /* repeated non-literal */
583 chr = pattern[1];
584 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
585 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
586 ptr++;
587 break;
588
589 default:
590 /* repeated single character pattern */
591 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
592 while ((SRE_CHAR*) state->ptr < end) {
593 i = SRE_MATCH(state, pattern);
594 if (i < 0)
595 return i;
596 if (!i)
597 break;
598 }
599 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
600 (SRE_CHAR*) state->ptr - ptr));
601 return (SRE_CHAR*) state->ptr - ptr;
602 }
603
604 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
605 return ptr - (SRE_CHAR*) state->ptr;
606 }
607
608 #if 0 /* not used in this release */
609 LOCAL(int)
610 SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
611 {
612 /* check if an SRE_OP_INFO block matches at the current position.
613 returns the number of SRE_CODE objects to skip if successful, 0
614 if no match */
615
616 SRE_CHAR* end = state->end;
617 SRE_CHAR* ptr = state->ptr;
618 Py_ssize_t i;
619
620 /* check minimal length */
621 if (pattern[3] && (end - ptr) < pattern[3])
622 return 0;
623
624 /* check known prefix */
625 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
626 /* <length> <skip> <prefix data> <overlap data> */
627 for (i = 0; i < pattern[5]; i++)
628 if ((SRE_CODE) ptr[i] != pattern[7 + i])
629 return 0;
630 return pattern[0] + 2 * pattern[6];
631 }
632 return pattern[0];
633 }
634 #endif
635
636 /* The macros below should be used to protect recursive SRE_MATCH()
637 * calls that *failed* and do *not* return immediately (IOW, those
638 * that will backtrack). Explaining:
639 *
640 * - Recursive SRE_MATCH() returned true: that's usually a success
641 * (besides atypical cases like ASSERT_NOT), therefore there's no
642 * reason to restore lastmark;
643 *
644 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
645 * is returning to the caller: If the current SRE_MATCH() is the
646 * top function of the recursion, returning false will be a matching
647 * failure, and it doesn't matter where lastmark is pointing to.
648 * If it's *not* the top function, it will be a recursive SRE_MATCH()
649 * failure by itself, and the calling SRE_MATCH() will have to deal
650 * with the failure by the same rules explained here (it will restore
651 * lastmark by itself if necessary);
652 *
653 * - Recursive SRE_MATCH() returned false, and will continue the
654 * outside 'for' loop: must be protected when breaking, since the next
655 * OP could potentially depend on lastmark;
656 *
657 * - Recursive SRE_MATCH() returned false, and will be called again
658 * inside a local for/while loop: must be protected between each
659 * loop iteration, since the recursive SRE_MATCH() could do anything,
660 * and could potentially depend on lastmark.
661 *
662 * For more information, check the discussion at SF patch #712900.
663 */
664 #define LASTMARK_SAVE() \
665 do { \
666 ctx->lastmark = state->lastmark; \
667 ctx->lastindex = state->lastindex; \
668 } while (0)
669 #define LASTMARK_RESTORE() \
670 do { \
671 state->lastmark = ctx->lastmark; \
672 state->lastindex = ctx->lastindex; \
673 } while (0)
674
675 #define RETURN_ERROR(i) do { return i; } while(0)
676 #define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
677 #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
678
679 #define RETURN_ON_ERROR(i) \
680 do { if (i < 0) RETURN_ERROR(i); } while (0)
681 #define RETURN_ON_SUCCESS(i) \
682 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
683 #define RETURN_ON_FAILURE(i) \
684 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
685
686 #define SFY(x) #x
687
688 #define DATA_STACK_ALLOC(state, type, ptr) \
689 do { \
690 alloc_pos = state->data_stack_base; \
691 TRACE(("allocating %s in %d (%d)\n", \
692 SFY(type), alloc_pos, sizeof(type))); \
693 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
694 int j = data_stack_grow(state, sizeof(type)); \
695 if (j < 0) return j; \
696 if (ctx_pos != -1) \
697 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
698 } \
699 ptr = (type*)(state->data_stack+alloc_pos); \
700 state->data_stack_base += sizeof(type); \
701 } while (0)
702
703 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
704 do { \
705 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
706 ptr = (type*)(state->data_stack+pos); \
707 } while (0)
708
709 #define DATA_STACK_PUSH(state, data, size) \
710 do { \
711 TRACE(("copy data in %p to %d (%d)\n", \
712 data, state->data_stack_base, size)); \
713 if (state->data_stack_size < state->data_stack_base+size) { \
714 int j = data_stack_grow(state, size); \
715 if (j < 0) return j; \
716 if (ctx_pos != -1) \
717 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
718 } \
719 memcpy(state->data_stack+state->data_stack_base, data, size); \
720 state->data_stack_base += size; \
721 } while (0)
722
723 #define DATA_STACK_POP(state, data, size, discard) \
724 do { \
725 TRACE(("copy data to %p from %d (%d)\n", \
726 data, state->data_stack_base-size, size)); \
727 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
728 if (discard) \
729 state->data_stack_base -= size; \
730 } while (0)
731
732 #define DATA_STACK_POP_DISCARD(state, size) \
733 do { \
734 TRACE(("discard data from %d (%d)\n", \
735 state->data_stack_base-size, size)); \
736 state->data_stack_base -= size; \
737 } while(0)
738
739 #define DATA_PUSH(x) \
740 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
741 #define DATA_POP(x) \
742 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
743 #define DATA_POP_DISCARD(x) \
744 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
745 #define DATA_ALLOC(t,p) \
746 DATA_STACK_ALLOC(state, t, p)
747 #define DATA_LOOKUP_AT(t,p,pos) \
748 DATA_STACK_LOOKUP_AT(state,t,p,pos)
749
750 #define MARK_PUSH(lastmark) \
751 do if (lastmark > 0) { \
752 i = lastmark; /* ctx->lastmark may change if reallocated */ \
753 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
754 } while (0)
755 #define MARK_POP(lastmark) \
756 do if (lastmark > 0) { \
757 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
758 } while (0)
759 #define MARK_POP_KEEP(lastmark) \
760 do if (lastmark > 0) { \
761 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
762 } while (0)
763 #define MARK_POP_DISCARD(lastmark) \
764 do if (lastmark > 0) { \
765 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
766 } while (0)
767
768 #define JUMP_NONE 0
769 #define JUMP_MAX_UNTIL_1 1
770 #define JUMP_MAX_UNTIL_2 2
771 #define JUMP_MAX_UNTIL_3 3
772 #define JUMP_MIN_UNTIL_1 4
773 #define JUMP_MIN_UNTIL_2 5
774 #define JUMP_MIN_UNTIL_3 6
775 #define JUMP_REPEAT 7
776 #define JUMP_REPEAT_ONE_1 8
777 #define JUMP_REPEAT_ONE_2 9
778 #define JUMP_MIN_REPEAT_ONE 10
779 #define JUMP_BRANCH 11
780 #define JUMP_ASSERT 12
781 #define JUMP_ASSERT_NOT 13
782
783 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
784 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
785 nextctx->last_ctx_pos = ctx_pos; \
786 nextctx->jump = jumpvalue; \
787 nextctx->pattern = nextpattern; \
788 ctx_pos = alloc_pos; \
789 ctx = nextctx; \
790 goto entrance; \
791 jumplabel: \
792 while (0) /* gcc doesn't like labels at end of scopes */ \
793
794 typedef struct {
795 Py_ssize_t last_ctx_pos;
796 Py_ssize_t jump;
797 SRE_CHAR* ptr;
798 SRE_CODE* pattern;
799 Py_ssize_t count;
800 Py_ssize_t lastmark;
801 Py_ssize_t lastindex;
802 union {
803 SRE_CODE chr;
804 SRE_REPEAT* rep;
805 } u;
806 } SRE_MATCH_CONTEXT;
807
808 /* check if string matches the given pattern. returns <0 for
809 error, 0 for failure, and 1 for success */
810 LOCAL(Py_ssize_t)
811 SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
812 {
813 SRE_CHAR* end = (SRE_CHAR *)state->end;
814 Py_ssize_t alloc_pos, ctx_pos = -1;
815 Py_ssize_t i, ret = 0;
816 Py_ssize_t jump;
817 unsigned int sigcount=0;
818
819 SRE_MATCH_CONTEXT* ctx;
820 SRE_MATCH_CONTEXT* nextctx;
821
822 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
823
824 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
825 ctx->last_ctx_pos = -1;
826 ctx->jump = JUMP_NONE;
827 ctx->pattern = pattern;
828 ctx_pos = alloc_pos;
829
830 entrance:
831
832 ctx->ptr = (SRE_CHAR *)state->ptr;
833
834 if (ctx->pattern[0] == SRE_OP_INFO) {
835 /* optimization info block */
836 /* <INFO> <1=skip> <2=flags> <3=min> ... */
837 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
838 TRACE(("reject (got %d chars, need %d)\n",
839 (end - ctx->ptr), ctx->pattern[3]));
840 RETURN_FAILURE;
841 }
842 ctx->pattern += ctx->pattern[1] + 1;
843 }
844
845 for (;;) {
846 ++sigcount;
847 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
848 RETURN_ERROR(SRE_ERROR_INTERRUPTED);
849
850 switch (*ctx->pattern++) {
851
852 case SRE_OP_MARK:
853 /* set mark */
854 /* <MARK> <gid> */
855 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
856 ctx->ptr, ctx->pattern[0]));
857 i = ctx->pattern[0];
858 if (i & 1)
859 state->lastindex = i/2 + 1;
860 if (i > state->lastmark) {
861 /* state->lastmark is the highest valid index in the
862 state->mark array. If it is increased by more than 1,
863 the intervening marks must be set to NULL to signal
864 that these marks have not been encountered. */
865 Py_ssize_t j = state->lastmark + 1;
866 while (j < i)
867 state->mark[j++] = NULL;
868 state->lastmark = i;
869 }
870 state->mark[i] = ctx->ptr;
871 ctx->pattern++;
872 break;
873
874 case SRE_OP_LITERAL:
875 /* match literal string */
876 /* <LITERAL> <code> */
877 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
878 ctx->ptr, *ctx->pattern));
879 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
880 RETURN_FAILURE;
881 ctx->pattern++;
882 ctx->ptr++;
883 break;
884
885 case SRE_OP_NOT_LITERAL:
886 /* match anything that is not literal character */
887 /* <NOT_LITERAL> <code> */
888 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
889 ctx->ptr, *ctx->pattern));
890 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
891 RETURN_FAILURE;
892 ctx->pattern++;
893 ctx->ptr++;
894 break;
895
896 case SRE_OP_SUCCESS:
897 /* end of pattern */
898 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
899 state->ptr = ctx->ptr;
900 RETURN_SUCCESS;
901
902 case SRE_OP_AT:
903 /* match at given position */
904 /* <AT> <code> */
905 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
906 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
907 RETURN_FAILURE;
908 ctx->pattern++;
909 break;
910
911 case SRE_OP_CATEGORY:
912 /* match at given category */
913 /* <CATEGORY> <code> */
914 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
915 ctx->ptr, *ctx->pattern));
916 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
917 RETURN_FAILURE;
918 ctx->pattern++;
919 ctx->ptr++;
920 break;
921
922 case SRE_OP_ANY:
923 /* match anything (except a newline) */
924 /* <ANY> */
925 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
926 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
927 RETURN_FAILURE;
928 ctx->ptr++;
929 break;
930
931 case SRE_OP_ANY_ALL:
932 /* match anything */
933 /* <ANY_ALL> */
934 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
935 if (ctx->ptr >= end)
936 RETURN_FAILURE;
937 ctx->ptr++;
938 break;
939
940 case SRE_OP_IN:
941 /* match set member (or non_member) */
942 /* <IN> <skip> <set> */
943 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
944 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
945 RETURN_FAILURE;
946 ctx->pattern += ctx->pattern[0];
947 ctx->ptr++;
948 break;
949
950 case SRE_OP_LITERAL_IGNORE:
951 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
952 ctx->pattern, ctx->ptr, ctx->pattern[0]));
953 if (ctx->ptr >= end ||
954 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
955 RETURN_FAILURE;
956 ctx->pattern++;
957 ctx->ptr++;
958 break;
959
960 case SRE_OP_NOT_LITERAL_IGNORE:
961 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
962 ctx->pattern, ctx->ptr, *ctx->pattern));
963 if (ctx->ptr >= end ||
964 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
965 RETURN_FAILURE;
966 ctx->pattern++;
967 ctx->ptr++;
968 break;
969
970 case SRE_OP_IN_IGNORE:
971 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
972 if (ctx->ptr >= end
973 || !SRE_CHARSET(ctx->pattern+1,
974 (SRE_CODE)state->lower(*ctx->ptr)))
975 RETURN_FAILURE;
976 ctx->pattern += ctx->pattern[0];
977 ctx->ptr++;
978 break;
979
980 case SRE_OP_JUMP:
981 case SRE_OP_INFO:
982 /* jump forward */
983 /* <JUMP> <offset> */
984 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
985 ctx->ptr, ctx->pattern[0]));
986 ctx->pattern += ctx->pattern[0];
987 break;
988
989 case SRE_OP_BRANCH:
990 /* alternation */
991 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
992 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
993 LASTMARK_SAVE();
994 ctx->u.rep = state->repeat;
995 if (ctx->u.rep)
996 MARK_PUSH(ctx->lastmark);
997 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
998 if (ctx->pattern[1] == SRE_OP_LITERAL &&
999 (ctx->ptr >= end ||
1000 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
1001 continue;
1002 if (ctx->pattern[1] == SRE_OP_IN &&
1003 (ctx->ptr >= end ||
1004 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
1005 continue;
1006 state->ptr = ctx->ptr;
1007 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
1008 if (ret) {
1009 if (ctx->u.rep)
1010 MARK_POP_DISCARD(ctx->lastmark);
1011 RETURN_ON_ERROR(ret);
1012 RETURN_SUCCESS;
1013 }
1014 if (ctx->u.rep)
1015 MARK_POP_KEEP(ctx->lastmark);
1016 LASTMARK_RESTORE();
1017 }
1018 if (ctx->u.rep)
1019 MARK_POP_DISCARD(ctx->lastmark);
1020 RETURN_FAILURE;
1021
1022 case SRE_OP_REPEAT_ONE:
1023 /* match repeated sequence (maximizing regexp) */
1024
1025 /* this operator only works if the repeated item is
1026 exactly one character wide, and we're not already
1027 collecting backtracking points. for other cases,
1028 use the MAX_REPEAT operator */
1029
1030 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1031
1032 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1033 ctx->pattern[1], ctx->pattern[2]));
1034
1035 if (ctx->ptr + ctx->pattern[1] > end)
1036 RETURN_FAILURE; /* cannot match */
1037
1038 state->ptr = ctx->ptr;
1039
1040 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1041 RETURN_ON_ERROR(ret);
1042 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1043 ctx->count = ret;
1044 ctx->ptr += ctx->count;
1045
1046 /* when we arrive here, count contains the number of
1047 matches, and ctx->ptr points to the tail of the target
1048 string. check if the rest of the pattern matches,
1049 and backtrack if not. */
1050
1051 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1052 RETURN_FAILURE;
1053
1054 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1055 /* tail is empty. we're finished */
1056 state->ptr = ctx->ptr;
1057 RETURN_SUCCESS;
1058 }
1059
1060 LASTMARK_SAVE();
1061
1062 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
1063 /* tail starts with a literal. skip positions where
1064 the rest of the pattern cannot possibly match */
1065 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
1066 for (;;) {
1067 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
1068 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1069 ctx->ptr--;
1070 ctx->count--;
1071 }
1072 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1073 break;
1074 state->ptr = ctx->ptr;
1075 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1076 ctx->pattern+ctx->pattern[0]);
1077 if (ret) {
1078 RETURN_ON_ERROR(ret);
1079 RETURN_SUCCESS;
1080 }
1081
1082 LASTMARK_RESTORE();
1083
1084 ctx->ptr--;
1085 ctx->count--;
1086 }
1087
1088 } else {
1089 /* general case */
1090 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
1091 state->ptr = ctx->ptr;
1092 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1093 ctx->pattern+ctx->pattern[0]);
1094 if (ret) {
1095 RETURN_ON_ERROR(ret);
1096 RETURN_SUCCESS;
1097 }
1098 ctx->ptr--;
1099 ctx->count--;
1100 LASTMARK_RESTORE();
1101 }
1102 }
1103 RETURN_FAILURE;
1104
1105 case SRE_OP_MIN_REPEAT_ONE:
1106 /* match repeated sequence (minimizing regexp) */
1107
1108 /* this operator only works if the repeated item is
1109 exactly one character wide, and we're not already
1110 collecting backtracking points. for other cases,
1111 use the MIN_REPEAT operator */
1112
1113 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1114
1115 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1116 ctx->pattern[1], ctx->pattern[2]));
1117
1118 if (ctx->ptr + ctx->pattern[1] > end)
1119 RETURN_FAILURE; /* cannot match */
1120
1121 state->ptr = ctx->ptr;
1122
1123 if (ctx->pattern[1] == 0)
1124 ctx->count = 0;
1125 else {
1126 /* count using pattern min as the maximum */
1127 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1128 RETURN_ON_ERROR(ret);
1129 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1130 if (ret < (Py_ssize_t) ctx->pattern[1])
1131 /* didn't match minimum number of times */
1132 RETURN_FAILURE;
1133 /* advance past minimum matches of repeat */
1134 ctx->count = ret;
1135 ctx->ptr += ctx->count;
1136 }
1137
1138 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1139 /* tail is empty. we're finished */
1140 state->ptr = ctx->ptr;
1141 RETURN_SUCCESS;
1142
1143 } else {
1144 /* general case */
1145 LASTMARK_SAVE();
1146 while ((Py_ssize_t)ctx->pattern[2] == 65535
1147 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
1148 state->ptr = ctx->ptr;
1149 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1150 ctx->pattern+ctx->pattern[0]);
1151 if (ret) {
1152 RETURN_ON_ERROR(ret);
1153 RETURN_SUCCESS;
1154 }
1155 state->ptr = ctx->ptr;
1156 ret = SRE_COUNT(state, ctx->pattern+3, 1);
1157 RETURN_ON_ERROR(ret);
1158 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1159 if (ret == 0)
1160 break;
1161 assert(ret == 1);
1162 ctx->ptr++;
1163 ctx->count++;
1164 LASTMARK_RESTORE();
1165 }
1166 }
1167 RETURN_FAILURE;
1168
1169 case SRE_OP_REPEAT:
1170 /* create repeat context. all the hard work is done
1171 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1172 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1173 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1174 ctx->pattern[1], ctx->pattern[2]));
1175
1176 /* install new repeat context */
1177 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
1178 if (!ctx->u.rep) {
1179 PyErr_NoMemory();
1180 RETURN_FAILURE;
1181 }
1182 ctx->u.rep->count = -1;
1183 ctx->u.rep->pattern = ctx->pattern;
1184 ctx->u.rep->prev = state->repeat;
1185 ctx->u.rep->last_ptr = NULL;
1186 state->repeat = ctx->u.rep;
1187
1188 state->ptr = ctx->ptr;
1189 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
1190 state->repeat = ctx->u.rep->prev;
1191 PyObject_FREE(ctx->u.rep);
1192
1193 if (ret) {
1194 RETURN_ON_ERROR(ret);
1195 RETURN_SUCCESS;
1196 }
1197 RETURN_FAILURE;
1198
1199 case SRE_OP_MAX_UNTIL:
1200 /* maximizing repeat */
1201 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1202
1203 /* FIXME: we probably need to deal with zero-width
1204 matches in here... */
1205
1206 ctx->u.rep = state->repeat;
1207 if (!ctx->u.rep)
1208 RETURN_ERROR(SRE_ERROR_STATE);
1209
1210 state->ptr = ctx->ptr;
1211
1212 ctx->count = ctx->u.rep->count+1;
1213
1214 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1215 ctx->ptr, ctx->count));
1216
1217 if (ctx->count < ctx->u.rep->pattern[1]) {
1218 /* not enough matches */
1219 ctx->u.rep->count = ctx->count;
1220 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1221 ctx->u.rep->pattern+3);
1222 if (ret) {
1223 RETURN_ON_ERROR(ret);
1224 RETURN_SUCCESS;
1225 }
1226 ctx->u.rep->count = ctx->count-1;
1227 state->ptr = ctx->ptr;
1228 RETURN_FAILURE;
1229 }
1230
1231 if ((ctx->count < ctx->u.rep->pattern[2] ||
1232 ctx->u.rep->pattern[2] == 65535) &&
1233 state->ptr != ctx->u.rep->last_ptr) {
1234 /* we may have enough matches, but if we can
1235 match another item, do so */
1236 ctx->u.rep->count = ctx->count;
1237 LASTMARK_SAVE();
1238 MARK_PUSH(ctx->lastmark);
1239 /* zero-width match protection */
1240 DATA_PUSH(&ctx->u.rep->last_ptr);
1241 ctx->u.rep->last_ptr = state->ptr;
1242 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1243 ctx->u.rep->pattern+3);
1244 DATA_POP(&ctx->u.rep->last_ptr);
1245 if (ret) {
1246 MARK_POP_DISCARD(ctx->lastmark);
1247 RETURN_ON_ERROR(ret);
1248 RETURN_SUCCESS;
1249 }
1250 MARK_POP(ctx->lastmark);
1251 LASTMARK_RESTORE();
1252 ctx->u.rep->count = ctx->count-1;
1253 state->ptr = ctx->ptr;
1254 }
1255
1256 /* cannot match more repeated items here. make sure the
1257 tail matches */
1258 state->repeat = ctx->u.rep->prev;
1259 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
1260 RETURN_ON_SUCCESS(ret);
1261 state->repeat = ctx->u.rep;
1262 state->ptr = ctx->ptr;
1263 RETURN_FAILURE;
1264
1265 case SRE_OP_MIN_UNTIL:
1266 /* minimizing repeat */
1267 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1268
1269 ctx->u.rep = state->repeat;
1270 if (!ctx->u.rep)
1271 RETURN_ERROR(SRE_ERROR_STATE);
1272
1273 state->ptr = ctx->ptr;
1274
1275 ctx->count = ctx->u.rep->count+1;
1276
1277 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1278 ctx->ptr, ctx->count, ctx->u.rep->pattern));
1279
1280 if (ctx->count < ctx->u.rep->pattern[1]) {
1281 /* not enough matches */
1282 ctx->u.rep->count = ctx->count;
1283 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1284 ctx->u.rep->pattern+3);
1285 if (ret) {
1286 RETURN_ON_ERROR(ret);
1287 RETURN_SUCCESS;
1288 }
1289 ctx->u.rep->count = ctx->count-1;
1290 state->ptr = ctx->ptr;
1291 RETURN_FAILURE;
1292 }
1293
1294 LASTMARK_SAVE();
1295
1296 /* see if the tail matches */
1297 state->repeat = ctx->u.rep->prev;
1298 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
1299 if (ret) {
1300 RETURN_ON_ERROR(ret);
1301 RETURN_SUCCESS;
1302 }
1303
1304 state->repeat = ctx->u.rep;
1305 state->ptr = ctx->ptr;
1306
1307 LASTMARK_RESTORE();
1308
1309 if (ctx->count >= ctx->u.rep->pattern[2]
1310 && ctx->u.rep->pattern[2] != 65535)
1311 RETURN_FAILURE;
1312
1313 ctx->u.rep->count = ctx->count;
1314 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1315 ctx->u.rep->pattern+3);
1316 if (ret) {
1317 RETURN_ON_ERROR(ret);
1318 RETURN_SUCCESS;
1319 }
1320 ctx->u.rep->count = ctx->count-1;
1321 state->ptr = ctx->ptr;
1322 RETURN_FAILURE;
1323
1324 case SRE_OP_GROUPREF:
1325 /* match backreference */
1326 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1327 ctx->ptr, ctx->pattern[0]));
1328 i = ctx->pattern[0];
1329 {
1330 Py_ssize_t groupref = i+i;
1331 if (groupref >= state->lastmark) {
1332 RETURN_FAILURE;
1333 } else {
1334 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1335 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1336 if (!p || !e || e < p)
1337 RETURN_FAILURE;
1338 while (p < e) {
1339 if (ctx->ptr >= end || *ctx->ptr != *p)
1340 RETURN_FAILURE;
1341 p++; ctx->ptr++;
1342 }
1343 }
1344 }
1345 ctx->pattern++;
1346 break;
1347
1348 case SRE_OP_GROUPREF_IGNORE:
1349 /* match backreference */
1350 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1351 ctx->ptr, ctx->pattern[0]));
1352 i = ctx->pattern[0];
1353 {
1354 Py_ssize_t groupref = i+i;
1355 if (groupref >= state->lastmark) {
1356 RETURN_FAILURE;
1357 } else {
1358 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1359 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1360 if (!p || !e || e < p)
1361 RETURN_FAILURE;
1362 while (p < e) {
1363 if (ctx->ptr >= end ||
1364 state->lower(*ctx->ptr) != state->lower(*p))
1365 RETURN_FAILURE;
1366 p++; ctx->ptr++;
1367 }
1368 }
1369 }
1370 ctx->pattern++;
1371 break;
1372
1373 case SRE_OP_GROUPREF_EXISTS:
1374 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1375 ctx->ptr, ctx->pattern[0]));
1376 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1377 i = ctx->pattern[0];
1378 {
1379 Py_ssize_t groupref = i+i;
1380 if (groupref >= state->lastmark) {
1381 ctx->pattern += ctx->pattern[1];
1382 break;
1383 } else {
1384 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1385 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1386 if (!p || !e || e < p) {
1387 ctx->pattern += ctx->pattern[1];
1388 break;
1389 }
1390 }
1391 }
1392 ctx->pattern += 2;
1393 break;
1394
1395 case SRE_OP_ASSERT:
1396 /* assert subpattern */
1397 /* <ASSERT> <skip> <back> <pattern> */
1398 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1399 ctx->ptr, ctx->pattern[1]));
1400 state->ptr = ctx->ptr - ctx->pattern[1];
1401 if (state->ptr < state->beginning)
1402 RETURN_FAILURE;
1403 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
1404 RETURN_ON_FAILURE(ret);
1405 ctx->pattern += ctx->pattern[0];
1406 break;
1407
1408 case SRE_OP_ASSERT_NOT:
1409 /* assert not subpattern */
1410 /* <ASSERT_NOT> <skip> <back> <pattern> */
1411 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1412 ctx->ptr, ctx->pattern[1]));
1413 state->ptr = ctx->ptr - ctx->pattern[1];
1414 if (state->ptr >= state->beginning) {
1415 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
1416 if (ret) {
1417 RETURN_ON_ERROR(ret);
1418 RETURN_FAILURE;
1419 }
1420 }
1421 ctx->pattern += ctx->pattern[0];
1422 break;
1423
1424 case SRE_OP_FAILURE:
1425 /* immediate failure */
1426 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1427 RETURN_FAILURE;
1428
1429 default:
1430 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1431 ctx->pattern[-1]));
1432 RETURN_ERROR(SRE_ERROR_ILLEGAL);
1433 }
1434 }
1435
1436 exit:
1437 ctx_pos = ctx->last_ctx_pos;
1438 jump = ctx->jump;
1439 DATA_POP_DISCARD(ctx);
1440 if (ctx_pos == -1)
1441 return ret;
1442 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1443
1444 switch (jump) {
1445 case JUMP_MAX_UNTIL_2:
1446 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1447 goto jump_max_until_2;
1448 case JUMP_MAX_UNTIL_3:
1449 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1450 goto jump_max_until_3;
1451 case JUMP_MIN_UNTIL_2:
1452 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1453 goto jump_min_until_2;
1454 case JUMP_MIN_UNTIL_3:
1455 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1456 goto jump_min_until_3;
1457 case JUMP_BRANCH:
1458 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1459 goto jump_branch;
1460 case JUMP_MAX_UNTIL_1:
1461 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1462 goto jump_max_until_1;
1463 case JUMP_MIN_UNTIL_1:
1464 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1465 goto jump_min_until_1;
1466 case JUMP_REPEAT:
1467 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1468 goto jump_repeat;
1469 case JUMP_REPEAT_ONE_1:
1470 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1471 goto jump_repeat_one_1;
1472 case JUMP_REPEAT_ONE_2:
1473 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1474 goto jump_repeat_one_2;
1475 case JUMP_MIN_REPEAT_ONE:
1476 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1477 goto jump_min_repeat_one;
1478 case JUMP_ASSERT:
1479 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1480 goto jump_assert;
1481 case JUMP_ASSERT_NOT:
1482 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1483 goto jump_assert_not;
1484 case JUMP_NONE:
1485 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1486 break;
1487 }
1488
1489 return ret; /* should never get here */
1490 }
1491
1492 LOCAL(Py_ssize_t)
1493 SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1494 {
1495 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1496 SRE_CHAR* end = (SRE_CHAR *)state->end;
1497 Py_ssize_t status = 0;
1498 Py_ssize_t prefix_len = 0;
1499 Py_ssize_t prefix_skip = 0;
1500 SRE_CODE* prefix = NULL;
1501 SRE_CODE* charset = NULL;
1502 SRE_CODE* overlap = NULL;
1503 int flags = 0;
1504
1505 if (pattern[0] == SRE_OP_INFO) {
1506 /* optimization info block */
1507 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1508
1509 flags = pattern[2];
1510
1511 if (pattern[3] > 1) {
1512 /* adjust end point (but make sure we leave at least one
1513 character in there, so literal search will work) */
1514 end -= pattern[3]-1;
1515 if (end <= ptr)
1516 end = ptr+1;
1517 }
1518
1519 if (flags & SRE_INFO_PREFIX) {
1520 /* pattern starts with a known prefix */
1521 /* <length> <skip> <prefix data> <overlap data> */
1522 prefix_len = pattern[5];
1523 prefix_skip = pattern[6];
1524 prefix = pattern + 7;
1525 overlap = prefix + prefix_len - 1;
1526 } else if (flags & SRE_INFO_CHARSET)
1527 /* pattern starts with a character from a known set */
1528 /* <charset> */
1529 charset = pattern + 5;
1530
1531 pattern += 1 + pattern[1];
1532 }
1533
1534 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1535 TRACE(("charset = %p\n", charset));
1536
1537 #if defined(USE_FAST_SEARCH)
1538 if (prefix_len > 1) {
1539 /* pattern starts with a known prefix. use the overlap
1540 table to skip forward as fast as we possibly can */
1541 Py_ssize_t i = 0;
1542 end = (SRE_CHAR *)state->end;
1543 while (ptr < end) {
1544 for (;;) {
1545 if ((SRE_CODE) ptr[0] != prefix[i]) {
1546 if (!i)
1547 break;
1548 else
1549 i = overlap[i];
1550 } else {
1551 if (++i == prefix_len) {
1552 /* found a potential match */
1553 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1554 state->start = ptr + 1 - prefix_len;
1555 state->ptr = ptr + 1 - prefix_len + prefix_skip;
1556 if (flags & SRE_INFO_LITERAL)
1557 return 1; /* we got all of it */
1558 status = SRE_MATCH(state, pattern + 2*prefix_skip);
1559 if (status != 0)
1560 return status;
1561 /* close but no cigar -- try again */
1562 i = overlap[i];
1563 }
1564 break;
1565 }
1566 }
1567 ptr++;
1568 }
1569 return 0;
1570 }
1571 #endif
1572
1573 if (pattern[0] == SRE_OP_LITERAL) {
1574 /* pattern starts with a literal character. this is used
1575 for short prefixes, and if fast search is disabled */
1576 SRE_CODE chr = pattern[1];
1577 end = (SRE_CHAR *)state->end;
1578 for (;;) {
1579 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1580 ptr++;
1581 if (ptr >= end)
1582 return 0;
1583 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1584 state->start = ptr;
1585 state->ptr = ++ptr;
1586 if (flags & SRE_INFO_LITERAL)
1587 return 1; /* we got all of it */
1588 status = SRE_MATCH(state, pattern + 2);
1589 if (status != 0)
1590 break;
1591 }
1592 } else if (charset) {
1593 /* pattern starts with a character from a known set */
1594 end = (SRE_CHAR *)state->end;
1595 for (;;) {
1596 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
1597 ptr++;
1598 if (ptr >= end)
1599 return 0;
1600 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
1601 state->start = ptr;
1602 state->ptr = ptr;
1603 status = SRE_MATCH(state, pattern);
1604 if (status != 0)
1605 break;
1606 ptr++;
1607 }
1608 } else
1609 /* general case */
1610 while (ptr <= end) {
1611 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
1612 state->start = state->ptr = ptr++;
1613 status = SRE_MATCH(state, pattern);
1614 if (status != 0)
1615 break;
1616 }
1617
1618 return status;
1619 }
1620
1621 LOCAL(int)
1622 SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
1623 {
1624 /* check if given string is a literal template (i.e. no escapes) */
1625 while (len-- > 0)
1626 if (*ptr++ == '\\')
1627 return 0;
1628 return 1;
1629 }
1630
1631 #if !defined(SRE_RECURSIVE)
1632
1633 /* -------------------------------------------------------------------- */
1634 /* factories and destructors */
1635
1636 /* see sre.h for object declarations */
1637 static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1638 static PyObject*pattern_scanner(PatternObject*, PyObject*);
1639
1640 static PyObject *
1641 sre_codesize(PyObject* self, PyObject *unused)
1642 {
1643 return Py_BuildValue("l", sizeof(SRE_CODE));
1644 }
1645
1646 static PyObject *
1647 sre_getlower(PyObject* self, PyObject* args)
1648 {
1649 int character, flags;
1650 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1651 return NULL;
1652 if (flags & SRE_FLAG_LOCALE)
1653 return Py_BuildValue("i", sre_lower_locale(character));
1654 if (flags & SRE_FLAG_UNICODE)
1655 #if defined(HAVE_UNICODE)
1656 return Py_BuildValue("i", sre_lower_unicode(character));
1657 #else
1658 return Py_BuildValue("i", sre_lower_locale(character));
1659 #endif
1660 return Py_BuildValue("i", sre_lower(character));
1661 }
1662
1663 LOCAL(void)
1664 state_reset(SRE_STATE* state)
1665 {
1666 /* FIXME: dynamic! */
1667 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
1668
1669 state->lastmark = -1;
1670 state->lastindex = -1;
1671
1672 state->repeat = NULL;
1673
1674 data_stack_dealloc(state);
1675 }
1676
1677 static void*
1678 getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
1679 {
1680 /* given a python object, return a data pointer, a length (in
1681 characters), and a character size. return NULL if the object
1682 is not a string (or not compatible) */
1683
1684 PyBufferProcs *buffer;
1685 Py_ssize_t size, bytes;
1686 int charsize;
1687 void* ptr;
1688
1689 #if defined(HAVE_UNICODE)
1690 if (PyUnicode_Check(string)) {
1691 /* unicode strings doesn't always support the buffer interface */
1692 ptr = (void*) PyUnicode_AS_DATA(string);
1693 /* bytes = PyUnicode_GET_DATA_SIZE(string); */
1694 size = PyUnicode_GET_SIZE(string);
1695 charsize = sizeof(Py_UNICODE);
1696
1697 } else {
1698 #endif
1699
1700 /* get pointer to string buffer */
1701 buffer = Py_TYPE(string)->tp_as_buffer;
1702 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1703 buffer->bf_getsegcount(string, NULL) != 1) {
1704 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1705 return NULL;
1706 }
1707
1708 /* determine buffer size */
1709 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1710 if (bytes < 0) {
1711 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1712 return NULL;
1713 }
1714
1715 /* determine character size */
1716 #if PY_VERSION_HEX >= 0x01060000
1717 size = PyObject_Size(string);
1718 #else
1719 size = PyObject_Length(string);
1720 #endif
1721
1722 if (PyString_Check(string) || bytes == size)
1723 charsize = 1;
1724 #if defined(HAVE_UNICODE)
1725 else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
1726 charsize = sizeof(Py_UNICODE);
1727 #endif
1728 else {
1729 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1730 return NULL;
1731 }
1732
1733 #if defined(HAVE_UNICODE)
1734 }
1735 #endif
1736
1737 *p_length = size;
1738 *p_charsize = charsize;
1739
1740 return ptr;
1741 }
1742
1743 LOCAL(PyObject*)
1744 state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1745 Py_ssize_t start, Py_ssize_t end)
1746 {
1747 /* prepare state object */
1748
1749 Py_ssize_t length;
1750 int charsize;
1751 void* ptr;
1752
1753 memset(state, 0, sizeof(SRE_STATE));
1754
1755 state->lastmark = -1;
1756 state->lastindex = -1;
1757
1758 ptr = getstring(string, &length, &charsize);
1759 if (!ptr)
1760 return NULL;
1761
1762 /* adjust boundaries */
1763 if (start < 0)
1764 start = 0;
1765 else if (start > length)
1766 start = length;
1767
1768 if (end < 0)
1769 end = 0;
1770 else if (end > length)
1771 end = length;
1772
1773 state->charsize = charsize;
1774
1775 state->beginning = ptr;
1776
1777 state->start = (void*) ((char*) ptr + start * state->charsize);
1778 state->end = (void*) ((char*) ptr + end * state->charsize);
1779
1780 Py_INCREF(string);
1781 state->string = string;
1782 state->pos = start;
1783 state->endpos = end;
1784
1785 if (pattern->flags & SRE_FLAG_LOCALE)
1786 state->lower = sre_lower_locale;
1787 else if (pattern->flags & SRE_FLAG_UNICODE)
1788 #if defined(HAVE_UNICODE)
1789 state->lower = sre_lower_unicode;
1790 #else
1791 state->lower = sre_lower_locale;
1792 #endif
1793 else
1794 state->lower = sre_lower;
1795
1796 return string;
1797 }
1798
1799 LOCAL(void)
1800 state_fini(SRE_STATE* state)
1801 {
1802 Py_XDECREF(state->string);
1803 data_stack_dealloc(state);
1804 }
1805
1806 /* calculate offset from start of string */
1807 #define STATE_OFFSET(state, member)\
1808 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1809
1810 LOCAL(PyObject*)
1811 state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
1812 {
1813 Py_ssize_t i, j;
1814
1815 index = (index - 1) * 2;
1816
1817 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
1818 if (empty)
1819 /* want empty string */
1820 i = j = 0;
1821 else {
1822 Py_INCREF(Py_None);
1823 return Py_None;
1824 }
1825 } else {
1826 i = STATE_OFFSET(state, state->mark[index]);
1827 j = STATE_OFFSET(state, state->mark[index+1]);
1828 }
1829
1830 return PySequence_GetSlice(string, i, j);
1831 }
1832
1833 static void
1834 pattern_error(int status)
1835 {
1836 switch (status) {
1837 case SRE_ERROR_RECURSION_LIMIT:
1838 PyErr_SetString(
1839 PyExc_RuntimeError,
1840 "maximum recursion limit exceeded"
1841 );
1842 break;
1843 case SRE_ERROR_MEMORY:
1844 PyErr_NoMemory();
1845 break;
1846 case SRE_ERROR_INTERRUPTED:
1847 /* An exception has already been raised, so let it fly */
1848 break;
1849 default:
1850 /* other error codes indicate compiler/engine bugs */
1851 PyErr_SetString(
1852 PyExc_RuntimeError,
1853 "internal error in regular expression engine"
1854 );
1855 }
1856 }
1857
1858 static void
1859 pattern_dealloc(PatternObject* self)
1860 {
1861 if (self->weakreflist != NULL)
1862 PyObject_ClearWeakRefs((PyObject *) self);
1863 Py_XDECREF(self->pattern);
1864 Py_XDECREF(self->groupindex);
1865 Py_XDECREF(self->indexgroup);
1866 PyObject_DEL(self);
1867 }
1868
1869 static PyObject*
1870 pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
1871 {
1872 SRE_STATE state;
1873 int status;
1874
1875 PyObject* string;
1876 Py_ssize_t start = 0;
1877 Py_ssize_t end = PY_SSIZE_T_MAX;
1878 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1879 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
1880 &string, &start, &end))
1881 return NULL;
1882
1883 string = state_init(&state, self, string, start, end);
1884 if (!string)
1885 return NULL;
1886
1887 state.ptr = state.start;
1888
1889 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1890
1891 if (state.charsize == 1) {
1892 status = sre_match(&state, PatternObject_GetCode(self));
1893 } else {
1894 #if defined(HAVE_UNICODE)
1895 status = sre_umatch(&state, PatternObject_GetCode(self));
1896 #endif
1897 }
1898
1899 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1900 if (PyErr_Occurred())
1901 return NULL;
1902
1903 state_fini(&state);
1904
1905 return pattern_new_match(self, &state, status);
1906 }
1907
1908 static PyObject*
1909 pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
1910 {
1911 SRE_STATE state;
1912 int status;
1913
1914 PyObject* string;
1915 Py_ssize_t start = 0;
1916 Py_ssize_t end = PY_SSIZE_T_MAX;
1917 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1918 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
1919 &string, &start, &end))
1920 return NULL;
1921
1922 string = state_init(&state, self, string, start, end);
1923 if (!string)
1924 return NULL;
1925
1926 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1927
1928 if (state.charsize == 1) {
1929 status = sre_search(&state, PatternObject_GetCode(self));
1930 } else {
1931 #if defined(HAVE_UNICODE)
1932 status = sre_usearch(&state, PatternObject_GetCode(self));
1933 #endif
1934 }
1935
1936 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1937
1938 state_fini(&state);
1939
1940 if (PyErr_Occurred())
1941 return NULL;
1942
1943 return pattern_new_match(self, &state, status);
1944 }
1945
1946 static PyObject*
1947 call(char* module, char* function, PyObject* args)
1948 {
1949 PyObject* name;
1950 PyObject* mod;
1951 PyObject* func;
1952 PyObject* result;
1953
1954 if (!args)
1955 return NULL;
1956 name = PyString_FromString(module);
1957 if (!name)
1958 return NULL;
1959 mod = PyImport_Import(name);
1960 Py_DECREF(name);
1961 if (!mod)
1962 return NULL;
1963 func = PyObject_GetAttrString(mod, function);
1964 Py_DECREF(mod);
1965 if (!func)
1966 return NULL;
1967 result = PyObject_CallObject(func, args);
1968 Py_DECREF(func);
1969 Py_DECREF(args);
1970 return result;
1971 }
1972
1973 #ifdef USE_BUILTIN_COPY
1974 static int
1975 deepcopy(PyObject** object, PyObject* memo)
1976 {
1977 PyObject* copy;
1978
1979 copy = call(
1980 "copy", "deepcopy",
1981 PyTuple_Pack(2, *object, memo)
1982 );
1983 if (!copy)
1984 return 0;
1985
1986 Py_DECREF(*object);
1987 *object = copy;
1988
1989 return 1; /* success */
1990 }
1991 #endif
1992
1993 static PyObject*
1994 join_list(PyObject* list, PyObject* string)
1995 {
1996 /* join list elements */
1997
1998 PyObject* joiner;
1999 #if PY_VERSION_HEX >= 0x01060000
2000 PyObject* function;
2001 PyObject* args;
2002 #endif
2003 PyObject* result;
2004
2005 joiner = PySequence_GetSlice(string, 0, 0);
2006 if (!joiner)
2007 return NULL;
2008
2009 if (PyList_GET_SIZE(list) == 0) {
2010 Py_DECREF(list);
2011 return joiner;
2012 }
2013
2014 #if PY_VERSION_HEX >= 0x01060000
2015 function = PyObject_GetAttrString(joiner, "join");
2016 if (!function) {
2017 Py_DECREF(joiner);
2018 return NULL;
2019 }
2020 args = PyTuple_New(1);
2021 if (!args) {
2022 Py_DECREF(function);
2023 Py_DECREF(joiner);
2024 return NULL;
2025 }
2026 PyTuple_SET_ITEM(args, 0, list);
2027 result = PyObject_CallObject(function, args);
2028 Py_DECREF(args); /* also removes list */
2029 Py_DECREF(function);
2030 #else
2031 result = call(
2032 "string", "join",
2033 PyTuple_Pack(2, list, joiner)
2034 );
2035 #endif
2036 Py_DECREF(joiner);
2037
2038 return result;
2039 }
2040
2041 static PyObject*
2042 pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
2043 {
2044 SRE_STATE state;
2045 PyObject* list;
2046 int status;
2047 Py_ssize_t i, b, e;
2048
2049 PyObject* string;
2050 Py_ssize_t start = 0;
2051 Py_ssize_t end = PY_SSIZE_T_MAX;
2052 static char* kwlist[] = { "source", "pos", "endpos", NULL };
2053 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
2054 &string, &start, &end))
2055 return NULL;
2056
2057 string = state_init(&state, self, string, start, end);
2058 if (!string)
2059 return NULL;
2060
2061 list = PyList_New(0);
2062 if (!list) {
2063 state_fini(&state);
2064 return NULL;
2065 }
2066
2067 while (state.start <= state.end) {
2068
2069 PyObject* item;
2070
2071 state_reset(&state);
2072
2073 state.ptr = state.start;
2074
2075 if (state.charsize == 1) {
2076 status = sre_search(&state, PatternObject_GetCode(self));
2077 } else {
2078 #if defined(HAVE_UNICODE)
2079 status = sre_usearch(&state, PatternObject_GetCode(self));
2080 #endif
2081 }
2082
2083 if (PyErr_Occurred())
2084 goto error;
2085
2086 if (status <= 0) {
2087 if (status == 0)
2088 break;
2089 pattern_error(status);
2090 goto error;
2091 }
2092
2093 /* don't bother to build a match object */
2094 switch (self->groups) {
2095 case 0:
2096 b = STATE_OFFSET(&state, state.start);
2097 e = STATE_OFFSET(&state, state.ptr);
2098 item = PySequence_GetSlice(string, b, e);
2099 if (!item)
2100 goto error;
2101 break;
2102 case 1:
2103 item = state_getslice(&state, 1, string, 1);
2104 if (!item)
2105 goto error;
2106 break;
2107 default:
2108 item = PyTuple_New(self->groups);
2109 if (!item)
2110 goto error;
2111 for (i = 0; i < self->groups; i++) {
2112 PyObject* o = state_getslice(&state, i+1, string, 1);
2113 if (!o) {
2114 Py_DECREF(item);
2115 goto error;
2116 }
2117 PyTuple_SET_ITEM(item, i, o);
2118 }
2119 break;
2120 }
2121
2122 status = PyList_Append(list, item);
2123 Py_DECREF(item);
2124 if (status < 0)
2125 goto error;
2126
2127 if (state.ptr == state.start)
2128 state.start = (void*) ((char*) state.ptr + state.charsize);
2129 else
2130 state.start = state.ptr;
2131
2132 }
2133
2134 state_fini(&state);
2135 return list;
2136
2137 error:
2138 Py_DECREF(list);
2139 state_fini(&state);
2140 return NULL;
2141
2142 }
2143
2144 #if PY_VERSION_HEX >= 0x02020000
2145 static PyObject*
2146 pattern_finditer(PatternObject* pattern, PyObject* args)
2147 {
2148 PyObject* scanner;
2149 PyObject* search;
2150 PyObject* iterator;
2151
2152 scanner = pattern_scanner(pattern, args);
2153 if (!scanner)
2154 return NULL;
2155
2156 search = PyObject_GetAttrString(scanner, "search");
2157 Py_DECREF(scanner);
2158 if (!search)
2159 return NULL;
2160
2161 iterator = PyCallIter_New(search, Py_None);
2162 Py_DECREF(search);
2163
2164 return iterator;
2165 }
2166 #endif
2167
2168 static PyObject*
2169 pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2170 {
2171 SRE_STATE state;
2172 PyObject* list;
2173 PyObject* item;
2174 int status;
2175 Py_ssize_t n;
2176 Py_ssize_t i;
2177 void* last;
2178
2179 PyObject* string;
2180 Py_ssize_t maxsplit = 0;
2181 static char* kwlist[] = { "source", "maxsplit", NULL };
2182 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
2183 &string, &maxsplit))
2184 return NULL;
2185
2186 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2187 if (!string)
2188 return NULL;
2189
2190 list = PyList_New(0);
2191 if (!list) {
2192 state_fini(&state);
2193 return NULL;
2194 }
2195
2196 n = 0;
2197 last = state.start;
2198
2199 while (!maxsplit || n < maxsplit) {
2200
2201 state_reset(&state);
2202
2203 state.ptr = state.start;
2204
2205 if (state.charsize == 1) {
2206 status = sre_search(&state, PatternObject_GetCode(self));
2207 } else {
2208 #if defined(HAVE_UNICODE)
2209 status = sre_usearch(&state, PatternObject_GetCode(self));
2210 #endif
2211 }
2212
2213 if (PyErr_Occurred())
2214 goto error;
2215
2216 if (status <= 0) {
2217 if (status == 0)
2218 break;
2219 pattern_error(status);
2220 goto error;
2221 }
2222
2223 if (state.start == state.ptr) {
2224 if (last == state.end)
2225 break;
2226 /* skip one character */
2227 state.start = (void*) ((char*) state.ptr + state.charsize);
2228 continue;
2229 }
2230
2231 /* get segment before this match */
2232 item = PySequence_GetSlice(
2233 string, STATE_OFFSET(&state, last),
2234 STATE_OFFSET(&state, state.start)
2235 );
2236 if (!item)
2237 goto error;
2238 status = PyList_Append(list, item);
2239 Py_DECREF(item);
2240 if (status < 0)
2241 goto error;
2242
2243 /* add groups (if any) */
2244 for (i = 0; i < self->groups; i++) {
2245 item = state_getslice(&state, i+1, string, 0);
2246 if (!item)
2247 goto error;
2248 status = PyList_Append(list, item);
2249 Py_DECREF(item);
2250 if (status < 0)
2251 goto error;
2252 }
2253
2254 n = n + 1;
2255
2256 last = state.start = state.ptr;
2257
2258 }
2259
2260 /* get segment following last match (even if empty) */
2261 item = PySequence_GetSlice(
2262 string, STATE_OFFSET(&state, last), state.endpos
2263 );
2264 if (!item)
2265 goto error;
2266 status = PyList_Append(list, item);
2267 Py_DECREF(item);
2268 if (status < 0)
2269 goto error;
2270
2271 state_fini(&state);
2272 return list;
2273
2274 error:
2275 Py_DECREF(list);
2276 state_fini(&state);
2277 return NULL;
2278
2279 }
2280
2281 static PyObject*
2282 pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
2283 Py_ssize_t count, Py_ssize_t subn)
2284 {
2285 SRE_STATE state;
2286 PyObject* list;
2287 PyObject* item;
2288 PyObject* filter;
2289 PyObject* args;
2290 PyObject* match;
2291 void* ptr;
2292 int status;
2293 Py_ssize_t n;
2294 Py_ssize_t i, b, e;
2295 int bint;
2296 int filter_is_callable;
2297
2298 if (PyCallable_Check(ptemplate)) {
2299 /* sub/subn takes either a function or a template */
2300 filter = ptemplate;
2301 Py_INCREF(filter);
2302 filter_is_callable = 1;
2303 } else {
2304 /* if not callable, check if it's a literal string */
2305 int literal;
2306 ptr = getstring(ptemplate, &n, &bint);
2307 b = bint;
2308 if (ptr) {
2309 if (b == 1) {
2310 literal = sre_literal_template((unsigned char *)ptr, n);
2311 } else {
2312 #if defined(HAVE_UNICODE)
2313 literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
2314 #endif
2315 }
2316 } else {
2317 PyErr_Clear();
2318 literal = 0;
2319 }
2320 if (literal) {
2321 filter = ptemplate;
2322 Py_INCREF(filter);
2323 filter_is_callable = 0;
2324 } else {
2325 /* not a literal; hand it over to the template compiler */
2326 filter = call(
2327 SRE_PY_MODULE, "_subx",
2328 PyTuple_Pack(2, self, ptemplate)
2329 );
2330 if (!filter)
2331 return NULL;
2332 filter_is_callable = PyCallable_Check(filter);
2333 }
2334 }
2335
2336 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2337 if (!string) {
2338 Py_DECREF(filter);
2339 return NULL;
2340 }
2341
2342 list = PyList_New(0);
2343 if (!list) {
2344 Py_DECREF(filter);
2345 state_fini(&state);
2346 return NULL;
2347 }
2348
2349 n = i = 0;
2350
2351 while (!count || n < count) {
2352
2353 state_reset(&state);
2354
2355 state.ptr = state.start;
2356
2357 if (state.charsize == 1) {
2358 status = sre_search(&state, PatternObject_GetCode(self));
2359 } else {
2360 #if defined(HAVE_UNICODE)
2361 status = sre_usearch(&state, PatternObject_GetCode(self));
2362 #endif
2363 }
2364
2365 if (PyErr_Occurred())
2366 goto error;
2367
2368 if (status <= 0) {
2369 if (status == 0)
2370 break;
2371 pattern_error(status);
2372 goto error;
2373 }
2374
2375 b = STATE_OFFSET(&state, state.start);
2376 e = STATE_OFFSET(&state, state.ptr);
2377
2378 if (i < b) {
2379 /* get segment before this match */
2380 item = PySequence_GetSlice(string, i, b);
2381 if (!item)
2382 goto error;
2383 status = PyList_Append(list, item);
2384 Py_DECREF(item);
2385 if (status < 0)
2386 goto error;
2387
2388 } else if (i == b && i == e && n > 0)
2389 /* ignore empty match on latest position */
2390 goto next;
2391
2392 if (filter_is_callable) {
2393 /* pass match object through filter */
2394 match = pattern_new_match(self, &state, 1);
2395 if (!match)
2396 goto error;
2397 args = PyTuple_Pack(1, match);
2398 if (!args) {
2399 Py_DECREF(match);
2400 goto error;
2401 }
2402 item = PyObject_CallObject(filter, args);
2403 Py_DECREF(args);
2404 Py_DECREF(match);
2405 if (!item)
2406 goto error;
2407 } else {
2408 /* filter is literal string */
2409 item = filter;
2410 Py_INCREF(item);
2411 }
2412
2413 /* add to list */
2414 if (item != Py_None) {
2415 status = PyList_Append(list, item);
2416 Py_DECREF(item);
2417 if (status < 0)
2418 goto error;
2419 }
2420
2421 i = e;
2422 n = n + 1;
2423
2424 next:
2425 /* move on */
2426 if (state.ptr == state.start)
2427 state.start = (void*) ((char*) state.ptr + state.charsize);
2428 else
2429 state.start = state.ptr;
2430
2431 }
2432
2433 /* get segment following last match */
2434 if (i < state.endpos) {
2435 item = PySequence_GetSlice(string, i, state.endpos);
2436 if (!item)
2437 goto error;
2438 status = PyList_Append(list, item);
2439 Py_DECREF(item);
2440 if (status < 0)
2441 goto error;
2442 }
2443
2444 state_fini(&state);
2445
2446 Py_DECREF(filter);
2447
2448 /* convert list to single string (also removes list) */
2449 item = join_list(list, string);
2450
2451 if (!item)
2452 return NULL;
2453
2454 if (subn)
2455 return Py_BuildValue("Ni", item, n);
2456
2457 return item;
2458
2459 error:
2460 Py_DECREF(list);
2461 state_fini(&state);
2462 Py_DECREF(filter);
2463 return NULL;
2464
2465 }
2466
2467 static PyObject*
2468 pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2469 {
2470 PyObject* ptemplate;
2471 PyObject* string;
2472 Py_ssize_t count = 0;
2473 static char* kwlist[] = { "repl", "string", "count", NULL };
2474 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
2475 &ptemplate, &string, &count))
2476 return NULL;
2477
2478 return pattern_subx(self, ptemplate, string, count, 0);
2479 }
2480
2481 static PyObject*
2482 pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2483 {
2484 PyObject* ptemplate;
2485 PyObject* string;
2486 Py_ssize_t count = 0;
2487 static char* kwlist[] = { "repl", "string", "count", NULL };
2488 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
2489 &ptemplate, &string, &count))
2490 return NULL;
2491
2492 return pattern_subx(self, ptemplate, string, count, 1);
2493 }
2494
2495 static PyObject*
2496 pattern_copy(PatternObject* self, PyObject *unused)
2497 {
2498 #ifdef USE_BUILTIN_COPY
2499 PatternObject* copy;
2500 int offset;
2501
2502 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2503 if (!copy)
2504 return NULL;
2505
2506 offset = offsetof(PatternObject, groups);
2507
2508 Py_XINCREF(self->groupindex);
2509 Py_XINCREF(self->indexgroup);
2510 Py_XINCREF(self->pattern);
2511
2512 memcpy((char*) copy + offset, (char*) self + offset,
2513 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
2514 copy->weakreflist = NULL;
2515
2516 return (PyObject*) copy;
2517 #else
2518 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2519 return NULL;
2520 #endif
2521 }
2522
2523 static PyObject*
2524 pattern_deepcopy(PatternObject* self, PyObject* memo)
2525 {
2526 #ifdef USE_BUILTIN_COPY
2527 PatternObject* copy;
2528
2529 copy = (PatternObject*) pattern_copy(self);
2530 if (!copy)
2531 return NULL;
2532
2533 if (!deepcopy(&copy->groupindex, memo) ||
2534 !deepcopy(&copy->indexgroup, memo) ||
2535 !deepcopy(&copy->pattern, memo)) {
2536 Py_DECREF(copy);
2537 return NULL;
2538 }
2539
2540 #else
2541 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2542 return NULL;
2543 #endif
2544 }
2545
2546 PyDoc_STRVAR(pattern_match_doc,
2547 "match(string[, pos[, endpos]]) --> match object or None.\n\
2548 Matches zero or more characters at the beginning of the string");
2549
2550 PyDoc_STRVAR(pattern_search_doc,
2551 "search(string[, pos[, endpos]]) --> match object or None.\n\
2552 Scan through string looking for a match, and return a corresponding\n\
2553 MatchObject instance. Return None if no position in the string matches.");
2554
2555 PyDoc_STRVAR(pattern_split_doc,
2556 "split(string[, maxsplit = 0]) --> list.\n\
2557 Split string by the occurrences of pattern.");
2558
2559 PyDoc_STRVAR(pattern_findall_doc,
2560 "findall(string[, pos[, endpos]]) --> list.\n\
2561 Return a list of all non-overlapping matches of pattern in string.");
2562
2563 PyDoc_STRVAR(pattern_finditer_doc,
2564 "finditer(string[, pos[, endpos]]) --> iterator.\n\
2565 Return an iterator over all non-overlapping matches for the \n\
2566 RE pattern in string. For each match, the iterator returns a\n\
2567 match object.");
2568
2569 PyDoc_STRVAR(pattern_sub_doc,
2570 "sub(repl, string[, count = 0]) --> newstring\n\
2571 Return the string obtained by replacing the leftmost non-overlapping\n\
2572 occurrences of pattern in string by the replacement repl.");
2573
2574 PyDoc_STRVAR(pattern_subn_doc,
2575 "subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2576 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2577 the leftmost non-overlapping occurrences of pattern with the\n\
2578 replacement repl.");
2579
2580 PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2581
2582 static PyMethodDef pattern_methods[] = {
2583 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
2584 pattern_match_doc},
2585 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
2586 pattern_search_doc},
2587 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2588 pattern_sub_doc},
2589 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2590 pattern_subn_doc},
2591 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
2592 pattern_split_doc},
2593 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
2594 pattern_findall_doc},
2595 #if PY_VERSION_HEX >= 0x02020000
2596 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2597 pattern_finditer_doc},
2598 #endif
2599 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
2600 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2601 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
2602 {NULL, NULL}
2603 };
2604
2605 #define PAT_OFF(x) offsetof(PatternObject, x)
2606 static PyMemberDef pattern_members[] = {
2607 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
2608 {"flags", T_INT, PAT_OFF(flags), READONLY},
2609 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
2610 {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
2611 {NULL} /* Sentinel */
2612 };
2613
2614 statichere PyTypeObject Pattern_Type = {
2615 PyObject_HEAD_INIT(NULL)
2616 0, "_" SRE_MODULE ".SRE_Pattern",
2617 sizeof(PatternObject), sizeof(SRE_CODE),
2618 (destructor)pattern_dealloc, /*tp_dealloc*/
2619 0, /* tp_print */
2620 0, /* tp_getattrn */
2621 0, /* tp_setattr */
2622 0, /* tp_compare */
2623 0, /* tp_repr */
2624 0, /* tp_as_number */
2625 0, /* tp_as_sequence */
2626 0, /* tp_as_mapping */
2627 0, /* tp_hash */
2628 0, /* tp_call */
2629 0, /* tp_str */
2630 0, /* tp_getattro */
2631 0, /* tp_setattro */
2632 0, /* tp_as_buffer */
2633 Py_TPFLAGS_DEFAULT, /* tp_flags */
2634 pattern_doc, /* tp_doc */
2635 0, /* tp_traverse */
2636 0, /* tp_clear */
2637 0, /* tp_richcompare */
2638 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
2639 0, /* tp_iter */
2640 0, /* tp_iternext */
2641 pattern_methods, /* tp_methods */
2642 pattern_members, /* tp_members */
2643 };
2644
2645 static int _validate(PatternObject *self); /* Forward */
2646
2647 static PyObject *
2648 _compile(PyObject* self_, PyObject* args)
2649 {
2650 /* "compile" pattern descriptor to pattern object */
2651
2652 PatternObject* self;
2653 Py_ssize_t i, n;
2654
2655 PyObject* pattern;
2656 int flags = 0;
2657 PyObject* code;
2658 Py_ssize_t groups = 0;
2659 PyObject* groupindex = NULL;
2660 PyObject* indexgroup = NULL;
2661 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
2662 &PyList_Type, &code, &groups,
2663 &groupindex, &indexgroup))
2664 return NULL;
2665
2666 n = PyList_GET_SIZE(code);
2667 /* coverity[ampersand_in_size] */
2668 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2669 if (!self)
2670 return NULL;
2671 self->weakreflist = NULL;
2672 self->pattern = NULL;
2673 self->groupindex = NULL;
2674 self->indexgroup = NULL;
2675
2676 self->codesize = n;
2677
2678 for (i = 0; i < n; i++) {
2679 PyObject *o = PyList_GET_ITEM(code, i);
2680 unsigned long value = PyInt_Check(o) ? (unsigned long)PyInt_AsLong(o)
2681 : PyLong_AsUnsignedLong(o);
2682 self->code[i] = (SRE_CODE) value;
2683 if ((unsigned long) self->code[i] != value) {
2684 PyErr_SetString(PyExc_OverflowError,
2685 "regular expression code size limit exceeded");
2686 break;
2687 }
2688 }
2689
2690 if (PyErr_Occurred()) {
2691 Py_DECREF(self);
2692 return NULL;
2693 }
2694
2695 Py_INCREF(pattern);
2696 self->pattern = pattern;
2697
2698 self->flags = flags;
2699
2700 self->groups = groups;
2701
2702 Py_XINCREF(groupindex);
2703 self->groupindex = groupindex;
2704
2705 Py_XINCREF(indexgroup);
2706 self->indexgroup = indexgroup;
2707
2708 self->weakreflist = NULL;
2709
2710 if (!_validate(self)) {
2711 Py_DECREF(self);
2712 return NULL;
2713 }
2714
2715 return (PyObject*) self;
2716 }
2717
2718 /* -------------------------------------------------------------------- */
2719 /* Code validation */
2720
2721 /* To learn more about this code, have a look at the _compile() function in
2722 Lib/sre_compile.py. The validation functions below checks the code array
2723 for conformance with the code patterns generated there.
2724
2725 The nice thing about the generated code is that it is position-independent:
2726 all jumps are relative jumps forward. Also, jumps don't cross each other:
2727 the target of a later jump is always earlier than the target of an earlier
2728 jump. IOW, this is okay:
2729
2730 J---------J-------T--------T
2731 \ \_____/ /
2732 \______________________/
2733
2734 but this is not:
2735
2736 J---------J-------T--------T
2737 \_________\_____/ /
2738 \____________/
2739
2740 It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
2741 bytes wide (the latter if Python is compiled for "wide" unicode support).
2742 */
2743
2744 /* Defining this one enables tracing of the validator */
2745 #undef VVERBOSE
2746
2747 /* Trace macro for the validator */
2748 #if defined(VVERBOSE)
2749 #define VTRACE(v) printf v
2750 #else
2751 #define VTRACE(v)
2752 #endif
2753
2754 /* Report failure */
2755 #define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2756
2757 /* Extract opcode, argument, or skip count from code array */
2758 #define GET_OP \
2759 do { \
2760 VTRACE(("%p: ", code)); \
2761 if (code >= end) FAIL; \
2762 op = *code++; \
2763 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2764 } while (0)
2765 #define GET_ARG \
2766 do { \
2767 VTRACE(("%p= ", code)); \
2768 if (code >= end) FAIL; \
2769 arg = *code++; \
2770 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2771 } while (0)
2772 #define GET_SKIP_ADJ(adj) \
2773 do { \
2774 VTRACE(("%p= ", code)); \
2775 if (code >= end) FAIL; \
2776 skip = *code; \
2777 VTRACE(("%lu (skip to %p)\n", \
2778 (unsigned long)skip, code+skip)); \
2779 if (code+skip-adj < code || code+skip-adj > end)\
2780 FAIL; \
2781 code++; \
2782 } while (0)
2783 #define GET_SKIP GET_SKIP_ADJ(0)
2784
2785 static int
2786 _validate_charset(SRE_CODE *code, SRE_CODE *end)
2787 {
2788 /* Some variables are manipulated by the macros above */
2789 SRE_CODE op;
2790 SRE_CODE arg;
2791 SRE_CODE offset;
2792 int i;
2793
2794 while (code < end) {
2795 GET_OP;
2796 switch (op) {
2797
2798 case SRE_OP_NEGATE:
2799 break;
2800
2801 case SRE_OP_LITERAL:
2802 GET_ARG;
2803 break;
2804
2805 case SRE_OP_RANGE:
2806 GET_ARG;
2807 GET_ARG;
2808 break;
2809
2810 case SRE_OP_CHARSET:
2811 offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
2812 if (code+offset < code || code+offset > end)
2813 FAIL;
2814 code += offset;
2815 break;
2816
2817 case SRE_OP_BIGCHARSET:
2818 GET_ARG; /* Number of blocks */
2819 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
2820 if (code+offset < code || code+offset > end)
2821 FAIL;
2822 /* Make sure that each byte points to a valid block */
2823 for (i = 0; i < 256; i++) {
2824 if (((unsigned char *)code)[i] >= arg)
2825 FAIL;
2826 }
2827 code += offset;
2828 offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
2829 if (code+offset < code || code+offset > end)
2830 FAIL;
2831 code += offset;
2832 break;
2833
2834 case SRE_OP_CATEGORY:
2835 GET_ARG;
2836 switch (arg) {
2837 case SRE_CATEGORY_DIGIT:
2838 case SRE_CATEGORY_NOT_DIGIT:
2839 case SRE_CATEGORY_SPACE:
2840 case SRE_CATEGORY_NOT_SPACE:
2841 case SRE_CATEGORY_WORD:
2842 case SRE_CATEGORY_NOT_WORD:
2843 case SRE_CATEGORY_LINEBREAK:
2844 case SRE_CATEGORY_NOT_LINEBREAK:
2845 case SRE_CATEGORY_LOC_WORD:
2846 case SRE_CATEGORY_LOC_NOT_WORD:
2847 case SRE_CATEGORY_UNI_DIGIT:
2848 case SRE_CATEGORY_UNI_NOT_DIGIT:
2849 case SRE_CATEGORY_UNI_SPACE:
2850 case SRE_CATEGORY_UNI_NOT_SPACE:
2851 case SRE_CATEGORY_UNI_WORD:
2852 case SRE_CATEGORY_UNI_NOT_WORD:
2853 case SRE_CATEGORY_UNI_LINEBREAK:
2854 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2855 break;
2856 default:
2857 FAIL;
2858 }
2859 break;
2860
2861 default:
2862 FAIL;
2863
2864 }
2865 }
2866
2867 return 1;
2868 }
2869
2870 static int
2871 _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2872 {
2873 /* Some variables are manipulated by the macros above */
2874 SRE_CODE op;
2875 SRE_CODE arg;
2876 SRE_CODE skip;
2877
2878 VTRACE(("code=%p, end=%p\n", code, end));
2879
2880 if (code > end)
2881 FAIL;
2882
2883 while (code < end) {
2884 GET_OP;
2885 switch (op) {
2886
2887 case SRE_OP_MARK:
2888 /* We don't check whether marks are properly nested; the
2889 sre_match() code is robust even if they don't, and the worst
2890 you can get is nonsensical match results. */
2891 GET_ARG;
2892 if (arg > 2*groups+1) {
2893 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
2894 FAIL;
2895 }
2896 break;
2897
2898 case SRE_OP_LITERAL:
2899 case SRE_OP_NOT_LITERAL:
2900 case SRE_OP_LITERAL_IGNORE:
2901 case SRE_OP_NOT_LITERAL_IGNORE:
2902 GET_ARG;
2903 /* The arg is just a character, nothing to check */
2904 break;
2905
2906 case SRE_OP_SUCCESS:
2907 case SRE_OP_FAILURE:
2908 /* Nothing to check; these normally end the matching process */
2909 break;
2910
2911 case SRE_OP_AT:
2912 GET_ARG;
2913 switch (arg) {
2914 case SRE_AT_BEGINNING:
2915 case SRE_AT_BEGINNING_STRING:
2916 case SRE_AT_BEGINNING_LINE:
2917 case SRE_AT_END:
2918 case SRE_AT_END_LINE:
2919 case SRE_AT_END_STRING:
2920 case SRE_AT_BOUNDARY:
2921 case SRE_AT_NON_BOUNDARY:
2922 case SRE_AT_LOC_BOUNDARY:
2923 case SRE_AT_LOC_NON_BOUNDARY:
2924 case SRE_AT_UNI_BOUNDARY:
2925 case SRE_AT_UNI_NON_BOUNDARY:
2926 break;
2927 default:
2928 FAIL;
2929 }
2930 break;
2931
2932 case SRE_OP_ANY:
2933 case SRE_OP_ANY_ALL:
2934 /* These have no operands */
2935 break;
2936
2937 case SRE_OP_IN:
2938 case SRE_OP_IN_IGNORE:
2939 GET_SKIP;
2940 /* Stop 1 before the end; we check the FAILURE below */
2941 if (!_validate_charset(code, code+skip-2))
2942 FAIL;
2943 if (code[skip-2] != SRE_OP_FAILURE)
2944 FAIL;
2945 code += skip-1;
2946 break;
2947
2948 case SRE_OP_INFO:
2949 {
2950 /* A minimal info field is
2951 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2952 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2953 more follows. */
2954 SRE_CODE flags, i;
2955 SRE_CODE *newcode;
2956 GET_SKIP;
2957 newcode = code+skip-1;
2958 GET_ARG; flags = arg;
2959 GET_ARG; /* min */
2960 GET_ARG; /* max */
2961 /* Check that only valid flags are present */
2962 if ((flags & ~(SRE_INFO_PREFIX |
2963 SRE_INFO_LITERAL |
2964 SRE_INFO_CHARSET)) != 0)
2965 FAIL;
2966 /* PREFIX and CHARSET are mutually exclusive */
2967 if ((flags & SRE_INFO_PREFIX) &&
2968 (flags & SRE_INFO_CHARSET))
2969 FAIL;
2970 /* LITERAL implies PREFIX */
2971 if ((flags & SRE_INFO_LITERAL) &&
2972 !(flags & SRE_INFO_PREFIX))
2973 FAIL;
2974 /* Validate the prefix */
2975 if (flags & SRE_INFO_PREFIX) {
2976 SRE_CODE prefix_len;
2977 GET_ARG; prefix_len = arg;
2978 GET_ARG; /* prefix skip */
2979 /* Here comes the prefix string */
2980 if (code+prefix_len < code || code+prefix_len > newcode)
2981 FAIL;
2982 code += prefix_len;
2983 /* And here comes the overlap table */
2984 if (code+prefix_len < code || code+prefix_len > newcode)
2985 FAIL;
2986 /* Each overlap value should be < prefix_len */
2987 for (i = 0; i < prefix_len; i++) {
2988 if (code[i] >= prefix_len)
2989 FAIL;
2990 }
2991 code += prefix_len;
2992 }
2993 /* Validate the charset */
2994 if (flags & SRE_INFO_CHARSET) {
2995 if (!_validate_charset(code, newcode-1))
2996 FAIL;
2997 if (newcode[-1] != SRE_OP_FAILURE)
2998 FAIL;
2999 code = newcode;
3000 }
3001 else if (code != newcode) {
3002 VTRACE(("code=%p, newcode=%p\n", code, newcode));
3003 FAIL;
3004 }
3005 }
3006 break;
3007
3008 case SRE_OP_BRANCH:
3009 {
3010 SRE_CODE *target = NULL;
3011 for (;;) {
3012 GET_SKIP;
3013 if (skip == 0)
3014 break;
3015 /* Stop 2 before the end; we check the JUMP below */
3016 if (!_validate_inner(code, code+skip-3, groups))
3017 FAIL;
3018 code += skip-3;
3019 /* Check that it ends with a JUMP, and that each JUMP
3020 has the same target */
3021 GET_OP;
3022 if (op != SRE_OP_JUMP)
3023 FAIL;
3024 GET_SKIP;
3025 if (target == NULL)
3026 target = code+skip-1;
3027 else if (code+skip-1 != target)
3028 FAIL;
3029 }
3030 }
3031 break;
3032
3033 case SRE_OP_REPEAT_ONE:
3034 case SRE_OP_MIN_REPEAT_ONE:
3035 {
3036 SRE_CODE min, max;
3037 GET_SKIP;
3038 GET_ARG; min = arg;
3039 GET_ARG; max = arg;
3040 if (min > max)
3041 FAIL;
3042 #ifdef Py_UNICODE_WIDE
3043 if (max > 65535)
3044 FAIL;
3045 #endif
3046 if (!_validate_inner(code, code+skip-4, groups))
3047 FAIL;
3048 code += skip-4;
3049 GET_OP;
3050 if (op != SRE_OP_SUCCESS)
3051 FAIL;
3052 }
3053 break;
3054
3055 case SRE_OP_REPEAT:
3056 {
3057 SRE_CODE min, max;
3058 GET_SKIP;
3059 GET_ARG; min = arg;
3060 GET_ARG; max = arg;
3061 if (min > max)
3062 FAIL;
3063 #ifdef Py_UNICODE_WIDE
3064 if (max > 65535)
3065 FAIL;
3066 #endif
3067 if (!_validate_inner(code, code+skip-3, groups))
3068 FAIL;
3069 code += skip-3;
3070 GET_OP;
3071 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3072 FAIL;
3073 }
3074 break;
3075
3076 case SRE_OP_GROUPREF:
3077 case SRE_OP_GROUPREF_IGNORE:
3078 GET_ARG;
3079 if (arg >= groups)
3080 FAIL;
3081 break;
3082
3083 case SRE_OP_GROUPREF_EXISTS:
3084 /* The regex syntax for this is: '(?(group)then|else)', where
3085 'group' is either an integer group number or a group name,
3086 'then' and 'else' are sub-regexes, and 'else' is optional. */
3087 GET_ARG;
3088 if (arg >= groups)
3089 FAIL;
3090 GET_SKIP_ADJ(1);
3091 code--; /* The skip is relative to the first arg! */
3092 /* There are two possibilities here: if there is both a 'then'
3093 part and an 'else' part, the generated code looks like:
3094
3095 GROUPREF_EXISTS
3096 <group>
3097 <skipyes>
3098 ...then part...
3099 JUMP
3100 <skipno>
3101 (<skipyes> jumps here)
3102 ...else part...
3103 (<skipno> jumps here)
3104
3105 If there is only a 'then' part, it looks like:
3106
3107 GROUPREF_EXISTS
3108 <group>
3109 <skip>
3110 ...then part...
3111 (<skip> jumps here)
3112
3113 There is no direct way to decide which it is, and we don't want
3114 to allow arbitrary jumps anywhere in the code; so we just look
3115 for a JUMP opcode preceding our skip target.
3116 */
3117 if (skip >= 3 && code+skip-3 >= code &&
3118 code[skip-3] == SRE_OP_JUMP)
3119 {
3120 VTRACE(("both then and else parts present\n"));
3121 if (!_validate_inner(code+1, code+skip-3, groups))
3122 FAIL;
3123 code += skip-2; /* Position after JUMP, at <skipno> */
3124 GET_SKIP;
3125 if (!_validate_inner(code, code+skip-1, groups))
3126 FAIL;
3127 code += skip-1;
3128 }
3129 else {
3130 VTRACE(("only a then part present\n"));
3131 if (!_validate_inner(code+1, code+skip-1, groups))
3132 FAIL;
3133 code += skip-1;
3134 }
3135 break;
3136
3137 case SRE_OP_ASSERT:
3138 case SRE_OP_ASSERT_NOT:
3139 GET_SKIP;
3140 GET_ARG; /* 0 for lookahead, width for lookbehind */
3141 code--; /* Back up over arg to simplify math below */
3142 if (arg & 0x80000000)
3143 FAIL; /* Width too large */
3144 /* Stop 1 before the end; we check the SUCCESS below */
3145 if (!_validate_inner(code+1, code+skip-2, groups))
3146 FAIL;
3147 code += skip-2;
3148 GET_OP;
3149 if (op != SRE_OP_SUCCESS)
3150 FAIL;
3151 break;
3152
3153 default:
3154 FAIL;
3155
3156 }
3157 }
3158
3159 VTRACE(("okay\n"));
3160 return 1;
3161 }
3162
3163 static int
3164 _validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3165 {
3166 if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3167 FAIL;
3168 if (groups == 0) /* fix for simplejson */
3169 groups = 100; /* 100 groups should always be safe */
3170 return _validate_inner(code, end-1, groups);
3171 }
3172
3173 static int
3174 _validate(PatternObject *self)
3175 {
3176 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3177 {
3178 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3179 return 0;
3180 }
3181 else
3182 VTRACE(("Success!\n"));
3183 return 1;
3184 }
3185
3186 /* -------------------------------------------------------------------- */
3187 /* match methods */
3188
3189 static void
3190 match_dealloc(MatchObject* self)
3191 {
3192 Py_XDECREF(self->regs);
3193 Py_XDECREF(self->string);
3194 Py_DECREF(self->pattern);
3195 PyObject_DEL(self);
3196 }
3197
3198 static PyObject*
3199 match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
3200 {
3201 if (index < 0 || index >= self->groups) {
3202 /* raise IndexError if we were given a bad group number */
3203 PyErr_SetString(
3204 PyExc_IndexError,
3205 "no such group"
3206 );
3207 return NULL;
3208 }
3209
3210 index *= 2;
3211
3212 if (self->string == Py_None || self->mark[index] < 0) {
3213 /* return default value if the string or group is undefined */
3214 Py_INCREF(def);
3215 return def;
3216 }
3217
3218 return PySequence_GetSlice(
3219 self->string, self->mark[index], self->mark[index+1]
3220 );
3221 }
3222
3223 static Py_ssize_t
3224 match_getindex(MatchObject* self, PyObject* index)
3225 {
3226 Py_ssize_t i;
3227
3228 if (PyInt_Check(index))
3229 return PyInt_AsSsize_t(index);
3230
3231 i = -1;
3232
3233 if (self->pattern->groupindex) {
3234 index = PyObject_GetItem(self->pattern->groupindex, index);
3235 if (index) {
3236 if (PyInt_Check(index) || PyLong_Check(index))
3237 i = PyInt_AsSsize_t(index);
3238 Py_DECREF(index);
3239 } else
3240 PyErr_Clear();
3241 }
3242
3243 return i;
3244 }
3245
3246 static PyObject*
3247 match_getslice(MatchObject* self, PyObject* index, PyObject* def)
3248 {
3249 return match_getslice_by_index(self, match_getindex(self, index), def);
3250 }
3251
3252 static PyObject*
3253 match_expand(MatchObject* self, PyObject* ptemplate)
3254 {
3255 /* delegate to Python code */
3256 return call(
3257 SRE_PY_MODULE, "_expand",
3258 PyTuple_Pack(3, self->pattern, self, ptemplate)
3259 );
3260 }
3261
3262 static PyObject*
3263 match_group(MatchObject* self, PyObject* args)
3264 {
3265 PyObject* result;
3266 Py_ssize_t i, size;
3267
3268 size = PyTuple_GET_SIZE(args);
3269
3270 switch (size) {
3271 case 0:
3272 result = match_getslice(self, Py_False, Py_None);
3273 break;
3274 case 1:
3275 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3276 break;
3277 default:
3278 /* fetch multiple items */
3279 result = PyTuple_New(size);
3280 if (!result)
3281 return NULL;
3282 for (i = 0; i < size; i++) {
3283 PyObject* item = match_getslice(
3284 self, PyTuple_GET_ITEM(args, i), Py_None
3285 );
3286 if (!item) {
3287 Py_DECREF(result);
3288 return NULL;
3289 }
3290 PyTuple_SET_ITEM(result, i, item);
3291 }
3292 break;
3293 }
3294 return result;
3295 }
3296
3297 static PyObject*
3298 match_groups(MatchObject* self, PyObject* args, PyObject* kw)
3299 {
3300 PyObject* result;
3301 Py_ssize_t index;
3302
3303 PyObject* def = Py_None;
3304 static char* kwlist[] = { "default", NULL };
3305 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
3306 return NULL;
3307
3308 result = PyTuple_New(self->groups-1);
3309 if (!result)
3310 return NULL;
3311
3312 for (index = 1; index < self->groups; index++) {
3313 PyObject* item;
3314 item = match_getslice_by_index(self, index, def);
3315 if (!item) {
3316 Py_DECREF(result);
3317 return NULL;
3318 }
3319 PyTuple_SET_ITEM(result, index-1, item);
3320 }
3321
3322 return result;
3323 }
3324
3325 static PyObject*
3326 match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
3327 {
3328 PyObject* result;
3329 PyObject* keys;
3330 Py_ssize_t index;
3331
3332 PyObject* def = Py_None;
3333 static char* kwlist[] = { "default", NULL };
3334 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
3335 return NULL;
3336
3337 result = PyDict_New();
3338 if (!result || !self->pattern->groupindex)
3339 return result;
3340
3341 keys = PyMapping_Keys(self->pattern->groupindex);
3342 if (!keys)
3343 goto failed;
3344
3345 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
3346 int status;
3347 PyObject* key;
3348 PyObject* value;
3349 key = PyList_GET_ITEM(keys, index);
3350 if (!key)
3351 goto failed;
3352 value = match_getslice(self, key, def);
3353 if (!value) {
3354 Py_DECREF(key);
3355 goto failed;
3356 }
3357 status = PyDict_SetItem(result, key, value);
3358 Py_DECREF(value);
3359 if (status < 0)
3360 goto failed;
3361 }
3362
3363 Py_DECREF(keys);
3364
3365 return result;
3366
3367 failed:
3368 Py_XDECREF(keys);
3369 Py_DECREF(result);
3370 return NULL;
3371 }
3372
3373 static PyObject*
3374 match_start(MatchObject* self, PyObject* args)
3375 {
3376 Py_ssize_t index;
3377
3378 PyObject* index_ = Py_False; /* zero */
3379 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
3380 return NULL;
3381
3382 index = match_getindex(self, index_);
3383
3384 if (index < 0 || index >= self->groups) {
3385 PyErr_SetString(
3386 PyExc_IndexError,
3387 "no such group"
3388 );
3389 return NULL;
3390 }
3391
3392 /* mark is -1 if group is undefined */
3393 return Py_BuildValue("i", self->mark[index*2]);
3394 }
3395
3396 static PyObject*
3397 match_end(MatchObject* self, PyObject* args)
3398 {
3399 Py_ssize_t index;
3400
3401 PyObject* index_ = Py_False; /* zero */
3402 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
3403 return NULL;
3404
3405 index = match_getindex(self, index_);
3406
3407 if (index < 0 || index >= self->groups) {
3408 PyErr_SetString(
3409 PyExc_IndexError,
3410 "no such group"
3411 );
3412 return NULL;
3413 }
3414
3415 /* mark is -1 if group is undefined */
3416 return Py_BuildValue("i", self->mark[index*2+1]);
3417 }
3418
3419 LOCAL(PyObject*)
3420 _pair(Py_ssize_t i1, Py_ssize_t i2)
3421 {
3422 PyObject* pair;
3423 PyObject* item;
3424
3425 pair = PyTuple_New(2);
3426 if (!pair)
3427 return NULL;
3428
3429 item = PyInt_FromSsize_t(i1);
3430 if (!item)
3431 goto error;
3432 PyTuple_SET_ITEM(pair, 0, item);
3433
3434 item = PyInt_FromSsize_t(i2);
3435 if (!item)
3436 goto error;
3437 PyTuple_SET_ITEM(pair, 1, item);
3438
3439 return pair;
3440
3441 error:
3442 Py_DECREF(pair);
3443 return NULL;
3444 }
3445
3446 static PyObject*
3447 match_span(MatchObject* self, PyObject* args)
3448 {
3449 Py_ssize_t index;
3450
3451 PyObject* index_ = Py_False; /* zero */
3452 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
3453 return NULL;
3454
3455 index = match_getindex(self, index_);
3456
3457 if (index < 0 || index >= self->groups) {
3458 PyErr_SetString(
3459 PyExc_IndexError,
3460 "no such group"
3461 );
3462 return NULL;
3463 }
3464
3465 /* marks are -1 if group is undefined */
3466 return _pair(self->mark[index*2], self->mark[index*2+1]);
3467 }
3468
3469 static PyObject*
3470 match_regs(MatchObject* self)
3471 {
3472 PyObject* regs;
3473 PyObject* item;
3474 Py_ssize_t index;
3475
3476 regs = PyTuple_New(self->groups);
3477 if (!regs)
3478 return NULL;
3479
3480 for (index = 0; index < self->groups; index++) {
3481 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3482 if (!item) {
3483 Py_DECREF(regs);
3484 return NULL;
3485 }
3486 PyTuple_SET_ITEM(regs, index, item);
3487 }
3488
3489 Py_INCREF(regs);
3490 self->regs = regs;
3491
3492 return regs;
3493 }
3494
3495 static PyObject*
3496 match_copy(MatchObject* self, PyObject *unused)
3497 {
3498 #ifdef USE_BUILTIN_COPY
3499 MatchObject* copy;
3500 Py_ssize_t slots, offset;
3501
3502 slots = 2 * (self->pattern->groups+1);
3503
3504 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3505 if (!copy)
3506 return NULL;
3507
3508 /* this value a constant, but any compiler should be able to
3509 figure that out all by itself */
3510 offset = offsetof(MatchObject, string);
3511
3512 Py_XINCREF(self->pattern);
3513 Py_XINCREF(self->string);
3514 Py_XINCREF(self->regs);
3515
3516 memcpy((char*) copy + offset, (char*) self + offset,
3517 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
3518
3519 return (PyObject*) copy;
3520 #else
3521 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
3522 return NULL;
3523 #endif
3524 }
3525
3526 static PyObject*
3527 match_deepcopy(MatchObject* self, PyObject* memo)
3528 {
3529 #ifdef USE_BUILTIN_COPY
3530 MatchObject* copy;
3531
3532 copy = (MatchObject*) match_copy(self);
3533 if (!copy)
3534 return NULL;
3535
3536 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3537 !deepcopy(&copy->string, memo) ||
3538 !deepcopy(&copy->regs, memo)) {
3539 Py_DECREF(copy);
3540 return NULL;
3541 }
3542
3543 #else
3544 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3545 return NULL;
3546 #endif
3547 }
3548
3549 static struct PyMethodDef match_methods[] = {
3550 {"group", (PyCFunction) match_group, METH_VARARGS},
3551 {"start", (PyCFunction) match_start, METH_VARARGS},
3552 {"end", (PyCFunction) match_end, METH_VARARGS},
3553 {"span", (PyCFunction) match_span, METH_VARARGS},
3554 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3555 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
3556 {"expand", (PyCFunction) match_expand, METH_O},
3557 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3558 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
3559 {NULL, NULL}
3560 };
3561
3562 static PyObject *
3563 match_lastindex_get(MatchObject *self)
3564 {
3565 if (self->lastindex >= 0)
3566 return Py_BuildValue("i", self->lastindex);
3567 Py_INCREF(Py_None);
3568 return Py_None;
3569 }
3570
3571 static PyObject *
3572 match_lastgroup_get(MatchObject *self)
3573 {
3574 if (self->pattern->indexgroup && self->lastindex >= 0) {
3575 PyObject* result = PySequence_GetItem(
3576 self->pattern->indexgroup, self->lastindex
3577 );
3578 if (result)
3579 return result;
3580 PyErr_Clear();
3581 }
3582 Py_INCREF(Py_None);
3583 return Py_None;
3584 }
3585
3586 static PyObject *
3587 match_regs_get(MatchObject *self)
3588 {
3589 if (self->regs) {
3590 Py_INCREF(self->regs);
3591 return self->regs;
3592 } else
3593 return match_regs(self);
3594 }
3595
3596 static PyGetSetDef match_getset[] = {
3597 {"lastindex", (getter)match_lastindex_get, (setter)NULL},
3598 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
3599 {"regs", (getter)match_regs_get, (setter)NULL},
3600 {NULL}
3601 };
3602
3603 #define MATCH_OFF(x) offsetof(MatchObject, x)
3604 static PyMemberDef match_members[] = {
3605 {"string", T_OBJECT, MATCH_OFF(string), READONLY},
3606 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
3607 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
3608 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
3609 {NULL}
3610 };
3611
3612
3613 /* FIXME: implement setattr("string", None) as a special case (to
3614 detach the associated string, if any */
3615
3616 static PyTypeObject Match_Type = {
3617 PyVarObject_HEAD_INIT(NULL, 0)
3618 "_" SRE_MODULE ".SRE_Match",
3619 sizeof(MatchObject), sizeof(Py_ssize_t),
3620 (destructor)match_dealloc, /* tp_dealloc */
3621 0, /* tp_print */
3622 0, /* tp_getattr */
3623 0, /* tp_setattr */
3624 0, /* tp_compare */
3625 0, /* tp_repr */
3626 0, /* tp_as_number */
3627 0, /* tp_as_sequence */
3628 0, /* tp_as_mapping */
3629 0, /* tp_hash */
3630 0, /* tp_call */
3631 0, /* tp_str */
3632 0, /* tp_getattro */
3633 0, /* tp_setattro */
3634 0, /* tp_as_buffer */
3635 Py_TPFLAGS_DEFAULT,
3636 0, /* tp_doc */
3637 0, /* tp_traverse */
3638 0, /* tp_clear */
3639 0, /* tp_richcompare */
3640 0, /* tp_weaklistoffset */
3641 0, /* tp_iter */
3642 0, /* tp_iternext */
3643 match_methods, /* tp_methods */
3644 match_members, /* tp_members */
3645 match_getset, /* tp_getset */
3646 };
3647
3648 static PyObject*
3649 pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3650 {
3651 /* create match object (from state object) */
3652
3653 MatchObject* match;
3654 Py_ssize_t i, j;
3655 char* base;
3656 int n;
3657
3658 if (status > 0) {
3659
3660 /* create match object (with room for extra group marks) */
3661 /* coverity[ampersand_in_size] */
3662 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3663 2*(pattern->groups+1));
3664 if (!match)
3665 return NULL;
3666
3667 Py_INCREF(pattern);
3668 match->pattern = pattern;
3669
3670 Py_INCREF(state->string);
3671 match->string = state->string;
3672
3673 match->regs = NULL;
3674 match->groups = pattern->groups+1;
3675
3676 /* fill in group slices */
3677
3678 base = (char*) state->beginning;
3679 n = state->charsize;
3680
3681 match->mark[0] = ((char*) state->start - base) / n;
3682 match->mark[1] = ((char*) state->ptr - base) / n;
3683
3684 for (i = j = 0; i < pattern->groups; i++, j+=2)
3685 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3686 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3687 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3688 } else
3689 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3690
3691 match->pos = state->pos;
3692 match->endpos = state->endpos;
3693
3694 match->lastindex = state->lastindex;
3695
3696 return (PyObject*) match;
3697
3698 } else if (status == 0) {
3699
3700 /* no match */
3701 Py_INCREF(Py_None);
3702 return Py_None;
3703
3704 }
3705
3706 /* internal error */
3707 pattern_error(status);
3708 return NULL;
3709 }
3710
3711
3712 /* -------------------------------------------------------------------- */
3713 /* scanner methods (experimental) */
3714
3715 static void
3716 scanner_dealloc(ScannerObject* self)
3717 {
3718 state_fini(&self->state);
3719 Py_XDECREF(self->pattern);
3720 PyObject_DEL(self);
3721 }
3722
3723 static PyObject*
3724 scanner_match(ScannerObject* self, PyObject *unused)
3725 {
3726 SRE_STATE* state = &self->state;
3727 PyObject* match;
3728 int status;
3729
3730 state_reset(state);
3731
3732 state->ptr = state->start;
3733
3734 if (state->charsize == 1) {
3735 status = sre_match(state, PatternObject_GetCode(self->pattern));
3736 } else {
3737 #if defined(HAVE_UNICODE)
3738 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
3739 #endif
3740 }
3741 if (PyErr_Occurred())
3742 return NULL;
3743
3744 match = pattern_new_match((PatternObject*) self->pattern,
3745 state, status);
3746
3747 if (status == 0 || state->ptr == state->start)
3748 state->start = (void*) ((char*) state->ptr + state->charsize);
3749 else
3750 state->start = state->ptr;
3751
3752 return match;
3753 }
3754
3755
3756 static PyObject*
3757 scanner_search(ScannerObject* self, PyObject *unused)
3758 {
3759 SRE_STATE* state = &self->state;
3760 PyObject* match;
3761 int status;
3762
3763 state_reset(state);
3764
3765 state->ptr = state->start;
3766
3767 if (state->charsize == 1) {
3768 status = sre_search(state, PatternObject_GetCode(self->pattern));
3769 } else {
3770 #if defined(HAVE_UNICODE)
3771 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
3772 #endif
3773 }
3774 if (PyErr_Occurred())
3775 return NULL;
3776
3777 match = pattern_new_match((PatternObject*) self->pattern,
3778 state, status);
3779
3780 if (status == 0 || state->ptr == state->start)
3781 state->start = (void*) ((char*) state->ptr + state->charsize);
3782 else
3783 state->start = state->ptr;
3784
3785 return match;
3786 }
3787
3788 static PyMethodDef scanner_methods[] = {
3789 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3790 {"search", (PyCFunction) scanner_search, METH_NOARGS},
3791 {NULL, NULL}
3792 };
3793
3794 #define SCAN_OFF(x) offsetof(ScannerObject, x)
3795 static PyMemberDef scanner_members[] = {
3796 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
3797 {NULL} /* Sentinel */
3798 };
3799
3800 statichere PyTypeObject Scanner_Type = {
3801 PyObject_HEAD_INIT(NULL)
3802 0, "_" SRE_MODULE ".SRE_Scanner",
3803 sizeof(ScannerObject), 0,
3804 (destructor)scanner_dealloc, /*tp_dealloc*/
3805 0, /* tp_print */
3806 0, /* tp_getattr */
3807 0, /* tp_setattr */
3808 0, /* tp_reserved */
3809 0, /* tp_repr */
3810 0, /* tp_as_number */
3811 0, /* tp_as_sequence */
3812 0, /* tp_as_mapping */
3813 0, /* tp_hash */
3814 0, /* tp_call */
3815 0, /* tp_str */
3816 0, /* tp_getattro */
3817 0, /* tp_setattro */
3818 0, /* tp_as_buffer */
3819 Py_TPFLAGS_DEFAULT, /* tp_flags */
3820 0, /* tp_doc */
3821 0, /* tp_traverse */
3822 0, /* tp_clear */
3823 0, /* tp_richcompare */
3824 0, /* tp_weaklistoffset */
3825 0, /* tp_iter */
3826 0, /* tp_iternext */
3827 scanner_methods, /* tp_methods */
3828 scanner_members, /* tp_members */
3829 0, /* tp_getset */
3830 };
3831
3832 static PyObject*
3833 pattern_scanner(PatternObject* pattern, PyObject* args)
3834 {
3835 /* create search state object */
3836
3837 ScannerObject* self;
3838
3839 PyObject* string;
3840 Py_ssize_t start = 0;
3841 Py_ssize_t end = PY_SSIZE_T_MAX;
3842 if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
3843 return NULL;
3844
3845 /* create scanner object */
3846 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3847 if (!self)
3848 return NULL;
3849 self->pattern = NULL;
3850
3851 string = state_init(&self->state, pattern, string, start, end);
3852 if (!string) {
3853 Py_DECREF(self);
3854 return NULL;
3855 }
3856
3857 Py_INCREF(pattern);
3858 self->pattern = (PyObject*) pattern;
3859
3860 return (PyObject*) self;
3861 }
3862
3863 static PyMethodDef _functions[] = {
3864 {"compile", _compile, METH_VARARGS},
3865 {"getcodesize", sre_codesize, METH_NOARGS},
3866 {"getlower", sre_getlower, METH_VARARGS},
3867 {NULL, NULL}
3868 };
3869
3870 #if PY_VERSION_HEX < 0x02030000
3871 DL_EXPORT(void) init_sre(void)
3872 #else
3873 PyMODINIT_FUNC init_sre(void)
3874 #endif
3875 {
3876 PyObject* m;
3877 PyObject* d;
3878 PyObject* x;
3879
3880 /* Patch object types */
3881 if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
3882 PyType_Ready(&Scanner_Type))
3883 return;
3884
3885 m = Py_InitModule("_" SRE_MODULE, _functions);
3886 if (m == NULL)
3887 return;
3888 d = PyModule_GetDict(m);
3889
3890 x = PyInt_FromLong(SRE_MAGIC);
3891 if (x) {
3892 PyDict_SetItemString(d, "MAGIC", x);
3893 Py_DECREF(x);
3894 }
3895
3896 x = PyInt_FromLong(sizeof(SRE_CODE));
3897 if (x) {
3898 PyDict_SetItemString(d, "CODESIZE", x);
3899 Py_DECREF(x);
3900 }
3901
3902 x = PyString_FromString(copyright);
3903 if (x) {
3904 PyDict_SetItemString(d, "copyright", x);
3905 Py_DECREF(x);
3906 }
3907 }
3908
3909 #endif /* !defined(SRE_RECURSIVE) */
3910
3911 /* vim:ts=4:sw=4:et
3912 */