]> git.proxmox.com Git - mirror_edk2.git/blob - AppPkg/Applications/Python/PyMod-2.7.1/Modules/_sre.c
Basic Core Python interpreter.
[mirror_edk2.git] / AppPkg / Applications / Python / PyMod-2.7.1 / Modules / _sre.c
1 /*
2 * Secret Labs' Regular Expression Engine
3 *
4 * regular expression matching engine
5 *
6 * partial history:
7 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
25 *
26 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
27 *
28 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
32 * Portions of this engine have been developed in cooperation with
33 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
34 * other compatibility work.
35 */
36
37 /* Get rid of these macros to prevent collisions between EFI and Python in this file. */
38 #undef RETURN_ERROR
39 #undef RETURN_SUCCESS
40
41 #ifndef SRE_RECURSIVE
42
43 static char copyright[] =
44 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
45
46 #define PY_SSIZE_T_CLEAN
47
48 #include "Python.h"
49 #include "structmember.h" /* offsetof */
50
51 #include "sre.h"
52
53 #include <ctype.h>
54
55 /* name of this module, minus the leading underscore */
56 #if !defined(SRE_MODULE)
57 #define SRE_MODULE "sre"
58 #endif
59
60 #define SRE_PY_MODULE "re"
61
62 /* defining this one enables tracing */
63 #undef VERBOSE
64
65 #if PY_VERSION_HEX >= 0x01060000
66 #if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
67 /* defining this enables unicode support (default under 1.6a1 and later) */
68 #define HAVE_UNICODE
69 #endif
70 #endif
71
72 /* -------------------------------------------------------------------- */
73 /* optional features */
74
75 /* enables fast searching */
76 #define USE_FAST_SEARCH
77
78 /* enables aggressive inlining (always on for Visual C) */
79 #undef USE_INLINE
80
81 /* enables copy/deepcopy handling (work in progress) */
82 #undef USE_BUILTIN_COPY
83
84 #if PY_VERSION_HEX < 0x01060000
85 #define PyObject_DEL(op) PyMem_DEL((op))
86 #endif
87
88 /* -------------------------------------------------------------------- */
89
90 #if defined(_MSC_VER)
91 #pragma optimize("gt", on) /* doesn't seem to make much difference... */
92 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
93 /* fastest possible local call under MSVC */
94 #define LOCAL(type) static __inline type __fastcall
95 #elif defined(USE_INLINE)
96 #define LOCAL(type) static inline type
97 #else
98 #define LOCAL(type) static type
99 #endif
100
101 /* error codes */
102 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
103 #define SRE_ERROR_STATE -2 /* illegal state */
104 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
105 #define SRE_ERROR_MEMORY -9 /* out of memory */
106 #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
107
108 #if defined(VERBOSE)
109 #define TRACE(v) printf v
110 #else
111 #define TRACE(v)
112 #endif
113
114 /* -------------------------------------------------------------------- */
115 /* search engine state */
116
117 /* default character predicates (run sre_chars.py to regenerate tables) */
118
119 #define SRE_DIGIT_MASK 1
120 #define SRE_SPACE_MASK 2
121 #define SRE_LINEBREAK_MASK 4
122 #define SRE_ALNUM_MASK 8
123 #define SRE_WORD_MASK 16
124
125 /* FIXME: this assumes ASCII. create tables in init_sre() instead */
126
127 static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
128 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
129 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
130 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
131 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
132 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
133 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
134
135 static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
136 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
137 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
138 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
139 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
140 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
141 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
142 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
143 120, 121, 122, 123, 124, 125, 126, 127 };
144
145 #define SRE_IS_DIGIT(ch)\
146 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
147 #define SRE_IS_SPACE(ch)\
148 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
149 #define SRE_IS_LINEBREAK(ch)\
150 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
151 #define SRE_IS_ALNUM(ch)\
152 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
153 #define SRE_IS_WORD(ch)\
154 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
155
156 static unsigned int sre_lower(unsigned int ch)
157 {
158 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
159 }
160
161 /* locale-specific character predicates */
162 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
163 * warnings when c's type supports only numbers < N+1 */
164 #define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
165 #define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
166 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
167 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
168 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
169
170 static unsigned int sre_lower_locale(unsigned int ch)
171 {
172 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
173 }
174
175 /* unicode-specific character predicates */
176
177 #if defined(HAVE_UNICODE)
178
179 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
180 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
181 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
182 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
183 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
184
185 static unsigned int sre_lower_unicode(unsigned int ch)
186 {
187 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
188 }
189
190 #endif
191
192 LOCAL(int)
193 sre_category(SRE_CODE category, unsigned int ch)
194 {
195 switch (category) {
196
197 case SRE_CATEGORY_DIGIT:
198 return SRE_IS_DIGIT(ch);
199 case SRE_CATEGORY_NOT_DIGIT:
200 return !SRE_IS_DIGIT(ch);
201 case SRE_CATEGORY_SPACE:
202 return SRE_IS_SPACE(ch);
203 case SRE_CATEGORY_NOT_SPACE:
204 return !SRE_IS_SPACE(ch);
205 case SRE_CATEGORY_WORD:
206 return SRE_IS_WORD(ch);
207 case SRE_CATEGORY_NOT_WORD:
208 return !SRE_IS_WORD(ch);
209 case SRE_CATEGORY_LINEBREAK:
210 return SRE_IS_LINEBREAK(ch);
211 case SRE_CATEGORY_NOT_LINEBREAK:
212 return !SRE_IS_LINEBREAK(ch);
213
214 case SRE_CATEGORY_LOC_WORD:
215 return SRE_LOC_IS_WORD(ch);
216 case SRE_CATEGORY_LOC_NOT_WORD:
217 return !SRE_LOC_IS_WORD(ch);
218
219 #if defined(HAVE_UNICODE)
220 case SRE_CATEGORY_UNI_DIGIT:
221 return SRE_UNI_IS_DIGIT(ch);
222 case SRE_CATEGORY_UNI_NOT_DIGIT:
223 return !SRE_UNI_IS_DIGIT(ch);
224 case SRE_CATEGORY_UNI_SPACE:
225 return SRE_UNI_IS_SPACE(ch);
226 case SRE_CATEGORY_UNI_NOT_SPACE:
227 return !SRE_UNI_IS_SPACE(ch);
228 case SRE_CATEGORY_UNI_WORD:
229 return SRE_UNI_IS_WORD(ch);
230 case SRE_CATEGORY_UNI_NOT_WORD:
231 return !SRE_UNI_IS_WORD(ch);
232 case SRE_CATEGORY_UNI_LINEBREAK:
233 return SRE_UNI_IS_LINEBREAK(ch);
234 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
235 return !SRE_UNI_IS_LINEBREAK(ch);
236 #else
237 case SRE_CATEGORY_UNI_DIGIT:
238 return SRE_IS_DIGIT(ch);
239 case SRE_CATEGORY_UNI_NOT_DIGIT:
240 return !SRE_IS_DIGIT(ch);
241 case SRE_CATEGORY_UNI_SPACE:
242 return SRE_IS_SPACE(ch);
243 case SRE_CATEGORY_UNI_NOT_SPACE:
244 return !SRE_IS_SPACE(ch);
245 case SRE_CATEGORY_UNI_WORD:
246 return SRE_LOC_IS_WORD(ch);
247 case SRE_CATEGORY_UNI_NOT_WORD:
248 return !SRE_LOC_IS_WORD(ch);
249 case SRE_CATEGORY_UNI_LINEBREAK:
250 return SRE_IS_LINEBREAK(ch);
251 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
252 return !SRE_IS_LINEBREAK(ch);
253 #endif
254 }
255 return 0;
256 }
257
258 /* helpers */
259
260 static void
261 data_stack_dealloc(SRE_STATE* state)
262 {
263 if (state->data_stack) {
264 PyMem_FREE(state->data_stack);
265 state->data_stack = NULL;
266 }
267 state->data_stack_size = state->data_stack_base = 0;
268 }
269
270 static int
271 data_stack_grow(SRE_STATE* state, Py_ssize_t size)
272 {
273 Py_ssize_t minsize, cursize;
274 minsize = state->data_stack_base+size;
275 cursize = state->data_stack_size;
276 if (cursize < minsize) {
277 void* stack;
278 cursize = minsize+minsize/4+1024;
279 TRACE(("allocate/grow stack %d\n", cursize));
280 stack = PyMem_REALLOC(state->data_stack, cursize);
281 if (!stack) {
282 data_stack_dealloc(state);
283 return SRE_ERROR_MEMORY;
284 }
285 state->data_stack = (char *)stack;
286 state->data_stack_size = cursize;
287 }
288 return 0;
289 }
290
291 /* generate 8-bit version */
292
293 #define SRE_CHAR unsigned char
294 #define SRE_AT sre_at
295 #define SRE_COUNT sre_count
296 #define SRE_CHARSET sre_charset
297 #define SRE_INFO sre_info
298 #define SRE_MATCH sre_match
299 #define SRE_MATCH_CONTEXT sre_match_context
300 #define SRE_SEARCH sre_search
301 #define SRE_LITERAL_TEMPLATE sre_literal_template
302
303 #if defined(HAVE_UNICODE)
304
305 #define SRE_RECURSIVE
306 #include "_sre.c"
307 #undef SRE_RECURSIVE
308
309 #undef SRE_LITERAL_TEMPLATE
310 #undef SRE_SEARCH
311 #undef SRE_MATCH
312 #undef SRE_MATCH_CONTEXT
313 #undef SRE_INFO
314 #undef SRE_CHARSET
315 #undef SRE_COUNT
316 #undef SRE_AT
317 #undef SRE_CHAR
318
319 /* generate 16-bit unicode version */
320
321 #define SRE_CHAR Py_UNICODE
322 #define SRE_AT sre_uat
323 #define SRE_COUNT sre_ucount
324 #define SRE_CHARSET sre_ucharset
325 #define SRE_INFO sre_uinfo
326 #define SRE_MATCH sre_umatch
327 #define SRE_MATCH_CONTEXT sre_umatch_context
328 #define SRE_SEARCH sre_usearch
329 #define SRE_LITERAL_TEMPLATE sre_uliteral_template
330 #endif
331
332 #endif /* SRE_RECURSIVE */
333
334 /* -------------------------------------------------------------------- */
335 /* String matching engine */
336
337 /* the following section is compiled twice, with different character
338 settings */
339
340 LOCAL(int)
341 SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
342 {
343 /* check if pointer is at given position */
344
345 Py_ssize_t thisp, thatp;
346
347 switch (at) {
348
349 case SRE_AT_BEGINNING:
350 case SRE_AT_BEGINNING_STRING:
351 return ((void*) ptr == state->beginning);
352
353 case SRE_AT_BEGINNING_LINE:
354 return ((void*) ptr == state->beginning ||
355 SRE_IS_LINEBREAK((int) ptr[-1]));
356
357 case SRE_AT_END:
358 return (((void*) (ptr+1) == state->end &&
359 SRE_IS_LINEBREAK((int) ptr[0])) ||
360 ((void*) ptr == state->end));
361
362 case SRE_AT_END_LINE:
363 return ((void*) ptr == state->end ||
364 SRE_IS_LINEBREAK((int) ptr[0]));
365
366 case SRE_AT_END_STRING:
367 return ((void*) ptr == state->end);
368
369 case SRE_AT_BOUNDARY:
370 if (state->beginning == state->end)
371 return 0;
372 thatp = ((void*) ptr > state->beginning) ?
373 SRE_IS_WORD((int) ptr[-1]) : 0;
374 thisp = ((void*) ptr < state->end) ?
375 SRE_IS_WORD((int) ptr[0]) : 0;
376 return thisp != thatp;
377
378 case SRE_AT_NON_BOUNDARY:
379 if (state->beginning == state->end)
380 return 0;
381 thatp = ((void*) ptr > state->beginning) ?
382 SRE_IS_WORD((int) ptr[-1]) : 0;
383 thisp = ((void*) ptr < state->end) ?
384 SRE_IS_WORD((int) ptr[0]) : 0;
385 return thisp == thatp;
386
387 case SRE_AT_LOC_BOUNDARY:
388 if (state->beginning == state->end)
389 return 0;
390 thatp = ((void*) ptr > state->beginning) ?
391 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
392 thisp = ((void*) ptr < state->end) ?
393 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
394 return thisp != thatp;
395
396 case SRE_AT_LOC_NON_BOUNDARY:
397 if (state->beginning == state->end)
398 return 0;
399 thatp = ((void*) ptr > state->beginning) ?
400 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
401 thisp = ((void*) ptr < state->end) ?
402 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
403 return thisp == thatp;
404
405 #if defined(HAVE_UNICODE)
406 case SRE_AT_UNI_BOUNDARY:
407 if (state->beginning == state->end)
408 return 0;
409 thatp = ((void*) ptr > state->beginning) ?
410 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
411 thisp = ((void*) ptr < state->end) ?
412 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
413 return thisp != thatp;
414
415 case SRE_AT_UNI_NON_BOUNDARY:
416 if (state->beginning == state->end)
417 return 0;
418 thatp = ((void*) ptr > state->beginning) ?
419 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
420 thisp = ((void*) ptr < state->end) ?
421 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
422 return thisp == thatp;
423 #endif
424
425 }
426
427 return 0;
428 }
429
430 LOCAL(int)
431 SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
432 {
433 /* check if character is a member of the given set */
434
435 int ok = 1;
436
437 for (;;) {
438 switch (*set++) {
439
440 case SRE_OP_FAILURE:
441 return !ok;
442
443 case SRE_OP_LITERAL:
444 /* <LITERAL> <code> */
445 if (ch == set[0])
446 return ok;
447 set++;
448 break;
449
450 case SRE_OP_CATEGORY:
451 /* <CATEGORY> <code> */
452 if (sre_category(set[0], (int) ch))
453 return ok;
454 set += 1;
455 break;
456
457 case SRE_OP_CHARSET:
458 if (sizeof(SRE_CODE) == 2) {
459 /* <CHARSET> <bitmap> (16 bits per code word) */
460 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
461 return ok;
462 set += 16;
463 }
464 else {
465 /* <CHARSET> <bitmap> (32 bits per code word) */
466 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
467 return ok;
468 set += 8;
469 }
470 break;
471
472 case SRE_OP_RANGE:
473 /* <RANGE> <lower> <upper> */
474 if (set[0] <= ch && ch <= set[1])
475 return ok;
476 set += 2;
477 break;
478
479 case SRE_OP_NEGATE:
480 ok = !ok;
481 break;
482
483 case SRE_OP_BIGCHARSET:
484 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
485 {
486 Py_ssize_t count, block;
487 count = *(set++);
488
489 if (sizeof(SRE_CODE) == 2) {
490 block = ((unsigned char*)set)[ch >> 8];
491 set += 128;
492 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
493 return ok;
494 set += count*16;
495 }
496 else {
497 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
498 * warnings when c's type supports only numbers < N+1 */
499 if (!(ch & ~65535))
500 block = ((unsigned char*)set)[ch >> 8];
501 else
502 block = -1;
503 set += 64;
504 if (block >=0 &&
505 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
506 return ok;
507 set += count*8;
508 }
509 break;
510 }
511
512 default:
513 /* internal error -- there's not much we can do about it
514 here, so let's just pretend it didn't match... */
515 return 0;
516 }
517 }
518 }
519
520 LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
521
522 LOCAL(Py_ssize_t)
523 SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
524 {
525 SRE_CODE chr;
526 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
527 SRE_CHAR* end = (SRE_CHAR *)state->end;
528 Py_ssize_t i;
529
530 /* adjust end */
531 if (maxcount < end - ptr && maxcount != 65535)
532 end = ptr + maxcount;
533
534 switch (pattern[0]) {
535
536 case SRE_OP_IN:
537 /* repeated set */
538 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
539 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
540 ptr++;
541 break;
542
543 case SRE_OP_ANY:
544 /* repeated dot wildcard. */
545 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
546 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
547 ptr++;
548 break;
549
550 case SRE_OP_ANY_ALL:
551 /* repeated dot wildcard. skip to the end of the target
552 string, and backtrack from there */
553 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
554 ptr = end;
555 break;
556
557 case SRE_OP_LITERAL:
558 /* repeated literal */
559 chr = pattern[1];
560 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
561 while (ptr < end && (SRE_CODE) *ptr == chr)
562 ptr++;
563 break;
564
565 case SRE_OP_LITERAL_IGNORE:
566 /* repeated literal */
567 chr = pattern[1];
568 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
569 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
570 ptr++;
571 break;
572
573 case SRE_OP_NOT_LITERAL:
574 /* repeated non-literal */
575 chr = pattern[1];
576 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
577 while (ptr < end && (SRE_CODE) *ptr != chr)
578 ptr++;
579 break;
580
581 case SRE_OP_NOT_LITERAL_IGNORE:
582 /* repeated non-literal */
583 chr = pattern[1];
584 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
585 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
586 ptr++;
587 break;
588
589 default:
590 /* repeated single character pattern */
591 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
592 while ((SRE_CHAR*) state->ptr < end) {
593 i = SRE_MATCH(state, pattern);
594 if (i < 0)
595 return i;
596 if (!i)
597 break;
598 }
599 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
600 (SRE_CHAR*) state->ptr - ptr));
601 return (SRE_CHAR*) state->ptr - ptr;
602 }
603
604 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
605 return ptr - (SRE_CHAR*) state->ptr;
606 }
607
608 #if 0 /* not used in this release */
609 LOCAL(int)
610 SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
611 {
612 /* check if an SRE_OP_INFO block matches at the current position.
613 returns the number of SRE_CODE objects to skip if successful, 0
614 if no match */
615
616 SRE_CHAR* end = state->end;
617 SRE_CHAR* ptr = state->ptr;
618 Py_ssize_t i;
619
620 /* check minimal length */
621 if (pattern[3] && (end - ptr) < pattern[3])
622 return 0;
623
624 /* check known prefix */
625 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
626 /* <length> <skip> <prefix data> <overlap data> */
627 for (i = 0; i < pattern[5]; i++)
628 if ((SRE_CODE) ptr[i] != pattern[7 + i])
629 return 0;
630 return pattern[0] + 2 * pattern[6];
631 }
632 return pattern[0];
633 }
634 #endif
635
636 /* The macros below should be used to protect recursive SRE_MATCH()
637 * calls that *failed* and do *not* return immediately (IOW, those
638 * that will backtrack). Explaining:
639 *
640 * - Recursive SRE_MATCH() returned true: that's usually a success
641 * (besides atypical cases like ASSERT_NOT), therefore there's no
642 * reason to restore lastmark;
643 *
644 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
645 * is returning to the caller: If the current SRE_MATCH() is the
646 * top function of the recursion, returning false will be a matching
647 * failure, and it doesn't matter where lastmark is pointing to.
648 * If it's *not* the top function, it will be a recursive SRE_MATCH()
649 * failure by itself, and the calling SRE_MATCH() will have to deal
650 * with the failure by the same rules explained here (it will restore
651 * lastmark by itself if necessary);
652 *
653 * - Recursive SRE_MATCH() returned false, and will continue the
654 * outside 'for' loop: must be protected when breaking, since the next
655 * OP could potentially depend on lastmark;
656 *
657 * - Recursive SRE_MATCH() returned false, and will be called again
658 * inside a local for/while loop: must be protected between each
659 * loop iteration, since the recursive SRE_MATCH() could do anything,
660 * and could potentially depend on lastmark.
661 *
662 * For more information, check the discussion at SF patch #712900.
663 */
664 #define LASTMARK_SAVE() \
665 do { \
666 ctx->lastmark = state->lastmark; \
667 ctx->lastindex = state->lastindex; \
668 } while (0)
669 #define LASTMARK_RESTORE() \
670 do { \
671 state->lastmark = ctx->lastmark; \
672 state->lastindex = ctx->lastindex; \
673 } while (0)
674
675 #define RETURN_ERROR(i) do { return i; } while(0)
676 #define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
677 #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
678
679 #define RETURN_ON_ERROR(i) \
680 do { if (i < 0) RETURN_ERROR(i); } while (0)
681 #define RETURN_ON_SUCCESS(i) \
682 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
683 #define RETURN_ON_FAILURE(i) \
684 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
685
686 #define SFY(x) #x
687
688 #define DATA_STACK_ALLOC(state, type, ptr) \
689 do { \
690 alloc_pos = state->data_stack_base; \
691 TRACE(("allocating %s in %d (%d)\n", \
692 SFY(type), alloc_pos, sizeof(type))); \
693 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
694 int j = data_stack_grow(state, sizeof(type)); \
695 if (j < 0) return j; \
696 if (ctx_pos != -1) \
697 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
698 } \
699 ptr = (type*)(state->data_stack+alloc_pos); \
700 state->data_stack_base += sizeof(type); \
701 } while (0)
702
703 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
704 do { \
705 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
706 ptr = (type*)(state->data_stack+pos); \
707 } while (0)
708
709 #define DATA_STACK_PUSH(state, data, size) \
710 do { \
711 TRACE(("copy data in %p to %d (%d)\n", \
712 data, state->data_stack_base, size)); \
713 if (state->data_stack_size < state->data_stack_base+size) { \
714 int j = data_stack_grow(state, size); \
715 if (j < 0) return j; \
716 if (ctx_pos != -1) \
717 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
718 } \
719 memcpy(state->data_stack+state->data_stack_base, data, size); \
720 state->data_stack_base += size; \
721 } while (0)
722
723 #define DATA_STACK_POP(state, data, size, discard) \
724 do { \
725 TRACE(("copy data to %p from %d (%d)\n", \
726 data, state->data_stack_base-size, size)); \
727 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
728 if (discard) \
729 state->data_stack_base -= size; \
730 } while (0)
731
732 #define DATA_STACK_POP_DISCARD(state, size) \
733 do { \
734 TRACE(("discard data from %d (%d)\n", \
735 state->data_stack_base-size, size)); \
736 state->data_stack_base -= size; \
737 } while(0)
738
739 #define DATA_PUSH(x) \
740 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
741 #define DATA_POP(x) \
742 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
743 #define DATA_POP_DISCARD(x) \
744 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
745 #define DATA_ALLOC(t,p) \
746 DATA_STACK_ALLOC(state, t, p)
747 #define DATA_LOOKUP_AT(t,p,pos) \
748 DATA_STACK_LOOKUP_AT(state,t,p,pos)
749
750 #define MARK_PUSH(lastmark) \
751 do if (lastmark > 0) { \
752 i = lastmark; /* ctx->lastmark may change if reallocated */ \
753 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
754 } while (0)
755 #define MARK_POP(lastmark) \
756 do if (lastmark > 0) { \
757 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
758 } while (0)
759 #define MARK_POP_KEEP(lastmark) \
760 do if (lastmark > 0) { \
761 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
762 } while (0)
763 #define MARK_POP_DISCARD(lastmark) \
764 do if (lastmark > 0) { \
765 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
766 } while (0)
767
768 #define JUMP_NONE 0
769 #define JUMP_MAX_UNTIL_1 1
770 #define JUMP_MAX_UNTIL_2 2
771 #define JUMP_MAX_UNTIL_3 3
772 #define JUMP_MIN_UNTIL_1 4
773 #define JUMP_MIN_UNTIL_2 5
774 #define JUMP_MIN_UNTIL_3 6
775 #define JUMP_REPEAT 7
776 #define JUMP_REPEAT_ONE_1 8
777 #define JUMP_REPEAT_ONE_2 9
778 #define JUMP_MIN_REPEAT_ONE 10
779 #define JUMP_BRANCH 11
780 #define JUMP_ASSERT 12
781 #define JUMP_ASSERT_NOT 13
782
783 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
784 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
785 nextctx->last_ctx_pos = ctx_pos; \
786 nextctx->jump = jumpvalue; \
787 nextctx->pattern = nextpattern; \
788 ctx_pos = alloc_pos; \
789 ctx = nextctx; \
790 goto entrance; \
791 jumplabel: \
792 while (0) /* gcc doesn't like labels at end of scopes */ \
793
794 typedef struct {
795 Py_ssize_t last_ctx_pos;
796 Py_ssize_t jump;
797 SRE_CHAR* ptr;
798 SRE_CODE* pattern;
799 Py_ssize_t count;
800 Py_ssize_t lastmark;
801 Py_ssize_t lastindex;
802 union {
803 SRE_CODE chr;
804 SRE_REPEAT* rep;
805 } u;
806 } SRE_MATCH_CONTEXT;
807
808 /* check if string matches the given pattern. returns <0 for
809 error, 0 for failure, and 1 for success */
810 LOCAL(Py_ssize_t)
811 SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
812 {
813 SRE_CHAR* end = (SRE_CHAR *)state->end;
814 Py_ssize_t alloc_pos, ctx_pos = -1;
815 Py_ssize_t i, ret = 0;
816 Py_ssize_t jump;
817 unsigned int sigcount=0;
818
819 SRE_MATCH_CONTEXT* ctx;
820 SRE_MATCH_CONTEXT* nextctx;
821
822 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
823
824 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
825 ctx->last_ctx_pos = -1;
826 ctx->jump = JUMP_NONE;
827 ctx->pattern = pattern;
828 ctx_pos = alloc_pos;
829
830 entrance:
831
832 ctx->ptr = (SRE_CHAR *)state->ptr;
833
834 if (ctx->pattern[0] == SRE_OP_INFO) {
835 /* optimization info block */
836 /* <INFO> <1=skip> <2=flags> <3=min> ... */
837 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
838 TRACE(("reject (got %d chars, need %d)\n",
839 (end - ctx->ptr), ctx->pattern[3]));
840 RETURN_FAILURE;
841 }
842 ctx->pattern += ctx->pattern[1] + 1;
843 }
844
845 for (;;) {
846 ++sigcount;
847 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
848 RETURN_ERROR(SRE_ERROR_INTERRUPTED);
849
850 switch (*ctx->pattern++) {
851
852 case SRE_OP_MARK:
853 /* set mark */
854 /* <MARK> <gid> */
855 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
856 ctx->ptr, ctx->pattern[0]));
857 i = ctx->pattern[0];
858 if (i & 1)
859 state->lastindex = i/2 + 1;
860 if (i > state->lastmark) {
861 /* state->lastmark is the highest valid index in the
862 state->mark array. If it is increased by more than 1,
863 the intervening marks must be set to NULL to signal
864 that these marks have not been encountered. */
865 Py_ssize_t j = state->lastmark + 1;
866 while (j < i)
867 state->mark[j++] = NULL;
868 state->lastmark = i;
869 }
870 state->mark[i] = ctx->ptr;
871 ctx->pattern++;
872 break;
873
874 case SRE_OP_LITERAL:
875 /* match literal string */
876 /* <LITERAL> <code> */
877 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
878 ctx->ptr, *ctx->pattern));
879 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
880 RETURN_FAILURE;
881 ctx->pattern++;
882 ctx->ptr++;
883 break;
884
885 case SRE_OP_NOT_LITERAL:
886 /* match anything that is not literal character */
887 /* <NOT_LITERAL> <code> */
888 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
889 ctx->ptr, *ctx->pattern));
890 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
891 RETURN_FAILURE;
892 ctx->pattern++;
893 ctx->ptr++;
894 break;
895
896 case SRE_OP_SUCCESS:
897 /* end of pattern */
898 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
899 state->ptr = ctx->ptr;
900 RETURN_SUCCESS;
901
902 case SRE_OP_AT:
903 /* match at given position */
904 /* <AT> <code> */
905 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
906 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
907 RETURN_FAILURE;
908 ctx->pattern++;
909 break;
910
911 case SRE_OP_CATEGORY:
912 /* match at given category */
913 /* <CATEGORY> <code> */
914 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
915 ctx->ptr, *ctx->pattern));
916 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
917 RETURN_FAILURE;
918 ctx->pattern++;
919 ctx->ptr++;
920 break;
921
922 case SRE_OP_ANY:
923 /* match anything (except a newline) */
924 /* <ANY> */
925 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
926 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
927 RETURN_FAILURE;
928 ctx->ptr++;
929 break;
930
931 case SRE_OP_ANY_ALL:
932 /* match anything */
933 /* <ANY_ALL> */
934 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
935 if (ctx->ptr >= end)
936 RETURN_FAILURE;
937 ctx->ptr++;
938 break;
939
940 case SRE_OP_IN:
941 /* match set member (or non_member) */
942 /* <IN> <skip> <set> */
943 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
944 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
945 RETURN_FAILURE;
946 ctx->pattern += ctx->pattern[0];
947 ctx->ptr++;
948 break;
949
950 case SRE_OP_LITERAL_IGNORE:
951 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
952 ctx->pattern, ctx->ptr, ctx->pattern[0]));
953 if (ctx->ptr >= end ||
954 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
955 RETURN_FAILURE;
956 ctx->pattern++;
957 ctx->ptr++;
958 break;
959
960 case SRE_OP_NOT_LITERAL_IGNORE:
961 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
962 ctx->pattern, ctx->ptr, *ctx->pattern));
963 if (ctx->ptr >= end ||
964 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
965 RETURN_FAILURE;
966 ctx->pattern++;
967 ctx->ptr++;
968 break;
969
970 case SRE_OP_IN_IGNORE:
971 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
972 if (ctx->ptr >= end
973 || !SRE_CHARSET(ctx->pattern+1,
974 (SRE_CODE)state->lower(*ctx->ptr)))
975 RETURN_FAILURE;
976 ctx->pattern += ctx->pattern[0];
977 ctx->ptr++;
978 break;
979
980 case SRE_OP_JUMP:
981 case SRE_OP_INFO:
982 /* jump forward */
983 /* <JUMP> <offset> */
984 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
985 ctx->ptr, ctx->pattern[0]));
986 ctx->pattern += ctx->pattern[0];
987 break;
988
989 case SRE_OP_BRANCH:
990 /* alternation */
991 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
992 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
993 LASTMARK_SAVE();
994 ctx->u.rep = state->repeat;
995 if (ctx->u.rep)
996 MARK_PUSH(ctx->lastmark);
997 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
998 if (ctx->pattern[1] == SRE_OP_LITERAL &&
999 (ctx->ptr >= end ||
1000 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
1001 continue;
1002 if (ctx->pattern[1] == SRE_OP_IN &&
1003 (ctx->ptr >= end ||
1004 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
1005 continue;
1006 state->ptr = ctx->ptr;
1007 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
1008 if (ret) {
1009 if (ctx->u.rep)
1010 MARK_POP_DISCARD(ctx->lastmark);
1011 RETURN_ON_ERROR(ret);
1012 RETURN_SUCCESS;
1013 }
1014 if (ctx->u.rep)
1015 MARK_POP_KEEP(ctx->lastmark);
1016 LASTMARK_RESTORE();
1017 }
1018 if (ctx->u.rep)
1019 MARK_POP_DISCARD(ctx->lastmark);
1020 RETURN_FAILURE;
1021
1022 case SRE_OP_REPEAT_ONE:
1023 /* match repeated sequence (maximizing regexp) */
1024
1025 /* this operator only works if the repeated item is
1026 exactly one character wide, and we're not already
1027 collecting backtracking points. for other cases,
1028 use the MAX_REPEAT operator */
1029
1030 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1031
1032 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1033 ctx->pattern[1], ctx->pattern[2]));
1034
1035 if (ctx->ptr + ctx->pattern[1] > end)
1036 RETURN_FAILURE; /* cannot match */
1037
1038 state->ptr = ctx->ptr;
1039
1040 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1041 RETURN_ON_ERROR(ret);
1042 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1043 ctx->count = ret;
1044 ctx->ptr += ctx->count;
1045
1046 /* when we arrive here, count contains the number of
1047 matches, and ctx->ptr points to the tail of the target
1048 string. check if the rest of the pattern matches,
1049 and backtrack if not. */
1050
1051 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1052 RETURN_FAILURE;
1053
1054 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1055 /* tail is empty. we're finished */
1056 state->ptr = ctx->ptr;
1057 RETURN_SUCCESS;
1058 }
1059
1060 LASTMARK_SAVE();
1061
1062 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
1063 /* tail starts with a literal. skip positions where
1064 the rest of the pattern cannot possibly match */
1065 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
1066 for (;;) {
1067 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
1068 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1069 ctx->ptr--;
1070 ctx->count--;
1071 }
1072 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1073 break;
1074 state->ptr = ctx->ptr;
1075 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1076 ctx->pattern+ctx->pattern[0]);
1077 if (ret) {
1078 RETURN_ON_ERROR(ret);
1079 RETURN_SUCCESS;
1080 }
1081
1082 LASTMARK_RESTORE();
1083
1084 ctx->ptr--;
1085 ctx->count--;
1086 }
1087
1088 } else {
1089 /* general case */
1090 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
1091 state->ptr = ctx->ptr;
1092 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1093 ctx->pattern+ctx->pattern[0]);
1094 if (ret) {
1095 RETURN_ON_ERROR(ret);
1096 RETURN_SUCCESS;
1097 }
1098 ctx->ptr--;
1099 ctx->count--;
1100 LASTMARK_RESTORE();
1101 }
1102 }
1103 RETURN_FAILURE;
1104
1105 case SRE_OP_MIN_REPEAT_ONE:
1106 /* match repeated sequence (minimizing regexp) */
1107
1108 /* this operator only works if the repeated item is
1109 exactly one character wide, and we're not already
1110 collecting backtracking points. for other cases,
1111 use the MIN_REPEAT operator */
1112
1113 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1114
1115 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1116 ctx->pattern[1], ctx->pattern[2]));
1117
1118 if (ctx->ptr + ctx->pattern[1] > end)
1119 RETURN_FAILURE; /* cannot match */
1120
1121 state->ptr = ctx->ptr;
1122
1123 if (ctx->pattern[1] == 0)
1124 ctx->count = 0;
1125 else {
1126 /* count using pattern min as the maximum */
1127 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1128 RETURN_ON_ERROR(ret);
1129 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1130 if (ret < (Py_ssize_t) ctx->pattern[1])
1131 /* didn't match minimum number of times */
1132 RETURN_FAILURE;
1133 /* advance past minimum matches of repeat */
1134 ctx->count = ret;
1135 ctx->ptr += ctx->count;
1136 }
1137
1138 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1139 /* tail is empty. we're finished */
1140 state->ptr = ctx->ptr;
1141 RETURN_SUCCESS;
1142
1143 } else {
1144 /* general case */
1145 LASTMARK_SAVE();
1146 while ((Py_ssize_t)ctx->pattern[2] == 65535
1147 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
1148 state->ptr = ctx->ptr;
1149 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1150 ctx->pattern+ctx->pattern[0]);
1151 if (ret) {
1152 RETURN_ON_ERROR(ret);
1153 RETURN_SUCCESS;
1154 }
1155 state->ptr = ctx->ptr;
1156 ret = SRE_COUNT(state, ctx->pattern+3, 1);
1157 RETURN_ON_ERROR(ret);
1158 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1159 if (ret == 0)
1160 break;
1161 assert(ret == 1);
1162 ctx->ptr++;
1163 ctx->count++;
1164 LASTMARK_RESTORE();
1165 }
1166 }
1167 RETURN_FAILURE;
1168
1169 case SRE_OP_REPEAT:
1170 /* create repeat context. all the hard work is done
1171 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1172 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1173 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1174 ctx->pattern[1], ctx->pattern[2]));
1175
1176 /* install new repeat context */
1177 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
1178 if (!ctx->u.rep) {
1179 PyErr_NoMemory();
1180 RETURN_FAILURE;
1181 }
1182 ctx->u.rep->count = -1;
1183 ctx->u.rep->pattern = ctx->pattern;
1184 ctx->u.rep->prev = state->repeat;
1185 ctx->u.rep->last_ptr = NULL;
1186 state->repeat = ctx->u.rep;
1187
1188 state->ptr = ctx->ptr;
1189 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
1190 state->repeat = ctx->u.rep->prev;
1191 PyObject_FREE(ctx->u.rep);
1192
1193 if (ret) {
1194 RETURN_ON_ERROR(ret);
1195 RETURN_SUCCESS;
1196 }
1197 RETURN_FAILURE;
1198
1199 case SRE_OP_MAX_UNTIL:
1200 /* maximizing repeat */
1201 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1202
1203 /* FIXME: we probably need to deal with zero-width
1204 matches in here... */
1205
1206 ctx->u.rep = state->repeat;
1207 if (!ctx->u.rep)
1208 RETURN_ERROR(SRE_ERROR_STATE);
1209
1210 state->ptr = ctx->ptr;
1211
1212 ctx->count = ctx->u.rep->count+1;
1213
1214 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1215 ctx->ptr, ctx->count));
1216
1217 if (ctx->count < ctx->u.rep->pattern[1]) {
1218 /* not enough matches */
1219 ctx->u.rep->count = ctx->count;
1220 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1221 ctx->u.rep->pattern+3);
1222 if (ret) {
1223 RETURN_ON_ERROR(ret);
1224 RETURN_SUCCESS;
1225 }
1226 ctx->u.rep->count = ctx->count-1;
1227 state->ptr = ctx->ptr;
1228 RETURN_FAILURE;
1229 }
1230
1231 if ((ctx->count < ctx->u.rep->pattern[2] ||
1232 ctx->u.rep->pattern[2] == 65535) &&
1233 state->ptr != ctx->u.rep->last_ptr) {
1234 /* we may have enough matches, but if we can
1235 match another item, do so */
1236 ctx->u.rep->count = ctx->count;
1237 LASTMARK_SAVE();
1238 MARK_PUSH(ctx->lastmark);
1239 /* zero-width match protection */
1240 DATA_PUSH(&ctx->u.rep->last_ptr);
1241 ctx->u.rep->last_ptr = state->ptr;
1242 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1243 ctx->u.rep->pattern+3);
1244 DATA_POP(&ctx->u.rep->last_ptr);
1245 if (ret) {
1246 MARK_POP_DISCARD(ctx->lastmark);
1247 RETURN_ON_ERROR(ret);
1248 RETURN_SUCCESS;
1249 }
1250 MARK_POP(ctx->lastmark);
1251 LASTMARK_RESTORE();
1252 ctx->u.rep->count = ctx->count-1;
1253 state->ptr = ctx->ptr;
1254 }
1255
1256 /* cannot match more repeated items here. make sure the
1257 tail matches */
1258 state->repeat = ctx->u.rep->prev;
1259 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
1260 RETURN_ON_SUCCESS(ret);
1261 state->repeat = ctx->u.rep;
1262 state->ptr = ctx->ptr;
1263 RETURN_FAILURE;
1264
1265 case SRE_OP_MIN_UNTIL:
1266 /* minimizing repeat */
1267 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1268
1269 ctx->u.rep = state->repeat;
1270 if (!ctx->u.rep)
1271 RETURN_ERROR(SRE_ERROR_STATE);
1272
1273 state->ptr = ctx->ptr;
1274
1275 ctx->count = ctx->u.rep->count+1;
1276
1277 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1278 ctx->ptr, ctx->count, ctx->u.rep->pattern));
1279
1280 if (ctx->count < ctx->u.rep->pattern[1]) {
1281 /* not enough matches */
1282 ctx->u.rep->count = ctx->count;
1283 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1284 ctx->u.rep->pattern+3);
1285 if (ret) {
1286 RETURN_ON_ERROR(ret);
1287 RETURN_SUCCESS;
1288 }
1289 ctx->u.rep->count = ctx->count-1;
1290 state->ptr = ctx->ptr;
1291 RETURN_FAILURE;
1292 }
1293
1294 LASTMARK_SAVE();
1295
1296 /* see if the tail matches */
1297 state->repeat = ctx->u.rep->prev;
1298 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
1299 if (ret) {
1300 RETURN_ON_ERROR(ret);
1301 RETURN_SUCCESS;
1302 }
1303
1304 state->repeat = ctx->u.rep;
1305 state->ptr = ctx->ptr;
1306
1307 LASTMARK_RESTORE();
1308
1309 if (ctx->count >= ctx->u.rep->pattern[2]
1310 && ctx->u.rep->pattern[2] != 65535)
1311 RETURN_FAILURE;
1312
1313 ctx->u.rep->count = ctx->count;
1314 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1315 ctx->u.rep->pattern+3);
1316 if (ret) {
1317 RETURN_ON_ERROR(ret);
1318 RETURN_SUCCESS;
1319 }
1320 ctx->u.rep->count = ctx->count-1;
1321 state->ptr = ctx->ptr;
1322 RETURN_FAILURE;
1323
1324 case SRE_OP_GROUPREF:
1325 /* match backreference */
1326 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1327 ctx->ptr, ctx->pattern[0]));
1328 i = ctx->pattern[0];
1329 {
1330 Py_ssize_t groupref = i+i;
1331 if (groupref >= state->lastmark) {
1332 RETURN_FAILURE;
1333 } else {
1334 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1335 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1336 if (!p || !e || e < p)
1337 RETURN_FAILURE;
1338 while (p < e) {
1339 if (ctx->ptr >= end || *ctx->ptr != *p)
1340 RETURN_FAILURE;
1341 p++; ctx->ptr++;
1342 }
1343 }
1344 }
1345 ctx->pattern++;
1346 break;
1347
1348 case SRE_OP_GROUPREF_IGNORE:
1349 /* match backreference */
1350 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1351 ctx->ptr, ctx->pattern[0]));
1352 i = ctx->pattern[0];
1353 {
1354 Py_ssize_t groupref = i+i;
1355 if (groupref >= state->lastmark) {
1356 RETURN_FAILURE;
1357 } else {
1358 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1359 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1360 if (!p || !e || e < p)
1361 RETURN_FAILURE;
1362 while (p < e) {
1363 if (ctx->ptr >= end ||
1364 state->lower(*ctx->ptr) != state->lower(*p))
1365 RETURN_FAILURE;
1366 p++; ctx->ptr++;
1367 }
1368 }
1369 }
1370 ctx->pattern++;
1371 break;
1372
1373 case SRE_OP_GROUPREF_EXISTS:
1374 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1375 ctx->ptr, ctx->pattern[0]));
1376 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1377 i = ctx->pattern[0];
1378 {
1379 Py_ssize_t groupref = i+i;
1380 if (groupref >= state->lastmark) {
1381 ctx->pattern += ctx->pattern[1];
1382 break;
1383 } else {
1384 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1385 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1386 if (!p || !e || e < p) {
1387 ctx->pattern += ctx->pattern[1];
1388 break;
1389 }
1390 }
1391 }
1392 ctx->pattern += 2;
1393 break;
1394
1395 case SRE_OP_ASSERT:
1396 /* assert subpattern */
1397 /* <ASSERT> <skip> <back> <pattern> */
1398 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1399 ctx->ptr, ctx->pattern[1]));
1400 state->ptr = ctx->ptr - ctx->pattern[1];
1401 if (state->ptr < state->beginning)
1402 RETURN_FAILURE;
1403 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
1404 RETURN_ON_FAILURE(ret);
1405 ctx->pattern += ctx->pattern[0];
1406 break;
1407
1408 case SRE_OP_ASSERT_NOT:
1409 /* assert not subpattern */
1410 /* <ASSERT_NOT> <skip> <back> <pattern> */
1411 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1412 ctx->ptr, ctx->pattern[1]));
1413 state->ptr = ctx->ptr - ctx->pattern[1];
1414 if (state->ptr >= state->beginning) {
1415 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
1416 if (ret) {
1417 RETURN_ON_ERROR(ret);
1418 RETURN_FAILURE;
1419 }
1420 }
1421 ctx->pattern += ctx->pattern[0];
1422 break;
1423
1424 case SRE_OP_FAILURE:
1425 /* immediate failure */
1426 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1427 RETURN_FAILURE;
1428
1429 default:
1430 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1431 ctx->pattern[-1]));
1432 RETURN_ERROR(SRE_ERROR_ILLEGAL);
1433 }
1434 }
1435
1436 exit:
1437 ctx_pos = ctx->last_ctx_pos;
1438 jump = ctx->jump;
1439 DATA_POP_DISCARD(ctx);
1440 if (ctx_pos == -1)
1441 return ret;
1442 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1443
1444 switch (jump) {
1445 case JUMP_MAX_UNTIL_2:
1446 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1447 goto jump_max_until_2;
1448 case JUMP_MAX_UNTIL_3:
1449 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1450 goto jump_max_until_3;
1451 case JUMP_MIN_UNTIL_2:
1452 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1453 goto jump_min_until_2;
1454 case JUMP_MIN_UNTIL_3:
1455 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1456 goto jump_min_until_3;
1457 case JUMP_BRANCH:
1458 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1459 goto jump_branch;
1460 case JUMP_MAX_UNTIL_1:
1461 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1462 goto jump_max_until_1;
1463 case JUMP_MIN_UNTIL_1:
1464 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1465 goto jump_min_until_1;
1466 case JUMP_REPEAT:
1467 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1468 goto jump_repeat;
1469 case JUMP_REPEAT_ONE_1:
1470 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1471 goto jump_repeat_one_1;
1472 case JUMP_REPEAT_ONE_2:
1473 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1474 goto jump_repeat_one_2;
1475 case JUMP_MIN_REPEAT_ONE:
1476 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1477 goto jump_min_repeat_one;
1478 case JUMP_ASSERT:
1479 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1480 goto jump_assert;
1481 case JUMP_ASSERT_NOT:
1482 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1483 goto jump_assert_not;
1484 case JUMP_NONE:
1485 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1486 break;
1487 }
1488
1489 return ret; /* should never get here */
1490 }
1491
1492 LOCAL(Py_ssize_t)
1493 SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1494 {
1495 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1496 SRE_CHAR* end = (SRE_CHAR *)state->end;
1497 Py_ssize_t status = 0;
1498 Py_ssize_t prefix_len = 0;
1499 Py_ssize_t prefix_skip = 0;
1500 SRE_CODE* prefix = NULL;
1501 SRE_CODE* charset = NULL;
1502 SRE_CODE* overlap = NULL;
1503 int flags = 0;
1504
1505 if (pattern[0] == SRE_OP_INFO) {
1506 /* optimization info block */
1507 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1508
1509 flags = pattern[2];
1510
1511 if (pattern[3] > 1) {
1512 /* adjust end point (but make sure we leave at least one
1513 character in there, so literal search will work) */
1514 end -= pattern[3]-1;
1515 if (end <= ptr)
1516 end = ptr+1;
1517 }
1518
1519 if (flags & SRE_INFO_PREFIX) {
1520 /* pattern starts with a known prefix */
1521 /* <length> <skip> <prefix data> <overlap data> */
1522 prefix_len = pattern[5];
1523 prefix_skip = pattern[6];
1524 prefix = pattern + 7;
1525 overlap = prefix + prefix_len - 1;
1526 } else if (flags & SRE_INFO_CHARSET)
1527 /* pattern starts with a character from a known set */
1528 /* <charset> */
1529 charset = pattern + 5;
1530
1531 pattern += 1 + pattern[1];
1532 }
1533
1534 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1535 TRACE(("charset = %p\n", charset));
1536
1537 #if defined(USE_FAST_SEARCH)
1538 if (prefix_len > 1) {
1539 /* pattern starts with a known prefix. use the overlap
1540 table to skip forward as fast as we possibly can */
1541 Py_ssize_t i = 0;
1542 end = (SRE_CHAR *)state->end;
1543 while (ptr < end) {
1544 for (;;) {
1545 if ((SRE_CODE) ptr[0] != prefix[i]) {
1546 if (!i)
1547 break;
1548 else
1549 i = overlap[i];
1550 } else {
1551 if (++i == prefix_len) {
1552 /* found a potential match */
1553 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1554 state->start = ptr + 1 - prefix_len;
1555 state->ptr = ptr + 1 - prefix_len + prefix_skip;
1556 if (flags & SRE_INFO_LITERAL)
1557 return 1; /* we got all of it */
1558 status = SRE_MATCH(state, pattern + 2*prefix_skip);
1559 if (status != 0)
1560 return status;
1561 /* close but no cigar -- try again */
1562 i = overlap[i];
1563 }
1564 break;
1565 }
1566 }
1567 ptr++;
1568 }
1569 return 0;
1570 }
1571 #endif
1572
1573 if (pattern[0] == SRE_OP_LITERAL) {
1574 /* pattern starts with a literal character. this is used
1575 for short prefixes, and if fast search is disabled */
1576 SRE_CODE chr = pattern[1];
1577 end = (SRE_CHAR *)state->end;
1578 for (;;) {
1579 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1580 ptr++;
1581 if (ptr >= end)
1582 return 0;
1583 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1584 state->start = ptr;
1585 state->ptr = ++ptr;
1586 if (flags & SRE_INFO_LITERAL)
1587 return 1; /* we got all of it */
1588 status = SRE_MATCH(state, pattern + 2);
1589 if (status != 0)
1590 break;
1591 }
1592 } else if (charset) {
1593 /* pattern starts with a character from a known set */
1594 end = (SRE_CHAR *)state->end;
1595 for (;;) {
1596 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
1597 ptr++;
1598 if (ptr >= end)
1599 return 0;
1600 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
1601 state->start = ptr;
1602 state->ptr = ptr;
1603 status = SRE_MATCH(state, pattern);
1604 if (status != 0)
1605 break;
1606 ptr++;
1607 }
1608 } else
1609 /* general case */
1610 while (ptr <= end) {
1611 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
1612 state->start = state->ptr = ptr++;
1613 status = SRE_MATCH(state, pattern);
1614 if (status != 0)
1615 break;
1616 }
1617
1618 return status;
1619 }
1620
1621 LOCAL(int)
1622 SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
1623 {
1624 /* check if given string is a literal template (i.e. no escapes) */
1625 while (len-- > 0)
1626 if (*ptr++ == '\\')
1627 return 0;
1628 return 1;
1629 }
1630
1631 #if !defined(SRE_RECURSIVE)
1632
1633 /* -------------------------------------------------------------------- */
1634 /* factories and destructors */
1635
1636 /* see sre.h for object declarations */
1637 static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1638 static PyObject*pattern_scanner(PatternObject*, PyObject*);
1639
1640 static PyObject *
1641 sre_codesize(PyObject* self, PyObject *unused)
1642 {
1643 return Py_BuildValue("l", sizeof(SRE_CODE));
1644 }
1645
1646 static PyObject *
1647 sre_getlower(PyObject* self, PyObject* args)
1648 {
1649 int character, flags;
1650 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1651 return NULL;
1652 if (flags & SRE_FLAG_LOCALE)
1653 return Py_BuildValue("i", sre_lower_locale(character));
1654 if (flags & SRE_FLAG_UNICODE)
1655 #if defined(HAVE_UNICODE)
1656 return Py_BuildValue("i", sre_lower_unicode(character));
1657 #else
1658 return Py_BuildValue("i", sre_lower_locale(character));
1659 #endif
1660 return Py_BuildValue("i", sre_lower(character));
1661 }
1662
1663 LOCAL(void)
1664 state_reset(SRE_STATE* state)
1665 {
1666 /* FIXME: dynamic! */
1667 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
1668
1669 state->lastmark = -1;
1670 state->lastindex = -1;
1671
1672 state->repeat = NULL;
1673
1674 data_stack_dealloc(state);
1675 }
1676
1677 static void*
1678 getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
1679 {
1680 /* given a python object, return a data pointer, a length (in
1681 characters), and a character size. return NULL if the object
1682 is not a string (or not compatible) */
1683
1684 PyBufferProcs *buffer;
1685 Py_ssize_t size, bytes;
1686 int charsize;
1687 void* ptr;
1688
1689 #if defined(HAVE_UNICODE)
1690 if (PyUnicode_Check(string)) {
1691 /* unicode strings doesn't always support the buffer interface */
1692 ptr = (void*) PyUnicode_AS_DATA(string);
1693 /* bytes = PyUnicode_GET_DATA_SIZE(string); */
1694 size = PyUnicode_GET_SIZE(string);
1695 charsize = sizeof(Py_UNICODE);
1696
1697 } else {
1698 #endif
1699
1700 /* get pointer to string buffer */
1701 buffer = Py_TYPE(string)->tp_as_buffer;
1702 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1703 buffer->bf_getsegcount(string, NULL) != 1) {
1704 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1705 return NULL;
1706 }
1707
1708 /* determine buffer size */
1709 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1710 if (bytes < 0) {
1711 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1712 return NULL;
1713 }
1714
1715 /* determine character size */
1716 #if PY_VERSION_HEX >= 0x01060000
1717 size = PyObject_Size(string);
1718 #else
1719 size = PyObject_Length(string);
1720 #endif
1721
1722 if (PyString_Check(string) || bytes == size)
1723 charsize = 1;
1724 #if defined(HAVE_UNICODE)
1725 else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
1726 charsize = sizeof(Py_UNICODE);
1727 #endif
1728 else {
1729 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1730 return NULL;
1731 }
1732
1733 #if defined(HAVE_UNICODE)
1734 }
1735 #endif
1736
1737 *p_length = size;
1738 *p_charsize = charsize;
1739
1740 return ptr;
1741 }
1742
1743 LOCAL(PyObject*)
1744 state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1745 Py_ssize_t start, Py_ssize_t end)
1746 {
1747 /* prepare state object */
1748
1749 Py_ssize_t length;
1750 int charsize;
1751 void* ptr;
1752
1753 memset(state, 0, sizeof(SRE_STATE));
1754
1755 state->lastmark = -1;
1756 state->lastindex = -1;
1757
1758 ptr = getstring(string, &length, &charsize);
1759 if (!ptr)
1760 return NULL;
1761
1762 /* adjust boundaries */
1763 if (start < 0)
1764 start = 0;
1765 else if (start > length)
1766 start = length;
1767
1768 if (end < 0)
1769 end = 0;
1770 else if (end > length)
1771 end = length;
1772
1773 state->charsize = charsize;
1774
1775 state->beginning = ptr;
1776
1777 state->start = (void*) ((char*) ptr + start * state->charsize);
1778 state->end = (void*) ((char*) ptr + end * state->charsize);
1779
1780 Py_INCREF(string);
1781 state->string = string;
1782 state->pos = start;
1783 state->endpos = end;
1784
1785 if (pattern->flags & SRE_FLAG_LOCALE)
1786 state->lower = sre_lower_locale;
1787 else if (pattern->flags & SRE_FLAG_UNICODE)
1788 #if defined(HAVE_UNICODE)
1789 state->lower = sre_lower_unicode;
1790 #else
1791 state->lower = sre_lower_locale;
1792 #endif
1793 else
1794 state->lower = sre_lower;
1795
1796 return string;
1797 }
1798
1799 LOCAL(void)
1800 state_fini(SRE_STATE* state)
1801 {
1802 Py_XDECREF(state->string);
1803 data_stack_dealloc(state);
1804 }
1805
1806 /* calculate offset from start of string */
1807 #define STATE_OFFSET(state, member)\
1808 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1809
1810 LOCAL(PyObject*)
1811 state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
1812 {
1813 Py_ssize_t i, j;
1814
1815 index = (index - 1) * 2;
1816
1817 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
1818 if (empty)
1819 /* want empty string */
1820 i = j = 0;
1821 else {
1822 Py_INCREF(Py_None);
1823 return Py_None;
1824 }
1825 } else {
1826 i = STATE_OFFSET(state, state->mark[index]);
1827 j = STATE_OFFSET(state, state->mark[index+1]);
1828 }
1829
1830 return PySequence_GetSlice(string, i, j);
1831 }
1832
1833 static void
1834 pattern_error(int status)
1835 {
1836 switch (status) {
1837 case SRE_ERROR_RECURSION_LIMIT:
1838 PyErr_SetString(
1839 PyExc_RuntimeError,
1840 "maximum recursion limit exceeded"
1841 );
1842 break;
1843 case SRE_ERROR_MEMORY:
1844 PyErr_NoMemory();
1845 break;
1846 case SRE_ERROR_INTERRUPTED:
1847 /* An exception has already been raised, so let it fly */
1848 break;
1849 default:
1850 /* other error codes indicate compiler/engine bugs */
1851 PyErr_SetString(
1852 PyExc_RuntimeError,
1853 "internal error in regular expression engine"
1854 );
1855 }
1856 }
1857
1858 static void
1859 pattern_dealloc(PatternObject* self)
1860 {
1861 if (self->weakreflist != NULL)
1862 PyObject_ClearWeakRefs((PyObject *) self);
1863 Py_XDECREF(self->pattern);
1864 Py_XDECREF(self->groupindex);
1865 Py_XDECREF(self->indexgroup);
1866 PyObject_DEL(self);
1867 }
1868
1869 static PyObject*
1870 pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
1871 {
1872 SRE_STATE state;
1873 int status;
1874
1875 PyObject* string;
1876 Py_ssize_t start = 0;
1877 Py_ssize_t end = PY_SSIZE_T_MAX;
1878 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1879 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
1880 &string, &start, &end))
1881 return NULL;
1882
1883 string = state_init(&state, self, string, start, end);
1884 if (!string)
1885 return NULL;
1886
1887 state.ptr = state.start;
1888
1889 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1890
1891 if (state.charsize == 1) {
1892 status = sre_match(&state, PatternObject_GetCode(self));
1893 } else {
1894 #if defined(HAVE_UNICODE)
1895 status = sre_umatch(&state, PatternObject_GetCode(self));
1896 #endif
1897 }
1898
1899 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1900 if (PyErr_Occurred())
1901 return NULL;
1902
1903 state_fini(&state);
1904
1905 return pattern_new_match(self, &state, status);
1906 }
1907
1908 static PyObject*
1909 pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
1910 {
1911 SRE_STATE state;
1912 int status;
1913
1914 PyObject* string;
1915 Py_ssize_t start = 0;
1916 Py_ssize_t end = PY_SSIZE_T_MAX;
1917 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1918 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
1919 &string, &start, &end))
1920 return NULL;
1921
1922 string = state_init(&state, self, string, start, end);
1923 if (!string)
1924 return NULL;
1925
1926 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1927
1928 if (state.charsize == 1) {
1929 status = sre_search(&state, PatternObject_GetCode(self));
1930 } else {
1931 #if defined(HAVE_UNICODE)
1932 status = sre_usearch(&state, PatternObject_GetCode(self));
1933 #endif
1934 }
1935
1936 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1937
1938 state_fini(&state);
1939
1940 if (PyErr_Occurred())
1941 return NULL;
1942
1943 return pattern_new_match(self, &state, status);
1944 }
1945
1946 static PyObject*
1947 call(char* module, char* function, PyObject* args)
1948 {
1949 PyObject* name;
1950 PyObject* mod;
1951 PyObject* func;
1952 PyObject* result;
1953
1954 if (!args)
1955 return NULL;
1956 name = PyString_FromString(module);
1957 if (!name)
1958 return NULL;
1959 mod = PyImport_Import(name);
1960 Py_DECREF(name);
1961 if (!mod)
1962 return NULL;
1963 func = PyObject_GetAttrString(mod, function);
1964 Py_DECREF(mod);
1965 if (!func)
1966 return NULL;
1967 result = PyObject_CallObject(func, args);
1968 Py_DECREF(func);
1969 Py_DECREF(args);
1970 return result;
1971 }
1972
1973 #ifdef USE_BUILTIN_COPY
1974 static int
1975 deepcopy(PyObject** object, PyObject* memo)
1976 {
1977 PyObject* copy;
1978
1979 copy = call(
1980 "copy", "deepcopy",
1981 PyTuple_Pack(2, *object, memo)
1982 );
1983 if (!copy)
1984 return 0;
1985
1986 Py_DECREF(*object);
1987 *object = copy;
1988
1989 return 1; /* success */
1990 }
1991 #endif
1992
1993 static PyObject*
1994 join_list(PyObject* list, PyObject* string)
1995 {
1996 /* join list elements */
1997
1998 PyObject* joiner;
1999 #if PY_VERSION_HEX >= 0x01060000
2000 PyObject* function;
2001 PyObject* args;
2002 #endif
2003 PyObject* result;
2004
2005 joiner = PySequence_GetSlice(string, 0, 0);
2006 if (!joiner)
2007 return NULL;
2008
2009 if (PyList_GET_SIZE(list) == 0) {
2010 Py_DECREF(list);
2011 return joiner;
2012 }
2013
2014 #if PY_VERSION_HEX >= 0x01060000
2015 function = PyObject_GetAttrString(joiner, "join");
2016 if (!function) {
2017 Py_DECREF(joiner);
2018 return NULL;
2019 }
2020 args = PyTuple_New(1);
2021 if (!args) {
2022 Py_DECREF(function);
2023 Py_DECREF(joiner);
2024 return NULL;
2025 }
2026 PyTuple_SET_ITEM(args, 0, list);
2027 result = PyObject_CallObject(function, args);
2028 Py_DECREF(args); /* also removes list */
2029 Py_DECREF(function);
2030 #else
2031 result = call(
2032 "string", "join",
2033 PyTuple_Pack(2, list, joiner)
2034 );
2035 #endif
2036 Py_DECREF(joiner);
2037
2038 return result;
2039 }
2040
2041 static PyObject*
2042 pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
2043 {
2044 SRE_STATE state;
2045 PyObject* list;
2046 int status;
2047 Py_ssize_t i, b, e;
2048
2049 PyObject* string;
2050 Py_ssize_t start = 0;
2051 Py_ssize_t end = PY_SSIZE_T_MAX;
2052 static char* kwlist[] = { "source", "pos", "endpos", NULL };
2053 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
2054 &string, &start, &end))
2055 return NULL;
2056
2057 string = state_init(&state, self, string, start, end);
2058 if (!string)
2059 return NULL;
2060
2061 list = PyList_New(0);
2062 if (!list) {
2063 state_fini(&state);
2064 return NULL;
2065 }
2066
2067 while (state.start <= state.end) {
2068
2069 PyObject* item;
2070
2071 state_reset(&state);
2072
2073 state.ptr = state.start;
2074
2075 if (state.charsize == 1) {
2076 status = sre_search(&state, PatternObject_GetCode(self));
2077 } else {
2078 #if defined(HAVE_UNICODE)
2079 status = sre_usearch(&state, PatternObject_GetCode(self));
2080 #endif
2081 }
2082
2083 if (PyErr_Occurred())
2084 goto error;
2085
2086 if (status <= 0) {
2087 if (status == 0)
2088 break;
2089 pattern_error(status);
2090 goto error;
2091 }
2092
2093 /* don't bother to build a match object */
2094 switch (self->groups) {
2095 case 0:
2096 b = STATE_OFFSET(&state, state.start);
2097 e = STATE_OFFSET(&state, state.ptr);
2098 item = PySequence_GetSlice(string, b, e);
2099 if (!item)
2100 goto error;
2101 break;
2102 case 1:
2103 item = state_getslice(&state, 1, string, 1);
2104 if (!item)
2105 goto error;
2106 break;
2107 default:
2108 item = PyTuple_New(self->groups);
2109 if (!item)
2110 goto error;
2111 for (i = 0; i < self->groups; i++) {
2112 PyObject* o = state_getslice(&state, i+1, string, 1);
2113 if (!o) {
2114 Py_DECREF(item);
2115 goto error;
2116 }
2117 PyTuple_SET_ITEM(item, i, o);
2118 }
2119 break;
2120 }
2121
2122 status = PyList_Append(list, item);
2123 Py_DECREF(item);
2124 if (status < 0)
2125 goto error;
2126
2127 if (state.ptr == state.start)
2128 state.start = (void*) ((char*) state.ptr + state.charsize);
2129 else
2130 state.start = state.ptr;
2131
2132 }
2133
2134 state_fini(&state);
2135 return list;
2136
2137 error:
2138 Py_DECREF(list);
2139 state_fini(&state);
2140 return NULL;
2141
2142 }
2143
2144 #if PY_VERSION_HEX >= 0x02020000
2145 static PyObject*
2146 pattern_finditer(PatternObject* pattern, PyObject* args)
2147 {
2148 PyObject* scanner;
2149 PyObject* search;
2150 PyObject* iterator;
2151
2152 scanner = pattern_scanner(pattern, args);
2153 if (!scanner)
2154 return NULL;
2155
2156 search = PyObject_GetAttrString(scanner, "search");
2157 Py_DECREF(scanner);
2158 if (!search)
2159 return NULL;
2160
2161 iterator = PyCallIter_New(search, Py_None);
2162 Py_DECREF(search);
2163
2164 return iterator;
2165 }
2166 #endif
2167
2168 static PyObject*
2169 pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2170 {
2171 SRE_STATE state;
2172 PyObject* list;
2173 PyObject* item;
2174 int status;
2175 Py_ssize_t n;
2176 Py_ssize_t i;
2177 void* last;
2178
2179 PyObject* string;
2180 Py_ssize_t maxsplit = 0;
2181 static char* kwlist[] = { "source", "maxsplit", NULL };
2182 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
2183 &string, &maxsplit))
2184 return NULL;
2185
2186 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2187 if (!string)
2188 return NULL;
2189
2190 list = PyList_New(0);
2191 if (!list) {
2192 state_fini(&state);
2193 return NULL;
2194 }
2195
2196 n = 0;
2197 last = state.start;
2198
2199 while (!maxsplit || n < maxsplit) {
2200
2201 state_reset(&state);
2202
2203 state.ptr = state.start;
2204
2205 if (state.charsize == 1) {
2206 status = sre_search(&state, PatternObject_GetCode(self));
2207 } else {
2208 #if defined(HAVE_UNICODE)
2209 status = sre_usearch(&state, PatternObject_GetCode(self));
2210 #endif
2211 }
2212
2213 if (PyErr_Occurred())
2214 goto error;
2215
2216 if (status <= 0) {
2217 if (status == 0)
2218 break;
2219 pattern_error(status);
2220 goto error;
2221 }
2222
2223 if (state.start == state.ptr) {
2224 if (last == state.end)
2225 break;
2226 /* skip one character */
2227 state.start = (void*) ((char*) state.ptr + state.charsize);
2228 continue;
2229 }
2230
2231 /* get segment before this match */
2232 item = PySequence_GetSlice(
2233 string, STATE_OFFSET(&state, last),
2234 STATE_OFFSET(&state, state.start)
2235 );
2236 if (!item)
2237 goto error;
2238 status = PyList_Append(list, item);
2239 Py_DECREF(item);
2240 if (status < 0)
2241 goto error;
2242
2243 /* add groups (if any) */
2244 for (i = 0; i < self->groups; i++) {
2245 item = state_getslice(&state, i+1, string, 0);
2246 if (!item)
2247 goto error;
2248 status = PyList_Append(list, item);
2249 Py_DECREF(item);
2250 if (status < 0)
2251 goto error;
2252 }
2253
2254 n = n + 1;
2255
2256 last = state.start = state.ptr;
2257
2258 }
2259
2260 /* get segment following last match (even if empty) */
2261 item = PySequence_GetSlice(
2262 string, STATE_OFFSET(&state, last), state.endpos
2263 );
2264 if (!item)
2265 goto error;
2266 status = PyList_Append(list, item);
2267 Py_DECREF(item);
2268 if (status < 0)
2269 goto error;
2270
2271 state_fini(&state);
2272 return list;
2273
2274 error:
2275 Py_DECREF(list);
2276 state_fini(&state);
2277 return NULL;
2278
2279 }
2280
2281 static PyObject*
2282 pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
2283 Py_ssize_t count, Py_ssize_t subn)
2284 {
2285 SRE_STATE state;
2286 PyObject* list;
2287 PyObject* item;
2288 PyObject* filter;
2289 PyObject* args;
2290 PyObject* match;
2291 void* ptr;
2292 int status;
2293 Py_ssize_t n;
2294 Py_ssize_t i, b, e;
2295 int bint;
2296 int filter_is_callable;
2297
2298 if (PyCallable_Check(ptemplate)) {
2299 /* sub/subn takes either a function or a template */
2300 filter = ptemplate;
2301 Py_INCREF(filter);
2302 filter_is_callable = 1;
2303 } else {
2304 /* if not callable, check if it's a literal string */
2305 int literal;
2306 ptr = getstring(ptemplate, &n, &bint);
2307 b = bint;
2308 if (ptr) {
2309 if (b == 1) {
2310 literal = sre_literal_template((unsigned char *)ptr, n);
2311 } else {
2312 #if defined(HAVE_UNICODE)
2313 literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
2314 #endif
2315 }
2316 } else {
2317 PyErr_Clear();
2318 literal = 0;
2319 }
2320 if (literal) {
2321 filter = ptemplate;
2322 Py_INCREF(filter);
2323 filter_is_callable = 0;
2324 } else {
2325 /* not a literal; hand it over to the template compiler */
2326 filter = call(
2327 SRE_PY_MODULE, "_subx",
2328 PyTuple_Pack(2, self, ptemplate)
2329 );
2330 if (!filter)
2331 return NULL;
2332 filter_is_callable = PyCallable_Check(filter);
2333 }
2334 }
2335
2336 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2337 if (!string) {
2338 Py_DECREF(filter);
2339 return NULL;
2340 }
2341
2342 list = PyList_New(0);
2343 if (!list) {
2344 Py_DECREF(filter);
2345 state_fini(&state);
2346 return NULL;
2347 }
2348
2349 n = i = 0;
2350
2351 while (!count || n < count) {
2352
2353 state_reset(&state);
2354
2355 state.ptr = state.start;
2356
2357 if (state.charsize == 1) {
2358 status = sre_search(&state, PatternObject_GetCode(self));
2359 } else {
2360 #if defined(HAVE_UNICODE)
2361 status = sre_usearch(&state, PatternObject_GetCode(self));
2362 #endif
2363 }
2364
2365 if (PyErr_Occurred())
2366 goto error;
2367
2368 if (status <= 0) {
2369 if (status == 0)
2370 break;
2371 pattern_error(status);
2372 goto error;
2373 }
2374
2375 b = STATE_OFFSET(&state, state.start);
2376 e = STATE_OFFSET(&state, state.ptr);
2377
2378 if (i < b) {
2379 /* get segment before this match */
2380 item = PySequence_GetSlice(string, i, b);
2381 if (!item)
2382 goto error;
2383 status = PyList_Append(list, item);
2384 Py_DECREF(item);
2385 if (status < 0)
2386 goto error;
2387
2388 } else if (i == b && i == e && n > 0)
2389 /* ignore empty match on latest position */
2390 goto next;
2391
2392 if (filter_is_callable) {
2393 /* pass match object through filter */
2394 match = pattern_new_match(self, &state, 1);
2395 if (!match)
2396 goto error;
2397 args = PyTuple_Pack(1, match);
2398 if (!args) {
2399 Py_DECREF(match);
2400 goto error;
2401 }
2402 item = PyObject_CallObject(filter, args);
2403 Py_DECREF(args);
2404 Py_DECREF(match);
2405 if (!item)
2406 goto error;
2407 } else {
2408 /* filter is literal string */
2409 item = filter;
2410 Py_INCREF(item);
2411 }
2412
2413 /* add to list */
2414 if (item != Py_None) {
2415 status = PyList_Append(list, item);
2416 Py_DECREF(item);
2417 if (status < 0)
2418 goto error;
2419 }
2420
2421 i = e;
2422 n = n + 1;
2423
2424 next:
2425 /* move on */
2426 if (state.ptr == state.start)
2427 state.start = (void*) ((char*) state.ptr + state.charsize);
2428 else
2429 state.start = state.ptr;
2430
2431 }
2432
2433 /* get segment following last match */
2434 if (i < state.endpos) {
2435 item = PySequence_GetSlice(string, i, state.endpos);
2436 if (!item)
2437 goto error;
2438 status = PyList_Append(list, item);
2439 Py_DECREF(item);
2440 if (status < 0)
2441 goto error;
2442 }
2443
2444 state_fini(&state);
2445
2446 Py_DECREF(filter);
2447
2448 /* convert list to single string (also removes list) */
2449 item = join_list(list, string);
2450
2451 if (!item)
2452 return NULL;
2453
2454 if (subn)
2455 return Py_BuildValue("Ni", item, n);
2456
2457 return item;
2458
2459 error:
2460 Py_DECREF(list);
2461 state_fini(&state);
2462 Py_DECREF(filter);
2463 return NULL;
2464
2465 }
2466
2467 static PyObject*
2468 pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2469 {
2470 PyObject* ptemplate;
2471 PyObject* string;
2472 Py_ssize_t count = 0;
2473 static char* kwlist[] = { "repl", "string", "count", NULL };
2474 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
2475 &ptemplate, &string, &count))
2476 return NULL;
2477
2478 return pattern_subx(self, ptemplate, string, count, 0);
2479 }
2480
2481 static PyObject*
2482 pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2483 {
2484 PyObject* ptemplate;
2485 PyObject* string;
2486 Py_ssize_t count = 0;
2487 static char* kwlist[] = { "repl", "string", "count", NULL };
2488 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
2489 &ptemplate, &string, &count))
2490 return NULL;
2491
2492 return pattern_subx(self, ptemplate, string, count, 1);
2493 }
2494
2495 static PyObject*
2496 pattern_copy(PatternObject* self, PyObject *unused)
2497 {
2498 #ifdef USE_BUILTIN_COPY
2499 PatternObject* copy;
2500 int offset;
2501
2502 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2503 if (!copy)
2504 return NULL;
2505
2506 offset = offsetof(PatternObject, groups);
2507
2508 Py_XINCREF(self->groupindex);
2509 Py_XINCREF(self->indexgroup);
2510 Py_XINCREF(self->pattern);
2511
2512 memcpy((char*) copy + offset, (char*) self + offset,
2513 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
2514 copy->weakreflist = NULL;
2515
2516 return (PyObject*) copy;
2517 #else
2518 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2519 return NULL;
2520 #endif
2521 }
2522
2523 static PyObject*
2524 pattern_deepcopy(PatternObject* self, PyObject* memo)
2525 {
2526 #ifdef USE_BUILTIN_COPY
2527 PatternObject* copy;
2528
2529 copy = (PatternObject*) pattern_copy(self);
2530 if (!copy)
2531 return NULL;
2532
2533 if (!deepcopy(&copy->groupindex, memo) ||
2534 !deepcopy(&copy->indexgroup, memo) ||
2535 !deepcopy(&copy->pattern, memo)) {
2536 Py_DECREF(copy);
2537 return NULL;
2538 }
2539
2540 #else
2541 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2542 return NULL;
2543 #endif
2544 }
2545
2546 PyDoc_STRVAR(pattern_match_doc,
2547 "match(string[, pos[, endpos]]) --> match object or None.\n\
2548 Matches zero or more characters at the beginning of the string");
2549
2550 PyDoc_STRVAR(pattern_search_doc,
2551 "search(string[, pos[, endpos]]) --> match object or None.\n\
2552 Scan through string looking for a match, and return a corresponding\n\
2553 MatchObject instance. Return None if no position in the string matches.");
2554
2555 PyDoc_STRVAR(pattern_split_doc,
2556 "split(string[, maxsplit = 0]) --> list.\n\
2557 Split string by the occurrences of pattern.");
2558
2559 PyDoc_STRVAR(pattern_findall_doc,
2560 "findall(string[, pos[, endpos]]) --> list.\n\
2561 Return a list of all non-overlapping matches of pattern in string.");
2562
2563 PyDoc_STRVAR(pattern_finditer_doc,
2564 "finditer(string[, pos[, endpos]]) --> iterator.\n\
2565 Return an iterator over all non-overlapping matches for the \n\
2566 RE pattern in string. For each match, the iterator returns a\n\
2567 match object.");
2568
2569 PyDoc_STRVAR(pattern_sub_doc,
2570 "sub(repl, string[, count = 0]) --> newstring\n\
2571 Return the string obtained by replacing the leftmost non-overlapping\n\
2572 occurrences of pattern in string by the replacement repl.");
2573
2574 PyDoc_STRVAR(pattern_subn_doc,
2575 "subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2576 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2577 the leftmost non-overlapping occurrences of pattern with the\n\
2578 replacement repl.");
2579
2580 PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2581
2582 static PyMethodDef pattern_methods[] = {
2583 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
2584 pattern_match_doc},
2585 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
2586 pattern_search_doc},
2587 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2588 pattern_sub_doc},
2589 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2590 pattern_subn_doc},
2591 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
2592 pattern_split_doc},
2593 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
2594 pattern_findall_doc},
2595 #if PY_VERSION_HEX >= 0x02020000
2596 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2597 pattern_finditer_doc},
2598 #endif
2599 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
2600 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2601 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
2602 {NULL, NULL}
2603 };
2604
2605 static PyObject*
2606 pattern_getattr(PatternObject* self, char* name)
2607 {
2608 PyObject* res;
2609
2610 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
2611
2612 if (res)
2613 return res;
2614
2615 PyErr_Clear();
2616
2617 /* attributes */
2618 if (!strcmp(name, "pattern")) {
2619 Py_INCREF(self->pattern);
2620 return self->pattern;
2621 }
2622
2623 if (!strcmp(name, "flags"))
2624 return Py_BuildValue("i", self->flags);
2625
2626 if (!strcmp(name, "groups"))
2627 return Py_BuildValue("i", self->groups);
2628
2629 if (!strcmp(name, "groupindex") && self->groupindex) {
2630 Py_INCREF(self->groupindex);
2631 return self->groupindex;
2632 }
2633
2634 PyErr_SetString(PyExc_AttributeError, name);
2635 return NULL;
2636 }
2637
2638 statichere PyTypeObject Pattern_Type = {
2639 PyObject_HEAD_INIT(NULL)
2640 0, "_" SRE_MODULE ".SRE_Pattern",
2641 sizeof(PatternObject), sizeof(SRE_CODE),
2642 (destructor)pattern_dealloc, /*tp_dealloc*/
2643 0, /*tp_print*/
2644 (getattrfunc)pattern_getattr, /*tp_getattr*/
2645 0, /* tp_setattr */
2646 0, /* tp_compare */
2647 0, /* tp_repr */
2648 0, /* tp_as_number */
2649 0, /* tp_as_sequence */
2650 0, /* tp_as_mapping */
2651 0, /* tp_hash */
2652 0, /* tp_call */
2653 0, /* tp_str */
2654 0, /* tp_getattro */
2655 0, /* tp_setattro */
2656 0, /* tp_as_buffer */
2657 Py_TPFLAGS_HAVE_WEAKREFS, /* tp_flags */
2658 pattern_doc, /* tp_doc */
2659 0, /* tp_traverse */
2660 0, /* tp_clear */
2661 0, /* tp_richcompare */
2662 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
2663 };
2664
2665 static int _validate(PatternObject *self); /* Forward */
2666
2667 static PyObject *
2668 _compile(PyObject* self_, PyObject* args)
2669 {
2670 /* "compile" pattern descriptor to pattern object */
2671
2672 PatternObject* self;
2673 Py_ssize_t i, n;
2674
2675 PyObject* pattern;
2676 int flags = 0;
2677 PyObject* code;
2678 Py_ssize_t groups = 0;
2679 PyObject* groupindex = NULL;
2680 PyObject* indexgroup = NULL;
2681 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
2682 &PyList_Type, &code, &groups,
2683 &groupindex, &indexgroup))
2684 return NULL;
2685
2686 n = PyList_GET_SIZE(code);
2687 /* coverity[ampersand_in_size] */
2688 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2689 if (!self)
2690 return NULL;
2691 self->weakreflist = NULL;
2692 self->pattern = NULL;
2693 self->groupindex = NULL;
2694 self->indexgroup = NULL;
2695
2696 self->codesize = n;
2697
2698 for (i = 0; i < n; i++) {
2699 PyObject *o = PyList_GET_ITEM(code, i);
2700 unsigned long value = PyInt_Check(o) ? (unsigned long)PyInt_AsLong(o)
2701 : PyLong_AsUnsignedLong(o);
2702 self->code[i] = (SRE_CODE) value;
2703 if ((unsigned long) self->code[i] != value) {
2704 PyErr_SetString(PyExc_OverflowError,
2705 "regular expression code size limit exceeded");
2706 break;
2707 }
2708 }
2709
2710 if (PyErr_Occurred()) {
2711 Py_DECREF(self);
2712 return NULL;
2713 }
2714
2715 Py_INCREF(pattern);
2716 self->pattern = pattern;
2717
2718 self->flags = flags;
2719
2720 self->groups = groups;
2721
2722 Py_XINCREF(groupindex);
2723 self->groupindex = groupindex;
2724
2725 Py_XINCREF(indexgroup);
2726 self->indexgroup = indexgroup;
2727
2728 self->weakreflist = NULL;
2729
2730 if (!_validate(self)) {
2731 Py_DECREF(self);
2732 return NULL;
2733 }
2734
2735 return (PyObject*) self;
2736 }
2737
2738 /* -------------------------------------------------------------------- */
2739 /* Code validation */
2740
2741 /* To learn more about this code, have a look at the _compile() function in
2742 Lib/sre_compile.py. The validation functions below checks the code array
2743 for conformance with the code patterns generated there.
2744
2745 The nice thing about the generated code is that it is position-independent:
2746 all jumps are relative jumps forward. Also, jumps don't cross each other:
2747 the target of a later jump is always earlier than the target of an earlier
2748 jump. IOW, this is okay:
2749
2750 J---------J-------T--------T
2751 \ \_____/ /
2752 \______________________/
2753
2754 but this is not:
2755
2756 J---------J-------T--------T
2757 \_________\_____/ /
2758 \____________/
2759
2760 It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
2761 bytes wide (the latter if Python is compiled for "wide" unicode support).
2762 */
2763
2764 /* Defining this one enables tracing of the validator */
2765 #undef VVERBOSE
2766
2767 /* Trace macro for the validator */
2768 #if defined(VVERBOSE)
2769 #define VTRACE(v) printf v
2770 #else
2771 #define VTRACE(v)
2772 #endif
2773
2774 /* Report failure */
2775 #define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2776
2777 /* Extract opcode, argument, or skip count from code array */
2778 #define GET_OP \
2779 do { \
2780 VTRACE(("%p: ", code)); \
2781 if (code >= end) FAIL; \
2782 op = *code++; \
2783 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2784 } while (0)
2785 #define GET_ARG \
2786 do { \
2787 VTRACE(("%p= ", code)); \
2788 if (code >= end) FAIL; \
2789 arg = *code++; \
2790 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2791 } while (0)
2792 #define GET_SKIP_ADJ(adj) \
2793 do { \
2794 VTRACE(("%p= ", code)); \
2795 if (code >= end) FAIL; \
2796 skip = *code; \
2797 VTRACE(("%lu (skip to %p)\n", \
2798 (unsigned long)skip, code+skip)); \
2799 if (code+skip-adj < code || code+skip-adj > end)\
2800 FAIL; \
2801 code++; \
2802 } while (0)
2803 #define GET_SKIP GET_SKIP_ADJ(0)
2804
2805 static int
2806 _validate_charset(SRE_CODE *code, SRE_CODE *end)
2807 {
2808 /* Some variables are manipulated by the macros above */
2809 SRE_CODE op;
2810 SRE_CODE arg;
2811 SRE_CODE offset;
2812 int i;
2813
2814 while (code < end) {
2815 GET_OP;
2816 switch (op) {
2817
2818 case SRE_OP_NEGATE:
2819 break;
2820
2821 case SRE_OP_LITERAL:
2822 GET_ARG;
2823 break;
2824
2825 case SRE_OP_RANGE:
2826 GET_ARG;
2827 GET_ARG;
2828 break;
2829
2830 case SRE_OP_CHARSET:
2831 offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
2832 if (code+offset < code || code+offset > end)
2833 FAIL;
2834 code += offset;
2835 break;
2836
2837 case SRE_OP_BIGCHARSET:
2838 GET_ARG; /* Number of blocks */
2839 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
2840 if (code+offset < code || code+offset > end)
2841 FAIL;
2842 /* Make sure that each byte points to a valid block */
2843 for (i = 0; i < 256; i++) {
2844 if (((unsigned char *)code)[i] >= arg)
2845 FAIL;
2846 }
2847 code += offset;
2848 offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
2849 if (code+offset < code || code+offset > end)
2850 FAIL;
2851 code += offset;
2852 break;
2853
2854 case SRE_OP_CATEGORY:
2855 GET_ARG;
2856 switch (arg) {
2857 case SRE_CATEGORY_DIGIT:
2858 case SRE_CATEGORY_NOT_DIGIT:
2859 case SRE_CATEGORY_SPACE:
2860 case SRE_CATEGORY_NOT_SPACE:
2861 case SRE_CATEGORY_WORD:
2862 case SRE_CATEGORY_NOT_WORD:
2863 case SRE_CATEGORY_LINEBREAK:
2864 case SRE_CATEGORY_NOT_LINEBREAK:
2865 case SRE_CATEGORY_LOC_WORD:
2866 case SRE_CATEGORY_LOC_NOT_WORD:
2867 case SRE_CATEGORY_UNI_DIGIT:
2868 case SRE_CATEGORY_UNI_NOT_DIGIT:
2869 case SRE_CATEGORY_UNI_SPACE:
2870 case SRE_CATEGORY_UNI_NOT_SPACE:
2871 case SRE_CATEGORY_UNI_WORD:
2872 case SRE_CATEGORY_UNI_NOT_WORD:
2873 case SRE_CATEGORY_UNI_LINEBREAK:
2874 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2875 break;
2876 default:
2877 FAIL;
2878 }
2879 break;
2880
2881 default:
2882 FAIL;
2883
2884 }
2885 }
2886
2887 return 1;
2888 }
2889
2890 static int
2891 _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2892 {
2893 /* Some variables are manipulated by the macros above */
2894 SRE_CODE op;
2895 SRE_CODE arg;
2896 SRE_CODE skip;
2897
2898 VTRACE(("code=%p, end=%p\n", code, end));
2899
2900 if (code > end)
2901 FAIL;
2902
2903 while (code < end) {
2904 GET_OP;
2905 switch (op) {
2906
2907 case SRE_OP_MARK:
2908 /* We don't check whether marks are properly nested; the
2909 sre_match() code is robust even if they don't, and the worst
2910 you can get is nonsensical match results. */
2911 GET_ARG;
2912 if (arg > 2*groups+1) {
2913 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
2914 FAIL;
2915 }
2916 break;
2917
2918 case SRE_OP_LITERAL:
2919 case SRE_OP_NOT_LITERAL:
2920 case SRE_OP_LITERAL_IGNORE:
2921 case SRE_OP_NOT_LITERAL_IGNORE:
2922 GET_ARG;
2923 /* The arg is just a character, nothing to check */
2924 break;
2925
2926 case SRE_OP_SUCCESS:
2927 case SRE_OP_FAILURE:
2928 /* Nothing to check; these normally end the matching process */
2929 break;
2930
2931 case SRE_OP_AT:
2932 GET_ARG;
2933 switch (arg) {
2934 case SRE_AT_BEGINNING:
2935 case SRE_AT_BEGINNING_STRING:
2936 case SRE_AT_BEGINNING_LINE:
2937 case SRE_AT_END:
2938 case SRE_AT_END_LINE:
2939 case SRE_AT_END_STRING:
2940 case SRE_AT_BOUNDARY:
2941 case SRE_AT_NON_BOUNDARY:
2942 case SRE_AT_LOC_BOUNDARY:
2943 case SRE_AT_LOC_NON_BOUNDARY:
2944 case SRE_AT_UNI_BOUNDARY:
2945 case SRE_AT_UNI_NON_BOUNDARY:
2946 break;
2947 default:
2948 FAIL;
2949 }
2950 break;
2951
2952 case SRE_OP_ANY:
2953 case SRE_OP_ANY_ALL:
2954 /* These have no operands */
2955 break;
2956
2957 case SRE_OP_IN:
2958 case SRE_OP_IN_IGNORE:
2959 GET_SKIP;
2960 /* Stop 1 before the end; we check the FAILURE below */
2961 if (!_validate_charset(code, code+skip-2))
2962 FAIL;
2963 if (code[skip-2] != SRE_OP_FAILURE)
2964 FAIL;
2965 code += skip-1;
2966 break;
2967
2968 case SRE_OP_INFO:
2969 {
2970 /* A minimal info field is
2971 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2972 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2973 more follows. */
2974 SRE_CODE flags, i;
2975 SRE_CODE *newcode;
2976 GET_SKIP;
2977 newcode = code+skip-1;
2978 GET_ARG; flags = arg;
2979 GET_ARG; /* min */
2980 GET_ARG; /* max */
2981 /* Check that only valid flags are present */
2982 if ((flags & ~(SRE_INFO_PREFIX |
2983 SRE_INFO_LITERAL |
2984 SRE_INFO_CHARSET)) != 0)
2985 FAIL;
2986 /* PREFIX and CHARSET are mutually exclusive */
2987 if ((flags & SRE_INFO_PREFIX) &&
2988 (flags & SRE_INFO_CHARSET))
2989 FAIL;
2990 /* LITERAL implies PREFIX */
2991 if ((flags & SRE_INFO_LITERAL) &&
2992 !(flags & SRE_INFO_PREFIX))
2993 FAIL;
2994 /* Validate the prefix */
2995 if (flags & SRE_INFO_PREFIX) {
2996 SRE_CODE prefix_len;
2997 GET_ARG; prefix_len = arg;
2998 GET_ARG; /* prefix skip */
2999 /* Here comes the prefix string */
3000 if (code+prefix_len < code || code+prefix_len > newcode)
3001 FAIL;
3002 code += prefix_len;
3003 /* And here comes the overlap table */
3004 if (code+prefix_len < code || code+prefix_len > newcode)
3005 FAIL;
3006 /* Each overlap value should be < prefix_len */
3007 for (i = 0; i < prefix_len; i++) {
3008 if (code[i] >= prefix_len)
3009 FAIL;
3010 }
3011 code += prefix_len;
3012 }
3013 /* Validate the charset */
3014 if (flags & SRE_INFO_CHARSET) {
3015 if (!_validate_charset(code, newcode-1))
3016 FAIL;
3017 if (newcode[-1] != SRE_OP_FAILURE)
3018 FAIL;
3019 code = newcode;
3020 }
3021 else if (code != newcode) {
3022 VTRACE(("code=%p, newcode=%p\n", code, newcode));
3023 FAIL;
3024 }
3025 }
3026 break;
3027
3028 case SRE_OP_BRANCH:
3029 {
3030 SRE_CODE *target = NULL;
3031 for (;;) {
3032 GET_SKIP;
3033 if (skip == 0)
3034 break;
3035 /* Stop 2 before the end; we check the JUMP below */
3036 if (!_validate_inner(code, code+skip-3, groups))
3037 FAIL;
3038 code += skip-3;
3039 /* Check that it ends with a JUMP, and that each JUMP
3040 has the same target */
3041 GET_OP;
3042 if (op != SRE_OP_JUMP)
3043 FAIL;
3044 GET_SKIP;
3045 if (target == NULL)
3046 target = code+skip-1;
3047 else if (code+skip-1 != target)
3048 FAIL;
3049 }
3050 }
3051 break;
3052
3053 case SRE_OP_REPEAT_ONE:
3054 case SRE_OP_MIN_REPEAT_ONE:
3055 {
3056 SRE_CODE min, max;
3057 GET_SKIP;
3058 GET_ARG; min = arg;
3059 GET_ARG; max = arg;
3060 if (min > max)
3061 FAIL;
3062 #ifdef Py_UNICODE_WIDE
3063 if (max > 65535)
3064 FAIL;
3065 #endif
3066 if (!_validate_inner(code, code+skip-4, groups))
3067 FAIL;
3068 code += skip-4;
3069 GET_OP;
3070 if (op != SRE_OP_SUCCESS)
3071 FAIL;
3072 }
3073 break;
3074
3075 case SRE_OP_REPEAT:
3076 {
3077 SRE_CODE min, max;
3078 GET_SKIP;
3079 GET_ARG; min = arg;
3080 GET_ARG; max = arg;
3081 if (min > max)
3082 FAIL;
3083 #ifdef Py_UNICODE_WIDE
3084 if (max > 65535)
3085 FAIL;
3086 #endif
3087 if (!_validate_inner(code, code+skip-3, groups))
3088 FAIL;
3089 code += skip-3;
3090 GET_OP;
3091 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3092 FAIL;
3093 }
3094 break;
3095
3096 case SRE_OP_GROUPREF:
3097 case SRE_OP_GROUPREF_IGNORE:
3098 GET_ARG;
3099 if (arg >= groups)
3100 FAIL;
3101 break;
3102
3103 case SRE_OP_GROUPREF_EXISTS:
3104 /* The regex syntax for this is: '(?(group)then|else)', where
3105 'group' is either an integer group number or a group name,
3106 'then' and 'else' are sub-regexes, and 'else' is optional. */
3107 GET_ARG;
3108 if (arg >= groups)
3109 FAIL;
3110 GET_SKIP_ADJ(1);
3111 code--; /* The skip is relative to the first arg! */
3112 /* There are two possibilities here: if there is both a 'then'
3113 part and an 'else' part, the generated code looks like:
3114
3115 GROUPREF_EXISTS
3116 <group>
3117 <skipyes>
3118 ...then part...
3119 JUMP
3120 <skipno>
3121 (<skipyes> jumps here)
3122 ...else part...
3123 (<skipno> jumps here)
3124
3125 If there is only a 'then' part, it looks like:
3126
3127 GROUPREF_EXISTS
3128 <group>
3129 <skip>
3130 ...then part...
3131 (<skip> jumps here)
3132
3133 There is no direct way to decide which it is, and we don't want
3134 to allow arbitrary jumps anywhere in the code; so we just look
3135 for a JUMP opcode preceding our skip target.
3136 */
3137 if (skip >= 3 && code+skip-3 >= code &&
3138 code[skip-3] == SRE_OP_JUMP)
3139 {
3140 VTRACE(("both then and else parts present\n"));
3141 if (!_validate_inner(code+1, code+skip-3, groups))
3142 FAIL;
3143 code += skip-2; /* Position after JUMP, at <skipno> */
3144 GET_SKIP;
3145 if (!_validate_inner(code, code+skip-1, groups))
3146 FAIL;
3147 code += skip-1;
3148 }
3149 else {
3150 VTRACE(("only a then part present\n"));
3151 if (!_validate_inner(code+1, code+skip-1, groups))
3152 FAIL;
3153 code += skip-1;
3154 }
3155 break;
3156
3157 case SRE_OP_ASSERT:
3158 case SRE_OP_ASSERT_NOT:
3159 GET_SKIP;
3160 GET_ARG; /* 0 for lookahead, width for lookbehind */
3161 code--; /* Back up over arg to simplify math below */
3162 if (arg & 0x80000000)
3163 FAIL; /* Width too large */
3164 /* Stop 1 before the end; we check the SUCCESS below */
3165 if (!_validate_inner(code+1, code+skip-2, groups))
3166 FAIL;
3167 code += skip-2;
3168 GET_OP;
3169 if (op != SRE_OP_SUCCESS)
3170 FAIL;
3171 break;
3172
3173 default:
3174 FAIL;
3175
3176 }
3177 }
3178
3179 VTRACE(("okay\n"));
3180 return 1;
3181 }
3182
3183 static int
3184 _validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3185 {
3186 if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3187 FAIL;
3188 if (groups == 0) /* fix for simplejson */
3189 groups = 100; /* 100 groups should always be safe */
3190 return _validate_inner(code, end-1, groups);
3191 }
3192
3193 static int
3194 _validate(PatternObject *self)
3195 {
3196 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3197 {
3198 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3199 return 0;
3200 }
3201 else
3202 VTRACE(("Success!\n"));
3203 return 1;
3204 }
3205
3206 /* -------------------------------------------------------------------- */
3207 /* match methods */
3208
3209 static void
3210 match_dealloc(MatchObject* self)
3211 {
3212 Py_XDECREF(self->regs);
3213 Py_XDECREF(self->string);
3214 Py_DECREF(self->pattern);
3215 PyObject_DEL(self);
3216 }
3217
3218 static PyObject*
3219 match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
3220 {
3221 if (index < 0 || index >= self->groups) {
3222 /* raise IndexError if we were given a bad group number */
3223 PyErr_SetString(
3224 PyExc_IndexError,
3225 "no such group"
3226 );
3227 return NULL;
3228 }
3229
3230 index *= 2;
3231
3232 if (self->string == Py_None || self->mark[index] < 0) {
3233 /* return default value if the string or group is undefined */
3234 Py_INCREF(def);
3235 return def;
3236 }
3237
3238 return PySequence_GetSlice(
3239 self->string, self->mark[index], self->mark[index+1]
3240 );
3241 }
3242
3243 static Py_ssize_t
3244 match_getindex(MatchObject* self, PyObject* index)
3245 {
3246 Py_ssize_t i;
3247
3248 if (PyInt_Check(index))
3249 return PyInt_AsSsize_t(index);
3250
3251 i = -1;
3252
3253 if (self->pattern->groupindex) {
3254 index = PyObject_GetItem(self->pattern->groupindex, index);
3255 if (index) {
3256 if (PyInt_Check(index) || PyLong_Check(index))
3257 i = PyInt_AsSsize_t(index);
3258 Py_DECREF(index);
3259 } else
3260 PyErr_Clear();
3261 }
3262
3263 return i;
3264 }
3265
3266 static PyObject*
3267 match_getslice(MatchObject* self, PyObject* index, PyObject* def)
3268 {
3269 return match_getslice_by_index(self, match_getindex(self, index), def);
3270 }
3271
3272 static PyObject*
3273 match_expand(MatchObject* self, PyObject* ptemplate)
3274 {
3275 /* delegate to Python code */
3276 return call(
3277 SRE_PY_MODULE, "_expand",
3278 PyTuple_Pack(3, self->pattern, self, ptemplate)
3279 );
3280 }
3281
3282 static PyObject*
3283 match_group(MatchObject* self, PyObject* args)
3284 {
3285 PyObject* result;
3286 Py_ssize_t i, size;
3287
3288 size = PyTuple_GET_SIZE(args);
3289
3290 switch (size) {
3291 case 0:
3292 result = match_getslice(self, Py_False, Py_None);
3293 break;
3294 case 1:
3295 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3296 break;
3297 default:
3298 /* fetch multiple items */
3299 result = PyTuple_New(size);
3300 if (!result)
3301 return NULL;
3302 for (i = 0; i < size; i++) {
3303 PyObject* item = match_getslice(
3304 self, PyTuple_GET_ITEM(args, i), Py_None
3305 );
3306 if (!item) {
3307 Py_DECREF(result);
3308 return NULL;
3309 }
3310 PyTuple_SET_ITEM(result, i, item);
3311 }
3312 break;
3313 }
3314 return result;
3315 }
3316
3317 static PyObject*
3318 match_groups(MatchObject* self, PyObject* args, PyObject* kw)
3319 {
3320 PyObject* result;
3321 Py_ssize_t index;
3322
3323 PyObject* def = Py_None;
3324 static char* kwlist[] = { "default", NULL };
3325 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
3326 return NULL;
3327
3328 result = PyTuple_New(self->groups-1);
3329 if (!result)
3330 return NULL;
3331
3332 for (index = 1; index < self->groups; index++) {
3333 PyObject* item;
3334 item = match_getslice_by_index(self, index, def);
3335 if (!item) {
3336 Py_DECREF(result);
3337 return NULL;
3338 }
3339 PyTuple_SET_ITEM(result, index-1, item);
3340 }
3341
3342 return result;
3343 }
3344
3345 static PyObject*
3346 match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
3347 {
3348 PyObject* result;
3349 PyObject* keys;
3350 Py_ssize_t index;
3351
3352 PyObject* def = Py_None;
3353 static char* kwlist[] = { "default", NULL };
3354 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
3355 return NULL;
3356
3357 result = PyDict_New();
3358 if (!result || !self->pattern->groupindex)
3359 return result;
3360
3361 keys = PyMapping_Keys(self->pattern->groupindex);
3362 if (!keys)
3363 goto failed;
3364
3365 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
3366 int status;
3367 PyObject* key;
3368 PyObject* value;
3369 key = PyList_GET_ITEM(keys, index);
3370 if (!key)
3371 goto failed;
3372 value = match_getslice(self, key, def);
3373 if (!value) {
3374 Py_DECREF(key);
3375 goto failed;
3376 }
3377 status = PyDict_SetItem(result, key, value);
3378 Py_DECREF(value);
3379 if (status < 0)
3380 goto failed;
3381 }
3382
3383 Py_DECREF(keys);
3384
3385 return result;
3386
3387 failed:
3388 Py_XDECREF(keys);
3389 Py_DECREF(result);
3390 return NULL;
3391 }
3392
3393 static PyObject*
3394 match_start(MatchObject* self, PyObject* args)
3395 {
3396 Py_ssize_t index;
3397
3398 PyObject* index_ = Py_False; /* zero */
3399 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
3400 return NULL;
3401
3402 index = match_getindex(self, index_);
3403
3404 if (index < 0 || index >= self->groups) {
3405 PyErr_SetString(
3406 PyExc_IndexError,
3407 "no such group"
3408 );
3409 return NULL;
3410 }
3411
3412 /* mark is -1 if group is undefined */
3413 return Py_BuildValue("i", self->mark[index*2]);
3414 }
3415
3416 static PyObject*
3417 match_end(MatchObject* self, PyObject* args)
3418 {
3419 Py_ssize_t index;
3420
3421 PyObject* index_ = Py_False; /* zero */
3422 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
3423 return NULL;
3424
3425 index = match_getindex(self, index_);
3426
3427 if (index < 0 || index >= self->groups) {
3428 PyErr_SetString(
3429 PyExc_IndexError,
3430 "no such group"
3431 );
3432 return NULL;
3433 }
3434
3435 /* mark is -1 if group is undefined */
3436 return Py_BuildValue("i", self->mark[index*2+1]);
3437 }
3438
3439 LOCAL(PyObject*)
3440 _pair(Py_ssize_t i1, Py_ssize_t i2)
3441 {
3442 PyObject* pair;
3443 PyObject* item;
3444
3445 pair = PyTuple_New(2);
3446 if (!pair)
3447 return NULL;
3448
3449 item = PyInt_FromSsize_t(i1);
3450 if (!item)
3451 goto error;
3452 PyTuple_SET_ITEM(pair, 0, item);
3453
3454 item = PyInt_FromSsize_t(i2);
3455 if (!item)
3456 goto error;
3457 PyTuple_SET_ITEM(pair, 1, item);
3458
3459 return pair;
3460
3461 error:
3462 Py_DECREF(pair);
3463 return NULL;
3464 }
3465
3466 static PyObject*
3467 match_span(MatchObject* self, PyObject* args)
3468 {
3469 Py_ssize_t index;
3470
3471 PyObject* index_ = Py_False; /* zero */
3472 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
3473 return NULL;
3474
3475 index = match_getindex(self, index_);
3476
3477 if (index < 0 || index >= self->groups) {
3478 PyErr_SetString(
3479 PyExc_IndexError,
3480 "no such group"
3481 );
3482 return NULL;
3483 }
3484
3485 /* marks are -1 if group is undefined */
3486 return _pair(self->mark[index*2], self->mark[index*2+1]);
3487 }
3488
3489 static PyObject*
3490 match_regs(MatchObject* self)
3491 {
3492 PyObject* regs;
3493 PyObject* item;
3494 Py_ssize_t index;
3495
3496 regs = PyTuple_New(self->groups);
3497 if (!regs)
3498 return NULL;
3499
3500 for (index = 0; index < self->groups; index++) {
3501 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3502 if (!item) {
3503 Py_DECREF(regs);
3504 return NULL;
3505 }
3506 PyTuple_SET_ITEM(regs, index, item);
3507 }
3508
3509 Py_INCREF(regs);
3510 self->regs = regs;
3511
3512 return regs;
3513 }
3514
3515 static PyObject*
3516 match_copy(MatchObject* self, PyObject *unused)
3517 {
3518 #ifdef USE_BUILTIN_COPY
3519 MatchObject* copy;
3520 Py_ssize_t slots, offset;
3521
3522 slots = 2 * (self->pattern->groups+1);
3523
3524 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3525 if (!copy)
3526 return NULL;
3527
3528 /* this value a constant, but any compiler should be able to
3529 figure that out all by itself */
3530 offset = offsetof(MatchObject, string);
3531
3532 Py_XINCREF(self->pattern);
3533 Py_XINCREF(self->string);
3534 Py_XINCREF(self->regs);
3535
3536 memcpy((char*) copy + offset, (char*) self + offset,
3537 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
3538
3539 return (PyObject*) copy;
3540 #else
3541 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
3542 return NULL;
3543 #endif
3544 }
3545
3546 static PyObject*
3547 match_deepcopy(MatchObject* self, PyObject* memo)
3548 {
3549 #ifdef USE_BUILTIN_COPY
3550 MatchObject* copy;
3551
3552 copy = (MatchObject*) match_copy(self);
3553 if (!copy)
3554 return NULL;
3555
3556 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3557 !deepcopy(&copy->string, memo) ||
3558 !deepcopy(&copy->regs, memo)) {
3559 Py_DECREF(copy);
3560 return NULL;
3561 }
3562
3563 #else
3564 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3565 return NULL;
3566 #endif
3567 }
3568
3569 static PyMethodDef match_methods[] = {
3570 {"group", (PyCFunction) match_group, METH_VARARGS},
3571 {"start", (PyCFunction) match_start, METH_VARARGS},
3572 {"end", (PyCFunction) match_end, METH_VARARGS},
3573 {"span", (PyCFunction) match_span, METH_VARARGS},
3574 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3575 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
3576 {"expand", (PyCFunction) match_expand, METH_O},
3577 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3578 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
3579 {NULL, NULL}
3580 };
3581
3582 static PyObject*
3583 match_getattr(MatchObject* self, char* name)
3584 {
3585 PyObject* res;
3586
3587 res = Py_FindMethod(match_methods, (PyObject*) self, name);
3588 if (res)
3589 return res;
3590
3591 PyErr_Clear();
3592
3593 if (!strcmp(name, "lastindex")) {
3594 if (self->lastindex >= 0)
3595 return Py_BuildValue("i", self->lastindex);
3596 Py_INCREF(Py_None);
3597 return Py_None;
3598 }
3599
3600 if (!strcmp(name, "lastgroup")) {
3601 if (self->pattern->indexgroup && self->lastindex >= 0) {
3602 PyObject* result = PySequence_GetItem(
3603 self->pattern->indexgroup, self->lastindex
3604 );
3605 if (result)
3606 return result;
3607 PyErr_Clear();
3608 }
3609 Py_INCREF(Py_None);
3610 return Py_None;
3611 }
3612
3613 if (!strcmp(name, "string")) {
3614 if (self->string) {
3615 Py_INCREF(self->string);
3616 return self->string;
3617 } else {
3618 Py_INCREF(Py_None);
3619 return Py_None;
3620 }
3621 }
3622
3623 if (!strcmp(name, "regs")) {
3624 if (self->regs) {
3625 Py_INCREF(self->regs);
3626 return self->regs;
3627 } else
3628 return match_regs(self);
3629 }
3630
3631 if (!strcmp(name, "re")) {
3632 Py_INCREF(self->pattern);
3633 return (PyObject*) self->pattern;
3634 }
3635
3636 if (!strcmp(name, "pos"))
3637 return Py_BuildValue("i", self->pos);
3638
3639 if (!strcmp(name, "endpos"))
3640 return Py_BuildValue("i", self->endpos);
3641
3642 PyErr_SetString(PyExc_AttributeError, name);
3643 return NULL;
3644 }
3645
3646 /* FIXME: implement setattr("string", None) as a special case (to
3647 detach the associated string, if any */
3648
3649 statichere PyTypeObject Match_Type = {
3650 PyObject_HEAD_INIT(NULL)
3651 0, "_" SRE_MODULE ".SRE_Match",
3652 sizeof(MatchObject), sizeof(Py_ssize_t),
3653 (destructor)match_dealloc, /*tp_dealloc*/
3654 0, /*tp_print*/
3655 (getattrfunc)match_getattr /*tp_getattr*/
3656 };
3657
3658 static PyObject*
3659 pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3660 {
3661 /* create match object (from state object) */
3662
3663 MatchObject* match;
3664 Py_ssize_t i, j;
3665 char* base;
3666 int n;
3667
3668 if (status > 0) {
3669
3670 /* create match object (with room for extra group marks) */
3671 /* coverity[ampersand_in_size] */
3672 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3673 2*(pattern->groups+1));
3674 if (!match)
3675 return NULL;
3676
3677 Py_INCREF(pattern);
3678 match->pattern = pattern;
3679
3680 Py_INCREF(state->string);
3681 match->string = state->string;
3682
3683 match->regs = NULL;
3684 match->groups = pattern->groups+1;
3685
3686 /* fill in group slices */
3687
3688 base = (char*) state->beginning;
3689 n = state->charsize;
3690
3691 match->mark[0] = ((char*) state->start - base) / n;
3692 match->mark[1] = ((char*) state->ptr - base) / n;
3693
3694 for (i = j = 0; i < pattern->groups; i++, j+=2)
3695 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3696 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3697 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3698 } else
3699 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3700
3701 match->pos = state->pos;
3702 match->endpos = state->endpos;
3703
3704 match->lastindex = state->lastindex;
3705
3706 return (PyObject*) match;
3707
3708 } else if (status == 0) {
3709
3710 /* no match */
3711 Py_INCREF(Py_None);
3712 return Py_None;
3713
3714 }
3715
3716 /* internal error */
3717 pattern_error(status);
3718 return NULL;
3719 }
3720
3721
3722 /* -------------------------------------------------------------------- */
3723 /* scanner methods (experimental) */
3724
3725 static void
3726 scanner_dealloc(ScannerObject* self)
3727 {
3728 state_fini(&self->state);
3729 Py_XDECREF(self->pattern);
3730 PyObject_DEL(self);
3731 }
3732
3733 static PyObject*
3734 scanner_match(ScannerObject* self, PyObject *unused)
3735 {
3736 SRE_STATE* state = &self->state;
3737 PyObject* match;
3738 int status;
3739
3740 state_reset(state);
3741
3742 state->ptr = state->start;
3743
3744 if (state->charsize == 1) {
3745 status = sre_match(state, PatternObject_GetCode(self->pattern));
3746 } else {
3747 #if defined(HAVE_UNICODE)
3748 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
3749 #endif
3750 }
3751 if (PyErr_Occurred())
3752 return NULL;
3753
3754 match = pattern_new_match((PatternObject*) self->pattern,
3755 state, status);
3756
3757 if (status == 0 || state->ptr == state->start)
3758 state->start = (void*) ((char*) state->ptr + state->charsize);
3759 else
3760 state->start = state->ptr;
3761
3762 return match;
3763 }
3764
3765
3766 static PyObject*
3767 scanner_search(ScannerObject* self, PyObject *unused)
3768 {
3769 SRE_STATE* state = &self->state;
3770 PyObject* match;
3771 int status;
3772
3773 state_reset(state);
3774
3775 state->ptr = state->start;
3776
3777 if (state->charsize == 1) {
3778 status = sre_search(state, PatternObject_GetCode(self->pattern));
3779 } else {
3780 #if defined(HAVE_UNICODE)
3781 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
3782 #endif
3783 }
3784 if (PyErr_Occurred())
3785 return NULL;
3786
3787 match = pattern_new_match((PatternObject*) self->pattern,
3788 state, status);
3789
3790 if (status == 0 || state->ptr == state->start)
3791 state->start = (void*) ((char*) state->ptr + state->charsize);
3792 else
3793 state->start = state->ptr;
3794
3795 return match;
3796 }
3797
3798 static PyMethodDef scanner_methods[] = {
3799 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3800 {"search", (PyCFunction) scanner_search, METH_NOARGS},
3801 {NULL, NULL}
3802 };
3803
3804 static PyObject*
3805 scanner_getattr(ScannerObject* self, char* name)
3806 {
3807 PyObject* res;
3808
3809 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
3810 if (res)
3811 return res;
3812
3813 PyErr_Clear();
3814
3815 /* attributes */
3816 if (!strcmp(name, "pattern")) {
3817 Py_INCREF(self->pattern);
3818 return self->pattern;
3819 }
3820
3821 PyErr_SetString(PyExc_AttributeError, name);
3822 return NULL;
3823 }
3824
3825 statichere PyTypeObject Scanner_Type = {
3826 PyObject_HEAD_INIT(NULL)
3827 0, "_" SRE_MODULE ".SRE_Scanner",
3828 sizeof(ScannerObject), 0,
3829 (destructor)scanner_dealloc, /*tp_dealloc*/
3830 0, /*tp_print*/
3831 (getattrfunc)scanner_getattr, /*tp_getattr*/
3832 };
3833
3834 static PyObject*
3835 pattern_scanner(PatternObject* pattern, PyObject* args)
3836 {
3837 /* create search state object */
3838
3839 ScannerObject* self;
3840
3841 PyObject* string;
3842 Py_ssize_t start = 0;
3843 Py_ssize_t end = PY_SSIZE_T_MAX;
3844 if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
3845 return NULL;
3846
3847 /* create scanner object */
3848 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3849 if (!self)
3850 return NULL;
3851 self->pattern = NULL;
3852
3853 string = state_init(&self->state, pattern, string, start, end);
3854 if (!string) {
3855 Py_DECREF(self);
3856 return NULL;
3857 }
3858
3859 Py_INCREF(pattern);
3860 self->pattern = (PyObject*) pattern;
3861
3862 return (PyObject*) self;
3863 }
3864
3865 static PyMethodDef _functions[] = {
3866 {"compile", _compile, METH_VARARGS},
3867 {"getcodesize", sre_codesize, METH_NOARGS},
3868 {"getlower", sre_getlower, METH_VARARGS},
3869 {NULL, NULL}
3870 };
3871
3872 #if PY_VERSION_HEX < 0x02030000
3873 DL_EXPORT(void) init_sre(void)
3874 #else
3875 PyMODINIT_FUNC init_sre(void)
3876 #endif
3877 {
3878 PyObject* m;
3879 PyObject* d;
3880 PyObject* x;
3881
3882 /* Patch object types */
3883 if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
3884 PyType_Ready(&Scanner_Type))
3885 return;
3886
3887 m = Py_InitModule("_" SRE_MODULE, _functions);
3888 if (m == NULL)
3889 return;
3890 d = PyModule_GetDict(m);
3891
3892 x = PyInt_FromLong(SRE_MAGIC);
3893 if (x) {
3894 PyDict_SetItemString(d, "MAGIC", x);
3895 Py_DECREF(x);
3896 }
3897
3898 x = PyInt_FromLong(sizeof(SRE_CODE));
3899 if (x) {
3900 PyDict_SetItemString(d, "CODESIZE", x);
3901 Py_DECREF(x);
3902 }
3903
3904 x = PyString_FromString(copyright);
3905 if (x) {
3906 PyDict_SetItemString(d, "copyright", x);
3907 Py_DECREF(x);
3908 }
3909 }
3910
3911 #endif /* !defined(SRE_RECURSIVE) */
3912
3913 /* vim:ts=4:sw=4:et
3914 */