2 * Secret Labs' Regular Expression Engine
4 * regular expression matching engine
7 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
26 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
28 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
32 * Portions of this engine have been developed in cooperation with
33 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
34 * other compatibility work.
37 /* Get rid of these macros to prevent collisions between EFI and Python in this file. */
43 static char copyright
[] =
44 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
46 #define PY_SSIZE_T_CLEAN
49 #include "structmember.h" /* offsetof */
55 /* name of this module, minus the leading underscore */
56 #if !defined(SRE_MODULE)
57 #define SRE_MODULE "sre"
60 #define SRE_PY_MODULE "re"
62 /* defining this one enables tracing */
65 #if PY_VERSION_HEX >= 0x01060000
66 #if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
67 /* defining this enables unicode support (default under 1.6a1 and later) */
72 /* -------------------------------------------------------------------- */
73 /* optional features */
75 /* enables fast searching */
76 #define USE_FAST_SEARCH
78 /* enables aggressive inlining (always on for Visual C) */
81 /* enables copy/deepcopy handling (work in progress) */
82 #undef USE_BUILTIN_COPY
84 #if PY_VERSION_HEX < 0x01060000
85 #define PyObject_DEL(op) PyMem_DEL((op))
88 /* -------------------------------------------------------------------- */
91 #pragma optimize("gt", on) /* doesn't seem to make much difference... */
92 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
93 /* fastest possible local call under MSVC */
94 #define LOCAL(type) static __inline type __fastcall
95 #elif defined(USE_INLINE)
96 #define LOCAL(type) static inline type
98 #define LOCAL(type) static type
102 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
103 #define SRE_ERROR_STATE -2 /* illegal state */
104 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
105 #define SRE_ERROR_MEMORY -9 /* out of memory */
106 #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
109 #define TRACE(v) printf v
114 /* -------------------------------------------------------------------- */
115 /* search engine state */
117 /* default character predicates (run sre_chars.py to regenerate tables) */
119 #define SRE_DIGIT_MASK 1
120 #define SRE_SPACE_MASK 2
121 #define SRE_LINEBREAK_MASK 4
122 #define SRE_ALNUM_MASK 8
123 #define SRE_WORD_MASK 16
125 /* FIXME: this assumes ASCII. create tables in init_sre() instead */
127 static char sre_char_info
[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
128 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
129 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
130 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
131 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
132 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
133 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
135 static char sre_char_lower
[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
136 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
137 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
138 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
139 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
140 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
141 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
142 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
143 120, 121, 122, 123, 124, 125, 126, 127 };
145 #define SRE_IS_DIGIT(ch)\
146 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
147 #define SRE_IS_SPACE(ch)\
148 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
149 #define SRE_IS_LINEBREAK(ch)\
150 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
151 #define SRE_IS_ALNUM(ch)\
152 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
153 #define SRE_IS_WORD(ch)\
154 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
156 static unsigned int sre_lower(unsigned int ch
)
158 return ((ch
) < 128 ? (unsigned int)sre_char_lower
[ch
] : ch
);
161 /* locale-specific character predicates */
162 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
163 * warnings when c's type supports only numbers < N+1 */
164 #define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
165 #define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
166 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
167 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
168 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
170 static unsigned int sre_lower_locale(unsigned int ch
)
172 return ((ch
) < 256 ? (unsigned int)tolower((ch
)) : ch
);
175 /* unicode-specific character predicates */
177 #if defined(HAVE_UNICODE)
179 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
180 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
181 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
182 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
183 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
185 static unsigned int sre_lower_unicode(unsigned int ch
)
187 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE
)(ch
));
193 sre_category(SRE_CODE category
, unsigned int ch
)
197 case SRE_CATEGORY_DIGIT
:
198 return SRE_IS_DIGIT(ch
);
199 case SRE_CATEGORY_NOT_DIGIT
:
200 return !SRE_IS_DIGIT(ch
);
201 case SRE_CATEGORY_SPACE
:
202 return SRE_IS_SPACE(ch
);
203 case SRE_CATEGORY_NOT_SPACE
:
204 return !SRE_IS_SPACE(ch
);
205 case SRE_CATEGORY_WORD
:
206 return SRE_IS_WORD(ch
);
207 case SRE_CATEGORY_NOT_WORD
:
208 return !SRE_IS_WORD(ch
);
209 case SRE_CATEGORY_LINEBREAK
:
210 return SRE_IS_LINEBREAK(ch
);
211 case SRE_CATEGORY_NOT_LINEBREAK
:
212 return !SRE_IS_LINEBREAK(ch
);
214 case SRE_CATEGORY_LOC_WORD
:
215 return SRE_LOC_IS_WORD(ch
);
216 case SRE_CATEGORY_LOC_NOT_WORD
:
217 return !SRE_LOC_IS_WORD(ch
);
219 #if defined(HAVE_UNICODE)
220 case SRE_CATEGORY_UNI_DIGIT
:
221 return SRE_UNI_IS_DIGIT(ch
);
222 case SRE_CATEGORY_UNI_NOT_DIGIT
:
223 return !SRE_UNI_IS_DIGIT(ch
);
224 case SRE_CATEGORY_UNI_SPACE
:
225 return SRE_UNI_IS_SPACE(ch
);
226 case SRE_CATEGORY_UNI_NOT_SPACE
:
227 return !SRE_UNI_IS_SPACE(ch
);
228 case SRE_CATEGORY_UNI_WORD
:
229 return SRE_UNI_IS_WORD(ch
);
230 case SRE_CATEGORY_UNI_NOT_WORD
:
231 return !SRE_UNI_IS_WORD(ch
);
232 case SRE_CATEGORY_UNI_LINEBREAK
:
233 return SRE_UNI_IS_LINEBREAK(ch
);
234 case SRE_CATEGORY_UNI_NOT_LINEBREAK
:
235 return !SRE_UNI_IS_LINEBREAK(ch
);
237 case SRE_CATEGORY_UNI_DIGIT
:
238 return SRE_IS_DIGIT(ch
);
239 case SRE_CATEGORY_UNI_NOT_DIGIT
:
240 return !SRE_IS_DIGIT(ch
);
241 case SRE_CATEGORY_UNI_SPACE
:
242 return SRE_IS_SPACE(ch
);
243 case SRE_CATEGORY_UNI_NOT_SPACE
:
244 return !SRE_IS_SPACE(ch
);
245 case SRE_CATEGORY_UNI_WORD
:
246 return SRE_LOC_IS_WORD(ch
);
247 case SRE_CATEGORY_UNI_NOT_WORD
:
248 return !SRE_LOC_IS_WORD(ch
);
249 case SRE_CATEGORY_UNI_LINEBREAK
:
250 return SRE_IS_LINEBREAK(ch
);
251 case SRE_CATEGORY_UNI_NOT_LINEBREAK
:
252 return !SRE_IS_LINEBREAK(ch
);
261 data_stack_dealloc(SRE_STATE
* state
)
263 if (state
->data_stack
) {
264 PyMem_FREE(state
->data_stack
);
265 state
->data_stack
= NULL
;
267 state
->data_stack_size
= state
->data_stack_base
= 0;
271 data_stack_grow(SRE_STATE
* state
, Py_ssize_t size
)
273 Py_ssize_t minsize
, cursize
;
274 minsize
= state
->data_stack_base
+size
;
275 cursize
= state
->data_stack_size
;
276 if (cursize
< minsize
) {
278 cursize
= minsize
+minsize
/4+1024;
279 TRACE(("allocate/grow stack %d\n", cursize
));
280 stack
= PyMem_REALLOC(state
->data_stack
, cursize
);
282 data_stack_dealloc(state
);
283 return SRE_ERROR_MEMORY
;
285 state
->data_stack
= (char *)stack
;
286 state
->data_stack_size
= cursize
;
291 /* generate 8-bit version */
293 #define SRE_CHAR unsigned char
294 #define SRE_AT sre_at
295 #define SRE_COUNT sre_count
296 #define SRE_CHARSET sre_charset
297 #define SRE_INFO sre_info
298 #define SRE_MATCH sre_match
299 #define SRE_MATCH_CONTEXT sre_match_context
300 #define SRE_SEARCH sre_search
301 #define SRE_LITERAL_TEMPLATE sre_literal_template
303 #if defined(HAVE_UNICODE)
305 #define SRE_RECURSIVE
309 #undef SRE_LITERAL_TEMPLATE
312 #undef SRE_MATCH_CONTEXT
319 /* generate 16-bit unicode version */
321 #define SRE_CHAR Py_UNICODE
322 #define SRE_AT sre_uat
323 #define SRE_COUNT sre_ucount
324 #define SRE_CHARSET sre_ucharset
325 #define SRE_INFO sre_uinfo
326 #define SRE_MATCH sre_umatch
327 #define SRE_MATCH_CONTEXT sre_umatch_context
328 #define SRE_SEARCH sre_usearch
329 #define SRE_LITERAL_TEMPLATE sre_uliteral_template
332 #endif /* SRE_RECURSIVE */
334 /* -------------------------------------------------------------------- */
335 /* String matching engine */
337 /* the following section is compiled twice, with different character
341 SRE_AT(SRE_STATE
* state
, SRE_CHAR
* ptr
, SRE_CODE at
)
343 /* check if pointer is at given position */
345 Py_ssize_t thisp
, thatp
;
349 case SRE_AT_BEGINNING
:
350 case SRE_AT_BEGINNING_STRING
:
351 return ((void*) ptr
== state
->beginning
);
353 case SRE_AT_BEGINNING_LINE
:
354 return ((void*) ptr
== state
->beginning
||
355 SRE_IS_LINEBREAK((int) ptr
[-1]));
358 return (((void*) (ptr
+1) == state
->end
&&
359 SRE_IS_LINEBREAK((int) ptr
[0])) ||
360 ((void*) ptr
== state
->end
));
362 case SRE_AT_END_LINE
:
363 return ((void*) ptr
== state
->end
||
364 SRE_IS_LINEBREAK((int) ptr
[0]));
366 case SRE_AT_END_STRING
:
367 return ((void*) ptr
== state
->end
);
369 case SRE_AT_BOUNDARY
:
370 if (state
->beginning
== state
->end
)
372 thatp
= ((void*) ptr
> state
->beginning
) ?
373 SRE_IS_WORD((int) ptr
[-1]) : 0;
374 thisp
= ((void*) ptr
< state
->end
) ?
375 SRE_IS_WORD((int) ptr
[0]) : 0;
376 return thisp
!= thatp
;
378 case SRE_AT_NON_BOUNDARY
:
379 if (state
->beginning
== state
->end
)
381 thatp
= ((void*) ptr
> state
->beginning
) ?
382 SRE_IS_WORD((int) ptr
[-1]) : 0;
383 thisp
= ((void*) ptr
< state
->end
) ?
384 SRE_IS_WORD((int) ptr
[0]) : 0;
385 return thisp
== thatp
;
387 case SRE_AT_LOC_BOUNDARY
:
388 if (state
->beginning
== state
->end
)
390 thatp
= ((void*) ptr
> state
->beginning
) ?
391 SRE_LOC_IS_WORD((int) ptr
[-1]) : 0;
392 thisp
= ((void*) ptr
< state
->end
) ?
393 SRE_LOC_IS_WORD((int) ptr
[0]) : 0;
394 return thisp
!= thatp
;
396 case SRE_AT_LOC_NON_BOUNDARY
:
397 if (state
->beginning
== state
->end
)
399 thatp
= ((void*) ptr
> state
->beginning
) ?
400 SRE_LOC_IS_WORD((int) ptr
[-1]) : 0;
401 thisp
= ((void*) ptr
< state
->end
) ?
402 SRE_LOC_IS_WORD((int) ptr
[0]) : 0;
403 return thisp
== thatp
;
405 #if defined(HAVE_UNICODE)
406 case SRE_AT_UNI_BOUNDARY
:
407 if (state
->beginning
== state
->end
)
409 thatp
= ((void*) ptr
> state
->beginning
) ?
410 SRE_UNI_IS_WORD((int) ptr
[-1]) : 0;
411 thisp
= ((void*) ptr
< state
->end
) ?
412 SRE_UNI_IS_WORD((int) ptr
[0]) : 0;
413 return thisp
!= thatp
;
415 case SRE_AT_UNI_NON_BOUNDARY
:
416 if (state
->beginning
== state
->end
)
418 thatp
= ((void*) ptr
> state
->beginning
) ?
419 SRE_UNI_IS_WORD((int) ptr
[-1]) : 0;
420 thisp
= ((void*) ptr
< state
->end
) ?
421 SRE_UNI_IS_WORD((int) ptr
[0]) : 0;
422 return thisp
== thatp
;
431 SRE_CHARSET(SRE_CODE
* set
, SRE_CODE ch
)
433 /* check if character is a member of the given set */
444 /* <LITERAL> <code> */
450 case SRE_OP_CATEGORY
:
451 /* <CATEGORY> <code> */
452 if (sre_category(set
[0], (int) ch
))
458 if (sizeof(SRE_CODE
) == 2) {
459 /* <CHARSET> <bitmap> (16 bits per code word) */
460 if (ch
< 256 && (set
[ch
>> 4] & (1 << (ch
& 15))))
465 /* <CHARSET> <bitmap> (32 bits per code word) */
466 if (ch
< 256 && (set
[ch
>> 5] & (1 << (ch
& 31))))
473 /* <RANGE> <lower> <upper> */
474 if (set
[0] <= ch
&& ch
<= set
[1])
483 case SRE_OP_BIGCHARSET
:
484 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
486 Py_ssize_t count
, block
;
489 if (sizeof(SRE_CODE
) == 2) {
490 block
= ((unsigned char*)set
)[ch
>> 8];
492 if (set
[block
*16 + ((ch
& 255)>>4)] & (1 << (ch
& 15)))
497 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
498 * warnings when c's type supports only numbers < N+1 */
500 block
= ((unsigned char*)set
)[ch
>> 8];
505 (set
[block
*8 + ((ch
& 255)>>5)] & (1 << (ch
& 31))))
513 /* internal error -- there's not much we can do about it
514 here, so let's just pretend it didn't match... */
520 LOCAL(Py_ssize_t
) SRE_MATCH(SRE_STATE
* state
, SRE_CODE
* pattern
);
523 SRE_COUNT(SRE_STATE
* state
, SRE_CODE
* pattern
, Py_ssize_t maxcount
)
526 SRE_CHAR
* ptr
= (SRE_CHAR
*)state
->ptr
;
527 SRE_CHAR
* end
= (SRE_CHAR
*)state
->end
;
531 if (maxcount
< end
- ptr
&& maxcount
!= 65535)
532 end
= ptr
+ maxcount
;
534 switch (pattern
[0]) {
538 TRACE(("|%p|%p|COUNT IN\n", pattern
, ptr
));
539 while (ptr
< end
&& SRE_CHARSET(pattern
+ 2, *ptr
))
544 /* repeated dot wildcard. */
545 TRACE(("|%p|%p|COUNT ANY\n", pattern
, ptr
));
546 while (ptr
< end
&& !SRE_IS_LINEBREAK(*ptr
))
551 /* repeated dot wildcard. skip to the end of the target
552 string, and backtrack from there */
553 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern
, ptr
));
558 /* repeated literal */
560 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern
, ptr
, chr
));
561 while (ptr
< end
&& (SRE_CODE
) *ptr
== chr
)
565 case SRE_OP_LITERAL_IGNORE
:
566 /* repeated literal */
568 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern
, ptr
, chr
));
569 while (ptr
< end
&& (SRE_CODE
) state
->lower(*ptr
) == chr
)
573 case SRE_OP_NOT_LITERAL
:
574 /* repeated non-literal */
576 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern
, ptr
, chr
));
577 while (ptr
< end
&& (SRE_CODE
) *ptr
!= chr
)
581 case SRE_OP_NOT_LITERAL_IGNORE
:
582 /* repeated non-literal */
584 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern
, ptr
, chr
));
585 while (ptr
< end
&& (SRE_CODE
) state
->lower(*ptr
) != chr
)
590 /* repeated single character pattern */
591 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern
, ptr
));
592 while ((SRE_CHAR
*) state
->ptr
< end
) {
593 i
= SRE_MATCH(state
, pattern
);
599 TRACE(("|%p|%p|COUNT %d\n", pattern
, ptr
,
600 (SRE_CHAR
*) state
->ptr
- ptr
));
601 return (SRE_CHAR
*) state
->ptr
- ptr
;
604 TRACE(("|%p|%p|COUNT %d\n", pattern
, ptr
, ptr
- (SRE_CHAR
*) state
->ptr
));
605 return ptr
- (SRE_CHAR
*) state
->ptr
;
608 #if 0 /* not used in this release */
610 SRE_INFO(SRE_STATE
* state
, SRE_CODE
* pattern
)
612 /* check if an SRE_OP_INFO block matches at the current position.
613 returns the number of SRE_CODE objects to skip if successful, 0
616 SRE_CHAR
* end
= state
->end
;
617 SRE_CHAR
* ptr
= state
->ptr
;
620 /* check minimal length */
621 if (pattern
[3] && (end
- ptr
) < pattern
[3])
624 /* check known prefix */
625 if (pattern
[2] & SRE_INFO_PREFIX
&& pattern
[5] > 1) {
626 /* <length> <skip> <prefix data> <overlap data> */
627 for (i
= 0; i
< pattern
[5]; i
++)
628 if ((SRE_CODE
) ptr
[i
] != pattern
[7 + i
])
630 return pattern
[0] + 2 * pattern
[6];
636 /* The macros below should be used to protect recursive SRE_MATCH()
637 * calls that *failed* and do *not* return immediately (IOW, those
638 * that will backtrack). Explaining:
640 * - Recursive SRE_MATCH() returned true: that's usually a success
641 * (besides atypical cases like ASSERT_NOT), therefore there's no
642 * reason to restore lastmark;
644 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
645 * is returning to the caller: If the current SRE_MATCH() is the
646 * top function of the recursion, returning false will be a matching
647 * failure, and it doesn't matter where lastmark is pointing to.
648 * If it's *not* the top function, it will be a recursive SRE_MATCH()
649 * failure by itself, and the calling SRE_MATCH() will have to deal
650 * with the failure by the same rules explained here (it will restore
651 * lastmark by itself if necessary);
653 * - Recursive SRE_MATCH() returned false, and will continue the
654 * outside 'for' loop: must be protected when breaking, since the next
655 * OP could potentially depend on lastmark;
657 * - Recursive SRE_MATCH() returned false, and will be called again
658 * inside a local for/while loop: must be protected between each
659 * loop iteration, since the recursive SRE_MATCH() could do anything,
660 * and could potentially depend on lastmark.
662 * For more information, check the discussion at SF patch #712900.
664 #define LASTMARK_SAVE() \
666 ctx->lastmark = state->lastmark; \
667 ctx->lastindex = state->lastindex; \
669 #define LASTMARK_RESTORE() \
671 state->lastmark = ctx->lastmark; \
672 state->lastindex = ctx->lastindex; \
675 #define RETURN_ERROR(i) do { return i; } while(0)
676 #define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
677 #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
679 #define RETURN_ON_ERROR(i) \
680 do { if (i < 0) RETURN_ERROR(i); } while (0)
681 #define RETURN_ON_SUCCESS(i) \
682 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
683 #define RETURN_ON_FAILURE(i) \
684 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
688 #define DATA_STACK_ALLOC(state, type, ptr) \
690 alloc_pos = state->data_stack_base; \
691 TRACE(("allocating %s in %d (%d)\n", \
692 SFY(type), alloc_pos, sizeof(type))); \
693 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
694 int j = data_stack_grow(state, sizeof(type)); \
695 if (j < 0) return j; \
697 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
699 ptr = (type*)(state->data_stack+alloc_pos); \
700 state->data_stack_base += sizeof(type); \
703 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
705 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
706 ptr = (type*)(state->data_stack+pos); \
709 #define DATA_STACK_PUSH(state, data, size) \
711 TRACE(("copy data in %p to %d (%d)\n", \
712 data, state->data_stack_base, size)); \
713 if (state->data_stack_size < state->data_stack_base+size) { \
714 int j = data_stack_grow(state, size); \
715 if (j < 0) return j; \
717 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
719 memcpy(state->data_stack+state->data_stack_base, data, size); \
720 state->data_stack_base += size; \
723 #define DATA_STACK_POP(state, data, size, discard) \
725 TRACE(("copy data to %p from %d (%d)\n", \
726 data, state->data_stack_base-size, size)); \
727 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
729 state->data_stack_base -= size; \
732 #define DATA_STACK_POP_DISCARD(state, size) \
734 TRACE(("discard data from %d (%d)\n", \
735 state->data_stack_base-size, size)); \
736 state->data_stack_base -= size; \
739 #define DATA_PUSH(x) \
740 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
741 #define DATA_POP(x) \
742 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
743 #define DATA_POP_DISCARD(x) \
744 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
745 #define DATA_ALLOC(t,p) \
746 DATA_STACK_ALLOC(state, t, p)
747 #define DATA_LOOKUP_AT(t,p,pos) \
748 DATA_STACK_LOOKUP_AT(state,t,p,pos)
750 #define MARK_PUSH(lastmark) \
751 do if (lastmark > 0) { \
752 i = lastmark; /* ctx->lastmark may change if reallocated */ \
753 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
755 #define MARK_POP(lastmark) \
756 do if (lastmark > 0) { \
757 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
759 #define MARK_POP_KEEP(lastmark) \
760 do if (lastmark > 0) { \
761 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
763 #define MARK_POP_DISCARD(lastmark) \
764 do if (lastmark > 0) { \
765 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
769 #define JUMP_MAX_UNTIL_1 1
770 #define JUMP_MAX_UNTIL_2 2
771 #define JUMP_MAX_UNTIL_3 3
772 #define JUMP_MIN_UNTIL_1 4
773 #define JUMP_MIN_UNTIL_2 5
774 #define JUMP_MIN_UNTIL_3 6
775 #define JUMP_REPEAT 7
776 #define JUMP_REPEAT_ONE_1 8
777 #define JUMP_REPEAT_ONE_2 9
778 #define JUMP_MIN_REPEAT_ONE 10
779 #define JUMP_BRANCH 11
780 #define JUMP_ASSERT 12
781 #define JUMP_ASSERT_NOT 13
783 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
784 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
785 nextctx->last_ctx_pos = ctx_pos; \
786 nextctx->jump = jumpvalue; \
787 nextctx->pattern = nextpattern; \
788 ctx_pos = alloc_pos; \
792 while (0) /* gcc doesn't like labels at end of scopes */ \
795 Py_ssize_t last_ctx_pos
;
801 Py_ssize_t lastindex
;
808 /* check if string matches the given pattern. returns <0 for
809 error, 0 for failure, and 1 for success */
811 SRE_MATCH(SRE_STATE
* state
, SRE_CODE
* pattern
)
813 SRE_CHAR
* end
= (SRE_CHAR
*)state
->end
;
814 Py_ssize_t alloc_pos
, ctx_pos
= -1;
815 Py_ssize_t i
, ret
= 0;
817 unsigned int sigcount
=0;
819 SRE_MATCH_CONTEXT
* ctx
;
820 SRE_MATCH_CONTEXT
* nextctx
;
822 TRACE(("|%p|%p|ENTER\n", pattern
, state
->ptr
));
824 DATA_ALLOC(SRE_MATCH_CONTEXT
, ctx
);
825 ctx
->last_ctx_pos
= -1;
826 ctx
->jump
= JUMP_NONE
;
827 ctx
->pattern
= pattern
;
832 ctx
->ptr
= (SRE_CHAR
*)state
->ptr
;
834 if (ctx
->pattern
[0] == SRE_OP_INFO
) {
835 /* optimization info block */
836 /* <INFO> <1=skip> <2=flags> <3=min> ... */
837 if (ctx
->pattern
[3] && (end
- ctx
->ptr
) < ctx
->pattern
[3]) {
838 TRACE(("reject (got %d chars, need %d)\n",
839 (end
- ctx
->ptr
), ctx
->pattern
[3]));
842 ctx
->pattern
+= ctx
->pattern
[1] + 1;
847 if ((0 == (sigcount
& 0xfff)) && PyErr_CheckSignals())
848 RETURN_ERROR(SRE_ERROR_INTERRUPTED
);
850 switch (*ctx
->pattern
++) {
855 TRACE(("|%p|%p|MARK %d\n", ctx
->pattern
,
856 ctx
->ptr
, ctx
->pattern
[0]));
859 state
->lastindex
= i
/2 + 1;
860 if (i
> state
->lastmark
) {
861 /* state->lastmark is the highest valid index in the
862 state->mark array. If it is increased by more than 1,
863 the intervening marks must be set to NULL to signal
864 that these marks have not been encountered. */
865 Py_ssize_t j
= state
->lastmark
+ 1;
867 state
->mark
[j
++] = NULL
;
870 state
->mark
[i
] = ctx
->ptr
;
875 /* match literal string */
876 /* <LITERAL> <code> */
877 TRACE(("|%p|%p|LITERAL %d\n", ctx
->pattern
,
878 ctx
->ptr
, *ctx
->pattern
));
879 if (ctx
->ptr
>= end
|| (SRE_CODE
) ctx
->ptr
[0] != ctx
->pattern
[0])
885 case SRE_OP_NOT_LITERAL
:
886 /* match anything that is not literal character */
887 /* <NOT_LITERAL> <code> */
888 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx
->pattern
,
889 ctx
->ptr
, *ctx
->pattern
));
890 if (ctx
->ptr
>= end
|| (SRE_CODE
) ctx
->ptr
[0] == ctx
->pattern
[0])
898 TRACE(("|%p|%p|SUCCESS\n", ctx
->pattern
, ctx
->ptr
));
899 state
->ptr
= ctx
->ptr
;
903 /* match at given position */
905 TRACE(("|%p|%p|AT %d\n", ctx
->pattern
, ctx
->ptr
, *ctx
->pattern
));
906 if (!SRE_AT(state
, ctx
->ptr
, *ctx
->pattern
))
911 case SRE_OP_CATEGORY
:
912 /* match at given category */
913 /* <CATEGORY> <code> */
914 TRACE(("|%p|%p|CATEGORY %d\n", ctx
->pattern
,
915 ctx
->ptr
, *ctx
->pattern
));
916 if (ctx
->ptr
>= end
|| !sre_category(ctx
->pattern
[0], ctx
->ptr
[0]))
923 /* match anything (except a newline) */
925 TRACE(("|%p|%p|ANY\n", ctx
->pattern
, ctx
->ptr
));
926 if (ctx
->ptr
>= end
|| SRE_IS_LINEBREAK(ctx
->ptr
[0]))
934 TRACE(("|%p|%p|ANY_ALL\n", ctx
->pattern
, ctx
->ptr
));
941 /* match set member (or non_member) */
942 /* <IN> <skip> <set> */
943 TRACE(("|%p|%p|IN\n", ctx
->pattern
, ctx
->ptr
));
944 if (ctx
->ptr
>= end
|| !SRE_CHARSET(ctx
->pattern
+ 1, *ctx
->ptr
))
946 ctx
->pattern
+= ctx
->pattern
[0];
950 case SRE_OP_LITERAL_IGNORE
:
951 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
952 ctx
->pattern
, ctx
->ptr
, ctx
->pattern
[0]));
953 if (ctx
->ptr
>= end
||
954 state
->lower(*ctx
->ptr
) != state
->lower(*ctx
->pattern
))
960 case SRE_OP_NOT_LITERAL_IGNORE
:
961 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
962 ctx
->pattern
, ctx
->ptr
, *ctx
->pattern
));
963 if (ctx
->ptr
>= end
||
964 state
->lower(*ctx
->ptr
) == state
->lower(*ctx
->pattern
))
970 case SRE_OP_IN_IGNORE
:
971 TRACE(("|%p|%p|IN_IGNORE\n", ctx
->pattern
, ctx
->ptr
));
973 || !SRE_CHARSET(ctx
->pattern
+1,
974 (SRE_CODE
)state
->lower(*ctx
->ptr
)))
976 ctx
->pattern
+= ctx
->pattern
[0];
983 /* <JUMP> <offset> */
984 TRACE(("|%p|%p|JUMP %d\n", ctx
->pattern
,
985 ctx
->ptr
, ctx
->pattern
[0]));
986 ctx
->pattern
+= ctx
->pattern
[0];
991 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
992 TRACE(("|%p|%p|BRANCH\n", ctx
->pattern
, ctx
->ptr
));
994 ctx
->u
.rep
= state
->repeat
;
996 MARK_PUSH(ctx
->lastmark
);
997 for (; ctx
->pattern
[0]; ctx
->pattern
+= ctx
->pattern
[0]) {
998 if (ctx
->pattern
[1] == SRE_OP_LITERAL
&&
1000 (SRE_CODE
) *ctx
->ptr
!= ctx
->pattern
[2]))
1002 if (ctx
->pattern
[1] == SRE_OP_IN
&&
1004 !SRE_CHARSET(ctx
->pattern
+ 3, (SRE_CODE
) *ctx
->ptr
)))
1006 state
->ptr
= ctx
->ptr
;
1007 DO_JUMP(JUMP_BRANCH
, jump_branch
, ctx
->pattern
+1);
1010 MARK_POP_DISCARD(ctx
->lastmark
);
1011 RETURN_ON_ERROR(ret
);
1015 MARK_POP_KEEP(ctx
->lastmark
);
1019 MARK_POP_DISCARD(ctx
->lastmark
);
1022 case SRE_OP_REPEAT_ONE
:
1023 /* match repeated sequence (maximizing regexp) */
1025 /* this operator only works if the repeated item is
1026 exactly one character wide, and we're not already
1027 collecting backtracking points. for other cases,
1028 use the MAX_REPEAT operator */
1030 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1032 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx
->pattern
, ctx
->ptr
,
1033 ctx
->pattern
[1], ctx
->pattern
[2]));
1035 if (ctx
->ptr
+ ctx
->pattern
[1] > end
)
1036 RETURN_FAILURE
; /* cannot match */
1038 state
->ptr
= ctx
->ptr
;
1040 ret
= SRE_COUNT(state
, ctx
->pattern
+3, ctx
->pattern
[2]);
1041 RETURN_ON_ERROR(ret
);
1042 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT
, ctx
, ctx_pos
);
1044 ctx
->ptr
+= ctx
->count
;
1046 /* when we arrive here, count contains the number of
1047 matches, and ctx->ptr points to the tail of the target
1048 string. check if the rest of the pattern matches,
1049 and backtrack if not. */
1051 if (ctx
->count
< (Py_ssize_t
) ctx
->pattern
[1])
1054 if (ctx
->pattern
[ctx
->pattern
[0]] == SRE_OP_SUCCESS
) {
1055 /* tail is empty. we're finished */
1056 state
->ptr
= ctx
->ptr
;
1062 if (ctx
->pattern
[ctx
->pattern
[0]] == SRE_OP_LITERAL
) {
1063 /* tail starts with a literal. skip positions where
1064 the rest of the pattern cannot possibly match */
1065 ctx
->u
.chr
= ctx
->pattern
[ctx
->pattern
[0]+1];
1067 while (ctx
->count
>= (Py_ssize_t
) ctx
->pattern
[1] &&
1068 (ctx
->ptr
>= end
|| *ctx
->ptr
!= ctx
->u
.chr
)) {
1072 if (ctx
->count
< (Py_ssize_t
) ctx
->pattern
[1])
1074 state
->ptr
= ctx
->ptr
;
1075 DO_JUMP(JUMP_REPEAT_ONE_1
, jump_repeat_one_1
,
1076 ctx
->pattern
+ctx
->pattern
[0]);
1078 RETURN_ON_ERROR(ret
);
1090 while (ctx
->count
>= (Py_ssize_t
) ctx
->pattern
[1]) {
1091 state
->ptr
= ctx
->ptr
;
1092 DO_JUMP(JUMP_REPEAT_ONE_2
, jump_repeat_one_2
,
1093 ctx
->pattern
+ctx
->pattern
[0]);
1095 RETURN_ON_ERROR(ret
);
1105 case SRE_OP_MIN_REPEAT_ONE
:
1106 /* match repeated sequence (minimizing regexp) */
1108 /* this operator only works if the repeated item is
1109 exactly one character wide, and we're not already
1110 collecting backtracking points. for other cases,
1111 use the MIN_REPEAT operator */
1113 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1115 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx
->pattern
, ctx
->ptr
,
1116 ctx
->pattern
[1], ctx
->pattern
[2]));
1118 if (ctx
->ptr
+ ctx
->pattern
[1] > end
)
1119 RETURN_FAILURE
; /* cannot match */
1121 state
->ptr
= ctx
->ptr
;
1123 if (ctx
->pattern
[1] == 0)
1126 /* count using pattern min as the maximum */
1127 ret
= SRE_COUNT(state
, ctx
->pattern
+3, ctx
->pattern
[1]);
1128 RETURN_ON_ERROR(ret
);
1129 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT
, ctx
, ctx_pos
);
1130 if (ret
< (Py_ssize_t
) ctx
->pattern
[1])
1131 /* didn't match minimum number of times */
1133 /* advance past minimum matches of repeat */
1135 ctx
->ptr
+= ctx
->count
;
1138 if (ctx
->pattern
[ctx
->pattern
[0]] == SRE_OP_SUCCESS
) {
1139 /* tail is empty. we're finished */
1140 state
->ptr
= ctx
->ptr
;
1146 while ((Py_ssize_t
)ctx
->pattern
[2] == 65535
1147 || ctx
->count
<= (Py_ssize_t
)ctx
->pattern
[2]) {
1148 state
->ptr
= ctx
->ptr
;
1149 DO_JUMP(JUMP_MIN_REPEAT_ONE
,jump_min_repeat_one
,
1150 ctx
->pattern
+ctx
->pattern
[0]);
1152 RETURN_ON_ERROR(ret
);
1155 state
->ptr
= ctx
->ptr
;
1156 ret
= SRE_COUNT(state
, ctx
->pattern
+3, 1);
1157 RETURN_ON_ERROR(ret
);
1158 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT
, ctx
, ctx_pos
);
1170 /* create repeat context. all the hard work is done
1171 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1172 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1173 TRACE(("|%p|%p|REPEAT %d %d\n", ctx
->pattern
, ctx
->ptr
,
1174 ctx
->pattern
[1], ctx
->pattern
[2]));
1176 /* install new repeat context */
1177 ctx
->u
.rep
= (SRE_REPEAT
*) PyObject_MALLOC(sizeof(*ctx
->u
.rep
));
1182 ctx
->u
.rep
->count
= -1;
1183 ctx
->u
.rep
->pattern
= ctx
->pattern
;
1184 ctx
->u
.rep
->prev
= state
->repeat
;
1185 ctx
->u
.rep
->last_ptr
= NULL
;
1186 state
->repeat
= ctx
->u
.rep
;
1188 state
->ptr
= ctx
->ptr
;
1189 DO_JUMP(JUMP_REPEAT
, jump_repeat
, ctx
->pattern
+ctx
->pattern
[0]);
1190 state
->repeat
= ctx
->u
.rep
->prev
;
1191 PyObject_FREE(ctx
->u
.rep
);
1194 RETURN_ON_ERROR(ret
);
1199 case SRE_OP_MAX_UNTIL
:
1200 /* maximizing repeat */
1201 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1203 /* FIXME: we probably need to deal with zero-width
1204 matches in here... */
1206 ctx
->u
.rep
= state
->repeat
;
1208 RETURN_ERROR(SRE_ERROR_STATE
);
1210 state
->ptr
= ctx
->ptr
;
1212 ctx
->count
= ctx
->u
.rep
->count
+1;
1214 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx
->pattern
,
1215 ctx
->ptr
, ctx
->count
));
1217 if (ctx
->count
< ctx
->u
.rep
->pattern
[1]) {
1218 /* not enough matches */
1219 ctx
->u
.rep
->count
= ctx
->count
;
1220 DO_JUMP(JUMP_MAX_UNTIL_1
, jump_max_until_1
,
1221 ctx
->u
.rep
->pattern
+3);
1223 RETURN_ON_ERROR(ret
);
1226 ctx
->u
.rep
->count
= ctx
->count
-1;
1227 state
->ptr
= ctx
->ptr
;
1231 if ((ctx
->count
< ctx
->u
.rep
->pattern
[2] ||
1232 ctx
->u
.rep
->pattern
[2] == 65535) &&
1233 state
->ptr
!= ctx
->u
.rep
->last_ptr
) {
1234 /* we may have enough matches, but if we can
1235 match another item, do so */
1236 ctx
->u
.rep
->count
= ctx
->count
;
1238 MARK_PUSH(ctx
->lastmark
);
1239 /* zero-width match protection */
1240 DATA_PUSH(&ctx
->u
.rep
->last_ptr
);
1241 ctx
->u
.rep
->last_ptr
= state
->ptr
;
1242 DO_JUMP(JUMP_MAX_UNTIL_2
, jump_max_until_2
,
1243 ctx
->u
.rep
->pattern
+3);
1244 DATA_POP(&ctx
->u
.rep
->last_ptr
);
1246 MARK_POP_DISCARD(ctx
->lastmark
);
1247 RETURN_ON_ERROR(ret
);
1250 MARK_POP(ctx
->lastmark
);
1252 ctx
->u
.rep
->count
= ctx
->count
-1;
1253 state
->ptr
= ctx
->ptr
;
1256 /* cannot match more repeated items here. make sure the
1258 state
->repeat
= ctx
->u
.rep
->prev
;
1259 DO_JUMP(JUMP_MAX_UNTIL_3
, jump_max_until_3
, ctx
->pattern
);
1260 RETURN_ON_SUCCESS(ret
);
1261 state
->repeat
= ctx
->u
.rep
;
1262 state
->ptr
= ctx
->ptr
;
1265 case SRE_OP_MIN_UNTIL
:
1266 /* minimizing repeat */
1267 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1269 ctx
->u
.rep
= state
->repeat
;
1271 RETURN_ERROR(SRE_ERROR_STATE
);
1273 state
->ptr
= ctx
->ptr
;
1275 ctx
->count
= ctx
->u
.rep
->count
+1;
1277 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx
->pattern
,
1278 ctx
->ptr
, ctx
->count
, ctx
->u
.rep
->pattern
));
1280 if (ctx
->count
< ctx
->u
.rep
->pattern
[1]) {
1281 /* not enough matches */
1282 ctx
->u
.rep
->count
= ctx
->count
;
1283 DO_JUMP(JUMP_MIN_UNTIL_1
, jump_min_until_1
,
1284 ctx
->u
.rep
->pattern
+3);
1286 RETURN_ON_ERROR(ret
);
1289 ctx
->u
.rep
->count
= ctx
->count
-1;
1290 state
->ptr
= ctx
->ptr
;
1296 /* see if the tail matches */
1297 state
->repeat
= ctx
->u
.rep
->prev
;
1298 DO_JUMP(JUMP_MIN_UNTIL_2
, jump_min_until_2
, ctx
->pattern
);
1300 RETURN_ON_ERROR(ret
);
1304 state
->repeat
= ctx
->u
.rep
;
1305 state
->ptr
= ctx
->ptr
;
1309 if (ctx
->count
>= ctx
->u
.rep
->pattern
[2]
1310 && ctx
->u
.rep
->pattern
[2] != 65535)
1313 ctx
->u
.rep
->count
= ctx
->count
;
1314 DO_JUMP(JUMP_MIN_UNTIL_3
,jump_min_until_3
,
1315 ctx
->u
.rep
->pattern
+3);
1317 RETURN_ON_ERROR(ret
);
1320 ctx
->u
.rep
->count
= ctx
->count
-1;
1321 state
->ptr
= ctx
->ptr
;
1324 case SRE_OP_GROUPREF
:
1325 /* match backreference */
1326 TRACE(("|%p|%p|GROUPREF %d\n", ctx
->pattern
,
1327 ctx
->ptr
, ctx
->pattern
[0]));
1328 i
= ctx
->pattern
[0];
1330 Py_ssize_t groupref
= i
+i
;
1331 if (groupref
>= state
->lastmark
) {
1334 SRE_CHAR
* p
= (SRE_CHAR
*) state
->mark
[groupref
];
1335 SRE_CHAR
* e
= (SRE_CHAR
*) state
->mark
[groupref
+1];
1336 if (!p
|| !e
|| e
< p
)
1339 if (ctx
->ptr
>= end
|| *ctx
->ptr
!= *p
)
1348 case SRE_OP_GROUPREF_IGNORE
:
1349 /* match backreference */
1350 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx
->pattern
,
1351 ctx
->ptr
, ctx
->pattern
[0]));
1352 i
= ctx
->pattern
[0];
1354 Py_ssize_t groupref
= i
+i
;
1355 if (groupref
>= state
->lastmark
) {
1358 SRE_CHAR
* p
= (SRE_CHAR
*) state
->mark
[groupref
];
1359 SRE_CHAR
* e
= (SRE_CHAR
*) state
->mark
[groupref
+1];
1360 if (!p
|| !e
|| e
< p
)
1363 if (ctx
->ptr
>= end
||
1364 state
->lower(*ctx
->ptr
) != state
->lower(*p
))
1373 case SRE_OP_GROUPREF_EXISTS
:
1374 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx
->pattern
,
1375 ctx
->ptr
, ctx
->pattern
[0]));
1376 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1377 i
= ctx
->pattern
[0];
1379 Py_ssize_t groupref
= i
+i
;
1380 if (groupref
>= state
->lastmark
) {
1381 ctx
->pattern
+= ctx
->pattern
[1];
1384 SRE_CHAR
* p
= (SRE_CHAR
*) state
->mark
[groupref
];
1385 SRE_CHAR
* e
= (SRE_CHAR
*) state
->mark
[groupref
+1];
1386 if (!p
|| !e
|| e
< p
) {
1387 ctx
->pattern
+= ctx
->pattern
[1];
1396 /* assert subpattern */
1397 /* <ASSERT> <skip> <back> <pattern> */
1398 TRACE(("|%p|%p|ASSERT %d\n", ctx
->pattern
,
1399 ctx
->ptr
, ctx
->pattern
[1]));
1400 state
->ptr
= ctx
->ptr
- ctx
->pattern
[1];
1401 if (state
->ptr
< state
->beginning
)
1403 DO_JUMP(JUMP_ASSERT
, jump_assert
, ctx
->pattern
+2);
1404 RETURN_ON_FAILURE(ret
);
1405 ctx
->pattern
+= ctx
->pattern
[0];
1408 case SRE_OP_ASSERT_NOT
:
1409 /* assert not subpattern */
1410 /* <ASSERT_NOT> <skip> <back> <pattern> */
1411 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx
->pattern
,
1412 ctx
->ptr
, ctx
->pattern
[1]));
1413 state
->ptr
= ctx
->ptr
- ctx
->pattern
[1];
1414 if (state
->ptr
>= state
->beginning
) {
1415 DO_JUMP(JUMP_ASSERT_NOT
, jump_assert_not
, ctx
->pattern
+2);
1417 RETURN_ON_ERROR(ret
);
1421 ctx
->pattern
+= ctx
->pattern
[0];
1424 case SRE_OP_FAILURE
:
1425 /* immediate failure */
1426 TRACE(("|%p|%p|FAILURE\n", ctx
->pattern
, ctx
->ptr
));
1430 TRACE(("|%p|%p|UNKNOWN %d\n", ctx
->pattern
, ctx
->ptr
,
1432 RETURN_ERROR(SRE_ERROR_ILLEGAL
);
1437 ctx_pos
= ctx
->last_ctx_pos
;
1439 DATA_POP_DISCARD(ctx
);
1442 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT
, ctx
, ctx_pos
);
1445 case JUMP_MAX_UNTIL_2
:
1446 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx
->pattern
, ctx
->ptr
));
1447 goto jump_max_until_2
;
1448 case JUMP_MAX_UNTIL_3
:
1449 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx
->pattern
, ctx
->ptr
));
1450 goto jump_max_until_3
;
1451 case JUMP_MIN_UNTIL_2
:
1452 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx
->pattern
, ctx
->ptr
));
1453 goto jump_min_until_2
;
1454 case JUMP_MIN_UNTIL_3
:
1455 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx
->pattern
, ctx
->ptr
));
1456 goto jump_min_until_3
;
1458 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx
->pattern
, ctx
->ptr
));
1460 case JUMP_MAX_UNTIL_1
:
1461 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx
->pattern
, ctx
->ptr
));
1462 goto jump_max_until_1
;
1463 case JUMP_MIN_UNTIL_1
:
1464 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx
->pattern
, ctx
->ptr
));
1465 goto jump_min_until_1
;
1467 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx
->pattern
, ctx
->ptr
));
1469 case JUMP_REPEAT_ONE_1
:
1470 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx
->pattern
, ctx
->ptr
));
1471 goto jump_repeat_one_1
;
1472 case JUMP_REPEAT_ONE_2
:
1473 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx
->pattern
, ctx
->ptr
));
1474 goto jump_repeat_one_2
;
1475 case JUMP_MIN_REPEAT_ONE
:
1476 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx
->pattern
, ctx
->ptr
));
1477 goto jump_min_repeat_one
;
1479 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx
->pattern
, ctx
->ptr
));
1481 case JUMP_ASSERT_NOT
:
1482 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx
->pattern
, ctx
->ptr
));
1483 goto jump_assert_not
;
1485 TRACE(("|%p|%p|RETURN %d\n", ctx
->pattern
, ctx
->ptr
, ret
));
1489 return ret
; /* should never get here */
1493 SRE_SEARCH(SRE_STATE
* state
, SRE_CODE
* pattern
)
1495 SRE_CHAR
* ptr
= (SRE_CHAR
*)state
->start
;
1496 SRE_CHAR
* end
= (SRE_CHAR
*)state
->end
;
1497 Py_ssize_t status
= 0;
1498 Py_ssize_t prefix_len
= 0;
1499 Py_ssize_t prefix_skip
= 0;
1500 SRE_CODE
* prefix
= NULL
;
1501 SRE_CODE
* charset
= NULL
;
1502 SRE_CODE
* overlap
= NULL
;
1505 if (pattern
[0] == SRE_OP_INFO
) {
1506 /* optimization info block */
1507 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1511 if (pattern
[3] > 1) {
1512 /* adjust end point (but make sure we leave at least one
1513 character in there, so literal search will work) */
1514 end
-= pattern
[3]-1;
1519 if (flags
& SRE_INFO_PREFIX
) {
1520 /* pattern starts with a known prefix */
1521 /* <length> <skip> <prefix data> <overlap data> */
1522 prefix_len
= pattern
[5];
1523 prefix_skip
= pattern
[6];
1524 prefix
= pattern
+ 7;
1525 overlap
= prefix
+ prefix_len
- 1;
1526 } else if (flags
& SRE_INFO_CHARSET
)
1527 /* pattern starts with a character from a known set */
1529 charset
= pattern
+ 5;
1531 pattern
+= 1 + pattern
[1];
1534 TRACE(("prefix = %p %d %d\n", prefix
, prefix_len
, prefix_skip
));
1535 TRACE(("charset = %p\n", charset
));
1537 #if defined(USE_FAST_SEARCH)
1538 if (prefix_len
> 1) {
1539 /* pattern starts with a known prefix. use the overlap
1540 table to skip forward as fast as we possibly can */
1542 end
= (SRE_CHAR
*)state
->end
;
1545 if ((SRE_CODE
) ptr
[0] != prefix
[i
]) {
1551 if (++i
== prefix_len
) {
1552 /* found a potential match */
1553 TRACE(("|%p|%p|SEARCH SCAN\n", pattern
, ptr
));
1554 state
->start
= ptr
+ 1 - prefix_len
;
1555 state
->ptr
= ptr
+ 1 - prefix_len
+ prefix_skip
;
1556 if (flags
& SRE_INFO_LITERAL
)
1557 return 1; /* we got all of it */
1558 status
= SRE_MATCH(state
, pattern
+ 2*prefix_skip
);
1561 /* close but no cigar -- try again */
1573 if (pattern
[0] == SRE_OP_LITERAL
) {
1574 /* pattern starts with a literal character. this is used
1575 for short prefixes, and if fast search is disabled */
1576 SRE_CODE chr
= pattern
[1];
1577 end
= (SRE_CHAR
*)state
->end
;
1579 while (ptr
< end
&& (SRE_CODE
) ptr
[0] != chr
)
1583 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern
, ptr
));
1586 if (flags
& SRE_INFO_LITERAL
)
1587 return 1; /* we got all of it */
1588 status
= SRE_MATCH(state
, pattern
+ 2);
1592 } else if (charset
) {
1593 /* pattern starts with a character from a known set */
1594 end
= (SRE_CHAR
*)state
->end
;
1596 while (ptr
< end
&& !SRE_CHARSET(charset
, ptr
[0]))
1600 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern
, ptr
));
1603 status
= SRE_MATCH(state
, pattern
);
1610 while (ptr
<= end
) {
1611 TRACE(("|%p|%p|SEARCH\n", pattern
, ptr
));
1612 state
->start
= state
->ptr
= ptr
++;
1613 status
= SRE_MATCH(state
, pattern
);
1622 SRE_LITERAL_TEMPLATE(SRE_CHAR
* ptr
, Py_ssize_t len
)
1624 /* check if given string is a literal template (i.e. no escapes) */
1631 #if !defined(SRE_RECURSIVE)
1633 /* -------------------------------------------------------------------- */
1634 /* factories and destructors */
1636 /* see sre.h for object declarations */
1637 static PyObject
*pattern_new_match(PatternObject
*, SRE_STATE
*, int);
1638 static PyObject
*pattern_scanner(PatternObject
*, PyObject
*);
1641 sre_codesize(PyObject
* self
, PyObject
*unused
)
1643 return Py_BuildValue("l", sizeof(SRE_CODE
));
1647 sre_getlower(PyObject
* self
, PyObject
* args
)
1649 int character
, flags
;
1650 if (!PyArg_ParseTuple(args
, "ii", &character
, &flags
))
1652 if (flags
& SRE_FLAG_LOCALE
)
1653 return Py_BuildValue("i", sre_lower_locale(character
));
1654 if (flags
& SRE_FLAG_UNICODE
)
1655 #if defined(HAVE_UNICODE)
1656 return Py_BuildValue("i", sre_lower_unicode(character
));
1658 return Py_BuildValue("i", sre_lower_locale(character
));
1660 return Py_BuildValue("i", sre_lower(character
));
1664 state_reset(SRE_STATE
* state
)
1666 /* FIXME: dynamic! */
1667 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
1669 state
->lastmark
= -1;
1670 state
->lastindex
= -1;
1672 state
->repeat
= NULL
;
1674 data_stack_dealloc(state
);
1678 getstring(PyObject
* string
, Py_ssize_t
* p_length
, int* p_charsize
)
1680 /* given a python object, return a data pointer, a length (in
1681 characters), and a character size. return NULL if the object
1682 is not a string (or not compatible) */
1684 PyBufferProcs
*buffer
;
1685 Py_ssize_t size
, bytes
;
1689 #if defined(HAVE_UNICODE)
1690 if (PyUnicode_Check(string
)) {
1691 /* unicode strings doesn't always support the buffer interface */
1692 ptr
= (void*) PyUnicode_AS_DATA(string
);
1693 /* bytes = PyUnicode_GET_DATA_SIZE(string); */
1694 size
= PyUnicode_GET_SIZE(string
);
1695 charsize
= sizeof(Py_UNICODE
);
1700 /* get pointer to string buffer */
1701 buffer
= Py_TYPE(string
)->tp_as_buffer
;
1702 if (!buffer
|| !buffer
->bf_getreadbuffer
|| !buffer
->bf_getsegcount
||
1703 buffer
->bf_getsegcount(string
, NULL
) != 1) {
1704 PyErr_SetString(PyExc_TypeError
, "expected string or buffer");
1708 /* determine buffer size */
1709 bytes
= buffer
->bf_getreadbuffer(string
, 0, &ptr
);
1711 PyErr_SetString(PyExc_TypeError
, "buffer has negative size");
1715 /* determine character size */
1716 #if PY_VERSION_HEX >= 0x01060000
1717 size
= PyObject_Size(string
);
1719 size
= PyObject_Length(string
);
1722 if (PyString_Check(string
) || bytes
== size
)
1724 #if defined(HAVE_UNICODE)
1725 else if (bytes
== (Py_ssize_t
) (size
* sizeof(Py_UNICODE
)))
1726 charsize
= sizeof(Py_UNICODE
);
1729 PyErr_SetString(PyExc_TypeError
, "buffer size mismatch");
1733 #if defined(HAVE_UNICODE)
1738 *p_charsize
= charsize
;
1744 state_init(SRE_STATE
* state
, PatternObject
* pattern
, PyObject
* string
,
1745 Py_ssize_t start
, Py_ssize_t end
)
1747 /* prepare state object */
1753 memset(state
, 0, sizeof(SRE_STATE
));
1755 state
->lastmark
= -1;
1756 state
->lastindex
= -1;
1758 ptr
= getstring(string
, &length
, &charsize
);
1762 /* adjust boundaries */
1765 else if (start
> length
)
1770 else if (end
> length
)
1773 state
->charsize
= charsize
;
1775 state
->beginning
= ptr
;
1777 state
->start
= (void*) ((char*) ptr
+ start
* state
->charsize
);
1778 state
->end
= (void*) ((char*) ptr
+ end
* state
->charsize
);
1781 state
->string
= string
;
1783 state
->endpos
= end
;
1785 if (pattern
->flags
& SRE_FLAG_LOCALE
)
1786 state
->lower
= sre_lower_locale
;
1787 else if (pattern
->flags
& SRE_FLAG_UNICODE
)
1788 #if defined(HAVE_UNICODE)
1789 state
->lower
= sre_lower_unicode
;
1791 state
->lower
= sre_lower_locale
;
1794 state
->lower
= sre_lower
;
1800 state_fini(SRE_STATE
* state
)
1802 Py_XDECREF(state
->string
);
1803 data_stack_dealloc(state
);
1806 /* calculate offset from start of string */
1807 #define STATE_OFFSET(state, member)\
1808 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1811 state_getslice(SRE_STATE
* state
, Py_ssize_t index
, PyObject
* string
, int empty
)
1815 index
= (index
- 1) * 2;
1817 if (string
== Py_None
|| index
>= state
->lastmark
|| !state
->mark
[index
] || !state
->mark
[index
+1]) {
1819 /* want empty string */
1826 i
= STATE_OFFSET(state
, state
->mark
[index
]);
1827 j
= STATE_OFFSET(state
, state
->mark
[index
+1]);
1830 return PySequence_GetSlice(string
, i
, j
);
1834 pattern_error(int status
)
1837 case SRE_ERROR_RECURSION_LIMIT
:
1840 "maximum recursion limit exceeded"
1843 case SRE_ERROR_MEMORY
:
1846 case SRE_ERROR_INTERRUPTED
:
1847 /* An exception has already been raised, so let it fly */
1850 /* other error codes indicate compiler/engine bugs */
1853 "internal error in regular expression engine"
1859 pattern_dealloc(PatternObject
* self
)
1861 if (self
->weakreflist
!= NULL
)
1862 PyObject_ClearWeakRefs((PyObject
*) self
);
1863 Py_XDECREF(self
->pattern
);
1864 Py_XDECREF(self
->groupindex
);
1865 Py_XDECREF(self
->indexgroup
);
1870 pattern_match(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
1876 Py_ssize_t start
= 0;
1877 Py_ssize_t end
= PY_SSIZE_T_MAX
;
1878 static char* kwlist
[] = { "pattern", "pos", "endpos", NULL
};
1879 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|nn:match", kwlist
,
1880 &string
, &start
, &end
))
1883 string
= state_init(&state
, self
, string
, start
, end
);
1887 state
.ptr
= state
.start
;
1889 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self
), state
.ptr
));
1891 if (state
.charsize
== 1) {
1892 status
= sre_match(&state
, PatternObject_GetCode(self
));
1894 #if defined(HAVE_UNICODE)
1895 status
= sre_umatch(&state
, PatternObject_GetCode(self
));
1899 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self
), state
.ptr
));
1900 if (PyErr_Occurred())
1905 return pattern_new_match(self
, &state
, status
);
1909 pattern_search(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
1915 Py_ssize_t start
= 0;
1916 Py_ssize_t end
= PY_SSIZE_T_MAX
;
1917 static char* kwlist
[] = { "pattern", "pos", "endpos", NULL
};
1918 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|nn:search", kwlist
,
1919 &string
, &start
, &end
))
1922 string
= state_init(&state
, self
, string
, start
, end
);
1926 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self
), state
.ptr
));
1928 if (state
.charsize
== 1) {
1929 status
= sre_search(&state
, PatternObject_GetCode(self
));
1931 #if defined(HAVE_UNICODE)
1932 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
1936 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self
), state
.ptr
));
1940 if (PyErr_Occurred())
1943 return pattern_new_match(self
, &state
, status
);
1947 call(char* module
, char* function
, PyObject
* args
)
1956 name
= PyString_FromString(module
);
1959 mod
= PyImport_Import(name
);
1963 func
= PyObject_GetAttrString(mod
, function
);
1967 result
= PyObject_CallObject(func
, args
);
1973 #ifdef USE_BUILTIN_COPY
1975 deepcopy(PyObject
** object
, PyObject
* memo
)
1981 PyTuple_Pack(2, *object
, memo
)
1989 return 1; /* success */
1994 join_list(PyObject
* list
, PyObject
* string
)
1996 /* join list elements */
1999 #if PY_VERSION_HEX >= 0x01060000
2005 joiner
= PySequence_GetSlice(string
, 0, 0);
2009 if (PyList_GET_SIZE(list
) == 0) {
2014 #if PY_VERSION_HEX >= 0x01060000
2015 function
= PyObject_GetAttrString(joiner
, "join");
2020 args
= PyTuple_New(1);
2022 Py_DECREF(function
);
2026 PyTuple_SET_ITEM(args
, 0, list
);
2027 result
= PyObject_CallObject(function
, args
);
2028 Py_DECREF(args
); /* also removes list */
2029 Py_DECREF(function
);
2033 PyTuple_Pack(2, list
, joiner
)
2042 pattern_findall(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2050 Py_ssize_t start
= 0;
2051 Py_ssize_t end
= PY_SSIZE_T_MAX
;
2052 static char* kwlist
[] = { "source", "pos", "endpos", NULL
};
2053 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|nn:findall", kwlist
,
2054 &string
, &start
, &end
))
2057 string
= state_init(&state
, self
, string
, start
, end
);
2061 list
= PyList_New(0);
2067 while (state
.start
<= state
.end
) {
2071 state_reset(&state
);
2073 state
.ptr
= state
.start
;
2075 if (state
.charsize
== 1) {
2076 status
= sre_search(&state
, PatternObject_GetCode(self
));
2078 #if defined(HAVE_UNICODE)
2079 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
2083 if (PyErr_Occurred())
2089 pattern_error(status
);
2093 /* don't bother to build a match object */
2094 switch (self
->groups
) {
2096 b
= STATE_OFFSET(&state
, state
.start
);
2097 e
= STATE_OFFSET(&state
, state
.ptr
);
2098 item
= PySequence_GetSlice(string
, b
, e
);
2103 item
= state_getslice(&state
, 1, string
, 1);
2108 item
= PyTuple_New(self
->groups
);
2111 for (i
= 0; i
< self
->groups
; i
++) {
2112 PyObject
* o
= state_getslice(&state
, i
+1, string
, 1);
2117 PyTuple_SET_ITEM(item
, i
, o
);
2122 status
= PyList_Append(list
, item
);
2127 if (state
.ptr
== state
.start
)
2128 state
.start
= (void*) ((char*) state
.ptr
+ state
.charsize
);
2130 state
.start
= state
.ptr
;
2144 #if PY_VERSION_HEX >= 0x02020000
2146 pattern_finditer(PatternObject
* pattern
, PyObject
* args
)
2152 scanner
= pattern_scanner(pattern
, args
);
2156 search
= PyObject_GetAttrString(scanner
, "search");
2161 iterator
= PyCallIter_New(search
, Py_None
);
2169 pattern_split(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2180 Py_ssize_t maxsplit
= 0;
2181 static char* kwlist
[] = { "source", "maxsplit", NULL
};
2182 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|n:split", kwlist
,
2183 &string
, &maxsplit
))
2186 string
= state_init(&state
, self
, string
, 0, PY_SSIZE_T_MAX
);
2190 list
= PyList_New(0);
2199 while (!maxsplit
|| n
< maxsplit
) {
2201 state_reset(&state
);
2203 state
.ptr
= state
.start
;
2205 if (state
.charsize
== 1) {
2206 status
= sre_search(&state
, PatternObject_GetCode(self
));
2208 #if defined(HAVE_UNICODE)
2209 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
2213 if (PyErr_Occurred())
2219 pattern_error(status
);
2223 if (state
.start
== state
.ptr
) {
2224 if (last
== state
.end
)
2226 /* skip one character */
2227 state
.start
= (void*) ((char*) state
.ptr
+ state
.charsize
);
2231 /* get segment before this match */
2232 item
= PySequence_GetSlice(
2233 string
, STATE_OFFSET(&state
, last
),
2234 STATE_OFFSET(&state
, state
.start
)
2238 status
= PyList_Append(list
, item
);
2243 /* add groups (if any) */
2244 for (i
= 0; i
< self
->groups
; i
++) {
2245 item
= state_getslice(&state
, i
+1, string
, 0);
2248 status
= PyList_Append(list
, item
);
2256 last
= state
.start
= state
.ptr
;
2260 /* get segment following last match (even if empty) */
2261 item
= PySequence_GetSlice(
2262 string
, STATE_OFFSET(&state
, last
), state
.endpos
2266 status
= PyList_Append(list
, item
);
2282 pattern_subx(PatternObject
* self
, PyObject
* ptemplate
, PyObject
* string
,
2283 Py_ssize_t count
, Py_ssize_t subn
)
2296 int filter_is_callable
;
2298 if (PyCallable_Check(ptemplate
)) {
2299 /* sub/subn takes either a function or a template */
2302 filter_is_callable
= 1;
2304 /* if not callable, check if it's a literal string */
2306 ptr
= getstring(ptemplate
, &n
, &bint
);
2310 literal
= sre_literal_template((unsigned char *)ptr
, n
);
2312 #if defined(HAVE_UNICODE)
2313 literal
= sre_uliteral_template((Py_UNICODE
*)ptr
, n
);
2323 filter_is_callable
= 0;
2325 /* not a literal; hand it over to the template compiler */
2327 SRE_PY_MODULE
, "_subx",
2328 PyTuple_Pack(2, self
, ptemplate
)
2332 filter_is_callable
= PyCallable_Check(filter
);
2336 string
= state_init(&state
, self
, string
, 0, PY_SSIZE_T_MAX
);
2342 list
= PyList_New(0);
2351 while (!count
|| n
< count
) {
2353 state_reset(&state
);
2355 state
.ptr
= state
.start
;
2357 if (state
.charsize
== 1) {
2358 status
= sre_search(&state
, PatternObject_GetCode(self
));
2360 #if defined(HAVE_UNICODE)
2361 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
2365 if (PyErr_Occurred())
2371 pattern_error(status
);
2375 b
= STATE_OFFSET(&state
, state
.start
);
2376 e
= STATE_OFFSET(&state
, state
.ptr
);
2379 /* get segment before this match */
2380 item
= PySequence_GetSlice(string
, i
, b
);
2383 status
= PyList_Append(list
, item
);
2388 } else if (i
== b
&& i
== e
&& n
> 0)
2389 /* ignore empty match on latest position */
2392 if (filter_is_callable
) {
2393 /* pass match object through filter */
2394 match
= pattern_new_match(self
, &state
, 1);
2397 args
= PyTuple_Pack(1, match
);
2402 item
= PyObject_CallObject(filter
, args
);
2408 /* filter is literal string */
2414 if (item
!= Py_None
) {
2415 status
= PyList_Append(list
, item
);
2426 if (state
.ptr
== state
.start
)
2427 state
.start
= (void*) ((char*) state
.ptr
+ state
.charsize
);
2429 state
.start
= state
.ptr
;
2433 /* get segment following last match */
2434 if (i
< state
.endpos
) {
2435 item
= PySequence_GetSlice(string
, i
, state
.endpos
);
2438 status
= PyList_Append(list
, item
);
2448 /* convert list to single string (also removes list) */
2449 item
= join_list(list
, string
);
2455 return Py_BuildValue("Ni", item
, n
);
2468 pattern_sub(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2470 PyObject
* ptemplate
;
2472 Py_ssize_t count
= 0;
2473 static char* kwlist
[] = { "repl", "string", "count", NULL
};
2474 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "OO|n:sub", kwlist
,
2475 &ptemplate
, &string
, &count
))
2478 return pattern_subx(self
, ptemplate
, string
, count
, 0);
2482 pattern_subn(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2484 PyObject
* ptemplate
;
2486 Py_ssize_t count
= 0;
2487 static char* kwlist
[] = { "repl", "string", "count", NULL
};
2488 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "OO|n:subn", kwlist
,
2489 &ptemplate
, &string
, &count
))
2492 return pattern_subx(self
, ptemplate
, string
, count
, 1);
2496 pattern_copy(PatternObject
* self
, PyObject
*unused
)
2498 #ifdef USE_BUILTIN_COPY
2499 PatternObject
* copy
;
2502 copy
= PyObject_NEW_VAR(PatternObject
, &Pattern_Type
, self
->codesize
);
2506 offset
= offsetof(PatternObject
, groups
);
2508 Py_XINCREF(self
->groupindex
);
2509 Py_XINCREF(self
->indexgroup
);
2510 Py_XINCREF(self
->pattern
);
2512 memcpy((char*) copy
+ offset
, (char*) self
+ offset
,
2513 sizeof(PatternObject
) + self
->codesize
* sizeof(SRE_CODE
) - offset
);
2514 copy
->weakreflist
= NULL
;
2516 return (PyObject
*) copy
;
2518 PyErr_SetString(PyExc_TypeError
, "cannot copy this pattern object");
2524 pattern_deepcopy(PatternObject
* self
, PyObject
* memo
)
2526 #ifdef USE_BUILTIN_COPY
2527 PatternObject
* copy
;
2529 copy
= (PatternObject
*) pattern_copy(self
);
2533 if (!deepcopy(©
->groupindex
, memo
) ||
2534 !deepcopy(©
->indexgroup
, memo
) ||
2535 !deepcopy(©
->pattern
, memo
)) {
2541 PyErr_SetString(PyExc_TypeError
, "cannot deepcopy this pattern object");
2546 PyDoc_STRVAR(pattern_match_doc
,
2547 "match(string[, pos[, endpos]]) --> match object or None.\n\
2548 Matches zero or more characters at the beginning of the string");
2550 PyDoc_STRVAR(pattern_search_doc
,
2551 "search(string[, pos[, endpos]]) --> match object or None.\n\
2552 Scan through string looking for a match, and return a corresponding\n\
2553 MatchObject instance. Return None if no position in the string matches.");
2555 PyDoc_STRVAR(pattern_split_doc
,
2556 "split(string[, maxsplit = 0]) --> list.\n\
2557 Split string by the occurrences of pattern.");
2559 PyDoc_STRVAR(pattern_findall_doc
,
2560 "findall(string[, pos[, endpos]]) --> list.\n\
2561 Return a list of all non-overlapping matches of pattern in string.");
2563 PyDoc_STRVAR(pattern_finditer_doc
,
2564 "finditer(string[, pos[, endpos]]) --> iterator.\n\
2565 Return an iterator over all non-overlapping matches for the \n\
2566 RE pattern in string. For each match, the iterator returns a\n\
2569 PyDoc_STRVAR(pattern_sub_doc
,
2570 "sub(repl, string[, count = 0]) --> newstring\n\
2571 Return the string obtained by replacing the leftmost non-overlapping\n\
2572 occurrences of pattern in string by the replacement repl.");
2574 PyDoc_STRVAR(pattern_subn_doc
,
2575 "subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2576 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2577 the leftmost non-overlapping occurrences of pattern with the\n\
2578 replacement repl.");
2580 PyDoc_STRVAR(pattern_doc
, "Compiled regular expression objects");
2582 static PyMethodDef pattern_methods
[] = {
2583 {"match", (PyCFunction
) pattern_match
, METH_VARARGS
|METH_KEYWORDS
,
2585 {"search", (PyCFunction
) pattern_search
, METH_VARARGS
|METH_KEYWORDS
,
2586 pattern_search_doc
},
2587 {"sub", (PyCFunction
) pattern_sub
, METH_VARARGS
|METH_KEYWORDS
,
2589 {"subn", (PyCFunction
) pattern_subn
, METH_VARARGS
|METH_KEYWORDS
,
2591 {"split", (PyCFunction
) pattern_split
, METH_VARARGS
|METH_KEYWORDS
,
2593 {"findall", (PyCFunction
) pattern_findall
, METH_VARARGS
|METH_KEYWORDS
,
2594 pattern_findall_doc
},
2595 #if PY_VERSION_HEX >= 0x02020000
2596 {"finditer", (PyCFunction
) pattern_finditer
, METH_VARARGS
,
2597 pattern_finditer_doc
},
2599 {"scanner", (PyCFunction
) pattern_scanner
, METH_VARARGS
},
2600 {"__copy__", (PyCFunction
) pattern_copy
, METH_NOARGS
},
2601 {"__deepcopy__", (PyCFunction
) pattern_deepcopy
, METH_O
},
2606 pattern_getattr(PatternObject
* self
, char* name
)
2610 res
= Py_FindMethod(pattern_methods
, (PyObject
*) self
, name
);
2618 if (!strcmp(name
, "pattern")) {
2619 Py_INCREF(self
->pattern
);
2620 return self
->pattern
;
2623 if (!strcmp(name
, "flags"))
2624 return Py_BuildValue("i", self
->flags
);
2626 if (!strcmp(name
, "groups"))
2627 return Py_BuildValue("i", self
->groups
);
2629 if (!strcmp(name
, "groupindex") && self
->groupindex
) {
2630 Py_INCREF(self
->groupindex
);
2631 return self
->groupindex
;
2634 PyErr_SetString(PyExc_AttributeError
, name
);
2638 statichere PyTypeObject Pattern_Type
= {
2639 PyObject_HEAD_INIT(NULL
)
2640 0, "_" SRE_MODULE
".SRE_Pattern",
2641 sizeof(PatternObject
), sizeof(SRE_CODE
),
2642 (destructor
)pattern_dealloc
, /*tp_dealloc*/
2644 (getattrfunc
)pattern_getattr
, /*tp_getattr*/
2648 0, /* tp_as_number */
2649 0, /* tp_as_sequence */
2650 0, /* tp_as_mapping */
2654 0, /* tp_getattro */
2655 0, /* tp_setattro */
2656 0, /* tp_as_buffer */
2657 Py_TPFLAGS_HAVE_WEAKREFS
, /* tp_flags */
2658 pattern_doc
, /* tp_doc */
2659 0, /* tp_traverse */
2661 0, /* tp_richcompare */
2662 offsetof(PatternObject
, weakreflist
), /* tp_weaklistoffset */
2665 static int _validate(PatternObject
*self
); /* Forward */
2668 _compile(PyObject
* self_
, PyObject
* args
)
2670 /* "compile" pattern descriptor to pattern object */
2672 PatternObject
* self
;
2678 Py_ssize_t groups
= 0;
2679 PyObject
* groupindex
= NULL
;
2680 PyObject
* indexgroup
= NULL
;
2681 if (!PyArg_ParseTuple(args
, "OiO!|nOO", &pattern
, &flags
,
2682 &PyList_Type
, &code
, &groups
,
2683 &groupindex
, &indexgroup
))
2686 n
= PyList_GET_SIZE(code
);
2687 /* coverity[ampersand_in_size] */
2688 self
= PyObject_NEW_VAR(PatternObject
, &Pattern_Type
, n
);
2691 self
->weakreflist
= NULL
;
2692 self
->pattern
= NULL
;
2693 self
->groupindex
= NULL
;
2694 self
->indexgroup
= NULL
;
2698 for (i
= 0; i
< n
; i
++) {
2699 PyObject
*o
= PyList_GET_ITEM(code
, i
);
2700 unsigned long value
= PyInt_Check(o
) ? (unsigned long)PyInt_AsLong(o
)
2701 : PyLong_AsUnsignedLong(o
);
2702 self
->code
[i
] = (SRE_CODE
) value
;
2703 if ((unsigned long) self
->code
[i
] != value
) {
2704 PyErr_SetString(PyExc_OverflowError
,
2705 "regular expression code size limit exceeded");
2710 if (PyErr_Occurred()) {
2716 self
->pattern
= pattern
;
2718 self
->flags
= flags
;
2720 self
->groups
= groups
;
2722 Py_XINCREF(groupindex
);
2723 self
->groupindex
= groupindex
;
2725 Py_XINCREF(indexgroup
);
2726 self
->indexgroup
= indexgroup
;
2728 self
->weakreflist
= NULL
;
2730 if (!_validate(self
)) {
2735 return (PyObject
*) self
;
2738 /* -------------------------------------------------------------------- */
2739 /* Code validation */
2741 /* To learn more about this code, have a look at the _compile() function in
2742 Lib/sre_compile.py. The validation functions below checks the code array
2743 for conformance with the code patterns generated there.
2745 The nice thing about the generated code is that it is position-independent:
2746 all jumps are relative jumps forward. Also, jumps don't cross each other:
2747 the target of a later jump is always earlier than the target of an earlier
2748 jump. IOW, this is okay:
2750 J---------J-------T--------T
2752 \______________________/
2756 J---------J-------T--------T
2760 It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
2761 bytes wide (the latter if Python is compiled for "wide" unicode support).
2764 /* Defining this one enables tracing of the validator */
2767 /* Trace macro for the validator */
2768 #if defined(VVERBOSE)
2769 #define VTRACE(v) printf v
2774 /* Report failure */
2775 #define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2777 /* Extract opcode, argument, or skip count from code array */
2780 VTRACE(("%p: ", code)); \
2781 if (code >= end) FAIL; \
2783 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2787 VTRACE(("%p= ", code)); \
2788 if (code >= end) FAIL; \
2790 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2792 #define GET_SKIP_ADJ(adj) \
2794 VTRACE(("%p= ", code)); \
2795 if (code >= end) FAIL; \
2797 VTRACE(("%lu (skip to %p)\n", \
2798 (unsigned long)skip, code+skip)); \
2799 if (code+skip-adj < code || code+skip-adj > end)\
2803 #define GET_SKIP GET_SKIP_ADJ(0)
2806 _validate_charset(SRE_CODE
*code
, SRE_CODE
*end
)
2808 /* Some variables are manipulated by the macros above */
2814 while (code
< end
) {
2821 case SRE_OP_LITERAL
:
2830 case SRE_OP_CHARSET
:
2831 offset
= 32/sizeof(SRE_CODE
); /* 32-byte bitmap */
2832 if (code
+offset
< code
|| code
+offset
> end
)
2837 case SRE_OP_BIGCHARSET
:
2838 GET_ARG
; /* Number of blocks */
2839 offset
= 256/sizeof(SRE_CODE
); /* 256-byte table */
2840 if (code
+offset
< code
|| code
+offset
> end
)
2842 /* Make sure that each byte points to a valid block */
2843 for (i
= 0; i
< 256; i
++) {
2844 if (((unsigned char *)code
)[i
] >= arg
)
2848 offset
= arg
* 32/sizeof(SRE_CODE
); /* 32-byte bitmap times arg */
2849 if (code
+offset
< code
|| code
+offset
> end
)
2854 case SRE_OP_CATEGORY
:
2857 case SRE_CATEGORY_DIGIT
:
2858 case SRE_CATEGORY_NOT_DIGIT
:
2859 case SRE_CATEGORY_SPACE
:
2860 case SRE_CATEGORY_NOT_SPACE
:
2861 case SRE_CATEGORY_WORD
:
2862 case SRE_CATEGORY_NOT_WORD
:
2863 case SRE_CATEGORY_LINEBREAK
:
2864 case SRE_CATEGORY_NOT_LINEBREAK
:
2865 case SRE_CATEGORY_LOC_WORD
:
2866 case SRE_CATEGORY_LOC_NOT_WORD
:
2867 case SRE_CATEGORY_UNI_DIGIT
:
2868 case SRE_CATEGORY_UNI_NOT_DIGIT
:
2869 case SRE_CATEGORY_UNI_SPACE
:
2870 case SRE_CATEGORY_UNI_NOT_SPACE
:
2871 case SRE_CATEGORY_UNI_WORD
:
2872 case SRE_CATEGORY_UNI_NOT_WORD
:
2873 case SRE_CATEGORY_UNI_LINEBREAK
:
2874 case SRE_CATEGORY_UNI_NOT_LINEBREAK
:
2891 _validate_inner(SRE_CODE
*code
, SRE_CODE
*end
, Py_ssize_t groups
)
2893 /* Some variables are manipulated by the macros above */
2898 VTRACE(("code=%p, end=%p\n", code
, end
));
2903 while (code
< end
) {
2908 /* We don't check whether marks are properly nested; the
2909 sre_match() code is robust even if they don't, and the worst
2910 you can get is nonsensical match results. */
2912 if (arg
> 2*groups
+1) {
2913 VTRACE(("arg=%d, groups=%d\n", (int)arg
, (int)groups
));
2918 case SRE_OP_LITERAL
:
2919 case SRE_OP_NOT_LITERAL
:
2920 case SRE_OP_LITERAL_IGNORE
:
2921 case SRE_OP_NOT_LITERAL_IGNORE
:
2923 /* The arg is just a character, nothing to check */
2926 case SRE_OP_SUCCESS
:
2927 case SRE_OP_FAILURE
:
2928 /* Nothing to check; these normally end the matching process */
2934 case SRE_AT_BEGINNING
:
2935 case SRE_AT_BEGINNING_STRING
:
2936 case SRE_AT_BEGINNING_LINE
:
2938 case SRE_AT_END_LINE
:
2939 case SRE_AT_END_STRING
:
2940 case SRE_AT_BOUNDARY
:
2941 case SRE_AT_NON_BOUNDARY
:
2942 case SRE_AT_LOC_BOUNDARY
:
2943 case SRE_AT_LOC_NON_BOUNDARY
:
2944 case SRE_AT_UNI_BOUNDARY
:
2945 case SRE_AT_UNI_NON_BOUNDARY
:
2953 case SRE_OP_ANY_ALL
:
2954 /* These have no operands */
2958 case SRE_OP_IN_IGNORE
:
2960 /* Stop 1 before the end; we check the FAILURE below */
2961 if (!_validate_charset(code
, code
+skip
-2))
2963 if (code
[skip
-2] != SRE_OP_FAILURE
)
2970 /* A minimal info field is
2971 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2972 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2977 newcode
= code
+skip
-1;
2978 GET_ARG
; flags
= arg
;
2981 /* Check that only valid flags are present */
2982 if ((flags
& ~(SRE_INFO_PREFIX
|
2984 SRE_INFO_CHARSET
)) != 0)
2986 /* PREFIX and CHARSET are mutually exclusive */
2987 if ((flags
& SRE_INFO_PREFIX
) &&
2988 (flags
& SRE_INFO_CHARSET
))
2990 /* LITERAL implies PREFIX */
2991 if ((flags
& SRE_INFO_LITERAL
) &&
2992 !(flags
& SRE_INFO_PREFIX
))
2994 /* Validate the prefix */
2995 if (flags
& SRE_INFO_PREFIX
) {
2996 SRE_CODE prefix_len
;
2997 GET_ARG
; prefix_len
= arg
;
2998 GET_ARG
; /* prefix skip */
2999 /* Here comes the prefix string */
3000 if (code
+prefix_len
< code
|| code
+prefix_len
> newcode
)
3003 /* And here comes the overlap table */
3004 if (code
+prefix_len
< code
|| code
+prefix_len
> newcode
)
3006 /* Each overlap value should be < prefix_len */
3007 for (i
= 0; i
< prefix_len
; i
++) {
3008 if (code
[i
] >= prefix_len
)
3013 /* Validate the charset */
3014 if (flags
& SRE_INFO_CHARSET
) {
3015 if (!_validate_charset(code
, newcode
-1))
3017 if (newcode
[-1] != SRE_OP_FAILURE
)
3021 else if (code
!= newcode
) {
3022 VTRACE(("code=%p, newcode=%p\n", code
, newcode
));
3030 SRE_CODE
*target
= NULL
;
3035 /* Stop 2 before the end; we check the JUMP below */
3036 if (!_validate_inner(code
, code
+skip
-3, groups
))
3039 /* Check that it ends with a JUMP, and that each JUMP
3040 has the same target */
3042 if (op
!= SRE_OP_JUMP
)
3046 target
= code
+skip
-1;
3047 else if (code
+skip
-1 != target
)
3053 case SRE_OP_REPEAT_ONE
:
3054 case SRE_OP_MIN_REPEAT_ONE
:
3062 #ifdef Py_UNICODE_WIDE
3066 if (!_validate_inner(code
, code
+skip
-4, groups
))
3070 if (op
!= SRE_OP_SUCCESS
)
3083 #ifdef Py_UNICODE_WIDE
3087 if (!_validate_inner(code
, code
+skip
-3, groups
))
3091 if (op
!= SRE_OP_MAX_UNTIL
&& op
!= SRE_OP_MIN_UNTIL
)
3096 case SRE_OP_GROUPREF
:
3097 case SRE_OP_GROUPREF_IGNORE
:
3103 case SRE_OP_GROUPREF_EXISTS
:
3104 /* The regex syntax for this is: '(?(group)then|else)', where
3105 'group' is either an integer group number or a group name,
3106 'then' and 'else' are sub-regexes, and 'else' is optional. */
3111 code
--; /* The skip is relative to the first arg! */
3112 /* There are two possibilities here: if there is both a 'then'
3113 part and an 'else' part, the generated code looks like:
3121 (<skipyes> jumps here)
3123 (<skipno> jumps here)
3125 If there is only a 'then' part, it looks like:
3133 There is no direct way to decide which it is, and we don't want
3134 to allow arbitrary jumps anywhere in the code; so we just look
3135 for a JUMP opcode preceding our skip target.
3137 if (skip
>= 3 && code
+skip
-3 >= code
&&
3138 code
[skip
-3] == SRE_OP_JUMP
)
3140 VTRACE(("both then and else parts present\n"));
3141 if (!_validate_inner(code
+1, code
+skip
-3, groups
))
3143 code
+= skip
-2; /* Position after JUMP, at <skipno> */
3145 if (!_validate_inner(code
, code
+skip
-1, groups
))
3150 VTRACE(("only a then part present\n"));
3151 if (!_validate_inner(code
+1, code
+skip
-1, groups
))
3158 case SRE_OP_ASSERT_NOT
:
3160 GET_ARG
; /* 0 for lookahead, width for lookbehind */
3161 code
--; /* Back up over arg to simplify math below */
3162 if (arg
& 0x80000000)
3163 FAIL
; /* Width too large */
3164 /* Stop 1 before the end; we check the SUCCESS below */
3165 if (!_validate_inner(code
+1, code
+skip
-2, groups
))
3169 if (op
!= SRE_OP_SUCCESS
)
3184 _validate_outer(SRE_CODE
*code
, SRE_CODE
*end
, Py_ssize_t groups
)
3186 if (groups
< 0 || groups
> 100 || code
>= end
|| end
[-1] != SRE_OP_SUCCESS
)
3188 if (groups
== 0) /* fix for simplejson */
3189 groups
= 100; /* 100 groups should always be safe */
3190 return _validate_inner(code
, end
-1, groups
);
3194 _validate(PatternObject
*self
)
3196 if (!_validate_outer(self
->code
, self
->code
+self
->codesize
, self
->groups
))
3198 PyErr_SetString(PyExc_RuntimeError
, "invalid SRE code");
3202 VTRACE(("Success!\n"));
3206 /* -------------------------------------------------------------------- */
3210 match_dealloc(MatchObject
* self
)
3212 Py_XDECREF(self
->regs
);
3213 Py_XDECREF(self
->string
);
3214 Py_DECREF(self
->pattern
);
3219 match_getslice_by_index(MatchObject
* self
, Py_ssize_t index
, PyObject
* def
)
3221 if (index
< 0 || index
>= self
->groups
) {
3222 /* raise IndexError if we were given a bad group number */
3232 if (self
->string
== Py_None
|| self
->mark
[index
] < 0) {
3233 /* return default value if the string or group is undefined */
3238 return PySequence_GetSlice(
3239 self
->string
, self
->mark
[index
], self
->mark
[index
+1]
3244 match_getindex(MatchObject
* self
, PyObject
* index
)
3248 if (PyInt_Check(index
))
3249 return PyInt_AsSsize_t(index
);
3253 if (self
->pattern
->groupindex
) {
3254 index
= PyObject_GetItem(self
->pattern
->groupindex
, index
);
3256 if (PyInt_Check(index
) || PyLong_Check(index
))
3257 i
= PyInt_AsSsize_t(index
);
3267 match_getslice(MatchObject
* self
, PyObject
* index
, PyObject
* def
)
3269 return match_getslice_by_index(self
, match_getindex(self
, index
), def
);
3273 match_expand(MatchObject
* self
, PyObject
* ptemplate
)
3275 /* delegate to Python code */
3277 SRE_PY_MODULE
, "_expand",
3278 PyTuple_Pack(3, self
->pattern
, self
, ptemplate
)
3283 match_group(MatchObject
* self
, PyObject
* args
)
3288 size
= PyTuple_GET_SIZE(args
);
3292 result
= match_getslice(self
, Py_False
, Py_None
);
3295 result
= match_getslice(self
, PyTuple_GET_ITEM(args
, 0), Py_None
);
3298 /* fetch multiple items */
3299 result
= PyTuple_New(size
);
3302 for (i
= 0; i
< size
; i
++) {
3303 PyObject
* item
= match_getslice(
3304 self
, PyTuple_GET_ITEM(args
, i
), Py_None
3310 PyTuple_SET_ITEM(result
, i
, item
);
3318 match_groups(MatchObject
* self
, PyObject
* args
, PyObject
* kw
)
3323 PyObject
* def
= Py_None
;
3324 static char* kwlist
[] = { "default", NULL
};
3325 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "|O:groups", kwlist
, &def
))
3328 result
= PyTuple_New(self
->groups
-1);
3332 for (index
= 1; index
< self
->groups
; index
++) {
3334 item
= match_getslice_by_index(self
, index
, def
);
3339 PyTuple_SET_ITEM(result
, index
-1, item
);
3346 match_groupdict(MatchObject
* self
, PyObject
* args
, PyObject
* kw
)
3352 PyObject
* def
= Py_None
;
3353 static char* kwlist
[] = { "default", NULL
};
3354 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "|O:groupdict", kwlist
, &def
))
3357 result
= PyDict_New();
3358 if (!result
|| !self
->pattern
->groupindex
)
3361 keys
= PyMapping_Keys(self
->pattern
->groupindex
);
3365 for (index
= 0; index
< PyList_GET_SIZE(keys
); index
++) {
3369 key
= PyList_GET_ITEM(keys
, index
);
3372 value
= match_getslice(self
, key
, def
);
3377 status
= PyDict_SetItem(result
, key
, value
);
3394 match_start(MatchObject
* self
, PyObject
* args
)
3398 PyObject
* index_
= Py_False
; /* zero */
3399 if (!PyArg_UnpackTuple(args
, "start", 0, 1, &index_
))
3402 index
= match_getindex(self
, index_
);
3404 if (index
< 0 || index
>= self
->groups
) {
3412 /* mark is -1 if group is undefined */
3413 return Py_BuildValue("i", self
->mark
[index
*2]);
3417 match_end(MatchObject
* self
, PyObject
* args
)
3421 PyObject
* index_
= Py_False
; /* zero */
3422 if (!PyArg_UnpackTuple(args
, "end", 0, 1, &index_
))
3425 index
= match_getindex(self
, index_
);
3427 if (index
< 0 || index
>= self
->groups
) {
3435 /* mark is -1 if group is undefined */
3436 return Py_BuildValue("i", self
->mark
[index
*2+1]);
3440 _pair(Py_ssize_t i1
, Py_ssize_t i2
)
3445 pair
= PyTuple_New(2);
3449 item
= PyInt_FromSsize_t(i1
);
3452 PyTuple_SET_ITEM(pair
, 0, item
);
3454 item
= PyInt_FromSsize_t(i2
);
3457 PyTuple_SET_ITEM(pair
, 1, item
);
3467 match_span(MatchObject
* self
, PyObject
* args
)
3471 PyObject
* index_
= Py_False
; /* zero */
3472 if (!PyArg_UnpackTuple(args
, "span", 0, 1, &index_
))
3475 index
= match_getindex(self
, index_
);
3477 if (index
< 0 || index
>= self
->groups
) {
3485 /* marks are -1 if group is undefined */
3486 return _pair(self
->mark
[index
*2], self
->mark
[index
*2+1]);
3490 match_regs(MatchObject
* self
)
3496 regs
= PyTuple_New(self
->groups
);
3500 for (index
= 0; index
< self
->groups
; index
++) {
3501 item
= _pair(self
->mark
[index
*2], self
->mark
[index
*2+1]);
3506 PyTuple_SET_ITEM(regs
, index
, item
);
3516 match_copy(MatchObject
* self
, PyObject
*unused
)
3518 #ifdef USE_BUILTIN_COPY
3520 Py_ssize_t slots
, offset
;
3522 slots
= 2 * (self
->pattern
->groups
+1);
3524 copy
= PyObject_NEW_VAR(MatchObject
, &Match_Type
, slots
);
3528 /* this value a constant, but any compiler should be able to
3529 figure that out all by itself */
3530 offset
= offsetof(MatchObject
, string
);
3532 Py_XINCREF(self
->pattern
);
3533 Py_XINCREF(self
->string
);
3534 Py_XINCREF(self
->regs
);
3536 memcpy((char*) copy
+ offset
, (char*) self
+ offset
,
3537 sizeof(MatchObject
) + slots
* sizeof(Py_ssize_t
) - offset
);
3539 return (PyObject
*) copy
;
3541 PyErr_SetString(PyExc_TypeError
, "cannot copy this match object");
3547 match_deepcopy(MatchObject
* self
, PyObject
* memo
)
3549 #ifdef USE_BUILTIN_COPY
3552 copy
= (MatchObject
*) match_copy(self
);
3556 if (!deepcopy((PyObject
**) ©
->pattern
, memo
) ||
3557 !deepcopy(©
->string
, memo
) ||
3558 !deepcopy(©
->regs
, memo
)) {
3564 PyErr_SetString(PyExc_TypeError
, "cannot deepcopy this match object");
3569 static PyMethodDef match_methods
[] = {
3570 {"group", (PyCFunction
) match_group
, METH_VARARGS
},
3571 {"start", (PyCFunction
) match_start
, METH_VARARGS
},
3572 {"end", (PyCFunction
) match_end
, METH_VARARGS
},
3573 {"span", (PyCFunction
) match_span
, METH_VARARGS
},
3574 {"groups", (PyCFunction
) match_groups
, METH_VARARGS
|METH_KEYWORDS
},
3575 {"groupdict", (PyCFunction
) match_groupdict
, METH_VARARGS
|METH_KEYWORDS
},
3576 {"expand", (PyCFunction
) match_expand
, METH_O
},
3577 {"__copy__", (PyCFunction
) match_copy
, METH_NOARGS
},
3578 {"__deepcopy__", (PyCFunction
) match_deepcopy
, METH_O
},
3583 match_getattr(MatchObject
* self
, char* name
)
3587 res
= Py_FindMethod(match_methods
, (PyObject
*) self
, name
);
3593 if (!strcmp(name
, "lastindex")) {
3594 if (self
->lastindex
>= 0)
3595 return Py_BuildValue("i", self
->lastindex
);
3600 if (!strcmp(name
, "lastgroup")) {
3601 if (self
->pattern
->indexgroup
&& self
->lastindex
>= 0) {
3602 PyObject
* result
= PySequence_GetItem(
3603 self
->pattern
->indexgroup
, self
->lastindex
3613 if (!strcmp(name
, "string")) {
3615 Py_INCREF(self
->string
);
3616 return self
->string
;
3623 if (!strcmp(name
, "regs")) {
3625 Py_INCREF(self
->regs
);
3628 return match_regs(self
);
3631 if (!strcmp(name
, "re")) {
3632 Py_INCREF(self
->pattern
);
3633 return (PyObject
*) self
->pattern
;
3636 if (!strcmp(name
, "pos"))
3637 return Py_BuildValue("i", self
->pos
);
3639 if (!strcmp(name
, "endpos"))
3640 return Py_BuildValue("i", self
->endpos
);
3642 PyErr_SetString(PyExc_AttributeError
, name
);
3646 /* FIXME: implement setattr("string", None) as a special case (to
3647 detach the associated string, if any */
3649 statichere PyTypeObject Match_Type
= {
3650 PyObject_HEAD_INIT(NULL
)
3651 0, "_" SRE_MODULE
".SRE_Match",
3652 sizeof(MatchObject
), sizeof(Py_ssize_t
),
3653 (destructor
)match_dealloc
, /*tp_dealloc*/
3655 (getattrfunc
)match_getattr
/*tp_getattr*/
3659 pattern_new_match(PatternObject
* pattern
, SRE_STATE
* state
, int status
)
3661 /* create match object (from state object) */
3670 /* create match object (with room for extra group marks) */
3671 /* coverity[ampersand_in_size] */
3672 match
= PyObject_NEW_VAR(MatchObject
, &Match_Type
,
3673 2*(pattern
->groups
+1));
3678 match
->pattern
= pattern
;
3680 Py_INCREF(state
->string
);
3681 match
->string
= state
->string
;
3684 match
->groups
= pattern
->groups
+1;
3686 /* fill in group slices */
3688 base
= (char*) state
->beginning
;
3689 n
= state
->charsize
;
3691 match
->mark
[0] = ((char*) state
->start
- base
) / n
;
3692 match
->mark
[1] = ((char*) state
->ptr
- base
) / n
;
3694 for (i
= j
= 0; i
< pattern
->groups
; i
++, j
+=2)
3695 if (j
+1 <= state
->lastmark
&& state
->mark
[j
] && state
->mark
[j
+1]) {
3696 match
->mark
[j
+2] = ((char*) state
->mark
[j
] - base
) / n
;
3697 match
->mark
[j
+3] = ((char*) state
->mark
[j
+1] - base
) / n
;
3699 match
->mark
[j
+2] = match
->mark
[j
+3] = -1; /* undefined */
3701 match
->pos
= state
->pos
;
3702 match
->endpos
= state
->endpos
;
3704 match
->lastindex
= state
->lastindex
;
3706 return (PyObject
*) match
;
3708 } else if (status
== 0) {
3716 /* internal error */
3717 pattern_error(status
);
3722 /* -------------------------------------------------------------------- */
3723 /* scanner methods (experimental) */
3726 scanner_dealloc(ScannerObject
* self
)
3728 state_fini(&self
->state
);
3729 Py_XDECREF(self
->pattern
);
3734 scanner_match(ScannerObject
* self
, PyObject
*unused
)
3736 SRE_STATE
* state
= &self
->state
;
3742 state
->ptr
= state
->start
;
3744 if (state
->charsize
== 1) {
3745 status
= sre_match(state
, PatternObject_GetCode(self
->pattern
));
3747 #if defined(HAVE_UNICODE)
3748 status
= sre_umatch(state
, PatternObject_GetCode(self
->pattern
));
3751 if (PyErr_Occurred())
3754 match
= pattern_new_match((PatternObject
*) self
->pattern
,
3757 if (status
== 0 || state
->ptr
== state
->start
)
3758 state
->start
= (void*) ((char*) state
->ptr
+ state
->charsize
);
3760 state
->start
= state
->ptr
;
3767 scanner_search(ScannerObject
* self
, PyObject
*unused
)
3769 SRE_STATE
* state
= &self
->state
;
3775 state
->ptr
= state
->start
;
3777 if (state
->charsize
== 1) {
3778 status
= sre_search(state
, PatternObject_GetCode(self
->pattern
));
3780 #if defined(HAVE_UNICODE)
3781 status
= sre_usearch(state
, PatternObject_GetCode(self
->pattern
));
3784 if (PyErr_Occurred())
3787 match
= pattern_new_match((PatternObject
*) self
->pattern
,
3790 if (status
== 0 || state
->ptr
== state
->start
)
3791 state
->start
= (void*) ((char*) state
->ptr
+ state
->charsize
);
3793 state
->start
= state
->ptr
;
3798 static PyMethodDef scanner_methods
[] = {
3799 {"match", (PyCFunction
) scanner_match
, METH_NOARGS
},
3800 {"search", (PyCFunction
) scanner_search
, METH_NOARGS
},
3805 scanner_getattr(ScannerObject
* self
, char* name
)
3809 res
= Py_FindMethod(scanner_methods
, (PyObject
*) self
, name
);
3816 if (!strcmp(name
, "pattern")) {
3817 Py_INCREF(self
->pattern
);
3818 return self
->pattern
;
3821 PyErr_SetString(PyExc_AttributeError
, name
);
3825 statichere PyTypeObject Scanner_Type
= {
3826 PyObject_HEAD_INIT(NULL
)
3827 0, "_" SRE_MODULE
".SRE_Scanner",
3828 sizeof(ScannerObject
), 0,
3829 (destructor
)scanner_dealloc
, /*tp_dealloc*/
3831 (getattrfunc
)scanner_getattr
, /*tp_getattr*/
3835 pattern_scanner(PatternObject
* pattern
, PyObject
* args
)
3837 /* create search state object */
3839 ScannerObject
* self
;
3842 Py_ssize_t start
= 0;
3843 Py_ssize_t end
= PY_SSIZE_T_MAX
;
3844 if (!PyArg_ParseTuple(args
, "O|nn:scanner", &string
, &start
, &end
))
3847 /* create scanner object */
3848 self
= PyObject_NEW(ScannerObject
, &Scanner_Type
);
3851 self
->pattern
= NULL
;
3853 string
= state_init(&self
->state
, pattern
, string
, start
, end
);
3860 self
->pattern
= (PyObject
*) pattern
;
3862 return (PyObject
*) self
;
3865 static PyMethodDef _functions
[] = {
3866 {"compile", _compile
, METH_VARARGS
},
3867 {"getcodesize", sre_codesize
, METH_NOARGS
},
3868 {"getlower", sre_getlower
, METH_VARARGS
},
3872 #if PY_VERSION_HEX < 0x02030000
3873 DL_EXPORT(void) init_sre(void)
3875 PyMODINIT_FUNC
init_sre(void)
3882 /* Patch object types */
3883 if (PyType_Ready(&Pattern_Type
) || PyType_Ready(&Match_Type
) ||
3884 PyType_Ready(&Scanner_Type
))
3887 m
= Py_InitModule("_" SRE_MODULE
, _functions
);
3890 d
= PyModule_GetDict(m
);
3892 x
= PyInt_FromLong(SRE_MAGIC
);
3894 PyDict_SetItemString(d
, "MAGIC", x
);
3898 x
= PyInt_FromLong(sizeof(SRE_CODE
));
3900 PyDict_SetItemString(d
, "CODESIZE", x
);
3904 x
= PyString_FromString(copyright
);
3906 PyDict_SetItemString(d
, "copyright", x
);
3911 #endif /* !defined(SRE_RECURSIVE) */