2 * Secret Labs' Regular Expression Engine
4 * regular expression matching engine
7 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
26 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
28 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
32 * Portions of this engine have been developed in cooperation with
33 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
34 * other compatibility work.
37 /* Get rid of these macros to prevent collisions between EFI and Python in this file. */
43 static char copyright
[] =
44 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
46 #define PY_SSIZE_T_CLEAN
49 #include "structmember.h" /* offsetof */
55 /* name of this module, minus the leading underscore */
56 #if !defined(SRE_MODULE)
57 #define SRE_MODULE "sre"
60 #define SRE_PY_MODULE "re"
62 /* defining this one enables tracing */
65 #if PY_VERSION_HEX >= 0x01060000
66 #if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
67 /* defining this enables unicode support (default under 1.6a1 and later) */
72 /* -------------------------------------------------------------------- */
73 /* optional features */
75 /* enables fast searching */
76 #define USE_FAST_SEARCH
78 /* enables aggressive inlining (always on for Visual C) */
81 /* enables copy/deepcopy handling (work in progress) */
82 #undef USE_BUILTIN_COPY
84 #if PY_VERSION_HEX < 0x01060000
85 #define PyObject_DEL(op) PyMem_DEL((op))
88 /* -------------------------------------------------------------------- */
91 #pragma optimize("gt", on) /* doesn't seem to make much difference... */
92 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
93 /* fastest possible local call under MSVC */
94 #define LOCAL(type) static __inline type __fastcall
95 #elif defined(USE_INLINE)
96 #define LOCAL(type) static inline type
98 #define LOCAL(type) static type
102 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
103 #define SRE_ERROR_STATE -2 /* illegal state */
104 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
105 #define SRE_ERROR_MEMORY -9 /* out of memory */
106 #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
109 #define TRACE(v) printf v
114 /* -------------------------------------------------------------------- */
115 /* search engine state */
117 /* default character predicates (run sre_chars.py to regenerate tables) */
119 #define SRE_DIGIT_MASK 1
120 #define SRE_SPACE_MASK 2
121 #define SRE_LINEBREAK_MASK 4
122 #define SRE_ALNUM_MASK 8
123 #define SRE_WORD_MASK 16
125 /* FIXME: this assumes ASCII. create tables in init_sre() instead */
127 static char sre_char_info
[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
128 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
129 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
130 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
131 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
132 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
133 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
135 static char sre_char_lower
[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
136 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
137 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
138 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
139 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
140 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
141 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
142 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
143 120, 121, 122, 123, 124, 125, 126, 127 };
145 #define SRE_IS_DIGIT(ch)\
146 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
147 #define SRE_IS_SPACE(ch)\
148 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
149 #define SRE_IS_LINEBREAK(ch)\
150 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
151 #define SRE_IS_ALNUM(ch)\
152 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
153 #define SRE_IS_WORD(ch)\
154 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
156 static unsigned int sre_lower(unsigned int ch
)
158 return ((ch
) < 128 ? (unsigned int)sre_char_lower
[ch
] : ch
);
161 /* locale-specific character predicates */
162 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
163 * warnings when c's type supports only numbers < N+1 */
164 #define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
165 #define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
166 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
167 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
168 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
170 static unsigned int sre_lower_locale(unsigned int ch
)
172 return ((ch
) < 256 ? (unsigned int)tolower((ch
)) : ch
);
175 /* unicode-specific character predicates */
177 #if defined(HAVE_UNICODE)
179 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
180 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
181 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
182 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
183 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
185 static unsigned int sre_lower_unicode(unsigned int ch
)
187 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE
)(ch
));
193 sre_category(SRE_CODE category
, unsigned int ch
)
197 case SRE_CATEGORY_DIGIT
:
198 return SRE_IS_DIGIT(ch
);
199 case SRE_CATEGORY_NOT_DIGIT
:
200 return !SRE_IS_DIGIT(ch
);
201 case SRE_CATEGORY_SPACE
:
202 return SRE_IS_SPACE(ch
);
203 case SRE_CATEGORY_NOT_SPACE
:
204 return !SRE_IS_SPACE(ch
);
205 case SRE_CATEGORY_WORD
:
206 return SRE_IS_WORD(ch
);
207 case SRE_CATEGORY_NOT_WORD
:
208 return !SRE_IS_WORD(ch
);
209 case SRE_CATEGORY_LINEBREAK
:
210 return SRE_IS_LINEBREAK(ch
);
211 case SRE_CATEGORY_NOT_LINEBREAK
:
212 return !SRE_IS_LINEBREAK(ch
);
214 case SRE_CATEGORY_LOC_WORD
:
215 return SRE_LOC_IS_WORD(ch
);
216 case SRE_CATEGORY_LOC_NOT_WORD
:
217 return !SRE_LOC_IS_WORD(ch
);
219 #if defined(HAVE_UNICODE)
220 case SRE_CATEGORY_UNI_DIGIT
:
221 return SRE_UNI_IS_DIGIT(ch
);
222 case SRE_CATEGORY_UNI_NOT_DIGIT
:
223 return !SRE_UNI_IS_DIGIT(ch
);
224 case SRE_CATEGORY_UNI_SPACE
:
225 return SRE_UNI_IS_SPACE(ch
);
226 case SRE_CATEGORY_UNI_NOT_SPACE
:
227 return !SRE_UNI_IS_SPACE(ch
);
228 case SRE_CATEGORY_UNI_WORD
:
229 return SRE_UNI_IS_WORD(ch
);
230 case SRE_CATEGORY_UNI_NOT_WORD
:
231 return !SRE_UNI_IS_WORD(ch
);
232 case SRE_CATEGORY_UNI_LINEBREAK
:
233 return SRE_UNI_IS_LINEBREAK(ch
);
234 case SRE_CATEGORY_UNI_NOT_LINEBREAK
:
235 return !SRE_UNI_IS_LINEBREAK(ch
);
237 case SRE_CATEGORY_UNI_DIGIT
:
238 return SRE_IS_DIGIT(ch
);
239 case SRE_CATEGORY_UNI_NOT_DIGIT
:
240 return !SRE_IS_DIGIT(ch
);
241 case SRE_CATEGORY_UNI_SPACE
:
242 return SRE_IS_SPACE(ch
);
243 case SRE_CATEGORY_UNI_NOT_SPACE
:
244 return !SRE_IS_SPACE(ch
);
245 case SRE_CATEGORY_UNI_WORD
:
246 return SRE_LOC_IS_WORD(ch
);
247 case SRE_CATEGORY_UNI_NOT_WORD
:
248 return !SRE_LOC_IS_WORD(ch
);
249 case SRE_CATEGORY_UNI_LINEBREAK
:
250 return SRE_IS_LINEBREAK(ch
);
251 case SRE_CATEGORY_UNI_NOT_LINEBREAK
:
252 return !SRE_IS_LINEBREAK(ch
);
261 data_stack_dealloc(SRE_STATE
* state
)
263 if (state
->data_stack
) {
264 PyMem_FREE(state
->data_stack
);
265 state
->data_stack
= NULL
;
267 state
->data_stack_size
= state
->data_stack_base
= 0;
271 data_stack_grow(SRE_STATE
* state
, Py_ssize_t size
)
273 Py_ssize_t minsize
, cursize
;
274 minsize
= state
->data_stack_base
+size
;
275 cursize
= state
->data_stack_size
;
276 if (cursize
< minsize
) {
278 cursize
= minsize
+minsize
/4+1024;
279 TRACE(("allocate/grow stack %d\n", cursize
));
280 stack
= PyMem_REALLOC(state
->data_stack
, cursize
);
282 data_stack_dealloc(state
);
283 return SRE_ERROR_MEMORY
;
285 state
->data_stack
= (char *)stack
;
286 state
->data_stack_size
= cursize
;
291 /* generate 8-bit version */
293 #define SRE_CHAR unsigned char
294 #define SRE_AT sre_at
295 #define SRE_COUNT sre_count
296 #define SRE_CHARSET sre_charset
297 #define SRE_INFO sre_info
298 #define SRE_MATCH sre_match
299 #define SRE_MATCH_CONTEXT sre_match_context
300 #define SRE_SEARCH sre_search
301 #define SRE_LITERAL_TEMPLATE sre_literal_template
303 #if defined(HAVE_UNICODE)
305 #define SRE_RECURSIVE
309 #undef SRE_LITERAL_TEMPLATE
312 #undef SRE_MATCH_CONTEXT
319 /* generate 16-bit unicode version */
321 #define SRE_CHAR Py_UNICODE
322 #define SRE_AT sre_uat
323 #define SRE_COUNT sre_ucount
324 #define SRE_CHARSET sre_ucharset
325 #define SRE_INFO sre_uinfo
326 #define SRE_MATCH sre_umatch
327 #define SRE_MATCH_CONTEXT sre_umatch_context
328 #define SRE_SEARCH sre_usearch
329 #define SRE_LITERAL_TEMPLATE sre_uliteral_template
332 #endif /* SRE_RECURSIVE */
334 /* -------------------------------------------------------------------- */
335 /* String matching engine */
337 /* the following section is compiled twice, with different character
341 SRE_AT(SRE_STATE
* state
, SRE_CHAR
* ptr
, SRE_CODE at
)
343 /* check if pointer is at given position */
345 Py_ssize_t thisp
, thatp
;
349 case SRE_AT_BEGINNING
:
350 case SRE_AT_BEGINNING_STRING
:
351 return ((void*) ptr
== state
->beginning
);
353 case SRE_AT_BEGINNING_LINE
:
354 return ((void*) ptr
== state
->beginning
||
355 SRE_IS_LINEBREAK((int) ptr
[-1]));
358 return (((void*) (ptr
+1) == state
->end
&&
359 SRE_IS_LINEBREAK((int) ptr
[0])) ||
360 ((void*) ptr
== state
->end
));
362 case SRE_AT_END_LINE
:
363 return ((void*) ptr
== state
->end
||
364 SRE_IS_LINEBREAK((int) ptr
[0]));
366 case SRE_AT_END_STRING
:
367 return ((void*) ptr
== state
->end
);
369 case SRE_AT_BOUNDARY
:
370 if (state
->beginning
== state
->end
)
372 thatp
= ((void*) ptr
> state
->beginning
) ?
373 SRE_IS_WORD((int) ptr
[-1]) : 0;
374 thisp
= ((void*) ptr
< state
->end
) ?
375 SRE_IS_WORD((int) ptr
[0]) : 0;
376 return thisp
!= thatp
;
378 case SRE_AT_NON_BOUNDARY
:
379 if (state
->beginning
== state
->end
)
381 thatp
= ((void*) ptr
> state
->beginning
) ?
382 SRE_IS_WORD((int) ptr
[-1]) : 0;
383 thisp
= ((void*) ptr
< state
->end
) ?
384 SRE_IS_WORD((int) ptr
[0]) : 0;
385 return thisp
== thatp
;
387 case SRE_AT_LOC_BOUNDARY
:
388 if (state
->beginning
== state
->end
)
390 thatp
= ((void*) ptr
> state
->beginning
) ?
391 SRE_LOC_IS_WORD((int) ptr
[-1]) : 0;
392 thisp
= ((void*) ptr
< state
->end
) ?
393 SRE_LOC_IS_WORD((int) ptr
[0]) : 0;
394 return thisp
!= thatp
;
396 case SRE_AT_LOC_NON_BOUNDARY
:
397 if (state
->beginning
== state
->end
)
399 thatp
= ((void*) ptr
> state
->beginning
) ?
400 SRE_LOC_IS_WORD((int) ptr
[-1]) : 0;
401 thisp
= ((void*) ptr
< state
->end
) ?
402 SRE_LOC_IS_WORD((int) ptr
[0]) : 0;
403 return thisp
== thatp
;
405 #if defined(HAVE_UNICODE)
406 case SRE_AT_UNI_BOUNDARY
:
407 if (state
->beginning
== state
->end
)
409 thatp
= ((void*) ptr
> state
->beginning
) ?
410 SRE_UNI_IS_WORD((int) ptr
[-1]) : 0;
411 thisp
= ((void*) ptr
< state
->end
) ?
412 SRE_UNI_IS_WORD((int) ptr
[0]) : 0;
413 return thisp
!= thatp
;
415 case SRE_AT_UNI_NON_BOUNDARY
:
416 if (state
->beginning
== state
->end
)
418 thatp
= ((void*) ptr
> state
->beginning
) ?
419 SRE_UNI_IS_WORD((int) ptr
[-1]) : 0;
420 thisp
= ((void*) ptr
< state
->end
) ?
421 SRE_UNI_IS_WORD((int) ptr
[0]) : 0;
422 return thisp
== thatp
;
431 SRE_CHARSET(SRE_CODE
* set
, SRE_CODE ch
)
433 /* check if character is a member of the given set */
444 /* <LITERAL> <code> */
450 case SRE_OP_CATEGORY
:
451 /* <CATEGORY> <code> */
452 if (sre_category(set
[0], (int) ch
))
458 if (sizeof(SRE_CODE
) == 2) {
459 /* <CHARSET> <bitmap> (16 bits per code word) */
460 if (ch
< 256 && (set
[ch
>> 4] & (1 << (ch
& 15))))
465 /* <CHARSET> <bitmap> (32 bits per code word) */
466 if (ch
< 256 && (set
[ch
>> 5] & (1 << (ch
& 31))))
473 /* <RANGE> <lower> <upper> */
474 if (set
[0] <= ch
&& ch
<= set
[1])
483 case SRE_OP_BIGCHARSET
:
484 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
486 Py_ssize_t count
, block
;
489 if (sizeof(SRE_CODE
) == 2) {
490 block
= ((unsigned char*)set
)[ch
>> 8];
492 if (set
[block
*16 + ((ch
& 255)>>4)] & (1 << (ch
& 15)))
497 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
498 * warnings when c's type supports only numbers < N+1 */
500 block
= ((unsigned char*)set
)[ch
>> 8];
505 (set
[block
*8 + ((ch
& 255)>>5)] & (1 << (ch
& 31))))
513 /* internal error -- there's not much we can do about it
514 here, so let's just pretend it didn't match... */
520 LOCAL(Py_ssize_t
) SRE_MATCH(SRE_STATE
* state
, SRE_CODE
* pattern
);
523 SRE_COUNT(SRE_STATE
* state
, SRE_CODE
* pattern
, Py_ssize_t maxcount
)
526 SRE_CHAR
* ptr
= (SRE_CHAR
*)state
->ptr
;
527 SRE_CHAR
* end
= (SRE_CHAR
*)state
->end
;
531 if (maxcount
< end
- ptr
&& maxcount
!= 65535)
532 end
= ptr
+ maxcount
;
534 switch (pattern
[0]) {
538 TRACE(("|%p|%p|COUNT IN\n", pattern
, ptr
));
539 while (ptr
< end
&& SRE_CHARSET(pattern
+ 2, *ptr
))
544 /* repeated dot wildcard. */
545 TRACE(("|%p|%p|COUNT ANY\n", pattern
, ptr
));
546 while (ptr
< end
&& !SRE_IS_LINEBREAK(*ptr
))
551 /* repeated dot wildcard. skip to the end of the target
552 string, and backtrack from there */
553 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern
, ptr
));
558 /* repeated literal */
560 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern
, ptr
, chr
));
561 while (ptr
< end
&& (SRE_CODE
) *ptr
== chr
)
565 case SRE_OP_LITERAL_IGNORE
:
566 /* repeated literal */
568 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern
, ptr
, chr
));
569 while (ptr
< end
&& (SRE_CODE
) state
->lower(*ptr
) == chr
)
573 case SRE_OP_NOT_LITERAL
:
574 /* repeated non-literal */
576 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern
, ptr
, chr
));
577 while (ptr
< end
&& (SRE_CODE
) *ptr
!= chr
)
581 case SRE_OP_NOT_LITERAL_IGNORE
:
582 /* repeated non-literal */
584 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern
, ptr
, chr
));
585 while (ptr
< end
&& (SRE_CODE
) state
->lower(*ptr
) != chr
)
590 /* repeated single character pattern */
591 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern
, ptr
));
592 while ((SRE_CHAR
*) state
->ptr
< end
) {
593 i
= SRE_MATCH(state
, pattern
);
599 TRACE(("|%p|%p|COUNT %d\n", pattern
, ptr
,
600 (SRE_CHAR
*) state
->ptr
- ptr
));
601 return (SRE_CHAR
*) state
->ptr
- ptr
;
604 TRACE(("|%p|%p|COUNT %d\n", pattern
, ptr
, ptr
- (SRE_CHAR
*) state
->ptr
));
605 return ptr
- (SRE_CHAR
*) state
->ptr
;
608 #if 0 /* not used in this release */
610 SRE_INFO(SRE_STATE
* state
, SRE_CODE
* pattern
)
612 /* check if an SRE_OP_INFO block matches at the current position.
613 returns the number of SRE_CODE objects to skip if successful, 0
616 SRE_CHAR
* end
= state
->end
;
617 SRE_CHAR
* ptr
= state
->ptr
;
620 /* check minimal length */
621 if (pattern
[3] && (end
- ptr
) < pattern
[3])
624 /* check known prefix */
625 if (pattern
[2] & SRE_INFO_PREFIX
&& pattern
[5] > 1) {
626 /* <length> <skip> <prefix data> <overlap data> */
627 for (i
= 0; i
< pattern
[5]; i
++)
628 if ((SRE_CODE
) ptr
[i
] != pattern
[7 + i
])
630 return pattern
[0] + 2 * pattern
[6];
636 /* The macros below should be used to protect recursive SRE_MATCH()
637 * calls that *failed* and do *not* return immediately (IOW, those
638 * that will backtrack). Explaining:
640 * - Recursive SRE_MATCH() returned true: that's usually a success
641 * (besides atypical cases like ASSERT_NOT), therefore there's no
642 * reason to restore lastmark;
644 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
645 * is returning to the caller: If the current SRE_MATCH() is the
646 * top function of the recursion, returning false will be a matching
647 * failure, and it doesn't matter where lastmark is pointing to.
648 * If it's *not* the top function, it will be a recursive SRE_MATCH()
649 * failure by itself, and the calling SRE_MATCH() will have to deal
650 * with the failure by the same rules explained here (it will restore
651 * lastmark by itself if necessary);
653 * - Recursive SRE_MATCH() returned false, and will continue the
654 * outside 'for' loop: must be protected when breaking, since the next
655 * OP could potentially depend on lastmark;
657 * - Recursive SRE_MATCH() returned false, and will be called again
658 * inside a local for/while loop: must be protected between each
659 * loop iteration, since the recursive SRE_MATCH() could do anything,
660 * and could potentially depend on lastmark.
662 * For more information, check the discussion at SF patch #712900.
664 #define LASTMARK_SAVE() \
666 ctx->lastmark = state->lastmark; \
667 ctx->lastindex = state->lastindex; \
669 #define LASTMARK_RESTORE() \
671 state->lastmark = ctx->lastmark; \
672 state->lastindex = ctx->lastindex; \
675 #define RETURN_ERROR(i) do { return i; } while(0)
676 #define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
677 #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
679 #define RETURN_ON_ERROR(i) \
680 do { if (i < 0) RETURN_ERROR(i); } while (0)
681 #define RETURN_ON_SUCCESS(i) \
682 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
683 #define RETURN_ON_FAILURE(i) \
684 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
688 #define DATA_STACK_ALLOC(state, type, ptr) \
690 alloc_pos = state->data_stack_base; \
691 TRACE(("allocating %s in %d (%d)\n", \
692 SFY(type), alloc_pos, sizeof(type))); \
693 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
694 int j = data_stack_grow(state, sizeof(type)); \
695 if (j < 0) return j; \
697 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
699 ptr = (type*)(state->data_stack+alloc_pos); \
700 state->data_stack_base += sizeof(type); \
703 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
705 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
706 ptr = (type*)(state->data_stack+pos); \
709 #define DATA_STACK_PUSH(state, data, size) \
711 TRACE(("copy data in %p to %d (%d)\n", \
712 data, state->data_stack_base, size)); \
713 if (state->data_stack_size < state->data_stack_base+size) { \
714 int j = data_stack_grow(state, size); \
715 if (j < 0) return j; \
717 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
719 memcpy(state->data_stack+state->data_stack_base, data, size); \
720 state->data_stack_base += size; \
723 #define DATA_STACK_POP(state, data, size, discard) \
725 TRACE(("copy data to %p from %d (%d)\n", \
726 data, state->data_stack_base-size, size)); \
727 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
729 state->data_stack_base -= size; \
732 #define DATA_STACK_POP_DISCARD(state, size) \
734 TRACE(("discard data from %d (%d)\n", \
735 state->data_stack_base-size, size)); \
736 state->data_stack_base -= size; \
739 #define DATA_PUSH(x) \
740 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
741 #define DATA_POP(x) \
742 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
743 #define DATA_POP_DISCARD(x) \
744 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
745 #define DATA_ALLOC(t,p) \
746 DATA_STACK_ALLOC(state, t, p)
747 #define DATA_LOOKUP_AT(t,p,pos) \
748 DATA_STACK_LOOKUP_AT(state,t,p,pos)
750 #define MARK_PUSH(lastmark) \
751 do if (lastmark > 0) { \
752 i = lastmark; /* ctx->lastmark may change if reallocated */ \
753 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
755 #define MARK_POP(lastmark) \
756 do if (lastmark > 0) { \
757 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
759 #define MARK_POP_KEEP(lastmark) \
760 do if (lastmark > 0) { \
761 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
763 #define MARK_POP_DISCARD(lastmark) \
764 do if (lastmark > 0) { \
765 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
769 #define JUMP_MAX_UNTIL_1 1
770 #define JUMP_MAX_UNTIL_2 2
771 #define JUMP_MAX_UNTIL_3 3
772 #define JUMP_MIN_UNTIL_1 4
773 #define JUMP_MIN_UNTIL_2 5
774 #define JUMP_MIN_UNTIL_3 6
775 #define JUMP_REPEAT 7
776 #define JUMP_REPEAT_ONE_1 8
777 #define JUMP_REPEAT_ONE_2 9
778 #define JUMP_MIN_REPEAT_ONE 10
779 #define JUMP_BRANCH 11
780 #define JUMP_ASSERT 12
781 #define JUMP_ASSERT_NOT 13
783 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
784 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
785 nextctx->last_ctx_pos = ctx_pos; \
786 nextctx->jump = jumpvalue; \
787 nextctx->pattern = nextpattern; \
788 ctx_pos = alloc_pos; \
792 while (0) /* gcc doesn't like labels at end of scopes */ \
795 Py_ssize_t last_ctx_pos
;
801 Py_ssize_t lastindex
;
808 /* check if string matches the given pattern. returns <0 for
809 error, 0 for failure, and 1 for success */
811 SRE_MATCH(SRE_STATE
* state
, SRE_CODE
* pattern
)
813 SRE_CHAR
* end
= (SRE_CHAR
*)state
->end
;
814 Py_ssize_t alloc_pos
, ctx_pos
= -1;
815 Py_ssize_t i
, ret
= 0;
817 unsigned int sigcount
=0;
819 SRE_MATCH_CONTEXT
* ctx
;
820 SRE_MATCH_CONTEXT
* nextctx
;
822 TRACE(("|%p|%p|ENTER\n", pattern
, state
->ptr
));
824 DATA_ALLOC(SRE_MATCH_CONTEXT
, ctx
);
825 ctx
->last_ctx_pos
= -1;
826 ctx
->jump
= JUMP_NONE
;
827 ctx
->pattern
= pattern
;
832 ctx
->ptr
= (SRE_CHAR
*)state
->ptr
;
834 if (ctx
->pattern
[0] == SRE_OP_INFO
) {
835 /* optimization info block */
836 /* <INFO> <1=skip> <2=flags> <3=min> ... */
837 if (ctx
->pattern
[3] && (end
- ctx
->ptr
) < ctx
->pattern
[3]) {
838 TRACE(("reject (got %d chars, need %d)\n",
839 (end
- ctx
->ptr
), ctx
->pattern
[3]));
842 ctx
->pattern
+= ctx
->pattern
[1] + 1;
847 if ((0 == (sigcount
& 0xfff)) && PyErr_CheckSignals())
848 RETURN_ERROR(SRE_ERROR_INTERRUPTED
);
850 switch (*ctx
->pattern
++) {
855 TRACE(("|%p|%p|MARK %d\n", ctx
->pattern
,
856 ctx
->ptr
, ctx
->pattern
[0]));
859 state
->lastindex
= i
/2 + 1;
860 if (i
> state
->lastmark
) {
861 /* state->lastmark is the highest valid index in the
862 state->mark array. If it is increased by more than 1,
863 the intervening marks must be set to NULL to signal
864 that these marks have not been encountered. */
865 Py_ssize_t j
= state
->lastmark
+ 1;
867 state
->mark
[j
++] = NULL
;
870 state
->mark
[i
] = ctx
->ptr
;
875 /* match literal string */
876 /* <LITERAL> <code> */
877 TRACE(("|%p|%p|LITERAL %d\n", ctx
->pattern
,
878 ctx
->ptr
, *ctx
->pattern
));
879 if (ctx
->ptr
>= end
|| (SRE_CODE
) ctx
->ptr
[0] != ctx
->pattern
[0])
885 case SRE_OP_NOT_LITERAL
:
886 /* match anything that is not literal character */
887 /* <NOT_LITERAL> <code> */
888 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx
->pattern
,
889 ctx
->ptr
, *ctx
->pattern
));
890 if (ctx
->ptr
>= end
|| (SRE_CODE
) ctx
->ptr
[0] == ctx
->pattern
[0])
898 TRACE(("|%p|%p|SUCCESS\n", ctx
->pattern
, ctx
->ptr
));
899 state
->ptr
= ctx
->ptr
;
903 /* match at given position */
905 TRACE(("|%p|%p|AT %d\n", ctx
->pattern
, ctx
->ptr
, *ctx
->pattern
));
906 if (!SRE_AT(state
, ctx
->ptr
, *ctx
->pattern
))
911 case SRE_OP_CATEGORY
:
912 /* match at given category */
913 /* <CATEGORY> <code> */
914 TRACE(("|%p|%p|CATEGORY %d\n", ctx
->pattern
,
915 ctx
->ptr
, *ctx
->pattern
));
916 if (ctx
->ptr
>= end
|| !sre_category(ctx
->pattern
[0], ctx
->ptr
[0]))
923 /* match anything (except a newline) */
925 TRACE(("|%p|%p|ANY\n", ctx
->pattern
, ctx
->ptr
));
926 if (ctx
->ptr
>= end
|| SRE_IS_LINEBREAK(ctx
->ptr
[0]))
934 TRACE(("|%p|%p|ANY_ALL\n", ctx
->pattern
, ctx
->ptr
));
941 /* match set member (or non_member) */
942 /* <IN> <skip> <set> */
943 TRACE(("|%p|%p|IN\n", ctx
->pattern
, ctx
->ptr
));
944 if (ctx
->ptr
>= end
|| !SRE_CHARSET(ctx
->pattern
+ 1, *ctx
->ptr
))
946 ctx
->pattern
+= ctx
->pattern
[0];
950 case SRE_OP_LITERAL_IGNORE
:
951 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
952 ctx
->pattern
, ctx
->ptr
, ctx
->pattern
[0]));
953 if (ctx
->ptr
>= end
||
954 state
->lower(*ctx
->ptr
) != state
->lower(*ctx
->pattern
))
960 case SRE_OP_NOT_LITERAL_IGNORE
:
961 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
962 ctx
->pattern
, ctx
->ptr
, *ctx
->pattern
));
963 if (ctx
->ptr
>= end
||
964 state
->lower(*ctx
->ptr
) == state
->lower(*ctx
->pattern
))
970 case SRE_OP_IN_IGNORE
:
971 TRACE(("|%p|%p|IN_IGNORE\n", ctx
->pattern
, ctx
->ptr
));
973 || !SRE_CHARSET(ctx
->pattern
+1,
974 (SRE_CODE
)state
->lower(*ctx
->ptr
)))
976 ctx
->pattern
+= ctx
->pattern
[0];
983 /* <JUMP> <offset> */
984 TRACE(("|%p|%p|JUMP %d\n", ctx
->pattern
,
985 ctx
->ptr
, ctx
->pattern
[0]));
986 ctx
->pattern
+= ctx
->pattern
[0];
991 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
992 TRACE(("|%p|%p|BRANCH\n", ctx
->pattern
, ctx
->ptr
));
994 ctx
->u
.rep
= state
->repeat
;
996 MARK_PUSH(ctx
->lastmark
);
997 for (; ctx
->pattern
[0]; ctx
->pattern
+= ctx
->pattern
[0]) {
998 if (ctx
->pattern
[1] == SRE_OP_LITERAL
&&
1000 (SRE_CODE
) *ctx
->ptr
!= ctx
->pattern
[2]))
1002 if (ctx
->pattern
[1] == SRE_OP_IN
&&
1004 !SRE_CHARSET(ctx
->pattern
+ 3, (SRE_CODE
) *ctx
->ptr
)))
1006 state
->ptr
= ctx
->ptr
;
1007 DO_JUMP(JUMP_BRANCH
, jump_branch
, ctx
->pattern
+1);
1010 MARK_POP_DISCARD(ctx
->lastmark
);
1011 RETURN_ON_ERROR(ret
);
1015 MARK_POP_KEEP(ctx
->lastmark
);
1019 MARK_POP_DISCARD(ctx
->lastmark
);
1022 case SRE_OP_REPEAT_ONE
:
1023 /* match repeated sequence (maximizing regexp) */
1025 /* this operator only works if the repeated item is
1026 exactly one character wide, and we're not already
1027 collecting backtracking points. for other cases,
1028 use the MAX_REPEAT operator */
1030 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1032 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx
->pattern
, ctx
->ptr
,
1033 ctx
->pattern
[1], ctx
->pattern
[2]));
1035 if (ctx
->ptr
+ ctx
->pattern
[1] > end
)
1036 RETURN_FAILURE
; /* cannot match */
1038 state
->ptr
= ctx
->ptr
;
1040 ret
= SRE_COUNT(state
, ctx
->pattern
+3, ctx
->pattern
[2]);
1041 RETURN_ON_ERROR(ret
);
1042 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT
, ctx
, ctx_pos
);
1044 ctx
->ptr
+= ctx
->count
;
1046 /* when we arrive here, count contains the number of
1047 matches, and ctx->ptr points to the tail of the target
1048 string. check if the rest of the pattern matches,
1049 and backtrack if not. */
1051 if (ctx
->count
< (Py_ssize_t
) ctx
->pattern
[1])
1054 if (ctx
->pattern
[ctx
->pattern
[0]] == SRE_OP_SUCCESS
) {
1055 /* tail is empty. we're finished */
1056 state
->ptr
= ctx
->ptr
;
1062 if (ctx
->pattern
[ctx
->pattern
[0]] == SRE_OP_LITERAL
) {
1063 /* tail starts with a literal. skip positions where
1064 the rest of the pattern cannot possibly match */
1065 ctx
->u
.chr
= ctx
->pattern
[ctx
->pattern
[0]+1];
1067 while (ctx
->count
>= (Py_ssize_t
) ctx
->pattern
[1] &&
1068 (ctx
->ptr
>= end
|| *ctx
->ptr
!= ctx
->u
.chr
)) {
1072 if (ctx
->count
< (Py_ssize_t
) ctx
->pattern
[1])
1074 state
->ptr
= ctx
->ptr
;
1075 DO_JUMP(JUMP_REPEAT_ONE_1
, jump_repeat_one_1
,
1076 ctx
->pattern
+ctx
->pattern
[0]);
1078 RETURN_ON_ERROR(ret
);
1090 while (ctx
->count
>= (Py_ssize_t
) ctx
->pattern
[1]) {
1091 state
->ptr
= ctx
->ptr
;
1092 DO_JUMP(JUMP_REPEAT_ONE_2
, jump_repeat_one_2
,
1093 ctx
->pattern
+ctx
->pattern
[0]);
1095 RETURN_ON_ERROR(ret
);
1105 case SRE_OP_MIN_REPEAT_ONE
:
1106 /* match repeated sequence (minimizing regexp) */
1108 /* this operator only works if the repeated item is
1109 exactly one character wide, and we're not already
1110 collecting backtracking points. for other cases,
1111 use the MIN_REPEAT operator */
1113 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1115 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx
->pattern
, ctx
->ptr
,
1116 ctx
->pattern
[1], ctx
->pattern
[2]));
1118 if (ctx
->ptr
+ ctx
->pattern
[1] > end
)
1119 RETURN_FAILURE
; /* cannot match */
1121 state
->ptr
= ctx
->ptr
;
1123 if (ctx
->pattern
[1] == 0)
1126 /* count using pattern min as the maximum */
1127 ret
= SRE_COUNT(state
, ctx
->pattern
+3, ctx
->pattern
[1]);
1128 RETURN_ON_ERROR(ret
);
1129 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT
, ctx
, ctx_pos
);
1130 if (ret
< (Py_ssize_t
) ctx
->pattern
[1])
1131 /* didn't match minimum number of times */
1133 /* advance past minimum matches of repeat */
1135 ctx
->ptr
+= ctx
->count
;
1138 if (ctx
->pattern
[ctx
->pattern
[0]] == SRE_OP_SUCCESS
) {
1139 /* tail is empty. we're finished */
1140 state
->ptr
= ctx
->ptr
;
1146 while ((Py_ssize_t
)ctx
->pattern
[2] == 65535
1147 || ctx
->count
<= (Py_ssize_t
)ctx
->pattern
[2]) {
1148 state
->ptr
= ctx
->ptr
;
1149 DO_JUMP(JUMP_MIN_REPEAT_ONE
,jump_min_repeat_one
,
1150 ctx
->pattern
+ctx
->pattern
[0]);
1152 RETURN_ON_ERROR(ret
);
1155 state
->ptr
= ctx
->ptr
;
1156 ret
= SRE_COUNT(state
, ctx
->pattern
+3, 1);
1157 RETURN_ON_ERROR(ret
);
1158 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT
, ctx
, ctx_pos
);
1170 /* create repeat context. all the hard work is done
1171 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1172 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1173 TRACE(("|%p|%p|REPEAT %d %d\n", ctx
->pattern
, ctx
->ptr
,
1174 ctx
->pattern
[1], ctx
->pattern
[2]));
1176 /* install new repeat context */
1177 ctx
->u
.rep
= (SRE_REPEAT
*) PyObject_MALLOC(sizeof(*ctx
->u
.rep
));
1182 ctx
->u
.rep
->count
= -1;
1183 ctx
->u
.rep
->pattern
= ctx
->pattern
;
1184 ctx
->u
.rep
->prev
= state
->repeat
;
1185 ctx
->u
.rep
->last_ptr
= NULL
;
1186 state
->repeat
= ctx
->u
.rep
;
1188 state
->ptr
= ctx
->ptr
;
1189 DO_JUMP(JUMP_REPEAT
, jump_repeat
, ctx
->pattern
+ctx
->pattern
[0]);
1190 state
->repeat
= ctx
->u
.rep
->prev
;
1191 PyObject_FREE(ctx
->u
.rep
);
1194 RETURN_ON_ERROR(ret
);
1199 case SRE_OP_MAX_UNTIL
:
1200 /* maximizing repeat */
1201 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1203 /* FIXME: we probably need to deal with zero-width
1204 matches in here... */
1206 ctx
->u
.rep
= state
->repeat
;
1208 RETURN_ERROR(SRE_ERROR_STATE
);
1210 state
->ptr
= ctx
->ptr
;
1212 ctx
->count
= ctx
->u
.rep
->count
+1;
1214 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx
->pattern
,
1215 ctx
->ptr
, ctx
->count
));
1217 if (ctx
->count
< ctx
->u
.rep
->pattern
[1]) {
1218 /* not enough matches */
1219 ctx
->u
.rep
->count
= ctx
->count
;
1220 DO_JUMP(JUMP_MAX_UNTIL_1
, jump_max_until_1
,
1221 ctx
->u
.rep
->pattern
+3);
1223 RETURN_ON_ERROR(ret
);
1226 ctx
->u
.rep
->count
= ctx
->count
-1;
1227 state
->ptr
= ctx
->ptr
;
1231 if ((ctx
->count
< ctx
->u
.rep
->pattern
[2] ||
1232 ctx
->u
.rep
->pattern
[2] == 65535) &&
1233 state
->ptr
!= ctx
->u
.rep
->last_ptr
) {
1234 /* we may have enough matches, but if we can
1235 match another item, do so */
1236 ctx
->u
.rep
->count
= ctx
->count
;
1238 MARK_PUSH(ctx
->lastmark
);
1239 /* zero-width match protection */
1240 DATA_PUSH(&ctx
->u
.rep
->last_ptr
);
1241 ctx
->u
.rep
->last_ptr
= state
->ptr
;
1242 DO_JUMP(JUMP_MAX_UNTIL_2
, jump_max_until_2
,
1243 ctx
->u
.rep
->pattern
+3);
1244 DATA_POP(&ctx
->u
.rep
->last_ptr
);
1246 MARK_POP_DISCARD(ctx
->lastmark
);
1247 RETURN_ON_ERROR(ret
);
1250 MARK_POP(ctx
->lastmark
);
1252 ctx
->u
.rep
->count
= ctx
->count
-1;
1253 state
->ptr
= ctx
->ptr
;
1256 /* cannot match more repeated items here. make sure the
1258 state
->repeat
= ctx
->u
.rep
->prev
;
1259 DO_JUMP(JUMP_MAX_UNTIL_3
, jump_max_until_3
, ctx
->pattern
);
1260 RETURN_ON_SUCCESS(ret
);
1261 state
->repeat
= ctx
->u
.rep
;
1262 state
->ptr
= ctx
->ptr
;
1265 case SRE_OP_MIN_UNTIL
:
1266 /* minimizing repeat */
1267 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1269 ctx
->u
.rep
= state
->repeat
;
1271 RETURN_ERROR(SRE_ERROR_STATE
);
1273 state
->ptr
= ctx
->ptr
;
1275 ctx
->count
= ctx
->u
.rep
->count
+1;
1277 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx
->pattern
,
1278 ctx
->ptr
, ctx
->count
, ctx
->u
.rep
->pattern
));
1280 if (ctx
->count
< ctx
->u
.rep
->pattern
[1]) {
1281 /* not enough matches */
1282 ctx
->u
.rep
->count
= ctx
->count
;
1283 DO_JUMP(JUMP_MIN_UNTIL_1
, jump_min_until_1
,
1284 ctx
->u
.rep
->pattern
+3);
1286 RETURN_ON_ERROR(ret
);
1289 ctx
->u
.rep
->count
= ctx
->count
-1;
1290 state
->ptr
= ctx
->ptr
;
1296 /* see if the tail matches */
1297 state
->repeat
= ctx
->u
.rep
->prev
;
1298 DO_JUMP(JUMP_MIN_UNTIL_2
, jump_min_until_2
, ctx
->pattern
);
1300 RETURN_ON_ERROR(ret
);
1304 state
->repeat
= ctx
->u
.rep
;
1305 state
->ptr
= ctx
->ptr
;
1309 if (ctx
->count
>= ctx
->u
.rep
->pattern
[2]
1310 && ctx
->u
.rep
->pattern
[2] != 65535)
1313 ctx
->u
.rep
->count
= ctx
->count
;
1314 DO_JUMP(JUMP_MIN_UNTIL_3
,jump_min_until_3
,
1315 ctx
->u
.rep
->pattern
+3);
1317 RETURN_ON_ERROR(ret
);
1320 ctx
->u
.rep
->count
= ctx
->count
-1;
1321 state
->ptr
= ctx
->ptr
;
1324 case SRE_OP_GROUPREF
:
1325 /* match backreference */
1326 TRACE(("|%p|%p|GROUPREF %d\n", ctx
->pattern
,
1327 ctx
->ptr
, ctx
->pattern
[0]));
1328 i
= ctx
->pattern
[0];
1330 Py_ssize_t groupref
= i
+i
;
1331 if (groupref
>= state
->lastmark
) {
1334 SRE_CHAR
* p
= (SRE_CHAR
*) state
->mark
[groupref
];
1335 SRE_CHAR
* e
= (SRE_CHAR
*) state
->mark
[groupref
+1];
1336 if (!p
|| !e
|| e
< p
)
1339 if (ctx
->ptr
>= end
|| *ctx
->ptr
!= *p
)
1348 case SRE_OP_GROUPREF_IGNORE
:
1349 /* match backreference */
1350 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx
->pattern
,
1351 ctx
->ptr
, ctx
->pattern
[0]));
1352 i
= ctx
->pattern
[0];
1354 Py_ssize_t groupref
= i
+i
;
1355 if (groupref
>= state
->lastmark
) {
1358 SRE_CHAR
* p
= (SRE_CHAR
*) state
->mark
[groupref
];
1359 SRE_CHAR
* e
= (SRE_CHAR
*) state
->mark
[groupref
+1];
1360 if (!p
|| !e
|| e
< p
)
1363 if (ctx
->ptr
>= end
||
1364 state
->lower(*ctx
->ptr
) != state
->lower(*p
))
1373 case SRE_OP_GROUPREF_EXISTS
:
1374 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx
->pattern
,
1375 ctx
->ptr
, ctx
->pattern
[0]));
1376 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1377 i
= ctx
->pattern
[0];
1379 Py_ssize_t groupref
= i
+i
;
1380 if (groupref
>= state
->lastmark
) {
1381 ctx
->pattern
+= ctx
->pattern
[1];
1384 SRE_CHAR
* p
= (SRE_CHAR
*) state
->mark
[groupref
];
1385 SRE_CHAR
* e
= (SRE_CHAR
*) state
->mark
[groupref
+1];
1386 if (!p
|| !e
|| e
< p
) {
1387 ctx
->pattern
+= ctx
->pattern
[1];
1396 /* assert subpattern */
1397 /* <ASSERT> <skip> <back> <pattern> */
1398 TRACE(("|%p|%p|ASSERT %d\n", ctx
->pattern
,
1399 ctx
->ptr
, ctx
->pattern
[1]));
1400 state
->ptr
= ctx
->ptr
- ctx
->pattern
[1];
1401 if (state
->ptr
< state
->beginning
)
1403 DO_JUMP(JUMP_ASSERT
, jump_assert
, ctx
->pattern
+2);
1404 RETURN_ON_FAILURE(ret
);
1405 ctx
->pattern
+= ctx
->pattern
[0];
1408 case SRE_OP_ASSERT_NOT
:
1409 /* assert not subpattern */
1410 /* <ASSERT_NOT> <skip> <back> <pattern> */
1411 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx
->pattern
,
1412 ctx
->ptr
, ctx
->pattern
[1]));
1413 state
->ptr
= ctx
->ptr
- ctx
->pattern
[1];
1414 if (state
->ptr
>= state
->beginning
) {
1415 DO_JUMP(JUMP_ASSERT_NOT
, jump_assert_not
, ctx
->pattern
+2);
1417 RETURN_ON_ERROR(ret
);
1421 ctx
->pattern
+= ctx
->pattern
[0];
1424 case SRE_OP_FAILURE
:
1425 /* immediate failure */
1426 TRACE(("|%p|%p|FAILURE\n", ctx
->pattern
, ctx
->ptr
));
1430 TRACE(("|%p|%p|UNKNOWN %d\n", ctx
->pattern
, ctx
->ptr
,
1432 RETURN_ERROR(SRE_ERROR_ILLEGAL
);
1437 ctx_pos
= ctx
->last_ctx_pos
;
1439 DATA_POP_DISCARD(ctx
);
1442 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT
, ctx
, ctx_pos
);
1445 case JUMP_MAX_UNTIL_2
:
1446 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx
->pattern
, ctx
->ptr
));
1447 goto jump_max_until_2
;
1448 case JUMP_MAX_UNTIL_3
:
1449 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx
->pattern
, ctx
->ptr
));
1450 goto jump_max_until_3
;
1451 case JUMP_MIN_UNTIL_2
:
1452 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx
->pattern
, ctx
->ptr
));
1453 goto jump_min_until_2
;
1454 case JUMP_MIN_UNTIL_3
:
1455 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx
->pattern
, ctx
->ptr
));
1456 goto jump_min_until_3
;
1458 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx
->pattern
, ctx
->ptr
));
1460 case JUMP_MAX_UNTIL_1
:
1461 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx
->pattern
, ctx
->ptr
));
1462 goto jump_max_until_1
;
1463 case JUMP_MIN_UNTIL_1
:
1464 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx
->pattern
, ctx
->ptr
));
1465 goto jump_min_until_1
;
1467 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx
->pattern
, ctx
->ptr
));
1469 case JUMP_REPEAT_ONE_1
:
1470 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx
->pattern
, ctx
->ptr
));
1471 goto jump_repeat_one_1
;
1472 case JUMP_REPEAT_ONE_2
:
1473 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx
->pattern
, ctx
->ptr
));
1474 goto jump_repeat_one_2
;
1475 case JUMP_MIN_REPEAT_ONE
:
1476 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx
->pattern
, ctx
->ptr
));
1477 goto jump_min_repeat_one
;
1479 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx
->pattern
, ctx
->ptr
));
1481 case JUMP_ASSERT_NOT
:
1482 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx
->pattern
, ctx
->ptr
));
1483 goto jump_assert_not
;
1485 TRACE(("|%p|%p|RETURN %d\n", ctx
->pattern
, ctx
->ptr
, ret
));
1489 return ret
; /* should never get here */
1493 SRE_SEARCH(SRE_STATE
* state
, SRE_CODE
* pattern
)
1495 SRE_CHAR
* ptr
= (SRE_CHAR
*)state
->start
;
1496 SRE_CHAR
* end
= (SRE_CHAR
*)state
->end
;
1497 Py_ssize_t status
= 0;
1498 Py_ssize_t prefix_len
= 0;
1499 Py_ssize_t prefix_skip
= 0;
1500 SRE_CODE
* prefix
= NULL
;
1501 SRE_CODE
* charset
= NULL
;
1502 SRE_CODE
* overlap
= NULL
;
1505 if (pattern
[0] == SRE_OP_INFO
) {
1506 /* optimization info block */
1507 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1511 if (pattern
[3] > 1) {
1512 /* adjust end point (but make sure we leave at least one
1513 character in there, so literal search will work) */
1514 end
-= pattern
[3]-1;
1519 if (flags
& SRE_INFO_PREFIX
) {
1520 /* pattern starts with a known prefix */
1521 /* <length> <skip> <prefix data> <overlap data> */
1522 prefix_len
= pattern
[5];
1523 prefix_skip
= pattern
[6];
1524 prefix
= pattern
+ 7;
1525 overlap
= prefix
+ prefix_len
- 1;
1526 } else if (flags
& SRE_INFO_CHARSET
)
1527 /* pattern starts with a character from a known set */
1529 charset
= pattern
+ 5;
1531 pattern
+= 1 + pattern
[1];
1534 TRACE(("prefix = %p %d %d\n", prefix
, prefix_len
, prefix_skip
));
1535 TRACE(("charset = %p\n", charset
));
1537 #if defined(USE_FAST_SEARCH)
1538 if (prefix_len
> 1) {
1539 /* pattern starts with a known prefix. use the overlap
1540 table to skip forward as fast as we possibly can */
1542 end
= (SRE_CHAR
*)state
->end
;
1545 if ((SRE_CODE
) ptr
[0] != prefix
[i
]) {
1551 if (++i
== prefix_len
) {
1552 /* found a potential match */
1553 TRACE(("|%p|%p|SEARCH SCAN\n", pattern
, ptr
));
1554 state
->start
= ptr
+ 1 - prefix_len
;
1555 state
->ptr
= ptr
+ 1 - prefix_len
+ prefix_skip
;
1556 if (flags
& SRE_INFO_LITERAL
)
1557 return 1; /* we got all of it */
1558 status
= SRE_MATCH(state
, pattern
+ 2*prefix_skip
);
1561 /* close but no cigar -- try again */
1573 if (pattern
[0] == SRE_OP_LITERAL
) {
1574 /* pattern starts with a literal character. this is used
1575 for short prefixes, and if fast search is disabled */
1576 SRE_CODE chr
= pattern
[1];
1577 end
= (SRE_CHAR
*)state
->end
;
1579 while (ptr
< end
&& (SRE_CODE
) ptr
[0] != chr
)
1583 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern
, ptr
));
1586 if (flags
& SRE_INFO_LITERAL
)
1587 return 1; /* we got all of it */
1588 status
= SRE_MATCH(state
, pattern
+ 2);
1592 } else if (charset
) {
1593 /* pattern starts with a character from a known set */
1594 end
= (SRE_CHAR
*)state
->end
;
1596 while (ptr
< end
&& !SRE_CHARSET(charset
, ptr
[0]))
1600 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern
, ptr
));
1603 status
= SRE_MATCH(state
, pattern
);
1610 while (ptr
<= end
) {
1611 TRACE(("|%p|%p|SEARCH\n", pattern
, ptr
));
1612 state
->start
= state
->ptr
= ptr
++;
1613 status
= SRE_MATCH(state
, pattern
);
1622 SRE_LITERAL_TEMPLATE(SRE_CHAR
* ptr
, Py_ssize_t len
)
1624 /* check if given string is a literal template (i.e. no escapes) */
1631 #if !defined(SRE_RECURSIVE)
1633 /* -------------------------------------------------------------------- */
1634 /* factories and destructors */
1636 /* see sre.h for object declarations */
1637 static PyObject
*pattern_new_match(PatternObject
*, SRE_STATE
*, int);
1638 static PyObject
*pattern_scanner(PatternObject
*, PyObject
*);
1641 sre_codesize(PyObject
* self
, PyObject
*unused
)
1643 return Py_BuildValue("l", sizeof(SRE_CODE
));
1647 sre_getlower(PyObject
* self
, PyObject
* args
)
1649 int character
, flags
;
1650 if (!PyArg_ParseTuple(args
, "ii", &character
, &flags
))
1652 if (flags
& SRE_FLAG_LOCALE
)
1653 return Py_BuildValue("i", sre_lower_locale(character
));
1654 if (flags
& SRE_FLAG_UNICODE
)
1655 #if defined(HAVE_UNICODE)
1656 return Py_BuildValue("i", sre_lower_unicode(character
));
1658 return Py_BuildValue("i", sre_lower_locale(character
));
1660 return Py_BuildValue("i", sre_lower(character
));
1664 state_reset(SRE_STATE
* state
)
1666 /* FIXME: dynamic! */
1667 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
1669 state
->lastmark
= -1;
1670 state
->lastindex
= -1;
1672 state
->repeat
= NULL
;
1674 data_stack_dealloc(state
);
1678 getstring(PyObject
* string
, Py_ssize_t
* p_length
, int* p_charsize
)
1680 /* given a python object, return a data pointer, a length (in
1681 characters), and a character size. return NULL if the object
1682 is not a string (or not compatible) */
1684 PyBufferProcs
*buffer
;
1685 Py_ssize_t size
, bytes
;
1689 #if defined(HAVE_UNICODE)
1690 if (PyUnicode_Check(string
)) {
1691 /* unicode strings doesn't always support the buffer interface */
1692 ptr
= (void*) PyUnicode_AS_DATA(string
);
1693 /* bytes = PyUnicode_GET_DATA_SIZE(string); */
1694 size
= PyUnicode_GET_SIZE(string
);
1695 charsize
= sizeof(Py_UNICODE
);
1700 /* get pointer to string buffer */
1701 buffer
= Py_TYPE(string
)->tp_as_buffer
;
1702 if (!buffer
|| !buffer
->bf_getreadbuffer
|| !buffer
->bf_getsegcount
||
1703 buffer
->bf_getsegcount(string
, NULL
) != 1) {
1704 PyErr_SetString(PyExc_TypeError
, "expected string or buffer");
1708 /* determine buffer size */
1709 bytes
= buffer
->bf_getreadbuffer(string
, 0, &ptr
);
1711 PyErr_SetString(PyExc_TypeError
, "buffer has negative size");
1715 /* determine character size */
1716 #if PY_VERSION_HEX >= 0x01060000
1717 size
= PyObject_Size(string
);
1719 size
= PyObject_Length(string
);
1722 if (PyString_Check(string
) || bytes
== size
)
1724 #if defined(HAVE_UNICODE)
1725 else if (bytes
== (Py_ssize_t
) (size
* sizeof(Py_UNICODE
)))
1726 charsize
= sizeof(Py_UNICODE
);
1729 PyErr_SetString(PyExc_TypeError
, "buffer size mismatch");
1733 #if defined(HAVE_UNICODE)
1738 *p_charsize
= charsize
;
1744 state_init(SRE_STATE
* state
, PatternObject
* pattern
, PyObject
* string
,
1745 Py_ssize_t start
, Py_ssize_t end
)
1747 /* prepare state object */
1753 memset(state
, 0, sizeof(SRE_STATE
));
1755 state
->lastmark
= -1;
1756 state
->lastindex
= -1;
1758 ptr
= getstring(string
, &length
, &charsize
);
1762 /* adjust boundaries */
1765 else if (start
> length
)
1770 else if (end
> length
)
1773 state
->charsize
= charsize
;
1775 state
->beginning
= ptr
;
1777 state
->start
= (void*) ((char*) ptr
+ start
* state
->charsize
);
1778 state
->end
= (void*) ((char*) ptr
+ end
* state
->charsize
);
1781 state
->string
= string
;
1783 state
->endpos
= end
;
1785 if (pattern
->flags
& SRE_FLAG_LOCALE
)
1786 state
->lower
= sre_lower_locale
;
1787 else if (pattern
->flags
& SRE_FLAG_UNICODE
)
1788 #if defined(HAVE_UNICODE)
1789 state
->lower
= sre_lower_unicode
;
1791 state
->lower
= sre_lower_locale
;
1794 state
->lower
= sre_lower
;
1800 state_fini(SRE_STATE
* state
)
1802 Py_XDECREF(state
->string
);
1803 data_stack_dealloc(state
);
1806 /* calculate offset from start of string */
1807 #define STATE_OFFSET(state, member)\
1808 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1811 state_getslice(SRE_STATE
* state
, Py_ssize_t index
, PyObject
* string
, int empty
)
1815 index
= (index
- 1) * 2;
1817 if (string
== Py_None
|| index
>= state
->lastmark
|| !state
->mark
[index
] || !state
->mark
[index
+1]) {
1819 /* want empty string */
1826 i
= STATE_OFFSET(state
, state
->mark
[index
]);
1827 j
= STATE_OFFSET(state
, state
->mark
[index
+1]);
1830 return PySequence_GetSlice(string
, i
, j
);
1834 pattern_error(int status
)
1837 case SRE_ERROR_RECURSION_LIMIT
:
1840 "maximum recursion limit exceeded"
1843 case SRE_ERROR_MEMORY
:
1846 case SRE_ERROR_INTERRUPTED
:
1847 /* An exception has already been raised, so let it fly */
1850 /* other error codes indicate compiler/engine bugs */
1853 "internal error in regular expression engine"
1859 pattern_dealloc(PatternObject
* self
)
1861 if (self
->weakreflist
!= NULL
)
1862 PyObject_ClearWeakRefs((PyObject
*) self
);
1863 Py_XDECREF(self
->pattern
);
1864 Py_XDECREF(self
->groupindex
);
1865 Py_XDECREF(self
->indexgroup
);
1870 pattern_match(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
1876 Py_ssize_t start
= 0;
1877 Py_ssize_t end
= PY_SSIZE_T_MAX
;
1878 static char* kwlist
[] = { "pattern", "pos", "endpos", NULL
};
1879 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|nn:match", kwlist
,
1880 &string
, &start
, &end
))
1883 string
= state_init(&state
, self
, string
, start
, end
);
1887 state
.ptr
= state
.start
;
1889 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self
), state
.ptr
));
1891 if (state
.charsize
== 1) {
1892 status
= sre_match(&state
, PatternObject_GetCode(self
));
1894 #if defined(HAVE_UNICODE)
1895 status
= sre_umatch(&state
, PatternObject_GetCode(self
));
1899 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self
), state
.ptr
));
1900 if (PyErr_Occurred())
1905 return pattern_new_match(self
, &state
, status
);
1909 pattern_search(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
1915 Py_ssize_t start
= 0;
1916 Py_ssize_t end
= PY_SSIZE_T_MAX
;
1917 static char* kwlist
[] = { "pattern", "pos", "endpos", NULL
};
1918 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|nn:search", kwlist
,
1919 &string
, &start
, &end
))
1922 string
= state_init(&state
, self
, string
, start
, end
);
1926 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self
), state
.ptr
));
1928 if (state
.charsize
== 1) {
1929 status
= sre_search(&state
, PatternObject_GetCode(self
));
1931 #if defined(HAVE_UNICODE)
1932 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
1936 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self
), state
.ptr
));
1940 if (PyErr_Occurred())
1943 return pattern_new_match(self
, &state
, status
);
1947 call(char* module
, char* function
, PyObject
* args
)
1956 name
= PyString_FromString(module
);
1959 mod
= PyImport_Import(name
);
1963 func
= PyObject_GetAttrString(mod
, function
);
1967 result
= PyObject_CallObject(func
, args
);
1973 #ifdef USE_BUILTIN_COPY
1975 deepcopy(PyObject
** object
, PyObject
* memo
)
1981 PyTuple_Pack(2, *object
, memo
)
1989 return 1; /* success */
1994 join_list(PyObject
* list
, PyObject
* string
)
1996 /* join list elements */
1999 #if PY_VERSION_HEX >= 0x01060000
2005 joiner
= PySequence_GetSlice(string
, 0, 0);
2009 if (PyList_GET_SIZE(list
) == 0) {
2014 #if PY_VERSION_HEX >= 0x01060000
2015 function
= PyObject_GetAttrString(joiner
, "join");
2020 args
= PyTuple_New(1);
2022 Py_DECREF(function
);
2026 PyTuple_SET_ITEM(args
, 0, list
);
2027 result
= PyObject_CallObject(function
, args
);
2028 Py_DECREF(args
); /* also removes list */
2029 Py_DECREF(function
);
2033 PyTuple_Pack(2, list
, joiner
)
2042 pattern_findall(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2050 Py_ssize_t start
= 0;
2051 Py_ssize_t end
= PY_SSIZE_T_MAX
;
2052 static char* kwlist
[] = { "source", "pos", "endpos", NULL
};
2053 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|nn:findall", kwlist
,
2054 &string
, &start
, &end
))
2057 string
= state_init(&state
, self
, string
, start
, end
);
2061 list
= PyList_New(0);
2067 while (state
.start
<= state
.end
) {
2071 state_reset(&state
);
2073 state
.ptr
= state
.start
;
2075 if (state
.charsize
== 1) {
2076 status
= sre_search(&state
, PatternObject_GetCode(self
));
2078 #if defined(HAVE_UNICODE)
2079 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
2083 if (PyErr_Occurred())
2089 pattern_error(status
);
2093 /* don't bother to build a match object */
2094 switch (self
->groups
) {
2096 b
= STATE_OFFSET(&state
, state
.start
);
2097 e
= STATE_OFFSET(&state
, state
.ptr
);
2098 item
= PySequence_GetSlice(string
, b
, e
);
2103 item
= state_getslice(&state
, 1, string
, 1);
2108 item
= PyTuple_New(self
->groups
);
2111 for (i
= 0; i
< self
->groups
; i
++) {
2112 PyObject
* o
= state_getslice(&state
, i
+1, string
, 1);
2117 PyTuple_SET_ITEM(item
, i
, o
);
2122 status
= PyList_Append(list
, item
);
2127 if (state
.ptr
== state
.start
)
2128 state
.start
= (void*) ((char*) state
.ptr
+ state
.charsize
);
2130 state
.start
= state
.ptr
;
2144 #if PY_VERSION_HEX >= 0x02020000
2146 pattern_finditer(PatternObject
* pattern
, PyObject
* args
)
2152 scanner
= pattern_scanner(pattern
, args
);
2156 search
= PyObject_GetAttrString(scanner
, "search");
2161 iterator
= PyCallIter_New(search
, Py_None
);
2169 pattern_split(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2180 Py_ssize_t maxsplit
= 0;
2181 static char* kwlist
[] = { "source", "maxsplit", NULL
};
2182 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|n:split", kwlist
,
2183 &string
, &maxsplit
))
2186 string
= state_init(&state
, self
, string
, 0, PY_SSIZE_T_MAX
);
2190 list
= PyList_New(0);
2199 while (!maxsplit
|| n
< maxsplit
) {
2201 state_reset(&state
);
2203 state
.ptr
= state
.start
;
2205 if (state
.charsize
== 1) {
2206 status
= sre_search(&state
, PatternObject_GetCode(self
));
2208 #if defined(HAVE_UNICODE)
2209 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
2213 if (PyErr_Occurred())
2219 pattern_error(status
);
2223 if (state
.start
== state
.ptr
) {
2224 if (last
== state
.end
)
2226 /* skip one character */
2227 state
.start
= (void*) ((char*) state
.ptr
+ state
.charsize
);
2231 /* get segment before this match */
2232 item
= PySequence_GetSlice(
2233 string
, STATE_OFFSET(&state
, last
),
2234 STATE_OFFSET(&state
, state
.start
)
2238 status
= PyList_Append(list
, item
);
2243 /* add groups (if any) */
2244 for (i
= 0; i
< self
->groups
; i
++) {
2245 item
= state_getslice(&state
, i
+1, string
, 0);
2248 status
= PyList_Append(list
, item
);
2256 last
= state
.start
= state
.ptr
;
2260 /* get segment following last match (even if empty) */
2261 item
= PySequence_GetSlice(
2262 string
, STATE_OFFSET(&state
, last
), state
.endpos
2266 status
= PyList_Append(list
, item
);
2282 pattern_subx(PatternObject
* self
, PyObject
* ptemplate
, PyObject
* string
,
2283 Py_ssize_t count
, Py_ssize_t subn
)
2296 int filter_is_callable
;
2298 if (PyCallable_Check(ptemplate
)) {
2299 /* sub/subn takes either a function or a template */
2302 filter_is_callable
= 1;
2304 /* if not callable, check if it's a literal string */
2306 ptr
= getstring(ptemplate
, &n
, &bint
);
2310 literal
= sre_literal_template((unsigned char *)ptr
, n
);
2312 #if defined(HAVE_UNICODE)
2313 literal
= sre_uliteral_template((Py_UNICODE
*)ptr
, n
);
2323 filter_is_callable
= 0;
2325 /* not a literal; hand it over to the template compiler */
2327 SRE_PY_MODULE
, "_subx",
2328 PyTuple_Pack(2, self
, ptemplate
)
2332 filter_is_callable
= PyCallable_Check(filter
);
2336 string
= state_init(&state
, self
, string
, 0, PY_SSIZE_T_MAX
);
2342 list
= PyList_New(0);
2351 while (!count
|| n
< count
) {
2353 state_reset(&state
);
2355 state
.ptr
= state
.start
;
2357 if (state
.charsize
== 1) {
2358 status
= sre_search(&state
, PatternObject_GetCode(self
));
2360 #if defined(HAVE_UNICODE)
2361 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
2365 if (PyErr_Occurred())
2371 pattern_error(status
);
2375 b
= STATE_OFFSET(&state
, state
.start
);
2376 e
= STATE_OFFSET(&state
, state
.ptr
);
2379 /* get segment before this match */
2380 item
= PySequence_GetSlice(string
, i
, b
);
2383 status
= PyList_Append(list
, item
);
2388 } else if (i
== b
&& i
== e
&& n
> 0)
2389 /* ignore empty match on latest position */
2392 if (filter_is_callable
) {
2393 /* pass match object through filter */
2394 match
= pattern_new_match(self
, &state
, 1);
2397 args
= PyTuple_Pack(1, match
);
2402 item
= PyObject_CallObject(filter
, args
);
2408 /* filter is literal string */
2414 if (item
!= Py_None
) {
2415 status
= PyList_Append(list
, item
);
2426 if (state
.ptr
== state
.start
)
2427 state
.start
= (void*) ((char*) state
.ptr
+ state
.charsize
);
2429 state
.start
= state
.ptr
;
2433 /* get segment following last match */
2434 if (i
< state
.endpos
) {
2435 item
= PySequence_GetSlice(string
, i
, state
.endpos
);
2438 status
= PyList_Append(list
, item
);
2448 /* convert list to single string (also removes list) */
2449 item
= join_list(list
, string
);
2455 return Py_BuildValue("Ni", item
, n
);
2468 pattern_sub(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2470 PyObject
* ptemplate
;
2472 Py_ssize_t count
= 0;
2473 static char* kwlist
[] = { "repl", "string", "count", NULL
};
2474 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "OO|n:sub", kwlist
,
2475 &ptemplate
, &string
, &count
))
2478 return pattern_subx(self
, ptemplate
, string
, count
, 0);
2482 pattern_subn(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2484 PyObject
* ptemplate
;
2486 Py_ssize_t count
= 0;
2487 static char* kwlist
[] = { "repl", "string", "count", NULL
};
2488 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "OO|n:subn", kwlist
,
2489 &ptemplate
, &string
, &count
))
2492 return pattern_subx(self
, ptemplate
, string
, count
, 1);
2496 pattern_copy(PatternObject
* self
, PyObject
*unused
)
2498 #ifdef USE_BUILTIN_COPY
2499 PatternObject
* copy
;
2502 copy
= PyObject_NEW_VAR(PatternObject
, &Pattern_Type
, self
->codesize
);
2506 offset
= offsetof(PatternObject
, groups
);
2508 Py_XINCREF(self
->groupindex
);
2509 Py_XINCREF(self
->indexgroup
);
2510 Py_XINCREF(self
->pattern
);
2512 memcpy((char*) copy
+ offset
, (char*) self
+ offset
,
2513 sizeof(PatternObject
) + self
->codesize
* sizeof(SRE_CODE
) - offset
);
2514 copy
->weakreflist
= NULL
;
2516 return (PyObject
*) copy
;
2518 PyErr_SetString(PyExc_TypeError
, "cannot copy this pattern object");
2524 pattern_deepcopy(PatternObject
* self
, PyObject
* memo
)
2526 #ifdef USE_BUILTIN_COPY
2527 PatternObject
* copy
;
2529 copy
= (PatternObject
*) pattern_copy(self
);
2533 if (!deepcopy(©
->groupindex
, memo
) ||
2534 !deepcopy(©
->indexgroup
, memo
) ||
2535 !deepcopy(©
->pattern
, memo
)) {
2541 PyErr_SetString(PyExc_TypeError
, "cannot deepcopy this pattern object");
2546 PyDoc_STRVAR(pattern_match_doc
,
2547 "match(string[, pos[, endpos]]) --> match object or None.\n\
2548 Matches zero or more characters at the beginning of the string");
2550 PyDoc_STRVAR(pattern_search_doc
,
2551 "search(string[, pos[, endpos]]) --> match object or None.\n\
2552 Scan through string looking for a match, and return a corresponding\n\
2553 MatchObject instance. Return None if no position in the string matches.");
2555 PyDoc_STRVAR(pattern_split_doc
,
2556 "split(string[, maxsplit = 0]) --> list.\n\
2557 Split string by the occurrences of pattern.");
2559 PyDoc_STRVAR(pattern_findall_doc
,
2560 "findall(string[, pos[, endpos]]) --> list.\n\
2561 Return a list of all non-overlapping matches of pattern in string.");
2563 PyDoc_STRVAR(pattern_finditer_doc
,
2564 "finditer(string[, pos[, endpos]]) --> iterator.\n\
2565 Return an iterator over all non-overlapping matches for the \n\
2566 RE pattern in string. For each match, the iterator returns a\n\
2569 PyDoc_STRVAR(pattern_sub_doc
,
2570 "sub(repl, string[, count = 0]) --> newstring\n\
2571 Return the string obtained by replacing the leftmost non-overlapping\n\
2572 occurrences of pattern in string by the replacement repl.");
2574 PyDoc_STRVAR(pattern_subn_doc
,
2575 "subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2576 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2577 the leftmost non-overlapping occurrences of pattern with the\n\
2578 replacement repl.");
2580 PyDoc_STRVAR(pattern_doc
, "Compiled regular expression objects");
2582 static PyMethodDef pattern_methods
[] = {
2583 {"match", (PyCFunction
) pattern_match
, METH_VARARGS
|METH_KEYWORDS
,
2585 {"search", (PyCFunction
) pattern_search
, METH_VARARGS
|METH_KEYWORDS
,
2586 pattern_search_doc
},
2587 {"sub", (PyCFunction
) pattern_sub
, METH_VARARGS
|METH_KEYWORDS
,
2589 {"subn", (PyCFunction
) pattern_subn
, METH_VARARGS
|METH_KEYWORDS
,
2591 {"split", (PyCFunction
) pattern_split
, METH_VARARGS
|METH_KEYWORDS
,
2593 {"findall", (PyCFunction
) pattern_findall
, METH_VARARGS
|METH_KEYWORDS
,
2594 pattern_findall_doc
},
2595 #if PY_VERSION_HEX >= 0x02020000
2596 {"finditer", (PyCFunction
) pattern_finditer
, METH_VARARGS
,
2597 pattern_finditer_doc
},
2599 {"scanner", (PyCFunction
) pattern_scanner
, METH_VARARGS
},
2600 {"__copy__", (PyCFunction
) pattern_copy
, METH_NOARGS
},
2601 {"__deepcopy__", (PyCFunction
) pattern_deepcopy
, METH_O
},
2605 #define PAT_OFF(x) offsetof(PatternObject, x)
2606 static PyMemberDef pattern_members
[] = {
2607 {"pattern", T_OBJECT
, PAT_OFF(pattern
), READONLY
},
2608 {"flags", T_INT
, PAT_OFF(flags
), READONLY
},
2609 {"groups", T_PYSSIZET
, PAT_OFF(groups
), READONLY
},
2610 {"groupindex", T_OBJECT
, PAT_OFF(groupindex
), READONLY
},
2611 {NULL
} /* Sentinel */
2614 statichere PyTypeObject Pattern_Type
= {
2615 PyObject_HEAD_INIT(NULL
)
2616 0, "_" SRE_MODULE
".SRE_Pattern",
2617 sizeof(PatternObject
), sizeof(SRE_CODE
),
2618 (destructor
)pattern_dealloc
, /*tp_dealloc*/
2620 0, /* tp_getattrn */
2624 0, /* tp_as_number */
2625 0, /* tp_as_sequence */
2626 0, /* tp_as_mapping */
2630 0, /* tp_getattro */
2631 0, /* tp_setattro */
2632 0, /* tp_as_buffer */
2633 Py_TPFLAGS_DEFAULT
, /* tp_flags */
2634 pattern_doc
, /* tp_doc */
2635 0, /* tp_traverse */
2637 0, /* tp_richcompare */
2638 offsetof(PatternObject
, weakreflist
), /* tp_weaklistoffset */
2640 0, /* tp_iternext */
2641 pattern_methods
, /* tp_methods */
2642 pattern_members
, /* tp_members */
2645 static int _validate(PatternObject
*self
); /* Forward */
2648 _compile(PyObject
* self_
, PyObject
* args
)
2650 /* "compile" pattern descriptor to pattern object */
2652 PatternObject
* self
;
2658 Py_ssize_t groups
= 0;
2659 PyObject
* groupindex
= NULL
;
2660 PyObject
* indexgroup
= NULL
;
2661 if (!PyArg_ParseTuple(args
, "OiO!|nOO", &pattern
, &flags
,
2662 &PyList_Type
, &code
, &groups
,
2663 &groupindex
, &indexgroup
))
2666 n
= PyList_GET_SIZE(code
);
2667 /* coverity[ampersand_in_size] */
2668 self
= PyObject_NEW_VAR(PatternObject
, &Pattern_Type
, n
);
2671 self
->weakreflist
= NULL
;
2672 self
->pattern
= NULL
;
2673 self
->groupindex
= NULL
;
2674 self
->indexgroup
= NULL
;
2678 for (i
= 0; i
< n
; i
++) {
2679 PyObject
*o
= PyList_GET_ITEM(code
, i
);
2680 unsigned long value
= PyInt_Check(o
) ? (unsigned long)PyInt_AsLong(o
)
2681 : PyLong_AsUnsignedLong(o
);
2682 self
->code
[i
] = (SRE_CODE
) value
;
2683 if ((unsigned long) self
->code
[i
] != value
) {
2684 PyErr_SetString(PyExc_OverflowError
,
2685 "regular expression code size limit exceeded");
2690 if (PyErr_Occurred()) {
2696 self
->pattern
= pattern
;
2698 self
->flags
= flags
;
2700 self
->groups
= groups
;
2702 Py_XINCREF(groupindex
);
2703 self
->groupindex
= groupindex
;
2705 Py_XINCREF(indexgroup
);
2706 self
->indexgroup
= indexgroup
;
2708 self
->weakreflist
= NULL
;
2710 if (!_validate(self
)) {
2715 return (PyObject
*) self
;
2718 /* -------------------------------------------------------------------- */
2719 /* Code validation */
2721 /* To learn more about this code, have a look at the _compile() function in
2722 Lib/sre_compile.py. The validation functions below checks the code array
2723 for conformance with the code patterns generated there.
2725 The nice thing about the generated code is that it is position-independent:
2726 all jumps are relative jumps forward. Also, jumps don't cross each other:
2727 the target of a later jump is always earlier than the target of an earlier
2728 jump. IOW, this is okay:
2730 J---------J-------T--------T
2732 \______________________/
2736 J---------J-------T--------T
2740 It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
2741 bytes wide (the latter if Python is compiled for "wide" unicode support).
2744 /* Defining this one enables tracing of the validator */
2747 /* Trace macro for the validator */
2748 #if defined(VVERBOSE)
2749 #define VTRACE(v) printf v
2754 /* Report failure */
2755 #define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2757 /* Extract opcode, argument, or skip count from code array */
2760 VTRACE(("%p: ", code)); \
2761 if (code >= end) FAIL; \
2763 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2767 VTRACE(("%p= ", code)); \
2768 if (code >= end) FAIL; \
2770 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2772 #define GET_SKIP_ADJ(adj) \
2774 VTRACE(("%p= ", code)); \
2775 if (code >= end) FAIL; \
2777 VTRACE(("%lu (skip to %p)\n", \
2778 (unsigned long)skip, code+skip)); \
2779 if (code+skip-adj < code || code+skip-adj > end)\
2783 #define GET_SKIP GET_SKIP_ADJ(0)
2786 _validate_charset(SRE_CODE
*code
, SRE_CODE
*end
)
2788 /* Some variables are manipulated by the macros above */
2794 while (code
< end
) {
2801 case SRE_OP_LITERAL
:
2810 case SRE_OP_CHARSET
:
2811 offset
= 32/sizeof(SRE_CODE
); /* 32-byte bitmap */
2812 if (code
+offset
< code
|| code
+offset
> end
)
2817 case SRE_OP_BIGCHARSET
:
2818 GET_ARG
; /* Number of blocks */
2819 offset
= 256/sizeof(SRE_CODE
); /* 256-byte table */
2820 if (code
+offset
< code
|| code
+offset
> end
)
2822 /* Make sure that each byte points to a valid block */
2823 for (i
= 0; i
< 256; i
++) {
2824 if (((unsigned char *)code
)[i
] >= arg
)
2828 offset
= arg
* 32/sizeof(SRE_CODE
); /* 32-byte bitmap times arg */
2829 if (code
+offset
< code
|| code
+offset
> end
)
2834 case SRE_OP_CATEGORY
:
2837 case SRE_CATEGORY_DIGIT
:
2838 case SRE_CATEGORY_NOT_DIGIT
:
2839 case SRE_CATEGORY_SPACE
:
2840 case SRE_CATEGORY_NOT_SPACE
:
2841 case SRE_CATEGORY_WORD
:
2842 case SRE_CATEGORY_NOT_WORD
:
2843 case SRE_CATEGORY_LINEBREAK
:
2844 case SRE_CATEGORY_NOT_LINEBREAK
:
2845 case SRE_CATEGORY_LOC_WORD
:
2846 case SRE_CATEGORY_LOC_NOT_WORD
:
2847 case SRE_CATEGORY_UNI_DIGIT
:
2848 case SRE_CATEGORY_UNI_NOT_DIGIT
:
2849 case SRE_CATEGORY_UNI_SPACE
:
2850 case SRE_CATEGORY_UNI_NOT_SPACE
:
2851 case SRE_CATEGORY_UNI_WORD
:
2852 case SRE_CATEGORY_UNI_NOT_WORD
:
2853 case SRE_CATEGORY_UNI_LINEBREAK
:
2854 case SRE_CATEGORY_UNI_NOT_LINEBREAK
:
2871 _validate_inner(SRE_CODE
*code
, SRE_CODE
*end
, Py_ssize_t groups
)
2873 /* Some variables are manipulated by the macros above */
2878 VTRACE(("code=%p, end=%p\n", code
, end
));
2883 while (code
< end
) {
2888 /* We don't check whether marks are properly nested; the
2889 sre_match() code is robust even if they don't, and the worst
2890 you can get is nonsensical match results. */
2892 if (arg
> 2*groups
+1) {
2893 VTRACE(("arg=%d, groups=%d\n", (int)arg
, (int)groups
));
2898 case SRE_OP_LITERAL
:
2899 case SRE_OP_NOT_LITERAL
:
2900 case SRE_OP_LITERAL_IGNORE
:
2901 case SRE_OP_NOT_LITERAL_IGNORE
:
2903 /* The arg is just a character, nothing to check */
2906 case SRE_OP_SUCCESS
:
2907 case SRE_OP_FAILURE
:
2908 /* Nothing to check; these normally end the matching process */
2914 case SRE_AT_BEGINNING
:
2915 case SRE_AT_BEGINNING_STRING
:
2916 case SRE_AT_BEGINNING_LINE
:
2918 case SRE_AT_END_LINE
:
2919 case SRE_AT_END_STRING
:
2920 case SRE_AT_BOUNDARY
:
2921 case SRE_AT_NON_BOUNDARY
:
2922 case SRE_AT_LOC_BOUNDARY
:
2923 case SRE_AT_LOC_NON_BOUNDARY
:
2924 case SRE_AT_UNI_BOUNDARY
:
2925 case SRE_AT_UNI_NON_BOUNDARY
:
2933 case SRE_OP_ANY_ALL
:
2934 /* These have no operands */
2938 case SRE_OP_IN_IGNORE
:
2940 /* Stop 1 before the end; we check the FAILURE below */
2941 if (!_validate_charset(code
, code
+skip
-2))
2943 if (code
[skip
-2] != SRE_OP_FAILURE
)
2950 /* A minimal info field is
2951 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2952 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2957 newcode
= code
+skip
-1;
2958 GET_ARG
; flags
= arg
;
2961 /* Check that only valid flags are present */
2962 if ((flags
& ~(SRE_INFO_PREFIX
|
2964 SRE_INFO_CHARSET
)) != 0)
2966 /* PREFIX and CHARSET are mutually exclusive */
2967 if ((flags
& SRE_INFO_PREFIX
) &&
2968 (flags
& SRE_INFO_CHARSET
))
2970 /* LITERAL implies PREFIX */
2971 if ((flags
& SRE_INFO_LITERAL
) &&
2972 !(flags
& SRE_INFO_PREFIX
))
2974 /* Validate the prefix */
2975 if (flags
& SRE_INFO_PREFIX
) {
2976 SRE_CODE prefix_len
;
2977 GET_ARG
; prefix_len
= arg
;
2978 GET_ARG
; /* prefix skip */
2979 /* Here comes the prefix string */
2980 if (code
+prefix_len
< code
|| code
+prefix_len
> newcode
)
2983 /* And here comes the overlap table */
2984 if (code
+prefix_len
< code
|| code
+prefix_len
> newcode
)
2986 /* Each overlap value should be < prefix_len */
2987 for (i
= 0; i
< prefix_len
; i
++) {
2988 if (code
[i
] >= prefix_len
)
2993 /* Validate the charset */
2994 if (flags
& SRE_INFO_CHARSET
) {
2995 if (!_validate_charset(code
, newcode
-1))
2997 if (newcode
[-1] != SRE_OP_FAILURE
)
3001 else if (code
!= newcode
) {
3002 VTRACE(("code=%p, newcode=%p\n", code
, newcode
));
3010 SRE_CODE
*target
= NULL
;
3015 /* Stop 2 before the end; we check the JUMP below */
3016 if (!_validate_inner(code
, code
+skip
-3, groups
))
3019 /* Check that it ends with a JUMP, and that each JUMP
3020 has the same target */
3022 if (op
!= SRE_OP_JUMP
)
3026 target
= code
+skip
-1;
3027 else if (code
+skip
-1 != target
)
3033 case SRE_OP_REPEAT_ONE
:
3034 case SRE_OP_MIN_REPEAT_ONE
:
3042 #ifdef Py_UNICODE_WIDE
3046 if (!_validate_inner(code
, code
+skip
-4, groups
))
3050 if (op
!= SRE_OP_SUCCESS
)
3063 #ifdef Py_UNICODE_WIDE
3067 if (!_validate_inner(code
, code
+skip
-3, groups
))
3071 if (op
!= SRE_OP_MAX_UNTIL
&& op
!= SRE_OP_MIN_UNTIL
)
3076 case SRE_OP_GROUPREF
:
3077 case SRE_OP_GROUPREF_IGNORE
:
3083 case SRE_OP_GROUPREF_EXISTS
:
3084 /* The regex syntax for this is: '(?(group)then|else)', where
3085 'group' is either an integer group number or a group name,
3086 'then' and 'else' are sub-regexes, and 'else' is optional. */
3091 code
--; /* The skip is relative to the first arg! */
3092 /* There are two possibilities here: if there is both a 'then'
3093 part and an 'else' part, the generated code looks like:
3101 (<skipyes> jumps here)
3103 (<skipno> jumps here)
3105 If there is only a 'then' part, it looks like:
3113 There is no direct way to decide which it is, and we don't want
3114 to allow arbitrary jumps anywhere in the code; so we just look
3115 for a JUMP opcode preceding our skip target.
3117 if (skip
>= 3 && code
+skip
-3 >= code
&&
3118 code
[skip
-3] == SRE_OP_JUMP
)
3120 VTRACE(("both then and else parts present\n"));
3121 if (!_validate_inner(code
+1, code
+skip
-3, groups
))
3123 code
+= skip
-2; /* Position after JUMP, at <skipno> */
3125 if (!_validate_inner(code
, code
+skip
-1, groups
))
3130 VTRACE(("only a then part present\n"));
3131 if (!_validate_inner(code
+1, code
+skip
-1, groups
))
3138 case SRE_OP_ASSERT_NOT
:
3140 GET_ARG
; /* 0 for lookahead, width for lookbehind */
3141 code
--; /* Back up over arg to simplify math below */
3142 if (arg
& 0x80000000)
3143 FAIL
; /* Width too large */
3144 /* Stop 1 before the end; we check the SUCCESS below */
3145 if (!_validate_inner(code
+1, code
+skip
-2, groups
))
3149 if (op
!= SRE_OP_SUCCESS
)
3164 _validate_outer(SRE_CODE
*code
, SRE_CODE
*end
, Py_ssize_t groups
)
3166 if (groups
< 0 || groups
> 100 || code
>= end
|| end
[-1] != SRE_OP_SUCCESS
)
3168 if (groups
== 0) /* fix for simplejson */
3169 groups
= 100; /* 100 groups should always be safe */
3170 return _validate_inner(code
, end
-1, groups
);
3174 _validate(PatternObject
*self
)
3176 if (!_validate_outer(self
->code
, self
->code
+self
->codesize
, self
->groups
))
3178 PyErr_SetString(PyExc_RuntimeError
, "invalid SRE code");
3182 VTRACE(("Success!\n"));
3186 /* -------------------------------------------------------------------- */
3190 match_dealloc(MatchObject
* self
)
3192 Py_XDECREF(self
->regs
);
3193 Py_XDECREF(self
->string
);
3194 Py_DECREF(self
->pattern
);
3199 match_getslice_by_index(MatchObject
* self
, Py_ssize_t index
, PyObject
* def
)
3201 if (index
< 0 || index
>= self
->groups
) {
3202 /* raise IndexError if we were given a bad group number */
3212 if (self
->string
== Py_None
|| self
->mark
[index
] < 0) {
3213 /* return default value if the string or group is undefined */
3218 return PySequence_GetSlice(
3219 self
->string
, self
->mark
[index
], self
->mark
[index
+1]
3224 match_getindex(MatchObject
* self
, PyObject
* index
)
3228 if (PyInt_Check(index
))
3229 return PyInt_AsSsize_t(index
);
3233 if (self
->pattern
->groupindex
) {
3234 index
= PyObject_GetItem(self
->pattern
->groupindex
, index
);
3236 if (PyInt_Check(index
) || PyLong_Check(index
))
3237 i
= PyInt_AsSsize_t(index
);
3247 match_getslice(MatchObject
* self
, PyObject
* index
, PyObject
* def
)
3249 return match_getslice_by_index(self
, match_getindex(self
, index
), def
);
3253 match_expand(MatchObject
* self
, PyObject
* ptemplate
)
3255 /* delegate to Python code */
3257 SRE_PY_MODULE
, "_expand",
3258 PyTuple_Pack(3, self
->pattern
, self
, ptemplate
)
3263 match_group(MatchObject
* self
, PyObject
* args
)
3268 size
= PyTuple_GET_SIZE(args
);
3272 result
= match_getslice(self
, Py_False
, Py_None
);
3275 result
= match_getslice(self
, PyTuple_GET_ITEM(args
, 0), Py_None
);
3278 /* fetch multiple items */
3279 result
= PyTuple_New(size
);
3282 for (i
= 0; i
< size
; i
++) {
3283 PyObject
* item
= match_getslice(
3284 self
, PyTuple_GET_ITEM(args
, i
), Py_None
3290 PyTuple_SET_ITEM(result
, i
, item
);
3298 match_groups(MatchObject
* self
, PyObject
* args
, PyObject
* kw
)
3303 PyObject
* def
= Py_None
;
3304 static char* kwlist
[] = { "default", NULL
};
3305 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "|O:groups", kwlist
, &def
))
3308 result
= PyTuple_New(self
->groups
-1);
3312 for (index
= 1; index
< self
->groups
; index
++) {
3314 item
= match_getslice_by_index(self
, index
, def
);
3319 PyTuple_SET_ITEM(result
, index
-1, item
);
3326 match_groupdict(MatchObject
* self
, PyObject
* args
, PyObject
* kw
)
3332 PyObject
* def
= Py_None
;
3333 static char* kwlist
[] = { "default", NULL
};
3334 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "|O:groupdict", kwlist
, &def
))
3337 result
= PyDict_New();
3338 if (!result
|| !self
->pattern
->groupindex
)
3341 keys
= PyMapping_Keys(self
->pattern
->groupindex
);
3345 for (index
= 0; index
< PyList_GET_SIZE(keys
); index
++) {
3349 key
= PyList_GET_ITEM(keys
, index
);
3352 value
= match_getslice(self
, key
, def
);
3357 status
= PyDict_SetItem(result
, key
, value
);
3374 match_start(MatchObject
* self
, PyObject
* args
)
3378 PyObject
* index_
= Py_False
; /* zero */
3379 if (!PyArg_UnpackTuple(args
, "start", 0, 1, &index_
))
3382 index
= match_getindex(self
, index_
);
3384 if (index
< 0 || index
>= self
->groups
) {
3392 /* mark is -1 if group is undefined */
3393 return Py_BuildValue("i", self
->mark
[index
*2]);
3397 match_end(MatchObject
* self
, PyObject
* args
)
3401 PyObject
* index_
= Py_False
; /* zero */
3402 if (!PyArg_UnpackTuple(args
, "end", 0, 1, &index_
))
3405 index
= match_getindex(self
, index_
);
3407 if (index
< 0 || index
>= self
->groups
) {
3415 /* mark is -1 if group is undefined */
3416 return Py_BuildValue("i", self
->mark
[index
*2+1]);
3420 _pair(Py_ssize_t i1
, Py_ssize_t i2
)
3425 pair
= PyTuple_New(2);
3429 item
= PyInt_FromSsize_t(i1
);
3432 PyTuple_SET_ITEM(pair
, 0, item
);
3434 item
= PyInt_FromSsize_t(i2
);
3437 PyTuple_SET_ITEM(pair
, 1, item
);
3447 match_span(MatchObject
* self
, PyObject
* args
)
3451 PyObject
* index_
= Py_False
; /* zero */
3452 if (!PyArg_UnpackTuple(args
, "span", 0, 1, &index_
))
3455 index
= match_getindex(self
, index_
);
3457 if (index
< 0 || index
>= self
->groups
) {
3465 /* marks are -1 if group is undefined */
3466 return _pair(self
->mark
[index
*2], self
->mark
[index
*2+1]);
3470 match_regs(MatchObject
* self
)
3476 regs
= PyTuple_New(self
->groups
);
3480 for (index
= 0; index
< self
->groups
; index
++) {
3481 item
= _pair(self
->mark
[index
*2], self
->mark
[index
*2+1]);
3486 PyTuple_SET_ITEM(regs
, index
, item
);
3496 match_copy(MatchObject
* self
, PyObject
*unused
)
3498 #ifdef USE_BUILTIN_COPY
3500 Py_ssize_t slots
, offset
;
3502 slots
= 2 * (self
->pattern
->groups
+1);
3504 copy
= PyObject_NEW_VAR(MatchObject
, &Match_Type
, slots
);
3508 /* this value a constant, but any compiler should be able to
3509 figure that out all by itself */
3510 offset
= offsetof(MatchObject
, string
);
3512 Py_XINCREF(self
->pattern
);
3513 Py_XINCREF(self
->string
);
3514 Py_XINCREF(self
->regs
);
3516 memcpy((char*) copy
+ offset
, (char*) self
+ offset
,
3517 sizeof(MatchObject
) + slots
* sizeof(Py_ssize_t
) - offset
);
3519 return (PyObject
*) copy
;
3521 PyErr_SetString(PyExc_TypeError
, "cannot copy this match object");
3527 match_deepcopy(MatchObject
* self
, PyObject
* memo
)
3529 #ifdef USE_BUILTIN_COPY
3532 copy
= (MatchObject
*) match_copy(self
);
3536 if (!deepcopy((PyObject
**) ©
->pattern
, memo
) ||
3537 !deepcopy(©
->string
, memo
) ||
3538 !deepcopy(©
->regs
, memo
)) {
3544 PyErr_SetString(PyExc_TypeError
, "cannot deepcopy this match object");
3549 static struct PyMethodDef match_methods
[] = {
3550 {"group", (PyCFunction
) match_group
, METH_VARARGS
},
3551 {"start", (PyCFunction
) match_start
, METH_VARARGS
},
3552 {"end", (PyCFunction
) match_end
, METH_VARARGS
},
3553 {"span", (PyCFunction
) match_span
, METH_VARARGS
},
3554 {"groups", (PyCFunction
) match_groups
, METH_VARARGS
|METH_KEYWORDS
},
3555 {"groupdict", (PyCFunction
) match_groupdict
, METH_VARARGS
|METH_KEYWORDS
},
3556 {"expand", (PyCFunction
) match_expand
, METH_O
},
3557 {"__copy__", (PyCFunction
) match_copy
, METH_NOARGS
},
3558 {"__deepcopy__", (PyCFunction
) match_deepcopy
, METH_O
},
3563 match_lastindex_get(MatchObject
*self
)
3565 if (self
->lastindex
>= 0)
3566 return Py_BuildValue("i", self
->lastindex
);
3572 match_lastgroup_get(MatchObject
*self
)
3574 if (self
->pattern
->indexgroup
&& self
->lastindex
>= 0) {
3575 PyObject
* result
= PySequence_GetItem(
3576 self
->pattern
->indexgroup
, self
->lastindex
3587 match_regs_get(MatchObject
*self
)
3590 Py_INCREF(self
->regs
);
3593 return match_regs(self
);
3596 static PyGetSetDef match_getset
[] = {
3597 {"lastindex", (getter
)match_lastindex_get
, (setter
)NULL
},
3598 {"lastgroup", (getter
)match_lastgroup_get
, (setter
)NULL
},
3599 {"regs", (getter
)match_regs_get
, (setter
)NULL
},
3603 #define MATCH_OFF(x) offsetof(MatchObject, x)
3604 static PyMemberDef match_members
[] = {
3605 {"string", T_OBJECT
, MATCH_OFF(string
), READONLY
},
3606 {"re", T_OBJECT
, MATCH_OFF(pattern
), READONLY
},
3607 {"pos", T_PYSSIZET
, MATCH_OFF(pos
), READONLY
},
3608 {"endpos", T_PYSSIZET
, MATCH_OFF(endpos
), READONLY
},
3613 /* FIXME: implement setattr("string", None) as a special case (to
3614 detach the associated string, if any */
3616 static PyTypeObject Match_Type
= {
3617 PyVarObject_HEAD_INIT(NULL
, 0)
3618 "_" SRE_MODULE
".SRE_Match",
3619 sizeof(MatchObject
), sizeof(Py_ssize_t
),
3620 (destructor
)match_dealloc
, /* tp_dealloc */
3626 0, /* tp_as_number */
3627 0, /* tp_as_sequence */
3628 0, /* tp_as_mapping */
3632 0, /* tp_getattro */
3633 0, /* tp_setattro */
3634 0, /* tp_as_buffer */
3637 0, /* tp_traverse */
3639 0, /* tp_richcompare */
3640 0, /* tp_weaklistoffset */
3642 0, /* tp_iternext */
3643 match_methods
, /* tp_methods */
3644 match_members
, /* tp_members */
3645 match_getset
, /* tp_getset */
3649 pattern_new_match(PatternObject
* pattern
, SRE_STATE
* state
, int status
)
3651 /* create match object (from state object) */
3660 /* create match object (with room for extra group marks) */
3661 /* coverity[ampersand_in_size] */
3662 match
= PyObject_NEW_VAR(MatchObject
, &Match_Type
,
3663 2*(pattern
->groups
+1));
3668 match
->pattern
= pattern
;
3670 Py_INCREF(state
->string
);
3671 match
->string
= state
->string
;
3674 match
->groups
= pattern
->groups
+1;
3676 /* fill in group slices */
3678 base
= (char*) state
->beginning
;
3679 n
= state
->charsize
;
3681 match
->mark
[0] = ((char*) state
->start
- base
) / n
;
3682 match
->mark
[1] = ((char*) state
->ptr
- base
) / n
;
3684 for (i
= j
= 0; i
< pattern
->groups
; i
++, j
+=2)
3685 if (j
+1 <= state
->lastmark
&& state
->mark
[j
] && state
->mark
[j
+1]) {
3686 match
->mark
[j
+2] = ((char*) state
->mark
[j
] - base
) / n
;
3687 match
->mark
[j
+3] = ((char*) state
->mark
[j
+1] - base
) / n
;
3689 match
->mark
[j
+2] = match
->mark
[j
+3] = -1; /* undefined */
3691 match
->pos
= state
->pos
;
3692 match
->endpos
= state
->endpos
;
3694 match
->lastindex
= state
->lastindex
;
3696 return (PyObject
*) match
;
3698 } else if (status
== 0) {
3706 /* internal error */
3707 pattern_error(status
);
3712 /* -------------------------------------------------------------------- */
3713 /* scanner methods (experimental) */
3716 scanner_dealloc(ScannerObject
* self
)
3718 state_fini(&self
->state
);
3719 Py_XDECREF(self
->pattern
);
3724 scanner_match(ScannerObject
* self
, PyObject
*unused
)
3726 SRE_STATE
* state
= &self
->state
;
3732 state
->ptr
= state
->start
;
3734 if (state
->charsize
== 1) {
3735 status
= sre_match(state
, PatternObject_GetCode(self
->pattern
));
3737 #if defined(HAVE_UNICODE)
3738 status
= sre_umatch(state
, PatternObject_GetCode(self
->pattern
));
3741 if (PyErr_Occurred())
3744 match
= pattern_new_match((PatternObject
*) self
->pattern
,
3747 if (status
== 0 || state
->ptr
== state
->start
)
3748 state
->start
= (void*) ((char*) state
->ptr
+ state
->charsize
);
3750 state
->start
= state
->ptr
;
3757 scanner_search(ScannerObject
* self
, PyObject
*unused
)
3759 SRE_STATE
* state
= &self
->state
;
3765 state
->ptr
= state
->start
;
3767 if (state
->charsize
== 1) {
3768 status
= sre_search(state
, PatternObject_GetCode(self
->pattern
));
3770 #if defined(HAVE_UNICODE)
3771 status
= sre_usearch(state
, PatternObject_GetCode(self
->pattern
));
3774 if (PyErr_Occurred())
3777 match
= pattern_new_match((PatternObject
*) self
->pattern
,
3780 if (status
== 0 || state
->ptr
== state
->start
)
3781 state
->start
= (void*) ((char*) state
->ptr
+ state
->charsize
);
3783 state
->start
= state
->ptr
;
3788 static PyMethodDef scanner_methods
[] = {
3789 {"match", (PyCFunction
) scanner_match
, METH_NOARGS
},
3790 {"search", (PyCFunction
) scanner_search
, METH_NOARGS
},
3794 #define SCAN_OFF(x) offsetof(ScannerObject, x)
3795 static PyMemberDef scanner_members
[] = {
3796 {"pattern", T_OBJECT
, SCAN_OFF(pattern
), READONLY
},
3797 {NULL
} /* Sentinel */
3800 statichere PyTypeObject Scanner_Type
= {
3801 PyObject_HEAD_INIT(NULL
)
3802 0, "_" SRE_MODULE
".SRE_Scanner",
3803 sizeof(ScannerObject
), 0,
3804 (destructor
)scanner_dealloc
, /*tp_dealloc*/
3808 0, /* tp_reserved */
3810 0, /* tp_as_number */
3811 0, /* tp_as_sequence */
3812 0, /* tp_as_mapping */
3816 0, /* tp_getattro */
3817 0, /* tp_setattro */
3818 0, /* tp_as_buffer */
3819 Py_TPFLAGS_DEFAULT
, /* tp_flags */
3821 0, /* tp_traverse */
3823 0, /* tp_richcompare */
3824 0, /* tp_weaklistoffset */
3826 0, /* tp_iternext */
3827 scanner_methods
, /* tp_methods */
3828 scanner_members
, /* tp_members */
3833 pattern_scanner(PatternObject
* pattern
, PyObject
* args
)
3835 /* create search state object */
3837 ScannerObject
* self
;
3840 Py_ssize_t start
= 0;
3841 Py_ssize_t end
= PY_SSIZE_T_MAX
;
3842 if (!PyArg_ParseTuple(args
, "O|nn:scanner", &string
, &start
, &end
))
3845 /* create scanner object */
3846 self
= PyObject_NEW(ScannerObject
, &Scanner_Type
);
3849 self
->pattern
= NULL
;
3851 string
= state_init(&self
->state
, pattern
, string
, start
, end
);
3858 self
->pattern
= (PyObject
*) pattern
;
3860 return (PyObject
*) self
;
3863 static PyMethodDef _functions
[] = {
3864 {"compile", _compile
, METH_VARARGS
},
3865 {"getcodesize", sre_codesize
, METH_NOARGS
},
3866 {"getlower", sre_getlower
, METH_VARARGS
},
3870 #if PY_VERSION_HEX < 0x02030000
3871 DL_EXPORT(void) init_sre(void)
3873 PyMODINIT_FUNC
init_sre(void)
3880 /* Patch object types */
3881 if (PyType_Ready(&Pattern_Type
) || PyType_Ready(&Match_Type
) ||
3882 PyType_Ready(&Scanner_Type
))
3885 m
= Py_InitModule("_" SRE_MODULE
, _functions
);
3888 d
= PyModule_GetDict(m
);
3890 x
= PyInt_FromLong(SRE_MAGIC
);
3892 PyDict_SetItemString(d
, "MAGIC", x
);
3896 x
= PyInt_FromLong(sizeof(SRE_CODE
));
3898 PyDict_SetItemString(d
, "CODESIZE", x
);
3902 x
= PyString_FromString(copyright
);
3904 PyDict_SetItemString(d
, "copyright", x
);
3909 #endif /* !defined(SRE_RECURSIVE) */