2 * Secret Labs' Regular Expression Engine
4 * regular expression matching engine
7 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
26 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
28 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
32 * Portions of this engine have been developed in cooperation with
33 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
34 * other compatibility work.
39 static char copyright
[] =
40 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
42 #define PY_SSIZE_T_CLEAN
45 #include "structmember.h" /* offsetof */
51 /* name of this module, minus the leading underscore */
52 #if !defined(SRE_MODULE)
53 #define SRE_MODULE "sre"
56 #define SRE_PY_MODULE "re"
58 /* defining this one enables tracing */
61 #if PY_VERSION_HEX >= 0x01060000
62 #if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
63 /* defining this enables unicode support (default under 1.6a1 and later) */
68 /* -------------------------------------------------------------------- */
69 /* optional features */
71 /* enables fast searching */
72 #define USE_FAST_SEARCH
74 /* enables aggressive inlining (always on for Visual C) */
77 /* enables copy/deepcopy handling (work in progress) */
78 #undef USE_BUILTIN_COPY
80 #if PY_VERSION_HEX < 0x01060000
81 #define PyObject_DEL(op) PyMem_DEL((op))
84 /* -------------------------------------------------------------------- */
87 #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
88 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
89 /* fastest possible local call under MSVC */
90 #define LOCAL(type) static __inline type __fastcall
91 #elif defined(USE_INLINE)
92 #define LOCAL(type) static inline type
94 #define LOCAL(type) static type
98 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
99 #define SRE_ERROR_STATE -2 /* illegal state */
100 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
101 #define SRE_ERROR_MEMORY -9 /* out of memory */
102 #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
105 #define TRACE(v) printf v
110 /* -------------------------------------------------------------------- */
111 /* search engine state */
113 /* default character predicates (run sre_chars.py to regenerate tables) */
115 #define SRE_DIGIT_MASK 1
116 #define SRE_SPACE_MASK 2
117 #define SRE_LINEBREAK_MASK 4
118 #define SRE_ALNUM_MASK 8
119 #define SRE_WORD_MASK 16
121 /* FIXME: this assumes ASCII. create tables in init_sre() instead */
123 static char sre_char_info
[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
124 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
125 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
126 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
127 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
128 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
129 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
131 static char sre_char_lower
[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
132 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
133 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
134 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
135 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
136 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
137 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
138 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
139 120, 121, 122, 123, 124, 125, 126, 127 };
141 #define SRE_IS_DIGIT(ch)\
142 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
143 #define SRE_IS_SPACE(ch)\
144 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
145 #define SRE_IS_LINEBREAK(ch)\
146 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
147 #define SRE_IS_ALNUM(ch)\
148 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
149 #define SRE_IS_WORD(ch)\
150 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
152 static unsigned int sre_lower(unsigned int ch
)
154 return ((ch
) < 128 ? (unsigned int)sre_char_lower
[ch
] : ch
);
157 /* locale-specific character predicates */
158 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
159 * warnings when c's type supports only numbers < N+1 */
160 #define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
161 #define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
162 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
163 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
164 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
166 static unsigned int sre_lower_locale(unsigned int ch
)
168 return ((ch
) < 256 ? (unsigned int)tolower((ch
)) : ch
);
171 /* unicode-specific character predicates */
173 #if defined(HAVE_UNICODE)
175 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
176 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
177 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
178 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
179 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
181 static unsigned int sre_lower_unicode(unsigned int ch
)
183 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE
)(ch
));
189 sre_category(SRE_CODE category
, unsigned int ch
)
193 case SRE_CATEGORY_DIGIT
:
194 return SRE_IS_DIGIT(ch
);
195 case SRE_CATEGORY_NOT_DIGIT
:
196 return !SRE_IS_DIGIT(ch
);
197 case SRE_CATEGORY_SPACE
:
198 return SRE_IS_SPACE(ch
);
199 case SRE_CATEGORY_NOT_SPACE
:
200 return !SRE_IS_SPACE(ch
);
201 case SRE_CATEGORY_WORD
:
202 return SRE_IS_WORD(ch
);
203 case SRE_CATEGORY_NOT_WORD
:
204 return !SRE_IS_WORD(ch
);
205 case SRE_CATEGORY_LINEBREAK
:
206 return SRE_IS_LINEBREAK(ch
);
207 case SRE_CATEGORY_NOT_LINEBREAK
:
208 return !SRE_IS_LINEBREAK(ch
);
210 case SRE_CATEGORY_LOC_WORD
:
211 return SRE_LOC_IS_WORD(ch
);
212 case SRE_CATEGORY_LOC_NOT_WORD
:
213 return !SRE_LOC_IS_WORD(ch
);
215 #if defined(HAVE_UNICODE)
216 case SRE_CATEGORY_UNI_DIGIT
:
217 return SRE_UNI_IS_DIGIT(ch
);
218 case SRE_CATEGORY_UNI_NOT_DIGIT
:
219 return !SRE_UNI_IS_DIGIT(ch
);
220 case SRE_CATEGORY_UNI_SPACE
:
221 return SRE_UNI_IS_SPACE(ch
);
222 case SRE_CATEGORY_UNI_NOT_SPACE
:
223 return !SRE_UNI_IS_SPACE(ch
);
224 case SRE_CATEGORY_UNI_WORD
:
225 return SRE_UNI_IS_WORD(ch
);
226 case SRE_CATEGORY_UNI_NOT_WORD
:
227 return !SRE_UNI_IS_WORD(ch
);
228 case SRE_CATEGORY_UNI_LINEBREAK
:
229 return SRE_UNI_IS_LINEBREAK(ch
);
230 case SRE_CATEGORY_UNI_NOT_LINEBREAK
:
231 return !SRE_UNI_IS_LINEBREAK(ch
);
233 case SRE_CATEGORY_UNI_DIGIT
:
234 return SRE_IS_DIGIT(ch
);
235 case SRE_CATEGORY_UNI_NOT_DIGIT
:
236 return !SRE_IS_DIGIT(ch
);
237 case SRE_CATEGORY_UNI_SPACE
:
238 return SRE_IS_SPACE(ch
);
239 case SRE_CATEGORY_UNI_NOT_SPACE
:
240 return !SRE_IS_SPACE(ch
);
241 case SRE_CATEGORY_UNI_WORD
:
242 return SRE_LOC_IS_WORD(ch
);
243 case SRE_CATEGORY_UNI_NOT_WORD
:
244 return !SRE_LOC_IS_WORD(ch
);
245 case SRE_CATEGORY_UNI_LINEBREAK
:
246 return SRE_IS_LINEBREAK(ch
);
247 case SRE_CATEGORY_UNI_NOT_LINEBREAK
:
248 return !SRE_IS_LINEBREAK(ch
);
257 data_stack_dealloc(SRE_STATE
* state
)
259 if (state
->data_stack
) {
260 PyMem_FREE(state
->data_stack
);
261 state
->data_stack
= NULL
;
263 state
->data_stack_size
= state
->data_stack_base
= 0;
267 data_stack_grow(SRE_STATE
* state
, Py_ssize_t size
)
269 Py_ssize_t minsize
, cursize
;
270 minsize
= state
->data_stack_base
+size
;
271 cursize
= state
->data_stack_size
;
272 if (cursize
< minsize
) {
274 cursize
= minsize
+minsize
/4+1024;
275 TRACE(("allocate/grow stack %" PY_FORMAT_SIZE_T
"d\n", cursize
));
276 stack
= PyMem_REALLOC(state
->data_stack
, cursize
);
278 data_stack_dealloc(state
);
279 return SRE_ERROR_MEMORY
;
281 state
->data_stack
= (char *)stack
;
282 state
->data_stack_size
= cursize
;
287 /* generate 8-bit version */
289 #define SRE_CHAR unsigned char
290 #define SRE_AT sre_at
291 #define SRE_COUNT sre_count
292 #define SRE_CHARSET sre_charset
293 #define SRE_INFO sre_info
294 #define SRE_MATCH sre_match
295 #define SRE_MATCH_CONTEXT sre_match_context
296 #define SRE_SEARCH sre_search
297 #define SRE_LITERAL_TEMPLATE sre_literal_template
299 #if defined(HAVE_UNICODE)
301 #define SRE_RECURSIVE
305 #undef SRE_LITERAL_TEMPLATE
308 #undef SRE_MATCH_CONTEXT
315 /* generate 16-bit unicode version */
317 #define SRE_CHAR Py_UNICODE
318 #define SRE_AT sre_uat
319 #define SRE_COUNT sre_ucount
320 #define SRE_CHARSET sre_ucharset
321 #define SRE_INFO sre_uinfo
322 #define SRE_MATCH sre_umatch
323 #define SRE_MATCH_CONTEXT sre_umatch_context
324 #define SRE_SEARCH sre_usearch
325 #define SRE_LITERAL_TEMPLATE sre_uliteral_template
328 #endif /* SRE_RECURSIVE */
330 /* -------------------------------------------------------------------- */
331 /* String matching engine */
333 /* the following section is compiled twice, with different character
337 SRE_AT(SRE_STATE
* state
, SRE_CHAR
* ptr
, SRE_CODE at
)
339 /* check if pointer is at given position */
341 Py_ssize_t thisp
, thatp
;
345 case SRE_AT_BEGINNING
:
346 case SRE_AT_BEGINNING_STRING
:
347 return ((void*) ptr
== state
->beginning
);
349 case SRE_AT_BEGINNING_LINE
:
350 return ((void*) ptr
== state
->beginning
||
351 SRE_IS_LINEBREAK((int) ptr
[-1]));
354 return (((void*) (ptr
+1) == state
->end
&&
355 SRE_IS_LINEBREAK((int) ptr
[0])) ||
356 ((void*) ptr
== state
->end
));
358 case SRE_AT_END_LINE
:
359 return ((void*) ptr
== state
->end
||
360 SRE_IS_LINEBREAK((int) ptr
[0]));
362 case SRE_AT_END_STRING
:
363 return ((void*) ptr
== state
->end
);
365 case SRE_AT_BOUNDARY
:
366 if (state
->beginning
== state
->end
)
368 thatp
= ((void*) ptr
> state
->beginning
) ?
369 SRE_IS_WORD((int) ptr
[-1]) : 0;
370 thisp
= ((void*) ptr
< state
->end
) ?
371 SRE_IS_WORD((int) ptr
[0]) : 0;
372 return thisp
!= thatp
;
374 case SRE_AT_NON_BOUNDARY
:
375 if (state
->beginning
== state
->end
)
377 thatp
= ((void*) ptr
> state
->beginning
) ?
378 SRE_IS_WORD((int) ptr
[-1]) : 0;
379 thisp
= ((void*) ptr
< state
->end
) ?
380 SRE_IS_WORD((int) ptr
[0]) : 0;
381 return thisp
== thatp
;
383 case SRE_AT_LOC_BOUNDARY
:
384 if (state
->beginning
== state
->end
)
386 thatp
= ((void*) ptr
> state
->beginning
) ?
387 SRE_LOC_IS_WORD((int) ptr
[-1]) : 0;
388 thisp
= ((void*) ptr
< state
->end
) ?
389 SRE_LOC_IS_WORD((int) ptr
[0]) : 0;
390 return thisp
!= thatp
;
392 case SRE_AT_LOC_NON_BOUNDARY
:
393 if (state
->beginning
== state
->end
)
395 thatp
= ((void*) ptr
> state
->beginning
) ?
396 SRE_LOC_IS_WORD((int) ptr
[-1]) : 0;
397 thisp
= ((void*) ptr
< state
->end
) ?
398 SRE_LOC_IS_WORD((int) ptr
[0]) : 0;
399 return thisp
== thatp
;
401 #if defined(HAVE_UNICODE)
402 case SRE_AT_UNI_BOUNDARY
:
403 if (state
->beginning
== state
->end
)
405 thatp
= ((void*) ptr
> state
->beginning
) ?
406 SRE_UNI_IS_WORD((int) ptr
[-1]) : 0;
407 thisp
= ((void*) ptr
< state
->end
) ?
408 SRE_UNI_IS_WORD((int) ptr
[0]) : 0;
409 return thisp
!= thatp
;
411 case SRE_AT_UNI_NON_BOUNDARY
:
412 if (state
->beginning
== state
->end
)
414 thatp
= ((void*) ptr
> state
->beginning
) ?
415 SRE_UNI_IS_WORD((int) ptr
[-1]) : 0;
416 thisp
= ((void*) ptr
< state
->end
) ?
417 SRE_UNI_IS_WORD((int) ptr
[0]) : 0;
418 return thisp
== thatp
;
427 SRE_CHARSET(SRE_CODE
* set
, SRE_CODE ch
)
429 /* check if character is a member of the given set */
440 /* <LITERAL> <code> */
446 case SRE_OP_CATEGORY
:
447 /* <CATEGORY> <code> */
448 if (sre_category(set
[0], (int) ch
))
454 if (sizeof(SRE_CODE
) == 2) {
455 /* <CHARSET> <bitmap> (16 bits per code word) */
456 if (ch
< 256 && (set
[ch
>> 4] & (1 << (ch
& 15))))
461 /* <CHARSET> <bitmap> (32 bits per code word) */
462 if (ch
< 256 && (set
[ch
>> 5] & (1u << (ch
& 31))))
469 /* <RANGE> <lower> <upper> */
470 if (set
[0] <= ch
&& ch
<= set
[1])
479 case SRE_OP_BIGCHARSET
:
480 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
482 Py_ssize_t count
, block
;
485 if (sizeof(SRE_CODE
) == 2) {
486 block
= ((unsigned char*)set
)[ch
>> 8];
488 if (set
[block
*16 + ((ch
& 255)>>4)] & (1 << (ch
& 15)))
493 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
494 * warnings when c's type supports only numbers < N+1 */
496 block
= ((unsigned char*)set
)[ch
>> 8];
501 (set
[block
*8 + ((ch
& 255)>>5)] & (1u << (ch
& 31))))
509 /* internal error -- there's not much we can do about it
510 here, so let's just pretend it didn't match... */
516 LOCAL(Py_ssize_t
) SRE_MATCH(SRE_STATE
* state
, SRE_CODE
* pattern
);
519 SRE_COUNT(SRE_STATE
* state
, SRE_CODE
* pattern
, Py_ssize_t maxcount
)
522 SRE_CHAR
* ptr
= (SRE_CHAR
*)state
->ptr
;
523 SRE_CHAR
* end
= (SRE_CHAR
*)state
->end
;
527 if (maxcount
< end
- ptr
&& maxcount
!= SRE_MAXREPEAT
)
528 end
= ptr
+ maxcount
;
530 switch (pattern
[0]) {
534 TRACE(("|%p|%p|COUNT IN\n", pattern
, ptr
));
535 while (ptr
< end
&& SRE_CHARSET(pattern
+ 2, *ptr
))
540 /* repeated dot wildcard. */
541 TRACE(("|%p|%p|COUNT ANY\n", pattern
, ptr
));
542 while (ptr
< end
&& !SRE_IS_LINEBREAK(*ptr
))
547 /* repeated dot wildcard. skip to the end of the target
548 string, and backtrack from there */
549 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern
, ptr
));
554 /* repeated literal */
556 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern
, ptr
, chr
));
557 while (ptr
< end
&& (SRE_CODE
) *ptr
== chr
)
561 case SRE_OP_LITERAL_IGNORE
:
562 /* repeated literal */
564 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern
, ptr
, chr
));
565 while (ptr
< end
&& (SRE_CODE
) state
->lower(*ptr
) == chr
)
569 case SRE_OP_NOT_LITERAL
:
570 /* repeated non-literal */
572 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern
, ptr
, chr
));
573 while (ptr
< end
&& (SRE_CODE
) *ptr
!= chr
)
577 case SRE_OP_NOT_LITERAL_IGNORE
:
578 /* repeated non-literal */
580 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern
, ptr
, chr
));
581 while (ptr
< end
&& (SRE_CODE
) state
->lower(*ptr
) != chr
)
586 /* repeated single character pattern */
587 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern
, ptr
));
588 while ((SRE_CHAR
*) state
->ptr
< end
) {
589 i
= SRE_MATCH(state
, pattern
);
595 TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T
"d\n", pattern
, ptr
,
596 (SRE_CHAR
*) state
->ptr
- ptr
));
597 return (SRE_CHAR
*) state
->ptr
- ptr
;
600 TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T
"d\n", pattern
, ptr
,
601 ptr
- (SRE_CHAR
*) state
->ptr
));
602 return ptr
- (SRE_CHAR
*) state
->ptr
;
605 #if 0 /* not used in this release */
607 SRE_INFO(SRE_STATE
* state
, SRE_CODE
* pattern
)
609 /* check if an SRE_OP_INFO block matches at the current position.
610 returns the number of SRE_CODE objects to skip if successful, 0
613 SRE_CHAR
* end
= state
->end
;
614 SRE_CHAR
* ptr
= state
->ptr
;
617 /* check minimal length */
618 if (pattern
[3] && (end
- ptr
) < pattern
[3])
621 /* check known prefix */
622 if (pattern
[2] & SRE_INFO_PREFIX
&& pattern
[5] > 1) {
623 /* <length> <skip> <prefix data> <overlap data> */
624 for (i
= 0; i
< pattern
[5]; i
++)
625 if ((SRE_CODE
) ptr
[i
] != pattern
[7 + i
])
627 return pattern
[0] + 2 * pattern
[6];
633 /* The macros below should be used to protect recursive SRE_MATCH()
634 * calls that *failed* and do *not* return immediately (IOW, those
635 * that will backtrack). Explaining:
637 * - Recursive SRE_MATCH() returned true: that's usually a success
638 * (besides atypical cases like ASSERT_NOT), therefore there's no
639 * reason to restore lastmark;
641 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
642 * is returning to the caller: If the current SRE_MATCH() is the
643 * top function of the recursion, returning false will be a matching
644 * failure, and it doesn't matter where lastmark is pointing to.
645 * If it's *not* the top function, it will be a recursive SRE_MATCH()
646 * failure by itself, and the calling SRE_MATCH() will have to deal
647 * with the failure by the same rules explained here (it will restore
648 * lastmark by itself if necessary);
650 * - Recursive SRE_MATCH() returned false, and will continue the
651 * outside 'for' loop: must be protected when breaking, since the next
652 * OP could potentially depend on lastmark;
654 * - Recursive SRE_MATCH() returned false, and will be called again
655 * inside a local for/while loop: must be protected between each
656 * loop iteration, since the recursive SRE_MATCH() could do anything,
657 * and could potentially depend on lastmark.
659 * For more information, check the discussion at SF patch #712900.
661 #define LASTMARK_SAVE() \
663 ctx->lastmark = state->lastmark; \
664 ctx->lastindex = state->lastindex; \
666 #define LASTMARK_RESTORE() \
668 state->lastmark = ctx->lastmark; \
669 state->lastindex = ctx->lastindex; \
672 #define RETURN_ERROR(i) do { return i; } while(0)
673 #define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
674 #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
676 #define RETURN_ON_ERROR(i) \
677 do { if (i < 0) RETURN_ERROR(i); } while (0)
678 #define RETURN_ON_SUCCESS(i) \
679 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
680 #define RETURN_ON_FAILURE(i) \
681 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
685 #define DATA_STACK_ALLOC(state, type, ptr) \
687 alloc_pos = state->data_stack_base; \
688 TRACE(("allocating %s in %" PY_FORMAT_SIZE_T "d " \
689 "(%" PY_FORMAT_SIZE_T "d)\n", \
690 SFY(type), alloc_pos, sizeof(type))); \
691 if (sizeof(type) > state->data_stack_size - alloc_pos) { \
692 int j = data_stack_grow(state, sizeof(type)); \
693 if (j < 0) return j; \
695 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
697 ptr = (type*)(state->data_stack+alloc_pos); \
698 state->data_stack_base += sizeof(type); \
701 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
703 TRACE(("looking up %s at %" PY_FORMAT_SIZE_T "d\n", SFY(type), pos)); \
704 ptr = (type*)(state->data_stack+pos); \
707 #define DATA_STACK_PUSH(state, data, size) \
709 TRACE(("copy data in %p to %" PY_FORMAT_SIZE_T "d " \
710 "(%" PY_FORMAT_SIZE_T "d)\n", \
711 data, state->data_stack_base, size)); \
712 if (size > state->data_stack_size - state->data_stack_base) { \
713 int j = data_stack_grow(state, size); \
714 if (j < 0) return j; \
716 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
718 memcpy(state->data_stack+state->data_stack_base, data, size); \
719 state->data_stack_base += size; \
722 #define DATA_STACK_POP(state, data, size, discard) \
724 TRACE(("copy data to %p from %" PY_FORMAT_SIZE_T "d " \
725 "(%" PY_FORMAT_SIZE_T "d)\n", \
726 data, state->data_stack_base-size, size)); \
727 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
729 state->data_stack_base -= size; \
732 #define DATA_STACK_POP_DISCARD(state, size) \
734 TRACE(("discard data from %" PY_FORMAT_SIZE_T "d " \
735 "(%" PY_FORMAT_SIZE_T "d)\n", \
736 state->data_stack_base-size, size)); \
737 state->data_stack_base -= size; \
740 #define DATA_PUSH(x) \
741 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
742 #define DATA_POP(x) \
743 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
744 #define DATA_POP_DISCARD(x) \
745 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
746 #define DATA_ALLOC(t,p) \
747 DATA_STACK_ALLOC(state, t, p)
748 #define DATA_LOOKUP_AT(t,p,pos) \
749 DATA_STACK_LOOKUP_AT(state,t,p,pos)
751 #define MARK_PUSH(lastmark) \
752 do if (lastmark > 0) { \
753 i = lastmark; /* ctx->lastmark may change if reallocated */ \
754 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
756 #define MARK_POP(lastmark) \
757 do if (lastmark > 0) { \
758 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
760 #define MARK_POP_KEEP(lastmark) \
761 do if (lastmark > 0) { \
762 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
764 #define MARK_POP_DISCARD(lastmark) \
765 do if (lastmark > 0) { \
766 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
770 #define JUMP_MAX_UNTIL_1 1
771 #define JUMP_MAX_UNTIL_2 2
772 #define JUMP_MAX_UNTIL_3 3
773 #define JUMP_MIN_UNTIL_1 4
774 #define JUMP_MIN_UNTIL_2 5
775 #define JUMP_MIN_UNTIL_3 6
776 #define JUMP_REPEAT 7
777 #define JUMP_REPEAT_ONE_1 8
778 #define JUMP_REPEAT_ONE_2 9
779 #define JUMP_MIN_REPEAT_ONE 10
780 #define JUMP_BRANCH 11
781 #define JUMP_ASSERT 12
782 #define JUMP_ASSERT_NOT 13
784 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
785 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
786 nextctx->last_ctx_pos = ctx_pos; \
787 nextctx->jump = jumpvalue; \
788 nextctx->pattern = nextpattern; \
789 ctx_pos = alloc_pos; \
793 while (0) /* gcc doesn't like labels at end of scopes */ \
796 Py_ssize_t last_ctx_pos
;
802 Py_ssize_t lastindex
;
809 /* check if string matches the given pattern. returns <0 for
810 error, 0 for failure, and 1 for success */
812 SRE_MATCH(SRE_STATE
* state
, SRE_CODE
* pattern
)
814 SRE_CHAR
* end
= (SRE_CHAR
*)state
->end
;
815 Py_ssize_t alloc_pos
, ctx_pos
= -1;
816 Py_ssize_t i
, ret
= 0;
818 unsigned int sigcount
=0;
820 SRE_MATCH_CONTEXT
* ctx
;
821 SRE_MATCH_CONTEXT
* nextctx
;
823 TRACE(("|%p|%p|ENTER\n", pattern
, state
->ptr
));
825 DATA_ALLOC(SRE_MATCH_CONTEXT
, ctx
);
826 ctx
->last_ctx_pos
= -1;
827 ctx
->jump
= JUMP_NONE
;
828 ctx
->pattern
= pattern
;
833 ctx
->ptr
= (SRE_CHAR
*)state
->ptr
;
835 if (ctx
->pattern
[0] == SRE_OP_INFO
) {
836 /* optimization info block */
837 /* <INFO> <1=skip> <2=flags> <3=min> ... */
838 if (ctx
->pattern
[3] && (end
- ctx
->ptr
) < ctx
->pattern
[3]) {
839 TRACE(("reject (got %" PY_FORMAT_SIZE_T
"d chars, "
840 "need %" PY_FORMAT_SIZE_T
"d)\n",
841 (end
- ctx
->ptr
), (Py_ssize_t
) ctx
->pattern
[3]));
844 ctx
->pattern
+= ctx
->pattern
[1] + 1;
849 if ((0 == (sigcount
& 0xfff)) && PyErr_CheckSignals())
850 RETURN_ERROR(SRE_ERROR_INTERRUPTED
);
852 switch (*ctx
->pattern
++) {
857 TRACE(("|%p|%p|MARK %d\n", ctx
->pattern
,
858 ctx
->ptr
, ctx
->pattern
[0]));
861 state
->lastindex
= i
/2 + 1;
862 if (i
> state
->lastmark
) {
863 /* state->lastmark is the highest valid index in the
864 state->mark array. If it is increased by more than 1,
865 the intervening marks must be set to NULL to signal
866 that these marks have not been encountered. */
867 Py_ssize_t j
= state
->lastmark
+ 1;
869 state
->mark
[j
++] = NULL
;
872 state
->mark
[i
] = ctx
->ptr
;
877 /* match literal string */
878 /* <LITERAL> <code> */
879 TRACE(("|%p|%p|LITERAL %d\n", ctx
->pattern
,
880 ctx
->ptr
, *ctx
->pattern
));
881 if (ctx
->ptr
>= end
|| (SRE_CODE
) ctx
->ptr
[0] != ctx
->pattern
[0])
887 case SRE_OP_NOT_LITERAL
:
888 /* match anything that is not literal character */
889 /* <NOT_LITERAL> <code> */
890 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx
->pattern
,
891 ctx
->ptr
, *ctx
->pattern
));
892 if (ctx
->ptr
>= end
|| (SRE_CODE
) ctx
->ptr
[0] == ctx
->pattern
[0])
900 TRACE(("|%p|%p|SUCCESS\n", ctx
->pattern
, ctx
->ptr
));
901 state
->ptr
= ctx
->ptr
;
905 /* match at given position */
907 TRACE(("|%p|%p|AT %d\n", ctx
->pattern
, ctx
->ptr
, *ctx
->pattern
));
908 if (!SRE_AT(state
, ctx
->ptr
, *ctx
->pattern
))
913 case SRE_OP_CATEGORY
:
914 /* match at given category */
915 /* <CATEGORY> <code> */
916 TRACE(("|%p|%p|CATEGORY %d\n", ctx
->pattern
,
917 ctx
->ptr
, *ctx
->pattern
));
918 if (ctx
->ptr
>= end
|| !sre_category(ctx
->pattern
[0], ctx
->ptr
[0]))
925 /* match anything (except a newline) */
927 TRACE(("|%p|%p|ANY\n", ctx
->pattern
, ctx
->ptr
));
928 if (ctx
->ptr
>= end
|| SRE_IS_LINEBREAK(ctx
->ptr
[0]))
936 TRACE(("|%p|%p|ANY_ALL\n", ctx
->pattern
, ctx
->ptr
));
943 /* match set member (or non_member) */
944 /* <IN> <skip> <set> */
945 TRACE(("|%p|%p|IN\n", ctx
->pattern
, ctx
->ptr
));
946 if (ctx
->ptr
>= end
|| !SRE_CHARSET(ctx
->pattern
+ 1, *ctx
->ptr
))
948 ctx
->pattern
+= ctx
->pattern
[0];
952 case SRE_OP_LITERAL_IGNORE
:
953 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
954 ctx
->pattern
, ctx
->ptr
, ctx
->pattern
[0]));
955 if (ctx
->ptr
>= end
||
956 state
->lower(*ctx
->ptr
) != state
->lower(*ctx
->pattern
))
962 case SRE_OP_NOT_LITERAL_IGNORE
:
963 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
964 ctx
->pattern
, ctx
->ptr
, *ctx
->pattern
));
965 if (ctx
->ptr
>= end
||
966 state
->lower(*ctx
->ptr
) == state
->lower(*ctx
->pattern
))
972 case SRE_OP_IN_IGNORE
:
973 TRACE(("|%p|%p|IN_IGNORE\n", ctx
->pattern
, ctx
->ptr
));
975 || !SRE_CHARSET(ctx
->pattern
+1,
976 (SRE_CODE
)state
->lower(*ctx
->ptr
)))
978 ctx
->pattern
+= ctx
->pattern
[0];
985 /* <JUMP> <offset> */
986 TRACE(("|%p|%p|JUMP %d\n", ctx
->pattern
,
987 ctx
->ptr
, ctx
->pattern
[0]));
988 ctx
->pattern
+= ctx
->pattern
[0];
993 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
994 TRACE(("|%p|%p|BRANCH\n", ctx
->pattern
, ctx
->ptr
));
996 ctx
->u
.rep
= state
->repeat
;
998 MARK_PUSH(ctx
->lastmark
);
999 for (; ctx
->pattern
[0]; ctx
->pattern
+= ctx
->pattern
[0]) {
1000 if (ctx
->pattern
[1] == SRE_OP_LITERAL
&&
1002 (SRE_CODE
) *ctx
->ptr
!= ctx
->pattern
[2]))
1004 if (ctx
->pattern
[1] == SRE_OP_IN
&&
1006 !SRE_CHARSET(ctx
->pattern
+ 3, (SRE_CODE
) *ctx
->ptr
)))
1008 state
->ptr
= ctx
->ptr
;
1009 DO_JUMP(JUMP_BRANCH
, jump_branch
, ctx
->pattern
+1);
1012 MARK_POP_DISCARD(ctx
->lastmark
);
1013 RETURN_ON_ERROR(ret
);
1017 MARK_POP_KEEP(ctx
->lastmark
);
1021 MARK_POP_DISCARD(ctx
->lastmark
);
1024 case SRE_OP_REPEAT_ONE
:
1025 /* match repeated sequence (maximizing regexp) */
1027 /* this operator only works if the repeated item is
1028 exactly one character wide, and we're not already
1029 collecting backtracking points. for other cases,
1030 use the MAX_REPEAT operator */
1032 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1034 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx
->pattern
, ctx
->ptr
,
1035 ctx
->pattern
[1], ctx
->pattern
[2]));
1037 if ((Py_ssize_t
) ctx
->pattern
[1] > end
- ctx
->ptr
)
1038 RETURN_FAILURE
; /* cannot match */
1040 state
->ptr
= ctx
->ptr
;
1042 ret
= SRE_COUNT(state
, ctx
->pattern
+3, ctx
->pattern
[2]);
1043 RETURN_ON_ERROR(ret
);
1044 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT
, ctx
, ctx_pos
);
1046 ctx
->ptr
+= ctx
->count
;
1048 /* when we arrive here, count contains the number of
1049 matches, and ctx->ptr points to the tail of the target
1050 string. check if the rest of the pattern matches,
1051 and backtrack if not. */
1053 if (ctx
->count
< (Py_ssize_t
) ctx
->pattern
[1])
1056 if (ctx
->pattern
[ctx
->pattern
[0]] == SRE_OP_SUCCESS
) {
1057 /* tail is empty. we're finished */
1058 state
->ptr
= ctx
->ptr
;
1064 if (ctx
->pattern
[ctx
->pattern
[0]] == SRE_OP_LITERAL
) {
1065 /* tail starts with a literal. skip positions where
1066 the rest of the pattern cannot possibly match */
1067 ctx
->u
.chr
= ctx
->pattern
[ctx
->pattern
[0]+1];
1069 while (ctx
->count
>= (Py_ssize_t
) ctx
->pattern
[1] &&
1070 (ctx
->ptr
>= end
|| *ctx
->ptr
!= ctx
->u
.chr
)) {
1074 if (ctx
->count
< (Py_ssize_t
) ctx
->pattern
[1])
1076 state
->ptr
= ctx
->ptr
;
1077 DO_JUMP(JUMP_REPEAT_ONE_1
, jump_repeat_one_1
,
1078 ctx
->pattern
+ctx
->pattern
[0]);
1080 RETURN_ON_ERROR(ret
);
1092 while (ctx
->count
>= (Py_ssize_t
) ctx
->pattern
[1]) {
1093 state
->ptr
= ctx
->ptr
;
1094 DO_JUMP(JUMP_REPEAT_ONE_2
, jump_repeat_one_2
,
1095 ctx
->pattern
+ctx
->pattern
[0]);
1097 RETURN_ON_ERROR(ret
);
1107 case SRE_OP_MIN_REPEAT_ONE
:
1108 /* match repeated sequence (minimizing regexp) */
1110 /* this operator only works if the repeated item is
1111 exactly one character wide, and we're not already
1112 collecting backtracking points. for other cases,
1113 use the MIN_REPEAT operator */
1115 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1117 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx
->pattern
, ctx
->ptr
,
1118 ctx
->pattern
[1], ctx
->pattern
[2]));
1120 if ((Py_ssize_t
) ctx
->pattern
[1] > end
- ctx
->ptr
)
1121 RETURN_FAILURE
; /* cannot match */
1123 state
->ptr
= ctx
->ptr
;
1125 if (ctx
->pattern
[1] == 0)
1128 /* count using pattern min as the maximum */
1129 ret
= SRE_COUNT(state
, ctx
->pattern
+3, ctx
->pattern
[1]);
1130 RETURN_ON_ERROR(ret
);
1131 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT
, ctx
, ctx_pos
);
1132 if (ret
< (Py_ssize_t
) ctx
->pattern
[1])
1133 /* didn't match minimum number of times */
1135 /* advance past minimum matches of repeat */
1137 ctx
->ptr
+= ctx
->count
;
1140 if (ctx
->pattern
[ctx
->pattern
[0]] == SRE_OP_SUCCESS
) {
1141 /* tail is empty. we're finished */
1142 state
->ptr
= ctx
->ptr
;
1148 while ((Py_ssize_t
)ctx
->pattern
[2] == SRE_MAXREPEAT
1149 || ctx
->count
<= (Py_ssize_t
)ctx
->pattern
[2]) {
1150 state
->ptr
= ctx
->ptr
;
1151 DO_JUMP(JUMP_MIN_REPEAT_ONE
,jump_min_repeat_one
,
1152 ctx
->pattern
+ctx
->pattern
[0]);
1154 RETURN_ON_ERROR(ret
);
1157 state
->ptr
= ctx
->ptr
;
1158 ret
= SRE_COUNT(state
, ctx
->pattern
+3, 1);
1159 RETURN_ON_ERROR(ret
);
1160 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT
, ctx
, ctx_pos
);
1172 /* create repeat context. all the hard work is done
1173 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1174 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1175 TRACE(("|%p|%p|REPEAT %d %d\n", ctx
->pattern
, ctx
->ptr
,
1176 ctx
->pattern
[1], ctx
->pattern
[2]));
1178 /* install new repeat context */
1179 ctx
->u
.rep
= (SRE_REPEAT
*) PyObject_MALLOC(sizeof(*ctx
->u
.rep
));
1184 ctx
->u
.rep
->count
= -1;
1185 ctx
->u
.rep
->pattern
= ctx
->pattern
;
1186 ctx
->u
.rep
->prev
= state
->repeat
;
1187 ctx
->u
.rep
->last_ptr
= NULL
;
1188 state
->repeat
= ctx
->u
.rep
;
1190 state
->ptr
= ctx
->ptr
;
1191 DO_JUMP(JUMP_REPEAT
, jump_repeat
, ctx
->pattern
+ctx
->pattern
[0]);
1192 state
->repeat
= ctx
->u
.rep
->prev
;
1193 PyObject_FREE(ctx
->u
.rep
);
1196 RETURN_ON_ERROR(ret
);
1201 case SRE_OP_MAX_UNTIL
:
1202 /* maximizing repeat */
1203 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1205 /* FIXME: we probably need to deal with zero-width
1206 matches in here... */
1208 ctx
->u
.rep
= state
->repeat
;
1210 RETURN_ERROR(SRE_ERROR_STATE
);
1212 state
->ptr
= ctx
->ptr
;
1214 ctx
->count
= ctx
->u
.rep
->count
+1;
1216 TRACE(("|%p|%p|MAX_UNTIL %" PY_FORMAT_SIZE_T
"d\n", ctx
->pattern
,
1217 ctx
->ptr
, ctx
->count
));
1219 if (ctx
->count
< (Py_ssize_t
) ctx
->u
.rep
->pattern
[1]) {
1220 /* not enough matches */
1221 ctx
->u
.rep
->count
= ctx
->count
;
1222 DO_JUMP(JUMP_MAX_UNTIL_1
, jump_max_until_1
,
1223 ctx
->u
.rep
->pattern
+3);
1225 RETURN_ON_ERROR(ret
);
1228 ctx
->u
.rep
->count
= ctx
->count
-1;
1229 state
->ptr
= ctx
->ptr
;
1233 if ((ctx
->count
< (Py_ssize_t
) ctx
->u
.rep
->pattern
[2] ||
1234 ctx
->u
.rep
->pattern
[2] == SRE_MAXREPEAT
) &&
1235 state
->ptr
!= ctx
->u
.rep
->last_ptr
) {
1236 /* we may have enough matches, but if we can
1237 match another item, do so */
1238 ctx
->u
.rep
->count
= ctx
->count
;
1240 MARK_PUSH(ctx
->lastmark
);
1241 /* zero-width match protection */
1242 DATA_PUSH(&ctx
->u
.rep
->last_ptr
);
1243 ctx
->u
.rep
->last_ptr
= state
->ptr
;
1244 DO_JUMP(JUMP_MAX_UNTIL_2
, jump_max_until_2
,
1245 ctx
->u
.rep
->pattern
+3);
1246 DATA_POP(&ctx
->u
.rep
->last_ptr
);
1248 MARK_POP_DISCARD(ctx
->lastmark
);
1249 RETURN_ON_ERROR(ret
);
1252 MARK_POP(ctx
->lastmark
);
1254 ctx
->u
.rep
->count
= ctx
->count
-1;
1255 state
->ptr
= ctx
->ptr
;
1258 /* cannot match more repeated items here. make sure the
1260 state
->repeat
= ctx
->u
.rep
->prev
;
1261 DO_JUMP(JUMP_MAX_UNTIL_3
, jump_max_until_3
, ctx
->pattern
);
1262 RETURN_ON_SUCCESS(ret
);
1263 state
->repeat
= ctx
->u
.rep
;
1264 state
->ptr
= ctx
->ptr
;
1267 case SRE_OP_MIN_UNTIL
:
1268 /* minimizing repeat */
1269 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1271 ctx
->u
.rep
= state
->repeat
;
1273 RETURN_ERROR(SRE_ERROR_STATE
);
1275 state
->ptr
= ctx
->ptr
;
1277 ctx
->count
= ctx
->u
.rep
->count
+1;
1279 TRACE(("|%p|%p|MIN_UNTIL %" PY_FORMAT_SIZE_T
"d %p\n", ctx
->pattern
,
1280 ctx
->ptr
, ctx
->count
, ctx
->u
.rep
->pattern
));
1282 if (ctx
->count
< (Py_ssize_t
) ctx
->u
.rep
->pattern
[1]) {
1283 /* not enough matches */
1284 ctx
->u
.rep
->count
= ctx
->count
;
1285 DO_JUMP(JUMP_MIN_UNTIL_1
, jump_min_until_1
,
1286 ctx
->u
.rep
->pattern
+3);
1288 RETURN_ON_ERROR(ret
);
1291 ctx
->u
.rep
->count
= ctx
->count
-1;
1292 state
->ptr
= ctx
->ptr
;
1298 /* see if the tail matches */
1299 state
->repeat
= ctx
->u
.rep
->prev
;
1300 DO_JUMP(JUMP_MIN_UNTIL_2
, jump_min_until_2
, ctx
->pattern
);
1302 RETURN_ON_ERROR(ret
);
1306 state
->repeat
= ctx
->u
.rep
;
1307 state
->ptr
= ctx
->ptr
;
1311 if ((ctx
->count
>= (Py_ssize_t
) ctx
->u
.rep
->pattern
[2]
1312 && ctx
->u
.rep
->pattern
[2] != SRE_MAXREPEAT
) ||
1313 state
->ptr
== ctx
->u
.rep
->last_ptr
)
1316 ctx
->u
.rep
->count
= ctx
->count
;
1317 /* zero-width match protection */
1318 DATA_PUSH(&ctx
->u
.rep
->last_ptr
);
1319 ctx
->u
.rep
->last_ptr
= state
->ptr
;
1320 DO_JUMP(JUMP_MIN_UNTIL_3
,jump_min_until_3
,
1321 ctx
->u
.rep
->pattern
+3);
1322 DATA_POP(&ctx
->u
.rep
->last_ptr
);
1324 RETURN_ON_ERROR(ret
);
1327 ctx
->u
.rep
->count
= ctx
->count
-1;
1328 state
->ptr
= ctx
->ptr
;
1331 case SRE_OP_GROUPREF
:
1332 /* match backreference */
1333 TRACE(("|%p|%p|GROUPREF %d\n", ctx
->pattern
,
1334 ctx
->ptr
, ctx
->pattern
[0]));
1335 i
= ctx
->pattern
[0];
1337 Py_ssize_t groupref
= i
+i
;
1338 if (groupref
>= state
->lastmark
) {
1341 SRE_CHAR
* p
= (SRE_CHAR
*) state
->mark
[groupref
];
1342 SRE_CHAR
* e
= (SRE_CHAR
*) state
->mark
[groupref
+1];
1343 if (!p
|| !e
|| e
< p
)
1346 if (ctx
->ptr
>= end
|| *ctx
->ptr
!= *p
)
1355 case SRE_OP_GROUPREF_IGNORE
:
1356 /* match backreference */
1357 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx
->pattern
,
1358 ctx
->ptr
, ctx
->pattern
[0]));
1359 i
= ctx
->pattern
[0];
1361 Py_ssize_t groupref
= i
+i
;
1362 if (groupref
>= state
->lastmark
) {
1365 SRE_CHAR
* p
= (SRE_CHAR
*) state
->mark
[groupref
];
1366 SRE_CHAR
* e
= (SRE_CHAR
*) state
->mark
[groupref
+1];
1367 if (!p
|| !e
|| e
< p
)
1370 if (ctx
->ptr
>= end
||
1371 state
->lower(*ctx
->ptr
) != state
->lower(*p
))
1380 case SRE_OP_GROUPREF_EXISTS
:
1381 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx
->pattern
,
1382 ctx
->ptr
, ctx
->pattern
[0]));
1383 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1384 i
= ctx
->pattern
[0];
1386 Py_ssize_t groupref
= i
+i
;
1387 if (groupref
>= state
->lastmark
) {
1388 ctx
->pattern
+= ctx
->pattern
[1];
1391 SRE_CHAR
* p
= (SRE_CHAR
*) state
->mark
[groupref
];
1392 SRE_CHAR
* e
= (SRE_CHAR
*) state
->mark
[groupref
+1];
1393 if (!p
|| !e
|| e
< p
) {
1394 ctx
->pattern
+= ctx
->pattern
[1];
1403 /* assert subpattern */
1404 /* <ASSERT> <skip> <back> <pattern> */
1405 TRACE(("|%p|%p|ASSERT %d\n", ctx
->pattern
,
1406 ctx
->ptr
, ctx
->pattern
[1]));
1407 state
->ptr
= ctx
->ptr
- ctx
->pattern
[1];
1408 if (state
->ptr
< state
->beginning
)
1410 DO_JUMP(JUMP_ASSERT
, jump_assert
, ctx
->pattern
+2);
1411 RETURN_ON_FAILURE(ret
);
1412 ctx
->pattern
+= ctx
->pattern
[0];
1415 case SRE_OP_ASSERT_NOT
:
1416 /* assert not subpattern */
1417 /* <ASSERT_NOT> <skip> <back> <pattern> */
1418 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx
->pattern
,
1419 ctx
->ptr
, ctx
->pattern
[1]));
1420 state
->ptr
= ctx
->ptr
- ctx
->pattern
[1];
1421 if (state
->ptr
>= state
->beginning
) {
1422 DO_JUMP(JUMP_ASSERT_NOT
, jump_assert_not
, ctx
->pattern
+2);
1424 RETURN_ON_ERROR(ret
);
1428 ctx
->pattern
+= ctx
->pattern
[0];
1431 case SRE_OP_FAILURE
:
1432 /* immediate failure */
1433 TRACE(("|%p|%p|FAILURE\n", ctx
->pattern
, ctx
->ptr
));
1437 TRACE(("|%p|%p|UNKNOWN %d\n", ctx
->pattern
, ctx
->ptr
,
1439 RETURN_ERROR(SRE_ERROR_ILLEGAL
);
1444 ctx_pos
= ctx
->last_ctx_pos
;
1446 DATA_POP_DISCARD(ctx
);
1449 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT
, ctx
, ctx_pos
);
1452 case JUMP_MAX_UNTIL_2
:
1453 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx
->pattern
, ctx
->ptr
));
1454 goto jump_max_until_2
;
1455 case JUMP_MAX_UNTIL_3
:
1456 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx
->pattern
, ctx
->ptr
));
1457 goto jump_max_until_3
;
1458 case JUMP_MIN_UNTIL_2
:
1459 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx
->pattern
, ctx
->ptr
));
1460 goto jump_min_until_2
;
1461 case JUMP_MIN_UNTIL_3
:
1462 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx
->pattern
, ctx
->ptr
));
1463 goto jump_min_until_3
;
1465 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx
->pattern
, ctx
->ptr
));
1467 case JUMP_MAX_UNTIL_1
:
1468 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx
->pattern
, ctx
->ptr
));
1469 goto jump_max_until_1
;
1470 case JUMP_MIN_UNTIL_1
:
1471 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx
->pattern
, ctx
->ptr
));
1472 goto jump_min_until_1
;
1474 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx
->pattern
, ctx
->ptr
));
1476 case JUMP_REPEAT_ONE_1
:
1477 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx
->pattern
, ctx
->ptr
));
1478 goto jump_repeat_one_1
;
1479 case JUMP_REPEAT_ONE_2
:
1480 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx
->pattern
, ctx
->ptr
));
1481 goto jump_repeat_one_2
;
1482 case JUMP_MIN_REPEAT_ONE
:
1483 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx
->pattern
, ctx
->ptr
));
1484 goto jump_min_repeat_one
;
1486 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx
->pattern
, ctx
->ptr
));
1488 case JUMP_ASSERT_NOT
:
1489 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx
->pattern
, ctx
->ptr
));
1490 goto jump_assert_not
;
1492 TRACE(("|%p|%p|RETURN %" PY_FORMAT_SIZE_T
"d\n", ctx
->pattern
,
1497 return ret
; /* should never get here */
1501 SRE_SEARCH(SRE_STATE
* state
, SRE_CODE
* pattern
)
1503 SRE_CHAR
* ptr
= (SRE_CHAR
*)state
->start
;
1504 SRE_CHAR
* end
= (SRE_CHAR
*)state
->end
;
1505 Py_ssize_t status
= 0;
1506 Py_ssize_t prefix_len
= 0;
1507 Py_ssize_t prefix_skip
= 0;
1508 SRE_CODE
* prefix
= NULL
;
1509 SRE_CODE
* charset
= NULL
;
1510 SRE_CODE
* overlap
= NULL
;
1513 if (pattern
[0] == SRE_OP_INFO
) {
1514 /* optimization info block */
1515 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1519 if (pattern
[3] > 1) {
1520 /* adjust end point (but make sure we leave at least one
1521 character in there, so literal search will work) */
1522 end
-= pattern
[3]-1;
1527 if (flags
& SRE_INFO_PREFIX
) {
1528 /* pattern starts with a known prefix */
1529 /* <length> <skip> <prefix data> <overlap data> */
1530 prefix_len
= pattern
[5];
1531 prefix_skip
= pattern
[6];
1532 prefix
= pattern
+ 7;
1533 overlap
= prefix
+ prefix_len
- 1;
1534 } else if (flags
& SRE_INFO_CHARSET
)
1535 /* pattern starts with a character from a known set */
1537 charset
= pattern
+ 5;
1539 pattern
+= 1 + pattern
[1];
1542 TRACE(("prefix = %p %" PY_FORMAT_SIZE_T
"d %" PY_FORMAT_SIZE_T
"d\n",
1543 prefix
, prefix_len
, prefix_skip
));
1544 TRACE(("charset = %p\n", charset
));
1546 #if defined(USE_FAST_SEARCH)
1547 if (prefix_len
> 1) {
1548 /* pattern starts with a known prefix. use the overlap
1549 table to skip forward as fast as we possibly can */
1551 end
= (SRE_CHAR
*)state
->end
;
1554 if ((SRE_CODE
) ptr
[0] != prefix
[i
]) {
1560 if (++i
== prefix_len
) {
1561 /* found a potential match */
1562 TRACE(("|%p|%p|SEARCH SCAN\n", pattern
, ptr
));
1563 state
->start
= ptr
+ 1 - prefix_len
;
1564 state
->ptr
= ptr
+ 1 - prefix_len
+ prefix_skip
;
1565 if (flags
& SRE_INFO_LITERAL
)
1566 return 1; /* we got all of it */
1567 status
= SRE_MATCH(state
, pattern
+ 2*prefix_skip
);
1570 /* close but no cigar -- try again */
1582 if (pattern
[0] == SRE_OP_LITERAL
) {
1583 /* pattern starts with a literal character. this is used
1584 for short prefixes, and if fast search is disabled */
1585 SRE_CODE chr
= pattern
[1];
1586 end
= (SRE_CHAR
*)state
->end
;
1588 while (ptr
< end
&& (SRE_CODE
) ptr
[0] != chr
)
1592 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern
, ptr
));
1595 if (flags
& SRE_INFO_LITERAL
)
1596 return 1; /* we got all of it */
1597 status
= SRE_MATCH(state
, pattern
+ 2);
1601 } else if (charset
) {
1602 /* pattern starts with a character from a known set */
1603 end
= (SRE_CHAR
*)state
->end
;
1605 while (ptr
< end
&& !SRE_CHARSET(charset
, ptr
[0]))
1609 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern
, ptr
));
1612 status
= SRE_MATCH(state
, pattern
);
1619 while (ptr
<= end
) {
1620 TRACE(("|%p|%p|SEARCH\n", pattern
, ptr
));
1621 state
->start
= state
->ptr
= ptr
++;
1622 status
= SRE_MATCH(state
, pattern
);
1631 SRE_LITERAL_TEMPLATE(SRE_CHAR
* ptr
, Py_ssize_t len
)
1633 /* check if given string is a literal template (i.e. no escapes) */
1640 #if !defined(SRE_RECURSIVE)
1642 /* -------------------------------------------------------------------- */
1643 /* factories and destructors */
1645 /* see sre.h for object declarations */
1646 static PyObject
*pattern_new_match(PatternObject
*, SRE_STATE
*, int);
1647 static PyObject
*pattern_scanner(PatternObject
*, PyObject
*);
1650 sre_codesize(PyObject
* self
, PyObject
*unused
)
1652 return PyInt_FromSize_t(sizeof(SRE_CODE
));
1656 sre_getlower(PyObject
* self
, PyObject
* args
)
1658 int character
, flags
;
1659 if (!PyArg_ParseTuple(args
, "ii", &character
, &flags
))
1661 if (flags
& SRE_FLAG_LOCALE
)
1662 return Py_BuildValue("i", sre_lower_locale(character
));
1663 if (flags
& SRE_FLAG_UNICODE
)
1664 #if defined(HAVE_UNICODE)
1665 return Py_BuildValue("i", sre_lower_unicode(character
));
1667 return Py_BuildValue("i", sre_lower_locale(character
));
1669 return Py_BuildValue("i", sre_lower(character
));
1673 state_reset(SRE_STATE
* state
)
1675 /* FIXME: dynamic! */
1676 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
1678 state
->lastmark
= -1;
1679 state
->lastindex
= -1;
1681 state
->repeat
= NULL
;
1683 data_stack_dealloc(state
);
1687 getstring(PyObject
* string
, Py_ssize_t
* p_length
, int* p_charsize
)
1689 /* given a python object, return a data pointer, a length (in
1690 characters), and a character size. return NULL if the object
1691 is not a string (or not compatible) */
1693 PyBufferProcs
*buffer
;
1694 Py_ssize_t size
, bytes
;
1698 #if defined(HAVE_UNICODE)
1699 if (PyUnicode_Check(string
)) {
1700 /* unicode strings doesn't always support the buffer interface */
1701 ptr
= (void*) PyUnicode_AS_DATA(string
);
1702 /* bytes = PyUnicode_GET_DATA_SIZE(string); */
1703 size
= PyUnicode_GET_SIZE(string
);
1704 charsize
= sizeof(Py_UNICODE
);
1709 /* get pointer to string buffer */
1710 buffer
= Py_TYPE(string
)->tp_as_buffer
;
1711 if (!buffer
|| !buffer
->bf_getreadbuffer
|| !buffer
->bf_getsegcount
||
1712 buffer
->bf_getsegcount(string
, NULL
) != 1) {
1713 PyErr_SetString(PyExc_TypeError
, "expected string or buffer");
1717 /* determine buffer size */
1718 bytes
= buffer
->bf_getreadbuffer(string
, 0, &ptr
);
1720 PyErr_SetString(PyExc_TypeError
, "buffer has negative size");
1724 /* determine character size */
1725 #if PY_VERSION_HEX >= 0x01060000
1726 size
= PyObject_Size(string
);
1728 size
= PyObject_Length(string
);
1731 if (PyString_Check(string
) || bytes
== size
)
1733 #if defined(HAVE_UNICODE)
1734 else if (bytes
== (Py_ssize_t
) (size
* sizeof(Py_UNICODE
)))
1735 charsize
= sizeof(Py_UNICODE
);
1738 PyErr_SetString(PyExc_TypeError
, "buffer size mismatch");
1742 #if defined(HAVE_UNICODE)
1747 *p_charsize
= charsize
;
1753 state_init(SRE_STATE
* state
, PatternObject
* pattern
, PyObject
* string
,
1754 Py_ssize_t start
, Py_ssize_t end
)
1756 /* prepare state object */
1762 memset(state
, 0, sizeof(SRE_STATE
));
1764 state
->lastmark
= -1;
1765 state
->lastindex
= -1;
1767 ptr
= getstring(string
, &length
, &charsize
);
1771 /* adjust boundaries */
1774 else if (start
> length
)
1779 else if (end
> length
)
1782 state
->charsize
= charsize
;
1784 state
->beginning
= ptr
;
1786 state
->start
= (void*) ((char*) ptr
+ start
* state
->charsize
);
1787 state
->end
= (void*) ((char*) ptr
+ end
* state
->charsize
);
1790 state
->string
= string
;
1792 state
->endpos
= end
;
1794 if (pattern
->flags
& SRE_FLAG_LOCALE
)
1795 state
->lower
= sre_lower_locale
;
1796 else if (pattern
->flags
& SRE_FLAG_UNICODE
)
1797 #if defined(HAVE_UNICODE)
1798 state
->lower
= sre_lower_unicode
;
1800 state
->lower
= sre_lower_locale
;
1803 state
->lower
= sre_lower
;
1809 state_fini(SRE_STATE
* state
)
1811 Py_XDECREF(state
->string
);
1812 data_stack_dealloc(state
);
1815 /* calculate offset from start of string */
1816 #define STATE_OFFSET(state, member)\
1817 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1820 state_getslice(SRE_STATE
* state
, Py_ssize_t index
, PyObject
* string
, int empty
)
1824 index
= (index
- 1) * 2;
1826 if (string
== Py_None
|| index
>= state
->lastmark
|| !state
->mark
[index
] || !state
->mark
[index
+1]) {
1828 /* want empty string */
1835 i
= STATE_OFFSET(state
, state
->mark
[index
]);
1836 j
= STATE_OFFSET(state
, state
->mark
[index
+1]);
1839 return PySequence_GetSlice(string
, i
, j
);
1843 pattern_error(int status
)
1846 case SRE_ERROR_RECURSION_LIMIT
:
1849 "maximum recursion limit exceeded"
1852 case SRE_ERROR_MEMORY
:
1855 case SRE_ERROR_INTERRUPTED
:
1856 /* An exception has already been raised, so let it fly */
1859 /* other error codes indicate compiler/engine bugs */
1862 "internal error in regular expression engine"
1868 pattern_dealloc(PatternObject
* self
)
1870 if (self
->weakreflist
!= NULL
)
1871 PyObject_ClearWeakRefs((PyObject
*) self
);
1872 Py_XDECREF(self
->pattern
);
1873 Py_XDECREF(self
->groupindex
);
1874 Py_XDECREF(self
->indexgroup
);
1879 check_args_size(const char *name
, PyObject
* args
, PyObject
* kw
, int n
)
1881 Py_ssize_t m
= PyTuple_GET_SIZE(args
) + (kw
? PyDict_Size(kw
) : 0);
1884 PyErr_Format(PyExc_TypeError
,
1885 "%s() takes at most %d positional arguments (%zd given)",
1891 fix_string_param(PyObject
*string
, PyObject
*string2
, const char *oldname
)
1893 if (string2
!= NULL
) {
1895 if (string
!= NULL
) {
1896 PyErr_Format(PyExc_TypeError
,
1897 "Argument given by name ('%s') and position (1)",
1901 sprintf(buf
, "The '%s' keyword parameter name is deprecated. "
1902 "Use 'string' instead.", oldname
);
1903 if (PyErr_Warn(PyExc_DeprecationWarning
, buf
) < 0)
1907 if (string
== NULL
) {
1908 PyErr_SetString(PyExc_TypeError
,
1909 "Required argument 'string' (pos 1) not found");
1916 pattern_match(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
1921 PyObject
*string
= NULL
, *string2
= NULL
;
1922 Py_ssize_t start
= 0;
1923 Py_ssize_t end
= PY_SSIZE_T_MAX
;
1924 static char* kwlist
[] = { "string", "pos", "endpos", "pattern", NULL
};
1925 if (!check_args_size("match", args
, kw
, 3))
1928 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "|OnnO:match", kwlist
,
1929 &string
, &start
, &end
, &string2
))
1932 string
= fix_string_param(string
, string2
, "pattern");
1936 string
= state_init(&state
, self
, string
, start
, end
);
1940 state
.ptr
= state
.start
;
1942 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self
), state
.ptr
));
1944 if (state
.charsize
== 1) {
1945 status
= sre_match(&state
, PatternObject_GetCode(self
));
1947 #if defined(HAVE_UNICODE)
1948 status
= sre_umatch(&state
, PatternObject_GetCode(self
));
1952 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self
), state
.ptr
));
1953 if (PyErr_Occurred())
1958 return pattern_new_match(self
, &state
, status
);
1962 pattern_search(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
1967 PyObject
*string
= NULL
, *string2
= NULL
;
1968 Py_ssize_t start
= 0;
1969 Py_ssize_t end
= PY_SSIZE_T_MAX
;
1970 static char* kwlist
[] = { "string", "pos", "endpos", "pattern", NULL
};
1971 if (!check_args_size("search", args
, kw
, 3))
1974 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "|OnnO:search", kwlist
,
1975 &string
, &start
, &end
, &string2
))
1978 string
= fix_string_param(string
, string2
, "pattern");
1982 string
= state_init(&state
, self
, string
, start
, end
);
1986 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self
), state
.ptr
));
1988 if (state
.charsize
== 1) {
1989 status
= sre_search(&state
, PatternObject_GetCode(self
));
1991 #if defined(HAVE_UNICODE)
1992 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
1996 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self
), state
.ptr
));
2000 if (PyErr_Occurred())
2003 return pattern_new_match(self
, &state
, status
);
2007 call(char* module
, char* function
, PyObject
* args
)
2016 name
= PyString_FromString(module
);
2019 mod
= PyImport_Import(name
);
2023 func
= PyObject_GetAttrString(mod
, function
);
2027 result
= PyObject_CallObject(func
, args
);
2033 #ifdef USE_BUILTIN_COPY
2035 deepcopy(PyObject
** object
, PyObject
* memo
)
2041 PyTuple_Pack(2, *object
, memo
)
2049 return 1; /* success */
2054 join_list(PyObject
* list
, PyObject
* string
)
2056 /* join list elements */
2059 #if PY_VERSION_HEX >= 0x01060000
2065 joiner
= PySequence_GetSlice(string
, 0, 0);
2069 if (PyList_GET_SIZE(list
) == 0) {
2074 #if PY_VERSION_HEX >= 0x01060000
2075 function
= PyObject_GetAttrString(joiner
, "join");
2080 args
= PyTuple_New(1);
2082 Py_DECREF(function
);
2086 PyTuple_SET_ITEM(args
, 0, list
);
2087 result
= PyObject_CallObject(function
, args
);
2088 Py_DECREF(args
); /* also removes list */
2089 Py_DECREF(function
);
2093 PyTuple_Pack(2, list
, joiner
)
2102 pattern_findall(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2109 PyObject
*string
= NULL
, *string2
= NULL
;
2110 Py_ssize_t start
= 0;
2111 Py_ssize_t end
= PY_SSIZE_T_MAX
;
2112 static char* kwlist
[] = { "string", "pos", "endpos", "source", NULL
};
2113 if (!check_args_size("findall", args
, kw
, 3))
2116 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "|OnnO:findall", kwlist
,
2117 &string
, &start
, &end
, &string2
))
2120 string
= fix_string_param(string
, string2
, "source");
2124 string
= state_init(&state
, self
, string
, start
, end
);
2128 list
= PyList_New(0);
2134 while (state
.start
<= state
.end
) {
2138 state_reset(&state
);
2140 state
.ptr
= state
.start
;
2142 if (state
.charsize
== 1) {
2143 status
= sre_search(&state
, PatternObject_GetCode(self
));
2145 #if defined(HAVE_UNICODE)
2146 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
2150 if (PyErr_Occurred())
2156 pattern_error(status
);
2160 /* don't bother to build a match object */
2161 switch (self
->groups
) {
2163 b
= STATE_OFFSET(&state
, state
.start
);
2164 e
= STATE_OFFSET(&state
, state
.ptr
);
2165 item
= PySequence_GetSlice(string
, b
, e
);
2170 item
= state_getslice(&state
, 1, string
, 1);
2175 item
= PyTuple_New(self
->groups
);
2178 for (i
= 0; i
< self
->groups
; i
++) {
2179 PyObject
* o
= state_getslice(&state
, i
+1, string
, 1);
2184 PyTuple_SET_ITEM(item
, i
, o
);
2189 status
= PyList_Append(list
, item
);
2194 if (state
.ptr
== state
.start
)
2195 state
.start
= (void*) ((char*) state
.ptr
+ state
.charsize
);
2197 state
.start
= state
.ptr
;
2211 #if PY_VERSION_HEX >= 0x02020000
2213 pattern_finditer(PatternObject
* pattern
, PyObject
* args
)
2219 scanner
= pattern_scanner(pattern
, args
);
2223 search
= PyObject_GetAttrString(scanner
, "search");
2228 iterator
= PyCallIter_New(search
, Py_None
);
2236 pattern_split(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2246 PyObject
*string
= NULL
, *string2
= NULL
;
2247 Py_ssize_t maxsplit
= 0;
2248 static char* kwlist
[] = { "string", "maxsplit", "source", NULL
};
2249 if (!check_args_size("split", args
, kw
, 2))
2252 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "|OnO:split", kwlist
,
2253 &string
, &maxsplit
, &string2
))
2256 string
= fix_string_param(string
, string2
, "source");
2260 string
= state_init(&state
, self
, string
, 0, PY_SSIZE_T_MAX
);
2264 list
= PyList_New(0);
2273 while (!maxsplit
|| n
< maxsplit
) {
2275 state_reset(&state
);
2277 state
.ptr
= state
.start
;
2279 if (state
.charsize
== 1) {
2280 status
= sre_search(&state
, PatternObject_GetCode(self
));
2282 #if defined(HAVE_UNICODE)
2283 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
2287 if (PyErr_Occurred())
2293 pattern_error(status
);
2297 if (state
.start
== state
.ptr
) {
2298 if (last
== state
.end
)
2300 /* skip one character */
2301 state
.start
= (void*) ((char*) state
.ptr
+ state
.charsize
);
2305 /* get segment before this match */
2306 item
= PySequence_GetSlice(
2307 string
, STATE_OFFSET(&state
, last
),
2308 STATE_OFFSET(&state
, state
.start
)
2312 status
= PyList_Append(list
, item
);
2317 /* add groups (if any) */
2318 for (i
= 0; i
< self
->groups
; i
++) {
2319 item
= state_getslice(&state
, i
+1, string
, 0);
2322 status
= PyList_Append(list
, item
);
2330 last
= state
.start
= state
.ptr
;
2334 /* get segment following last match (even if empty) */
2335 item
= PySequence_GetSlice(
2336 string
, STATE_OFFSET(&state
, last
), state
.endpos
2340 status
= PyList_Append(list
, item
);
2356 pattern_subx(PatternObject
* self
, PyObject
* ptemplate
, PyObject
* string
,
2357 Py_ssize_t count
, Py_ssize_t subn
)
2370 int filter_is_callable
;
2372 if (PyCallable_Check(ptemplate
)) {
2373 /* sub/subn takes either a function or a template */
2376 filter_is_callable
= 1;
2378 /* if not callable, check if it's a literal string */
2380 ptr
= getstring(ptemplate
, &n
, &bint
);
2384 literal
= sre_literal_template((unsigned char *)ptr
, n
);
2386 #if defined(HAVE_UNICODE)
2387 literal
= sre_uliteral_template((Py_UNICODE
*)ptr
, n
);
2397 filter_is_callable
= 0;
2399 /* not a literal; hand it over to the template compiler */
2401 SRE_PY_MODULE
, "_subx",
2402 PyTuple_Pack(2, self
, ptemplate
)
2406 filter_is_callable
= PyCallable_Check(filter
);
2410 string
= state_init(&state
, self
, string
, 0, PY_SSIZE_T_MAX
);
2416 list
= PyList_New(0);
2425 while (!count
|| n
< count
) {
2427 state_reset(&state
);
2429 state
.ptr
= state
.start
;
2431 if (state
.charsize
== 1) {
2432 status
= sre_search(&state
, PatternObject_GetCode(self
));
2434 #if defined(HAVE_UNICODE)
2435 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
2439 if (PyErr_Occurred())
2445 pattern_error(status
);
2449 b
= STATE_OFFSET(&state
, state
.start
);
2450 e
= STATE_OFFSET(&state
, state
.ptr
);
2453 /* get segment before this match */
2454 item
= PySequence_GetSlice(string
, i
, b
);
2457 status
= PyList_Append(list
, item
);
2462 } else if (i
== b
&& i
== e
&& n
> 0)
2463 /* ignore empty match on latest position */
2466 if (filter_is_callable
) {
2467 /* pass match object through filter */
2468 match
= pattern_new_match(self
, &state
, 1);
2471 args
= PyTuple_Pack(1, match
);
2476 item
= PyObject_CallObject(filter
, args
);
2482 /* filter is literal string */
2488 if (item
!= Py_None
) {
2489 status
= PyList_Append(list
, item
);
2500 if (state
.ptr
== state
.start
)
2501 state
.start
= (void*) ((char*) state
.ptr
+ state
.charsize
);
2503 state
.start
= state
.ptr
;
2507 /* get segment following last match */
2508 if (i
< state
.endpos
) {
2509 item
= PySequence_GetSlice(string
, i
, state
.endpos
);
2512 status
= PyList_Append(list
, item
);
2522 /* convert list to single string (also removes list) */
2523 item
= join_list(list
, string
);
2529 return Py_BuildValue("Nn", item
, n
);
2542 pattern_sub(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2544 PyObject
* ptemplate
;
2546 Py_ssize_t count
= 0;
2547 static char* kwlist
[] = { "repl", "string", "count", NULL
};
2548 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "OO|n:sub", kwlist
,
2549 &ptemplate
, &string
, &count
))
2552 return pattern_subx(self
, ptemplate
, string
, count
, 0);
2556 pattern_subn(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2558 PyObject
* ptemplate
;
2560 Py_ssize_t count
= 0;
2561 static char* kwlist
[] = { "repl", "string", "count", NULL
};
2562 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "OO|n:subn", kwlist
,
2563 &ptemplate
, &string
, &count
))
2566 return pattern_subx(self
, ptemplate
, string
, count
, 1);
2570 pattern_copy(PatternObject
* self
, PyObject
*unused
)
2572 #ifdef USE_BUILTIN_COPY
2573 PatternObject
* copy
;
2576 copy
= PyObject_NEW_VAR(PatternObject
, &Pattern_Type
, self
->codesize
);
2580 offset
= offsetof(PatternObject
, groups
);
2582 Py_XINCREF(self
->groupindex
);
2583 Py_XINCREF(self
->indexgroup
);
2584 Py_XINCREF(self
->pattern
);
2586 memcpy((char*) copy
+ offset
, (char*) self
+ offset
,
2587 sizeof(PatternObject
) + self
->codesize
* sizeof(SRE_CODE
) - offset
);
2588 copy
->weakreflist
= NULL
;
2590 return (PyObject
*) copy
;
2592 PyErr_SetString(PyExc_TypeError
, "cannot copy this pattern object");
2598 pattern_deepcopy(PatternObject
* self
, PyObject
* memo
)
2600 #ifdef USE_BUILTIN_COPY
2601 PatternObject
* copy
;
2603 copy
= (PatternObject
*) pattern_copy(self
);
2607 if (!deepcopy(©
->groupindex
, memo
) ||
2608 !deepcopy(©
->indexgroup
, memo
) ||
2609 !deepcopy(©
->pattern
, memo
)) {
2615 PyErr_SetString(PyExc_TypeError
, "cannot deepcopy this pattern object");
2620 PyDoc_STRVAR(pattern_match_doc
,
2621 "match(string[, pos[, endpos]]) --> match object or None.\n\
2622 Matches zero or more characters at the beginning of the string");
2624 PyDoc_STRVAR(pattern_search_doc
,
2625 "search(string[, pos[, endpos]]) --> match object or None.\n\
2626 Scan through string looking for a match, and return a corresponding\n\
2627 match object instance. Return None if no position in the string matches.");
2629 PyDoc_STRVAR(pattern_split_doc
,
2630 "split(string[, maxsplit = 0]) --> list.\n\
2631 Split string by the occurrences of pattern.");
2633 PyDoc_STRVAR(pattern_findall_doc
,
2634 "findall(string[, pos[, endpos]]) --> list.\n\
2635 Return a list of all non-overlapping matches of pattern in string.");
2637 PyDoc_STRVAR(pattern_finditer_doc
,
2638 "finditer(string[, pos[, endpos]]) --> iterator.\n\
2639 Return an iterator over all non-overlapping matches for the \n\
2640 RE pattern in string. For each match, the iterator returns a\n\
2643 PyDoc_STRVAR(pattern_sub_doc
,
2644 "sub(repl, string[, count = 0]) --> newstring\n\
2645 Return the string obtained by replacing the leftmost non-overlapping\n\
2646 occurrences of pattern in string by the replacement repl.");
2648 PyDoc_STRVAR(pattern_subn_doc
,
2649 "subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2650 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2651 the leftmost non-overlapping occurrences of pattern with the\n\
2652 replacement repl.");
2654 PyDoc_STRVAR(pattern_doc
, "Compiled regular expression objects");
2656 static PyMethodDef pattern_methods
[] = {
2657 {"match", (PyCFunction
) pattern_match
, METH_VARARGS
|METH_KEYWORDS
,
2659 {"search", (PyCFunction
) pattern_search
, METH_VARARGS
|METH_KEYWORDS
,
2660 pattern_search_doc
},
2661 {"sub", (PyCFunction
) pattern_sub
, METH_VARARGS
|METH_KEYWORDS
,
2663 {"subn", (PyCFunction
) pattern_subn
, METH_VARARGS
|METH_KEYWORDS
,
2665 {"split", (PyCFunction
) pattern_split
, METH_VARARGS
|METH_KEYWORDS
,
2667 {"findall", (PyCFunction
) pattern_findall
, METH_VARARGS
|METH_KEYWORDS
,
2668 pattern_findall_doc
},
2669 #if PY_VERSION_HEX >= 0x02020000
2670 {"finditer", (PyCFunction
) pattern_finditer
, METH_VARARGS
,
2671 pattern_finditer_doc
},
2673 {"scanner", (PyCFunction
) pattern_scanner
, METH_VARARGS
},
2674 {"__copy__", (PyCFunction
) pattern_copy
, METH_NOARGS
},
2675 {"__deepcopy__", (PyCFunction
) pattern_deepcopy
, METH_O
},
2679 #define PAT_OFF(x) offsetof(PatternObject, x)
2680 static PyMemberDef pattern_members
[] = {
2681 {"pattern", T_OBJECT
, PAT_OFF(pattern
), READONLY
},
2682 {"flags", T_INT
, PAT_OFF(flags
), READONLY
},
2683 {"groups", T_PYSSIZET
, PAT_OFF(groups
), READONLY
},
2684 {"groupindex", T_OBJECT
, PAT_OFF(groupindex
), READONLY
},
2685 {NULL
} /* Sentinel */
2688 statichere PyTypeObject Pattern_Type
= {
2689 PyObject_HEAD_INIT(NULL
)
2690 0, "_" SRE_MODULE
".SRE_Pattern",
2691 sizeof(PatternObject
), sizeof(SRE_CODE
),
2692 (destructor
)pattern_dealloc
, /*tp_dealloc*/
2694 0, /* tp_getattrn */
2698 0, /* tp_as_number */
2699 0, /* tp_as_sequence */
2700 0, /* tp_as_mapping */
2704 0, /* tp_getattro */
2705 0, /* tp_setattro */
2706 0, /* tp_as_buffer */
2707 Py_TPFLAGS_DEFAULT
, /* tp_flags */
2708 pattern_doc
, /* tp_doc */
2709 0, /* tp_traverse */
2711 0, /* tp_richcompare */
2712 offsetof(PatternObject
, weakreflist
), /* tp_weaklistoffset */
2714 0, /* tp_iternext */
2715 pattern_methods
, /* tp_methods */
2716 pattern_members
, /* tp_members */
2719 static int _validate(PatternObject
*self
); /* Forward */
2722 _compile(PyObject
* self_
, PyObject
* args
)
2724 /* "compile" pattern descriptor to pattern object */
2726 PatternObject
* self
;
2732 Py_ssize_t groups
= 0;
2733 PyObject
* groupindex
= NULL
;
2734 PyObject
* indexgroup
= NULL
;
2735 if (!PyArg_ParseTuple(args
, "OiO!|nOO", &pattern
, &flags
,
2736 &PyList_Type
, &code
, &groups
,
2737 &groupindex
, &indexgroup
))
2740 n
= PyList_GET_SIZE(code
);
2741 /* coverity[ampersand_in_size] */
2742 self
= PyObject_NEW_VAR(PatternObject
, &Pattern_Type
, n
);
2745 self
->weakreflist
= NULL
;
2746 self
->pattern
= NULL
;
2747 self
->groupindex
= NULL
;
2748 self
->indexgroup
= NULL
;
2752 for (i
= 0; i
< n
; i
++) {
2753 PyObject
*o
= PyList_GET_ITEM(code
, i
);
2754 unsigned long value
= PyInt_Check(o
) ? (unsigned long)PyInt_AsLong(o
)
2755 : PyLong_AsUnsignedLong(o
);
2756 if (value
== (unsigned long)-1 && PyErr_Occurred()) {
2757 if (PyErr_ExceptionMatches(PyExc_OverflowError
)) {
2758 PyErr_SetString(PyExc_OverflowError
,
2759 "regular expression code size limit exceeded");
2763 self
->code
[i
] = (SRE_CODE
) value
;
2764 if ((unsigned long) self
->code
[i
] != value
) {
2765 PyErr_SetString(PyExc_OverflowError
,
2766 "regular expression code size limit exceeded");
2771 if (PyErr_Occurred()) {
2777 self
->pattern
= pattern
;
2779 self
->flags
= flags
;
2781 self
->groups
= groups
;
2783 Py_XINCREF(groupindex
);
2784 self
->groupindex
= groupindex
;
2786 Py_XINCREF(indexgroup
);
2787 self
->indexgroup
= indexgroup
;
2789 self
->weakreflist
= NULL
;
2791 if (!_validate(self
)) {
2796 return (PyObject
*) self
;
2799 /* -------------------------------------------------------------------- */
2800 /* Code validation */
2802 /* To learn more about this code, have a look at the _compile() function in
2803 Lib/sre_compile.py. The validation functions below checks the code array
2804 for conformance with the code patterns generated there.
2806 The nice thing about the generated code is that it is position-independent:
2807 all jumps are relative jumps forward. Also, jumps don't cross each other:
2808 the target of a later jump is always earlier than the target of an earlier
2809 jump. IOW, this is okay:
2811 J---------J-------T--------T
2813 \______________________/
2817 J---------J-------T--------T
2821 It also helps that SRE_CODE is always an unsigned type.
2824 /* Defining this one enables tracing of the validator */
2827 /* Trace macro for the validator */
2828 #if defined(VVERBOSE)
2829 #define VTRACE(v) printf v
2831 #define VTRACE(v) do {} while(0) /* do nothing */
2834 /* Report failure */
2835 #define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2837 /* Extract opcode, argument, or skip count from code array */
2840 VTRACE(("%p: ", code)); \
2841 if (code >= end) FAIL; \
2843 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2847 VTRACE(("%p= ", code)); \
2848 if (code >= end) FAIL; \
2850 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2852 #define GET_SKIP_ADJ(adj) \
2854 VTRACE(("%p= ", code)); \
2855 if (code >= end) FAIL; \
2857 VTRACE(("%lu (skip to %p)\n", \
2858 (unsigned long)skip, code+skip)); \
2859 if (skip-adj > end-code) \
2863 #define GET_SKIP GET_SKIP_ADJ(0)
2866 _validate_charset(SRE_CODE
*code
, SRE_CODE
*end
)
2868 /* Some variables are manipulated by the macros above */
2874 while (code
< end
) {
2881 case SRE_OP_LITERAL
:
2890 case SRE_OP_CHARSET
:
2891 offset
= 32/sizeof(SRE_CODE
); /* 32-byte bitmap */
2892 if (offset
> end
-code
)
2897 case SRE_OP_BIGCHARSET
:
2898 GET_ARG
; /* Number of blocks */
2899 offset
= 256/sizeof(SRE_CODE
); /* 256-byte table */
2900 if (offset
> end
-code
)
2902 /* Make sure that each byte points to a valid block */
2903 for (i
= 0; i
< 256; i
++) {
2904 if (((unsigned char *)code
)[i
] >= arg
)
2908 offset
= arg
* 32/sizeof(SRE_CODE
); /* 32-byte bitmap times arg */
2909 if (offset
> end
-code
)
2914 case SRE_OP_CATEGORY
:
2917 case SRE_CATEGORY_DIGIT
:
2918 case SRE_CATEGORY_NOT_DIGIT
:
2919 case SRE_CATEGORY_SPACE
:
2920 case SRE_CATEGORY_NOT_SPACE
:
2921 case SRE_CATEGORY_WORD
:
2922 case SRE_CATEGORY_NOT_WORD
:
2923 case SRE_CATEGORY_LINEBREAK
:
2924 case SRE_CATEGORY_NOT_LINEBREAK
:
2925 case SRE_CATEGORY_LOC_WORD
:
2926 case SRE_CATEGORY_LOC_NOT_WORD
:
2927 case SRE_CATEGORY_UNI_DIGIT
:
2928 case SRE_CATEGORY_UNI_NOT_DIGIT
:
2929 case SRE_CATEGORY_UNI_SPACE
:
2930 case SRE_CATEGORY_UNI_NOT_SPACE
:
2931 case SRE_CATEGORY_UNI_WORD
:
2932 case SRE_CATEGORY_UNI_NOT_WORD
:
2933 case SRE_CATEGORY_UNI_LINEBREAK
:
2934 case SRE_CATEGORY_UNI_NOT_LINEBREAK
:
2951 _validate_inner(SRE_CODE
*code
, SRE_CODE
*end
, Py_ssize_t groups
)
2953 /* Some variables are manipulated by the macros above */
2958 VTRACE(("code=%p, end=%p\n", code
, end
));
2963 while (code
< end
) {
2968 /* We don't check whether marks are properly nested; the
2969 sre_match() code is robust even if they don't, and the worst
2970 you can get is nonsensical match results. */
2972 if (arg
> 2*groups
+1) {
2973 VTRACE(("arg=%d, groups=%d\n", (int)arg
, (int)groups
));
2978 case SRE_OP_LITERAL
:
2979 case SRE_OP_NOT_LITERAL
:
2980 case SRE_OP_LITERAL_IGNORE
:
2981 case SRE_OP_NOT_LITERAL_IGNORE
:
2983 /* The arg is just a character, nothing to check */
2986 case SRE_OP_SUCCESS
:
2987 case SRE_OP_FAILURE
:
2988 /* Nothing to check; these normally end the matching process */
2994 case SRE_AT_BEGINNING
:
2995 case SRE_AT_BEGINNING_STRING
:
2996 case SRE_AT_BEGINNING_LINE
:
2998 case SRE_AT_END_LINE
:
2999 case SRE_AT_END_STRING
:
3000 case SRE_AT_BOUNDARY
:
3001 case SRE_AT_NON_BOUNDARY
:
3002 case SRE_AT_LOC_BOUNDARY
:
3003 case SRE_AT_LOC_NON_BOUNDARY
:
3004 case SRE_AT_UNI_BOUNDARY
:
3005 case SRE_AT_UNI_NON_BOUNDARY
:
3013 case SRE_OP_ANY_ALL
:
3014 /* These have no operands */
3018 case SRE_OP_IN_IGNORE
:
3020 /* Stop 1 before the end; we check the FAILURE below */
3021 if (!_validate_charset(code
, code
+skip
-2))
3023 if (code
[skip
-2] != SRE_OP_FAILURE
)
3030 /* A minimal info field is
3031 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
3032 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
3037 newcode
= code
+skip
-1;
3038 GET_ARG
; flags
= arg
;
3041 /* Check that only valid flags are present */
3042 if ((flags
& ~(SRE_INFO_PREFIX
|
3044 SRE_INFO_CHARSET
)) != 0)
3046 /* PREFIX and CHARSET are mutually exclusive */
3047 if ((flags
& SRE_INFO_PREFIX
) &&
3048 (flags
& SRE_INFO_CHARSET
))
3050 /* LITERAL implies PREFIX */
3051 if ((flags
& SRE_INFO_LITERAL
) &&
3052 !(flags
& SRE_INFO_PREFIX
))
3054 /* Validate the prefix */
3055 if (flags
& SRE_INFO_PREFIX
) {
3056 SRE_CODE prefix_len
;
3057 GET_ARG
; prefix_len
= arg
;
3058 GET_ARG
; /* prefix skip */
3059 /* Here comes the prefix string */
3060 if (prefix_len
> newcode
-code
)
3063 /* And here comes the overlap table */
3064 if (prefix_len
> newcode
-code
)
3066 /* Each overlap value should be < prefix_len */
3067 for (i
= 0; i
< prefix_len
; i
++) {
3068 if (code
[i
] >= prefix_len
)
3073 /* Validate the charset */
3074 if (flags
& SRE_INFO_CHARSET
) {
3075 if (!_validate_charset(code
, newcode
-1))
3077 if (newcode
[-1] != SRE_OP_FAILURE
)
3081 else if (code
!= newcode
) {
3082 VTRACE(("code=%p, newcode=%p\n", code
, newcode
));
3090 SRE_CODE
*target
= NULL
;
3095 /* Stop 2 before the end; we check the JUMP below */
3096 if (!_validate_inner(code
, code
+skip
-3, groups
))
3099 /* Check that it ends with a JUMP, and that each JUMP
3100 has the same target */
3102 if (op
!= SRE_OP_JUMP
)
3106 target
= code
+skip
-1;
3107 else if (code
+skip
-1 != target
)
3113 case SRE_OP_REPEAT_ONE
:
3114 case SRE_OP_MIN_REPEAT_ONE
:
3122 if (max
> SRE_MAXREPEAT
)
3124 if (!_validate_inner(code
, code
+skip
-4, groups
))
3128 if (op
!= SRE_OP_SUCCESS
)
3141 if (max
> SRE_MAXREPEAT
)
3143 if (!_validate_inner(code
, code
+skip
-3, groups
))
3147 if (op
!= SRE_OP_MAX_UNTIL
&& op
!= SRE_OP_MIN_UNTIL
)
3152 case SRE_OP_GROUPREF
:
3153 case SRE_OP_GROUPREF_IGNORE
:
3159 case SRE_OP_GROUPREF_EXISTS
:
3160 /* The regex syntax for this is: '(?(group)then|else)', where
3161 'group' is either an integer group number or a group name,
3162 'then' and 'else' are sub-regexes, and 'else' is optional. */
3167 code
--; /* The skip is relative to the first arg! */
3168 /* There are two possibilities here: if there is both a 'then'
3169 part and an 'else' part, the generated code looks like:
3177 (<skipyes> jumps here)
3179 (<skipno> jumps here)
3181 If there is only a 'then' part, it looks like:
3189 There is no direct way to decide which it is, and we don't want
3190 to allow arbitrary jumps anywhere in the code; so we just look
3191 for a JUMP opcode preceding our skip target.
3193 if (skip
>= 3 && skip
-3 < end
-code
&&
3194 code
[skip
-3] == SRE_OP_JUMP
)
3196 VTRACE(("both then and else parts present\n"));
3197 if (!_validate_inner(code
+1, code
+skip
-3, groups
))
3199 code
+= skip
-2; /* Position after JUMP, at <skipno> */
3201 if (!_validate_inner(code
, code
+skip
-1, groups
))
3206 VTRACE(("only a then part present\n"));
3207 if (!_validate_inner(code
+1, code
+skip
-1, groups
))
3214 case SRE_OP_ASSERT_NOT
:
3216 GET_ARG
; /* 0 for lookahead, width for lookbehind */
3217 code
--; /* Back up over arg to simplify math below */
3218 if (arg
& 0x80000000)
3219 FAIL
; /* Width too large */
3220 /* Stop 1 before the end; we check the SUCCESS below */
3221 if (!_validate_inner(code
+1, code
+skip
-2, groups
))
3225 if (op
!= SRE_OP_SUCCESS
)
3240 _validate_outer(SRE_CODE
*code
, SRE_CODE
*end
, Py_ssize_t groups
)
3242 if (groups
< 0 || groups
> 100 || code
>= end
|| end
[-1] != SRE_OP_SUCCESS
)
3244 if (groups
== 0) /* fix for simplejson */
3245 groups
= 100; /* 100 groups should always be safe */
3246 return _validate_inner(code
, end
-1, groups
);
3250 _validate(PatternObject
*self
)
3252 if (!_validate_outer(self
->code
, self
->code
+self
->codesize
, self
->groups
))
3254 PyErr_SetString(PyExc_RuntimeError
, "invalid SRE code");
3258 VTRACE(("Success!\n"));
3262 /* -------------------------------------------------------------------- */
3266 match_dealloc(MatchObject
* self
)
3268 Py_XDECREF(self
->regs
);
3269 Py_XDECREF(self
->string
);
3270 Py_DECREF(self
->pattern
);
3275 match_getslice_by_index(MatchObject
* self
, Py_ssize_t index
, PyObject
* def
)
3277 if (index
< 0 || index
>= self
->groups
) {
3278 /* raise IndexError if we were given a bad group number */
3288 if (self
->string
== Py_None
|| self
->mark
[index
] < 0) {
3289 /* return default value if the string or group is undefined */
3294 return PySequence_GetSlice(
3295 self
->string
, self
->mark
[index
], self
->mark
[index
+1]
3300 match_getindex(MatchObject
* self
, PyObject
* index
)
3304 if (PyInt_Check(index
) || PyLong_Check(index
))
3305 return PyInt_AsSsize_t(index
);
3309 if (self
->pattern
->groupindex
) {
3310 index
= PyObject_GetItem(self
->pattern
->groupindex
, index
);
3312 if (PyInt_Check(index
) || PyLong_Check(index
))
3313 i
= PyInt_AsSsize_t(index
);
3323 match_getslice(MatchObject
* self
, PyObject
* index
, PyObject
* def
)
3325 return match_getslice_by_index(self
, match_getindex(self
, index
), def
);
3329 match_expand(MatchObject
* self
, PyObject
* ptemplate
)
3331 /* delegate to Python code */
3333 SRE_PY_MODULE
, "_expand",
3334 PyTuple_Pack(3, self
->pattern
, self
, ptemplate
)
3339 match_group(MatchObject
* self
, PyObject
* args
)
3344 size
= PyTuple_GET_SIZE(args
);
3348 result
= match_getslice(self
, Py_False
, Py_None
);
3351 result
= match_getslice(self
, PyTuple_GET_ITEM(args
, 0), Py_None
);
3354 /* fetch multiple items */
3355 result
= PyTuple_New(size
);
3358 for (i
= 0; i
< size
; i
++) {
3359 PyObject
* item
= match_getslice(
3360 self
, PyTuple_GET_ITEM(args
, i
), Py_None
3366 PyTuple_SET_ITEM(result
, i
, item
);
3374 match_groups(MatchObject
* self
, PyObject
* args
, PyObject
* kw
)
3379 PyObject
* def
= Py_None
;
3380 static char* kwlist
[] = { "default", NULL
};
3381 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "|O:groups", kwlist
, &def
))
3384 result
= PyTuple_New(self
->groups
-1);
3388 for (index
= 1; index
< self
->groups
; index
++) {
3390 item
= match_getslice_by_index(self
, index
, def
);
3395 PyTuple_SET_ITEM(result
, index
-1, item
);
3402 match_groupdict(MatchObject
* self
, PyObject
* args
, PyObject
* kw
)
3408 PyObject
* def
= Py_None
;
3409 static char* kwlist
[] = { "default", NULL
};
3410 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "|O:groupdict", kwlist
, &def
))
3413 result
= PyDict_New();
3414 if (!result
|| !self
->pattern
->groupindex
)
3417 keys
= PyMapping_Keys(self
->pattern
->groupindex
);
3421 for (index
= 0; index
< PyList_GET_SIZE(keys
); index
++) {
3425 key
= PyList_GET_ITEM(keys
, index
);
3428 value
= match_getslice(self
, key
, def
);
3433 status
= PyDict_SetItem(result
, key
, value
);
3450 match_start(MatchObject
* self
, PyObject
* args
)
3454 PyObject
* index_
= Py_False
; /* zero */
3455 if (!PyArg_UnpackTuple(args
, "start", 0, 1, &index_
))
3458 index
= match_getindex(self
, index_
);
3460 if (index
< 0 || index
>= self
->groups
) {
3468 /* mark is -1 if group is undefined */
3469 return PyInt_FromSsize_t(self
->mark
[index
*2]);
3473 match_end(MatchObject
* self
, PyObject
* args
)
3477 PyObject
* index_
= Py_False
; /* zero */
3478 if (!PyArg_UnpackTuple(args
, "end", 0, 1, &index_
))
3481 index
= match_getindex(self
, index_
);
3483 if (index
< 0 || index
>= self
->groups
) {
3491 /* mark is -1 if group is undefined */
3492 return PyInt_FromSsize_t(self
->mark
[index
*2+1]);
3496 _pair(Py_ssize_t i1
, Py_ssize_t i2
)
3501 pair
= PyTuple_New(2);
3505 item
= PyInt_FromSsize_t(i1
);
3508 PyTuple_SET_ITEM(pair
, 0, item
);
3510 item
= PyInt_FromSsize_t(i2
);
3513 PyTuple_SET_ITEM(pair
, 1, item
);
3523 match_span(MatchObject
* self
, PyObject
* args
)
3527 PyObject
* index_
= Py_False
; /* zero */
3528 if (!PyArg_UnpackTuple(args
, "span", 0, 1, &index_
))
3531 index
= match_getindex(self
, index_
);
3533 if (index
< 0 || index
>= self
->groups
) {
3541 /* marks are -1 if group is undefined */
3542 return _pair(self
->mark
[index
*2], self
->mark
[index
*2+1]);
3546 match_regs(MatchObject
* self
)
3552 regs
= PyTuple_New(self
->groups
);
3556 for (index
= 0; index
< self
->groups
; index
++) {
3557 item
= _pair(self
->mark
[index
*2], self
->mark
[index
*2+1]);
3562 PyTuple_SET_ITEM(regs
, index
, item
);
3572 match_copy(MatchObject
* self
, PyObject
*unused
)
3574 #ifdef USE_BUILTIN_COPY
3576 Py_ssize_t slots
, offset
;
3578 slots
= 2 * (self
->pattern
->groups
+1);
3580 copy
= PyObject_NEW_VAR(MatchObject
, &Match_Type
, slots
);
3584 /* this value a constant, but any compiler should be able to
3585 figure that out all by itself */
3586 offset
= offsetof(MatchObject
, string
);
3588 Py_XINCREF(self
->pattern
);
3589 Py_XINCREF(self
->string
);
3590 Py_XINCREF(self
->regs
);
3592 memcpy((char*) copy
+ offset
, (char*) self
+ offset
,
3593 sizeof(MatchObject
) + slots
* sizeof(Py_ssize_t
) - offset
);
3595 return (PyObject
*) copy
;
3597 PyErr_SetString(PyExc_TypeError
, "cannot copy this match object");
3603 match_deepcopy(MatchObject
* self
, PyObject
* memo
)
3605 #ifdef USE_BUILTIN_COPY
3608 copy
= (MatchObject
*) match_copy(self
);
3612 if (!deepcopy((PyObject
**) ©
->pattern
, memo
) ||
3613 !deepcopy(©
->string
, memo
) ||
3614 !deepcopy(©
->regs
, memo
)) {
3620 PyErr_SetString(PyExc_TypeError
, "cannot deepcopy this match object");
3625 PyDoc_STRVAR(match_doc
,
3626 "The result of re.match() and re.search().\n\
3627 Match objects always have a boolean value of True.");
3629 PyDoc_STRVAR(match_group_doc
,
3630 "group([group1, ...]) -> str or tuple.\n\
3631 Return subgroup(s) of the match by indices or names.\n\
3632 For 0 returns the entire match.");
3634 PyDoc_STRVAR(match_start_doc
,
3635 "start([group=0]) -> int.\n\
3636 Return index of the start of the substring matched by group.");
3638 PyDoc_STRVAR(match_end_doc
,
3639 "end([group=0]) -> int.\n\
3640 Return index of the end of the substring matched by group.");
3642 PyDoc_STRVAR(match_span_doc
,
3643 "span([group]) -> tuple.\n\
3644 For MatchObject m, return the 2-tuple (m.start(group), m.end(group)).");
3646 PyDoc_STRVAR(match_groups_doc
,
3647 "groups([default=None]) -> tuple.\n\
3648 Return a tuple containing all the subgroups of the match, from 1.\n\
3649 The default argument is used for groups\n\
3650 that did not participate in the match");
3652 PyDoc_STRVAR(match_groupdict_doc
,
3653 "groupdict([default=None]) -> dict.\n\
3654 Return a dictionary containing all the named subgroups of the match,\n\
3655 keyed by the subgroup name. The default argument is used for groups\n\
3656 that did not participate in the match");
3658 PyDoc_STRVAR(match_expand_doc
,
3659 "expand(template) -> str.\n\
3660 Return the string obtained by doing backslash substitution\n\
3661 on the string template, as done by the sub() method.");
3663 static PyMethodDef match_methods
[] = {
3664 {"group", (PyCFunction
) match_group
, METH_VARARGS
, match_group_doc
},
3665 {"start", (PyCFunction
) match_start
, METH_VARARGS
, match_start_doc
},
3666 {"end", (PyCFunction
) match_end
, METH_VARARGS
, match_end_doc
},
3667 {"span", (PyCFunction
) match_span
, METH_VARARGS
, match_span_doc
},
3668 {"groups", (PyCFunction
) match_groups
, METH_VARARGS
|METH_KEYWORDS
,
3670 {"groupdict", (PyCFunction
) match_groupdict
, METH_VARARGS
|METH_KEYWORDS
,
3671 match_groupdict_doc
},
3672 {"expand", (PyCFunction
) match_expand
, METH_O
, match_expand_doc
},
3673 {"__copy__", (PyCFunction
) match_copy
, METH_NOARGS
},
3674 {"__deepcopy__", (PyCFunction
) match_deepcopy
, METH_O
},
3679 match_lastindex_get(MatchObject
*self
)
3681 if (self
->lastindex
>= 0)
3682 return PyInt_FromSsize_t(self
->lastindex
);
3688 match_lastgroup_get(MatchObject
*self
)
3690 if (self
->pattern
->indexgroup
&& self
->lastindex
>= 0) {
3691 PyObject
* result
= PySequence_GetItem(
3692 self
->pattern
->indexgroup
, self
->lastindex
3703 match_regs_get(MatchObject
*self
)
3706 Py_INCREF(self
->regs
);
3709 return match_regs(self
);
3712 static PyGetSetDef match_getset
[] = {
3713 {"lastindex", (getter
)match_lastindex_get
, (setter
)NULL
},
3714 {"lastgroup", (getter
)match_lastgroup_get
, (setter
)NULL
},
3715 {"regs", (getter
)match_regs_get
, (setter
)NULL
},
3719 #define MATCH_OFF(x) offsetof(MatchObject, x)
3720 static PyMemberDef match_members
[] = {
3721 {"string", T_OBJECT
, MATCH_OFF(string
), READONLY
},
3722 {"re", T_OBJECT
, MATCH_OFF(pattern
), READONLY
},
3723 {"pos", T_PYSSIZET
, MATCH_OFF(pos
), READONLY
},
3724 {"endpos", T_PYSSIZET
, MATCH_OFF(endpos
), READONLY
},
3729 /* FIXME: implement setattr("string", None) as a special case (to
3730 detach the associated string, if any */
3732 static PyTypeObject Match_Type
= {
3733 PyVarObject_HEAD_INIT(NULL
, 0)
3734 "_" SRE_MODULE
".SRE_Match",
3735 sizeof(MatchObject
), sizeof(Py_ssize_t
),
3736 (destructor
)match_dealloc
, /* tp_dealloc */
3742 0, /* tp_as_number */
3743 0, /* tp_as_sequence */
3744 0, /* tp_as_mapping */
3748 0, /* tp_getattro */
3749 0, /* tp_setattro */
3750 0, /* tp_as_buffer */
3752 match_doc
, /* tp_doc */
3753 0, /* tp_traverse */
3755 0, /* tp_richcompare */
3756 0, /* tp_weaklistoffset */
3758 0, /* tp_iternext */
3759 match_methods
, /* tp_methods */
3760 match_members
, /* tp_members */
3761 match_getset
, /* tp_getset */
3765 pattern_new_match(PatternObject
* pattern
, SRE_STATE
* state
, int status
)
3767 /* create match object (from state object) */
3776 /* create match object (with room for extra group marks) */
3777 /* coverity[ampersand_in_size] */
3778 match
= PyObject_NEW_VAR(MatchObject
, &Match_Type
,
3779 2*(pattern
->groups
+1));
3784 match
->pattern
= pattern
;
3786 Py_INCREF(state
->string
);
3787 match
->string
= state
->string
;
3790 match
->groups
= pattern
->groups
+1;
3792 /* fill in group slices */
3794 base
= (char*) state
->beginning
;
3795 n
= state
->charsize
;
3797 match
->mark
[0] = ((char*) state
->start
- base
) / n
;
3798 match
->mark
[1] = ((char*) state
->ptr
- base
) / n
;
3800 for (i
= j
= 0; i
< pattern
->groups
; i
++, j
+=2)
3801 if (j
+1 <= state
->lastmark
&& state
->mark
[j
] && state
->mark
[j
+1]) {
3802 match
->mark
[j
+2] = ((char*) state
->mark
[j
] - base
) / n
;
3803 match
->mark
[j
+3] = ((char*) state
->mark
[j
+1] - base
) / n
;
3805 match
->mark
[j
+2] = match
->mark
[j
+3] = -1; /* undefined */
3807 match
->pos
= state
->pos
;
3808 match
->endpos
= state
->endpos
;
3810 match
->lastindex
= state
->lastindex
;
3812 return (PyObject
*) match
;
3814 } else if (status
== 0) {
3822 /* internal error */
3823 pattern_error(status
);
3828 /* -------------------------------------------------------------------- */
3829 /* scanner methods (experimental) */
3832 scanner_dealloc(ScannerObject
* self
)
3834 state_fini(&self
->state
);
3835 Py_XDECREF(self
->pattern
);
3840 scanner_match(ScannerObject
* self
, PyObject
*unused
)
3842 SRE_STATE
* state
= &self
->state
;
3848 state
->ptr
= state
->start
;
3850 if (state
->charsize
== 1) {
3851 status
= sre_match(state
, PatternObject_GetCode(self
->pattern
));
3853 #if defined(HAVE_UNICODE)
3854 status
= sre_umatch(state
, PatternObject_GetCode(self
->pattern
));
3857 if (PyErr_Occurred())
3860 match
= pattern_new_match((PatternObject
*) self
->pattern
,
3863 if (status
== 0 || state
->ptr
== state
->start
)
3864 state
->start
= (void*) ((char*) state
->ptr
+ state
->charsize
);
3866 state
->start
= state
->ptr
;
3873 scanner_search(ScannerObject
* self
, PyObject
*unused
)
3875 SRE_STATE
* state
= &self
->state
;
3881 state
->ptr
= state
->start
;
3883 if (state
->charsize
== 1) {
3884 status
= sre_search(state
, PatternObject_GetCode(self
->pattern
));
3886 #if defined(HAVE_UNICODE)
3887 status
= sre_usearch(state
, PatternObject_GetCode(self
->pattern
));
3890 if (PyErr_Occurred())
3893 match
= pattern_new_match((PatternObject
*) self
->pattern
,
3896 if (status
== 0 || state
->ptr
== state
->start
)
3897 state
->start
= (void*) ((char*) state
->ptr
+ state
->charsize
);
3899 state
->start
= state
->ptr
;
3904 static PyMethodDef scanner_methods
[] = {
3905 {"match", (PyCFunction
) scanner_match
, METH_NOARGS
},
3906 {"search", (PyCFunction
) scanner_search
, METH_NOARGS
},
3910 #define SCAN_OFF(x) offsetof(ScannerObject, x)
3911 static PyMemberDef scanner_members
[] = {
3912 {"pattern", T_OBJECT
, SCAN_OFF(pattern
), READONLY
},
3913 {NULL
} /* Sentinel */
3916 statichere PyTypeObject Scanner_Type
= {
3917 PyObject_HEAD_INIT(NULL
)
3918 0, "_" SRE_MODULE
".SRE_Scanner",
3919 sizeof(ScannerObject
), 0,
3920 (destructor
)scanner_dealloc
, /*tp_dealloc*/
3924 0, /* tp_reserved */
3926 0, /* tp_as_number */
3927 0, /* tp_as_sequence */
3928 0, /* tp_as_mapping */
3932 0, /* tp_getattro */
3933 0, /* tp_setattro */
3934 0, /* tp_as_buffer */
3935 Py_TPFLAGS_DEFAULT
, /* tp_flags */
3937 0, /* tp_traverse */
3939 0, /* tp_richcompare */
3940 0, /* tp_weaklistoffset */
3942 0, /* tp_iternext */
3943 scanner_methods
, /* tp_methods */
3944 scanner_members
, /* tp_members */
3949 pattern_scanner(PatternObject
* pattern
, PyObject
* args
)
3951 /* create search state object */
3953 ScannerObject
* self
;
3956 Py_ssize_t start
= 0;
3957 Py_ssize_t end
= PY_SSIZE_T_MAX
;
3958 if (!PyArg_ParseTuple(args
, "O|nn:scanner", &string
, &start
, &end
))
3961 /* create scanner object */
3962 self
= PyObject_NEW(ScannerObject
, &Scanner_Type
);
3965 self
->pattern
= NULL
;
3967 string
= state_init(&self
->state
, pattern
, string
, start
, end
);
3974 self
->pattern
= (PyObject
*) pattern
;
3976 return (PyObject
*) self
;
3979 static PyMethodDef _functions
[] = {
3980 {"compile", _compile
, METH_VARARGS
},
3981 {"getcodesize", sre_codesize
, METH_NOARGS
},
3982 {"getlower", sre_getlower
, METH_VARARGS
},
3986 #if PY_VERSION_HEX < 0x02030000
3987 DL_EXPORT(void) init_sre(void)
3989 PyMODINIT_FUNC
init_sre(void)
3996 /* Patch object types */
3997 if (PyType_Ready(&Pattern_Type
) || PyType_Ready(&Match_Type
) ||
3998 PyType_Ready(&Scanner_Type
))
4001 m
= Py_InitModule("_" SRE_MODULE
, _functions
);
4004 d
= PyModule_GetDict(m
);
4006 x
= PyInt_FromLong(SRE_MAGIC
);
4008 PyDict_SetItemString(d
, "MAGIC", x
);
4012 x
= PyInt_FromLong(sizeof(SRE_CODE
));
4014 PyDict_SetItemString(d
, "CODESIZE", x
);
4018 x
= PyLong_FromUnsignedLong(SRE_MAXREPEAT
);
4020 PyDict_SetItemString(d
, "MAXREPEAT", x
);
4024 x
= PyString_FromString(copyright
);
4026 PyDict_SetItemString(d
, "copyright", x
);
4031 #endif /* !defined(SRE_RECURSIVE) */