2 * Secret Labs' Regular Expression Engine
4 * regular expression matching engine
6 Copyright (c) 2015, Daryl McDaniel. All rights reserved.<BR>
7 Copyright (c) 2011, Intel Corporation. All rights reserved.<BR>
8 This program and the accompanying materials are licensed and made available under
9 the terms and conditions of the BSD License that accompanies this distribution.
10 The full text of the license may be found at
11 http://opensource.org/licenses/bsd-license.
13 THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS,
14 WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED.
17 * 1999-10-24 fl created (based on existing template matcher code)
18 * 2000-03-06 fl first alpha, sort of
19 * 2000-08-01 fl fixes for 1.6b1
20 * 2000-08-07 fl use PyOS_CheckStack() if available
21 * 2000-09-20 fl added expand method
22 * 2001-03-20 fl lots of fixes for 2.1b2
23 * 2001-04-15 fl export copyright as Python attribute, not global
24 * 2001-04-28 fl added __copy__ methods (work in progress)
25 * 2001-05-14 fl fixes for 1.5.2 compatibility
26 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
27 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
28 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
29 * 2001-10-21 fl added sub/subn primitive
30 * 2001-10-24 fl added finditer primitive (for 2.2 only)
31 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
32 * 2002-11-09 fl fixed empty sub/subn return type
33 * 2003-04-18 mvl fully support 4-byte codes
34 * 2003-10-17 gn implemented non recursive scheme
36 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
38 * This version of the SRE library can be redistributed under CNRI's
39 * Python 1.6 license. For any other use, please contact Secret Labs
40 * AB (info@pythonware.com).
42 * Portions of this engine have been developed in cooperation with
43 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
44 * other compatibility work.
47 /* Get rid of these macros to prevent collisions between EFI and Python in this file. */
53 static char copyright
[] =
54 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
56 #define PY_SSIZE_T_CLEAN
59 #include "structmember.h" /* offsetof */
65 /* name of this module, minus the leading underscore */
66 #if !defined(SRE_MODULE)
67 #define SRE_MODULE "sre"
70 #define SRE_PY_MODULE "re"
72 /* defining this one enables tracing */
75 #if PY_VERSION_HEX >= 0x01060000
76 #if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
77 /* defining this enables unicode support (default under 1.6a1 and later) */
82 /* -------------------------------------------------------------------- */
83 /* optional features */
85 /* enables fast searching */
86 #define USE_FAST_SEARCH
88 /* enables aggressive inlining (always on for Visual C) */
91 /* enables copy/deepcopy handling (work in progress) */
92 #undef USE_BUILTIN_COPY
94 #if PY_VERSION_HEX < 0x01060000
95 #define PyObject_DEL(op) PyMem_DEL((op))
98 /* -------------------------------------------------------------------- */
100 #if defined(_MSC_VER)
101 #pragma optimize("gt", on) /* doesn't seem to make much difference... */
102 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
103 /* fastest possible local call under MSVC */
104 #define LOCAL(type) static __inline type __fastcall
105 #elif defined(USE_INLINE)
106 #define LOCAL(type) static inline type
108 #define LOCAL(type) static type
112 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
113 #define SRE_ERROR_STATE -2 /* illegal state */
114 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
115 #define SRE_ERROR_MEMORY -9 /* out of memory */
116 #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
119 #define TRACE(v) printf v
124 /* -------------------------------------------------------------------- */
125 /* search engine state */
127 /* default character predicates (run sre_chars.py to regenerate tables) */
129 #define SRE_DIGIT_MASK 1
130 #define SRE_SPACE_MASK 2
131 #define SRE_LINEBREAK_MASK 4
132 #define SRE_ALNUM_MASK 8
133 #define SRE_WORD_MASK 16
135 /* FIXME: this assumes ASCII. create tables in init_sre() instead */
137 static char sre_char_info
[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
138 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
140 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
141 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
142 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
143 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
145 static char sre_char_lower
[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
146 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
147 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
148 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
149 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
150 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
151 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
152 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
153 120, 121, 122, 123, 124, 125, 126, 127 };
155 #define SRE_IS_DIGIT(ch)\
156 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
157 #define SRE_IS_SPACE(ch)\
158 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
159 #define SRE_IS_LINEBREAK(ch)\
160 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
161 #define SRE_IS_ALNUM(ch)\
162 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
163 #define SRE_IS_WORD(ch)\
164 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
166 static unsigned int sre_lower(unsigned int ch
)
168 return ((ch
) < 128 ? (unsigned int)sre_char_lower
[ch
] : ch
);
171 /* locale-specific character predicates */
172 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
173 * warnings when c's type supports only numbers < N+1 */
174 #define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
175 #define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
176 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
177 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
178 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
180 static unsigned int sre_lower_locale(unsigned int ch
)
182 return ((ch
) < 256 ? (unsigned int)tolower((ch
)) : ch
);
185 /* unicode-specific character predicates */
187 #if defined(HAVE_UNICODE)
189 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
190 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
191 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
192 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
193 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
195 static unsigned int sre_lower_unicode(unsigned int ch
)
197 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE
)(ch
));
203 sre_category(SRE_CODE category
, unsigned int ch
)
207 case SRE_CATEGORY_DIGIT
:
208 return SRE_IS_DIGIT(ch
);
209 case SRE_CATEGORY_NOT_DIGIT
:
210 return !SRE_IS_DIGIT(ch
);
211 case SRE_CATEGORY_SPACE
:
212 return SRE_IS_SPACE(ch
);
213 case SRE_CATEGORY_NOT_SPACE
:
214 return !SRE_IS_SPACE(ch
);
215 case SRE_CATEGORY_WORD
:
216 return SRE_IS_WORD(ch
);
217 case SRE_CATEGORY_NOT_WORD
:
218 return !SRE_IS_WORD(ch
);
219 case SRE_CATEGORY_LINEBREAK
:
220 return SRE_IS_LINEBREAK(ch
);
221 case SRE_CATEGORY_NOT_LINEBREAK
:
222 return !SRE_IS_LINEBREAK(ch
);
224 case SRE_CATEGORY_LOC_WORD
:
225 return SRE_LOC_IS_WORD(ch
);
226 case SRE_CATEGORY_LOC_NOT_WORD
:
227 return !SRE_LOC_IS_WORD(ch
);
229 #if defined(HAVE_UNICODE)
230 case SRE_CATEGORY_UNI_DIGIT
:
231 return SRE_UNI_IS_DIGIT(ch
);
232 case SRE_CATEGORY_UNI_NOT_DIGIT
:
233 return !SRE_UNI_IS_DIGIT(ch
);
234 case SRE_CATEGORY_UNI_SPACE
:
235 return SRE_UNI_IS_SPACE(ch
);
236 case SRE_CATEGORY_UNI_NOT_SPACE
:
237 return !SRE_UNI_IS_SPACE(ch
);
238 case SRE_CATEGORY_UNI_WORD
:
239 return SRE_UNI_IS_WORD(ch
);
240 case SRE_CATEGORY_UNI_NOT_WORD
:
241 return !SRE_UNI_IS_WORD(ch
);
242 case SRE_CATEGORY_UNI_LINEBREAK
:
243 return SRE_UNI_IS_LINEBREAK(ch
);
244 case SRE_CATEGORY_UNI_NOT_LINEBREAK
:
245 return !SRE_UNI_IS_LINEBREAK(ch
);
247 case SRE_CATEGORY_UNI_DIGIT
:
248 return SRE_IS_DIGIT(ch
);
249 case SRE_CATEGORY_UNI_NOT_DIGIT
:
250 return !SRE_IS_DIGIT(ch
);
251 case SRE_CATEGORY_UNI_SPACE
:
252 return SRE_IS_SPACE(ch
);
253 case SRE_CATEGORY_UNI_NOT_SPACE
:
254 return !SRE_IS_SPACE(ch
);
255 case SRE_CATEGORY_UNI_WORD
:
256 return SRE_LOC_IS_WORD(ch
);
257 case SRE_CATEGORY_UNI_NOT_WORD
:
258 return !SRE_LOC_IS_WORD(ch
);
259 case SRE_CATEGORY_UNI_LINEBREAK
:
260 return SRE_IS_LINEBREAK(ch
);
261 case SRE_CATEGORY_UNI_NOT_LINEBREAK
:
262 return !SRE_IS_LINEBREAK(ch
);
271 data_stack_dealloc(SRE_STATE
* state
)
273 if (state
->data_stack
) {
274 PyMem_FREE(state
->data_stack
);
275 state
->data_stack
= NULL
;
277 state
->data_stack_size
= state
->data_stack_base
= 0;
281 data_stack_grow(SRE_STATE
* state
, Py_ssize_t size
)
283 Py_ssize_t minsize
, cursize
;
284 minsize
= state
->data_stack_base
+size
;
285 cursize
= state
->data_stack_size
;
286 if (cursize
< minsize
) {
288 cursize
= minsize
+minsize
/4+1024;
289 TRACE(("allocate/grow stack %" PY_FORMAT_SIZE_T
"d\n", cursize
));
290 stack
= PyMem_REALLOC(state
->data_stack
, cursize
);
292 data_stack_dealloc(state
);
293 return SRE_ERROR_MEMORY
;
295 state
->data_stack
= (char *)stack
;
296 state
->data_stack_size
= cursize
;
301 /* generate 8-bit version */
303 #define SRE_CHAR unsigned char
304 #define SRE_AT sre_at
305 #define SRE_COUNT sre_count
306 #define SRE_CHARSET sre_charset
307 #define SRE_INFO sre_info
308 #define SRE_MATCH sre_match
309 #define SRE_MATCH_CONTEXT sre_match_context
310 #define SRE_SEARCH sre_search
311 #define SRE_LITERAL_TEMPLATE sre_literal_template
313 #if defined(HAVE_UNICODE)
315 #define SRE_RECURSIVE
319 #undef SRE_LITERAL_TEMPLATE
322 #undef SRE_MATCH_CONTEXT
329 /* generate 16-bit unicode version */
331 #define SRE_CHAR Py_UNICODE
332 #define SRE_AT sre_uat
333 #define SRE_COUNT sre_ucount
334 #define SRE_CHARSET sre_ucharset
335 #define SRE_INFO sre_uinfo
336 #define SRE_MATCH sre_umatch
337 #define SRE_MATCH_CONTEXT sre_umatch_context
338 #define SRE_SEARCH sre_usearch
339 #define SRE_LITERAL_TEMPLATE sre_uliteral_template
342 #endif /* SRE_RECURSIVE */
344 /* -------------------------------------------------------------------- */
345 /* String matching engine */
347 /* the following section is compiled twice, with different character
351 SRE_AT(SRE_STATE
* state
, SRE_CHAR
* ptr
, SRE_CODE at
)
353 /* check if pointer is at given position */
355 Py_ssize_t thisp
, thatp
;
359 case SRE_AT_BEGINNING
:
360 case SRE_AT_BEGINNING_STRING
:
361 return ((void*) ptr
== state
->beginning
);
363 case SRE_AT_BEGINNING_LINE
:
364 return ((void*) ptr
== state
->beginning
||
365 SRE_IS_LINEBREAK((int) ptr
[-1]));
368 return (((void*) (ptr
+1) == state
->end
&&
369 SRE_IS_LINEBREAK((int) ptr
[0])) ||
370 ((void*) ptr
== state
->end
));
372 case SRE_AT_END_LINE
:
373 return ((void*) ptr
== state
->end
||
374 SRE_IS_LINEBREAK((int) ptr
[0]));
376 case SRE_AT_END_STRING
:
377 return ((void*) ptr
== state
->end
);
379 case SRE_AT_BOUNDARY
:
380 if (state
->beginning
== state
->end
)
382 thatp
= ((void*) ptr
> state
->beginning
) ?
383 SRE_IS_WORD((int) ptr
[-1]) : 0;
384 thisp
= ((void*) ptr
< state
->end
) ?
385 SRE_IS_WORD((int) ptr
[0]) : 0;
386 return thisp
!= thatp
;
388 case SRE_AT_NON_BOUNDARY
:
389 if (state
->beginning
== state
->end
)
391 thatp
= ((void*) ptr
> state
->beginning
) ?
392 SRE_IS_WORD((int) ptr
[-1]) : 0;
393 thisp
= ((void*) ptr
< state
->end
) ?
394 SRE_IS_WORD((int) ptr
[0]) : 0;
395 return thisp
== thatp
;
397 case SRE_AT_LOC_BOUNDARY
:
398 if (state
->beginning
== state
->end
)
400 thatp
= ((void*) ptr
> state
->beginning
) ?
401 SRE_LOC_IS_WORD((int) ptr
[-1]) : 0;
402 thisp
= ((void*) ptr
< state
->end
) ?
403 SRE_LOC_IS_WORD((int) ptr
[0]) : 0;
404 return thisp
!= thatp
;
406 case SRE_AT_LOC_NON_BOUNDARY
:
407 if (state
->beginning
== state
->end
)
409 thatp
= ((void*) ptr
> state
->beginning
) ?
410 SRE_LOC_IS_WORD((int) ptr
[-1]) : 0;
411 thisp
= ((void*) ptr
< state
->end
) ?
412 SRE_LOC_IS_WORD((int) ptr
[0]) : 0;
413 return thisp
== thatp
;
415 #if defined(HAVE_UNICODE)
416 case SRE_AT_UNI_BOUNDARY
:
417 if (state
->beginning
== state
->end
)
419 thatp
= ((void*) ptr
> state
->beginning
) ?
420 SRE_UNI_IS_WORD((int) ptr
[-1]) : 0;
421 thisp
= ((void*) ptr
< state
->end
) ?
422 SRE_UNI_IS_WORD((int) ptr
[0]) : 0;
423 return thisp
!= thatp
;
425 case SRE_AT_UNI_NON_BOUNDARY
:
426 if (state
->beginning
== state
->end
)
428 thatp
= ((void*) ptr
> state
->beginning
) ?
429 SRE_UNI_IS_WORD((int) ptr
[-1]) : 0;
430 thisp
= ((void*) ptr
< state
->end
) ?
431 SRE_UNI_IS_WORD((int) ptr
[0]) : 0;
432 return thisp
== thatp
;
441 SRE_CHARSET(SRE_CODE
* set
, SRE_CODE ch
)
443 /* check if character is a member of the given set */
454 /* <LITERAL> <code> */
460 case SRE_OP_CATEGORY
:
461 /* <CATEGORY> <code> */
462 if (sre_category(set
[0], (int) ch
))
468 if (sizeof(SRE_CODE
) == 2) {
469 /* <CHARSET> <bitmap> (16 bits per code word) */
470 if (ch
< 256 && (set
[ch
>> 4] & (1 << (ch
& 15))))
475 /* <CHARSET> <bitmap> (32 bits per code word) */
476 if (ch
< 256 && (set
[ch
>> 5] & (1u << (ch
& 31))))
483 /* <RANGE> <lower> <upper> */
484 if (set
[0] <= ch
&& ch
<= set
[1])
493 case SRE_OP_BIGCHARSET
:
494 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
496 Py_ssize_t count
, block
;
499 if (sizeof(SRE_CODE
) == 2) {
500 block
= ((unsigned char*)set
)[ch
>> 8];
502 if (set
[block
*16 + ((ch
& 255)>>4)] & (1 << (ch
& 15)))
507 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
508 * warnings when c's type supports only numbers < N+1 */
510 block
= ((unsigned char*)set
)[ch
>> 8];
515 (set
[block
*8 + ((ch
& 255)>>5)] & (1u << (ch
& 31))))
523 /* internal error -- there's not much we can do about it
524 here, so let's just pretend it didn't match... */
530 LOCAL(Py_ssize_t
) SRE_MATCH(SRE_STATE
* state
, SRE_CODE
* pattern
);
533 SRE_COUNT(SRE_STATE
* state
, SRE_CODE
* pattern
, Py_ssize_t maxcount
)
536 SRE_CHAR
* ptr
= (SRE_CHAR
*)state
->ptr
;
537 SRE_CHAR
* end
= (SRE_CHAR
*)state
->end
;
541 if (maxcount
< end
- ptr
&& maxcount
!= SRE_MAXREPEAT
)
542 end
= ptr
+ maxcount
;
544 switch (pattern
[0]) {
548 TRACE(("|%p|%p|COUNT IN\n", pattern
, ptr
));
549 while (ptr
< end
&& SRE_CHARSET(pattern
+ 2, *ptr
))
554 /* repeated dot wildcard. */
555 TRACE(("|%p|%p|COUNT ANY\n", pattern
, ptr
));
556 while (ptr
< end
&& !SRE_IS_LINEBREAK(*ptr
))
561 /* repeated dot wildcard. skip to the end of the target
562 string, and backtrack from there */
563 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern
, ptr
));
568 /* repeated literal */
570 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern
, ptr
, chr
));
571 while (ptr
< end
&& (SRE_CODE
) *ptr
== chr
)
575 case SRE_OP_LITERAL_IGNORE
:
576 /* repeated literal */
578 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern
, ptr
, chr
));
579 while (ptr
< end
&& (SRE_CODE
) state
->lower(*ptr
) == chr
)
583 case SRE_OP_NOT_LITERAL
:
584 /* repeated non-literal */
586 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern
, ptr
, chr
));
587 while (ptr
< end
&& (SRE_CODE
) *ptr
!= chr
)
591 case SRE_OP_NOT_LITERAL_IGNORE
:
592 /* repeated non-literal */
594 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern
, ptr
, chr
));
595 while (ptr
< end
&& (SRE_CODE
) state
->lower(*ptr
) != chr
)
600 /* repeated single character pattern */
601 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern
, ptr
));
602 while ((SRE_CHAR
*) state
->ptr
< end
) {
603 i
= SRE_MATCH(state
, pattern
);
609 TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T
"d\n", pattern
, ptr
,
610 (SRE_CHAR
*) state
->ptr
- ptr
));
611 return (SRE_CHAR
*) state
->ptr
- ptr
;
614 TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T
"d\n", pattern
, ptr
,
615 ptr
- (SRE_CHAR
*) state
->ptr
));
616 return ptr
- (SRE_CHAR
*) state
->ptr
;
619 #if 0 /* not used in this release */
621 SRE_INFO(SRE_STATE
* state
, SRE_CODE
* pattern
)
623 /* check if an SRE_OP_INFO block matches at the current position.
624 returns the number of SRE_CODE objects to skip if successful, 0
627 SRE_CHAR
* end
= state
->end
;
628 SRE_CHAR
* ptr
= state
->ptr
;
631 /* check minimal length */
632 if (pattern
[3] && (end
- ptr
) < pattern
[3])
635 /* check known prefix */
636 if (pattern
[2] & SRE_INFO_PREFIX
&& pattern
[5] > 1) {
637 /* <length> <skip> <prefix data> <overlap data> */
638 for (i
= 0; i
< pattern
[5]; i
++)
639 if ((SRE_CODE
) ptr
[i
] != pattern
[7 + i
])
641 return pattern
[0] + 2 * pattern
[6];
647 /* The macros below should be used to protect recursive SRE_MATCH()
648 * calls that *failed* and do *not* return immediately (IOW, those
649 * that will backtrack). Explaining:
651 * - Recursive SRE_MATCH() returned true: that's usually a success
652 * (besides atypical cases like ASSERT_NOT), therefore there's no
653 * reason to restore lastmark;
655 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
656 * is returning to the caller: If the current SRE_MATCH() is the
657 * top function of the recursion, returning false will be a matching
658 * failure, and it doesn't matter where lastmark is pointing to.
659 * If it's *not* the top function, it will be a recursive SRE_MATCH()
660 * failure by itself, and the calling SRE_MATCH() will have to deal
661 * with the failure by the same rules explained here (it will restore
662 * lastmark by itself if necessary);
664 * - Recursive SRE_MATCH() returned false, and will continue the
665 * outside 'for' loop: must be protected when breaking, since the next
666 * OP could potentially depend on lastmark;
668 * - Recursive SRE_MATCH() returned false, and will be called again
669 * inside a local for/while loop: must be protected between each
670 * loop iteration, since the recursive SRE_MATCH() could do anything,
671 * and could potentially depend on lastmark.
673 * For more information, check the discussion at SF patch #712900.
675 #define LASTMARK_SAVE() \
677 ctx->lastmark = state->lastmark; \
678 ctx->lastindex = state->lastindex; \
680 #define LASTMARK_RESTORE() \
682 state->lastmark = ctx->lastmark; \
683 state->lastindex = ctx->lastindex; \
686 #define RETURN_ERROR(i) do { return i; } while(0)
687 #define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
688 #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
690 #define RETURN_ON_ERROR(i) \
691 do { if (i < 0) RETURN_ERROR(i); } while (0)
692 #define RETURN_ON_SUCCESS(i) \
693 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
694 #define RETURN_ON_FAILURE(i) \
695 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
699 #define DATA_STACK_ALLOC(state, type, ptr) \
701 alloc_pos = state->data_stack_base; \
702 TRACE(("allocating %s in %" PY_FORMAT_SIZE_T "d " \
703 "(%" PY_FORMAT_SIZE_T "d)\n", \
704 SFY(type), alloc_pos, sizeof(type))); \
705 if (sizeof(type) > state->data_stack_size - alloc_pos) { \
706 int j = data_stack_grow(state, sizeof(type)); \
707 if (j < 0) return j; \
709 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
711 ptr = (type*)(state->data_stack+alloc_pos); \
712 state->data_stack_base += sizeof(type); \
715 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
717 TRACE(("looking up %s at %" PY_FORMAT_SIZE_T "d\n", SFY(type), pos)); \
718 ptr = (type*)(state->data_stack+pos); \
721 #define DATA_STACK_PUSH(state, data, size) \
723 TRACE(("copy data in %p to %" PY_FORMAT_SIZE_T "d " \
724 "(%" PY_FORMAT_SIZE_T "d)\n", \
725 data, state->data_stack_base, size)); \
726 if (size > state->data_stack_size - state->data_stack_base) { \
727 int j = data_stack_grow(state, size); \
728 if (j < 0) return j; \
730 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
732 memcpy(state->data_stack+state->data_stack_base, data, size); \
733 state->data_stack_base += size; \
736 #define DATA_STACK_POP(state, data, size, discard) \
738 TRACE(("copy data to %p from %" PY_FORMAT_SIZE_T "d " \
739 "(%" PY_FORMAT_SIZE_T "d)\n", \
740 data, state->data_stack_base-size, size)); \
741 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
743 state->data_stack_base -= size; \
746 #define DATA_STACK_POP_DISCARD(state, size) \
748 TRACE(("discard data from %" PY_FORMAT_SIZE_T "d " \
749 "(%" PY_FORMAT_SIZE_T "d)\n", \
750 state->data_stack_base-size, size)); \
751 state->data_stack_base -= size; \
754 #define DATA_PUSH(x) \
755 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
756 #define DATA_POP(x) \
757 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
758 #define DATA_POP_DISCARD(x) \
759 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
760 #define DATA_ALLOC(t,p) \
761 DATA_STACK_ALLOC(state, t, p)
762 #define DATA_LOOKUP_AT(t,p,pos) \
763 DATA_STACK_LOOKUP_AT(state,t,p,pos)
765 #define MARK_PUSH(lastmark) \
766 do if (lastmark > 0) { \
767 i = lastmark; /* ctx->lastmark may change if reallocated */ \
768 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
770 #define MARK_POP(lastmark) \
771 do if (lastmark > 0) { \
772 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
774 #define MARK_POP_KEEP(lastmark) \
775 do if (lastmark > 0) { \
776 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
778 #define MARK_POP_DISCARD(lastmark) \
779 do if (lastmark > 0) { \
780 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
784 #define JUMP_MAX_UNTIL_1 1
785 #define JUMP_MAX_UNTIL_2 2
786 #define JUMP_MAX_UNTIL_3 3
787 #define JUMP_MIN_UNTIL_1 4
788 #define JUMP_MIN_UNTIL_2 5
789 #define JUMP_MIN_UNTIL_3 6
790 #define JUMP_REPEAT 7
791 #define JUMP_REPEAT_ONE_1 8
792 #define JUMP_REPEAT_ONE_2 9
793 #define JUMP_MIN_REPEAT_ONE 10
794 #define JUMP_BRANCH 11
795 #define JUMP_ASSERT 12
796 #define JUMP_ASSERT_NOT 13
798 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
799 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
800 nextctx->last_ctx_pos = ctx_pos; \
801 nextctx->jump = jumpvalue; \
802 nextctx->pattern = nextpattern; \
803 ctx_pos = alloc_pos; \
807 while (0) /* gcc doesn't like labels at end of scopes */ \
810 Py_ssize_t last_ctx_pos
;
816 Py_ssize_t lastindex
;
823 /* check if string matches the given pattern. returns <0 for
824 error, 0 for failure, and 1 for success */
826 SRE_MATCH(SRE_STATE
* state
, SRE_CODE
* pattern
)
828 SRE_CHAR
* end
= (SRE_CHAR
*)state
->end
;
829 Py_ssize_t alloc_pos
, ctx_pos
= -1;
830 Py_ssize_t i
, ret
= 0;
832 unsigned int sigcount
=0;
834 SRE_MATCH_CONTEXT
* ctx
;
835 SRE_MATCH_CONTEXT
* nextctx
;
837 TRACE(("|%p|%p|ENTER\n", pattern
, state
->ptr
));
839 DATA_ALLOC(SRE_MATCH_CONTEXT
, ctx
);
840 ctx
->last_ctx_pos
= -1;
841 ctx
->jump
= JUMP_NONE
;
842 ctx
->pattern
= pattern
;
847 ctx
->ptr
= (SRE_CHAR
*)state
->ptr
;
849 if (ctx
->pattern
[0] == SRE_OP_INFO
) {
850 /* optimization info block */
851 /* <INFO> <1=skip> <2=flags> <3=min> ... */
852 if (ctx
->pattern
[3] && (end
- ctx
->ptr
) < ctx
->pattern
[3]) {
853 TRACE(("reject (got %" PY_FORMAT_SIZE_T
"d chars, "
854 "need %" PY_FORMAT_SIZE_T
"d)\n",
855 (end
- ctx
->ptr
), (Py_ssize_t
) ctx
->pattern
[3]));
858 ctx
->pattern
+= ctx
->pattern
[1] + 1;
863 if ((0 == (sigcount
& 0xfff)) && PyErr_CheckSignals())
864 RETURN_ERROR(SRE_ERROR_INTERRUPTED
);
866 switch (*ctx
->pattern
++) {
871 TRACE(("|%p|%p|MARK %d\n", ctx
->pattern
,
872 ctx
->ptr
, ctx
->pattern
[0]));
875 state
->lastindex
= i
/2 + 1;
876 if (i
> state
->lastmark
) {
877 /* state->lastmark is the highest valid index in the
878 state->mark array. If it is increased by more than 1,
879 the intervening marks must be set to NULL to signal
880 that these marks have not been encountered. */
881 Py_ssize_t j
= state
->lastmark
+ 1;
883 state
->mark
[j
++] = NULL
;
886 state
->mark
[i
] = ctx
->ptr
;
891 /* match literal string */
892 /* <LITERAL> <code> */
893 TRACE(("|%p|%p|LITERAL %d\n", ctx
->pattern
,
894 ctx
->ptr
, *ctx
->pattern
));
895 if (ctx
->ptr
>= end
|| (SRE_CODE
) ctx
->ptr
[0] != ctx
->pattern
[0])
901 case SRE_OP_NOT_LITERAL
:
902 /* match anything that is not literal character */
903 /* <NOT_LITERAL> <code> */
904 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx
->pattern
,
905 ctx
->ptr
, *ctx
->pattern
));
906 if (ctx
->ptr
>= end
|| (SRE_CODE
) ctx
->ptr
[0] == ctx
->pattern
[0])
914 TRACE(("|%p|%p|SUCCESS\n", ctx
->pattern
, ctx
->ptr
));
915 state
->ptr
= ctx
->ptr
;
919 /* match at given position */
921 TRACE(("|%p|%p|AT %d\n", ctx
->pattern
, ctx
->ptr
, *ctx
->pattern
));
922 if (!SRE_AT(state
, ctx
->ptr
, *ctx
->pattern
))
927 case SRE_OP_CATEGORY
:
928 /* match at given category */
929 /* <CATEGORY> <code> */
930 TRACE(("|%p|%p|CATEGORY %d\n", ctx
->pattern
,
931 ctx
->ptr
, *ctx
->pattern
));
932 if (ctx
->ptr
>= end
|| !sre_category(ctx
->pattern
[0], ctx
->ptr
[0]))
939 /* match anything (except a newline) */
941 TRACE(("|%p|%p|ANY\n", ctx
->pattern
, ctx
->ptr
));
942 if (ctx
->ptr
>= end
|| SRE_IS_LINEBREAK(ctx
->ptr
[0]))
950 TRACE(("|%p|%p|ANY_ALL\n", ctx
->pattern
, ctx
->ptr
));
957 /* match set member (or non_member) */
958 /* <IN> <skip> <set> */
959 TRACE(("|%p|%p|IN\n", ctx
->pattern
, ctx
->ptr
));
960 if (ctx
->ptr
>= end
|| !SRE_CHARSET(ctx
->pattern
+ 1, *ctx
->ptr
))
962 ctx
->pattern
+= ctx
->pattern
[0];
966 case SRE_OP_LITERAL_IGNORE
:
967 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
968 ctx
->pattern
, ctx
->ptr
, ctx
->pattern
[0]));
969 if (ctx
->ptr
>= end
||
970 state
->lower(*ctx
->ptr
) != state
->lower(*ctx
->pattern
))
976 case SRE_OP_NOT_LITERAL_IGNORE
:
977 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
978 ctx
->pattern
, ctx
->ptr
, *ctx
->pattern
));
979 if (ctx
->ptr
>= end
||
980 state
->lower(*ctx
->ptr
) == state
->lower(*ctx
->pattern
))
986 case SRE_OP_IN_IGNORE
:
987 TRACE(("|%p|%p|IN_IGNORE\n", ctx
->pattern
, ctx
->ptr
));
989 || !SRE_CHARSET(ctx
->pattern
+1,
990 (SRE_CODE
)state
->lower(*ctx
->ptr
)))
992 ctx
->pattern
+= ctx
->pattern
[0];
999 /* <JUMP> <offset> */
1000 TRACE(("|%p|%p|JUMP %d\n", ctx
->pattern
,
1001 ctx
->ptr
, ctx
->pattern
[0]));
1002 ctx
->pattern
+= ctx
->pattern
[0];
1007 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
1008 TRACE(("|%p|%p|BRANCH\n", ctx
->pattern
, ctx
->ptr
));
1010 ctx
->u
.rep
= state
->repeat
;
1012 MARK_PUSH(ctx
->lastmark
);
1013 for (; ctx
->pattern
[0]; ctx
->pattern
+= ctx
->pattern
[0]) {
1014 if (ctx
->pattern
[1] == SRE_OP_LITERAL
&&
1016 (SRE_CODE
) *ctx
->ptr
!= ctx
->pattern
[2]))
1018 if (ctx
->pattern
[1] == SRE_OP_IN
&&
1020 !SRE_CHARSET(ctx
->pattern
+ 3, (SRE_CODE
) *ctx
->ptr
)))
1022 state
->ptr
= ctx
->ptr
;
1023 DO_JUMP(JUMP_BRANCH
, jump_branch
, ctx
->pattern
+1);
1026 MARK_POP_DISCARD(ctx
->lastmark
);
1027 RETURN_ON_ERROR(ret
);
1031 MARK_POP_KEEP(ctx
->lastmark
);
1035 MARK_POP_DISCARD(ctx
->lastmark
);
1038 case SRE_OP_REPEAT_ONE
:
1039 /* match repeated sequence (maximizing regexp) */
1041 /* this operator only works if the repeated item is
1042 exactly one character wide, and we're not already
1043 collecting backtracking points. for other cases,
1044 use the MAX_REPEAT operator */
1046 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1048 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx
->pattern
, ctx
->ptr
,
1049 ctx
->pattern
[1], ctx
->pattern
[2]));
1051 if ((Py_ssize_t
) ctx
->pattern
[1] > end
- ctx
->ptr
)
1052 RETURN_FAILURE
; /* cannot match */
1054 state
->ptr
= ctx
->ptr
;
1056 ret
= SRE_COUNT(state
, ctx
->pattern
+3, ctx
->pattern
[2]);
1057 RETURN_ON_ERROR(ret
);
1058 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT
, ctx
, ctx_pos
);
1060 ctx
->ptr
+= ctx
->count
;
1062 /* when we arrive here, count contains the number of
1063 matches, and ctx->ptr points to the tail of the target
1064 string. check if the rest of the pattern matches,
1065 and backtrack if not. */
1067 if (ctx
->count
< (Py_ssize_t
) ctx
->pattern
[1])
1070 if (ctx
->pattern
[ctx
->pattern
[0]] == SRE_OP_SUCCESS
) {
1071 /* tail is empty. we're finished */
1072 state
->ptr
= ctx
->ptr
;
1078 if (ctx
->pattern
[ctx
->pattern
[0]] == SRE_OP_LITERAL
) {
1079 /* tail starts with a literal. skip positions where
1080 the rest of the pattern cannot possibly match */
1081 ctx
->u
.chr
= ctx
->pattern
[ctx
->pattern
[0]+1];
1083 while (ctx
->count
>= (Py_ssize_t
) ctx
->pattern
[1] &&
1084 (ctx
->ptr
>= end
|| *ctx
->ptr
!= ctx
->u
.chr
)) {
1088 if (ctx
->count
< (Py_ssize_t
) ctx
->pattern
[1])
1090 state
->ptr
= ctx
->ptr
;
1091 DO_JUMP(JUMP_REPEAT_ONE_1
, jump_repeat_one_1
,
1092 ctx
->pattern
+ctx
->pattern
[0]);
1094 RETURN_ON_ERROR(ret
);
1106 while (ctx
->count
>= (Py_ssize_t
) ctx
->pattern
[1]) {
1107 state
->ptr
= ctx
->ptr
;
1108 DO_JUMP(JUMP_REPEAT_ONE_2
, jump_repeat_one_2
,
1109 ctx
->pattern
+ctx
->pattern
[0]);
1111 RETURN_ON_ERROR(ret
);
1121 case SRE_OP_MIN_REPEAT_ONE
:
1122 /* match repeated sequence (minimizing regexp) */
1124 /* this operator only works if the repeated item is
1125 exactly one character wide, and we're not already
1126 collecting backtracking points. for other cases,
1127 use the MIN_REPEAT operator */
1129 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1131 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx
->pattern
, ctx
->ptr
,
1132 ctx
->pattern
[1], ctx
->pattern
[2]));
1134 if ((Py_ssize_t
) ctx
->pattern
[1] > end
- ctx
->ptr
)
1135 RETURN_FAILURE
; /* cannot match */
1137 state
->ptr
= ctx
->ptr
;
1139 if (ctx
->pattern
[1] == 0)
1142 /* count using pattern min as the maximum */
1143 ret
= SRE_COUNT(state
, ctx
->pattern
+3, ctx
->pattern
[1]);
1144 RETURN_ON_ERROR(ret
);
1145 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT
, ctx
, ctx_pos
);
1146 if (ret
< (Py_ssize_t
) ctx
->pattern
[1])
1147 /* didn't match minimum number of times */
1149 /* advance past minimum matches of repeat */
1151 ctx
->ptr
+= ctx
->count
;
1154 if (ctx
->pattern
[ctx
->pattern
[0]] == SRE_OP_SUCCESS
) {
1155 /* tail is empty. we're finished */
1156 state
->ptr
= ctx
->ptr
;
1162 while ((Py_ssize_t
)ctx
->pattern
[2] == SRE_MAXREPEAT
1163 || ctx
->count
<= (Py_ssize_t
)ctx
->pattern
[2]) {
1164 state
->ptr
= ctx
->ptr
;
1165 DO_JUMP(JUMP_MIN_REPEAT_ONE
,jump_min_repeat_one
,
1166 ctx
->pattern
+ctx
->pattern
[0]);
1168 RETURN_ON_ERROR(ret
);
1171 state
->ptr
= ctx
->ptr
;
1172 ret
= SRE_COUNT(state
, ctx
->pattern
+3, 1);
1173 RETURN_ON_ERROR(ret
);
1174 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT
, ctx
, ctx_pos
);
1186 /* create repeat context. all the hard work is done
1187 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1188 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1189 TRACE(("|%p|%p|REPEAT %d %d\n", ctx
->pattern
, ctx
->ptr
,
1190 ctx
->pattern
[1], ctx
->pattern
[2]));
1192 /* install new repeat context */
1193 ctx
->u
.rep
= (SRE_REPEAT
*) PyObject_MALLOC(sizeof(*ctx
->u
.rep
));
1198 ctx
->u
.rep
->count
= -1;
1199 ctx
->u
.rep
->pattern
= ctx
->pattern
;
1200 ctx
->u
.rep
->prev
= state
->repeat
;
1201 ctx
->u
.rep
->last_ptr
= NULL
;
1202 state
->repeat
= ctx
->u
.rep
;
1204 state
->ptr
= ctx
->ptr
;
1205 DO_JUMP(JUMP_REPEAT
, jump_repeat
, ctx
->pattern
+ctx
->pattern
[0]);
1206 state
->repeat
= ctx
->u
.rep
->prev
;
1207 PyObject_FREE(ctx
->u
.rep
);
1210 RETURN_ON_ERROR(ret
);
1215 case SRE_OP_MAX_UNTIL
:
1216 /* maximizing repeat */
1217 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1219 /* FIXME: we probably need to deal with zero-width
1220 matches in here... */
1222 ctx
->u
.rep
= state
->repeat
;
1224 RETURN_ERROR(SRE_ERROR_STATE
);
1226 state
->ptr
= ctx
->ptr
;
1228 ctx
->count
= ctx
->u
.rep
->count
+1;
1230 TRACE(("|%p|%p|MAX_UNTIL %" PY_FORMAT_SIZE_T
"d\n", ctx
->pattern
,
1231 ctx
->ptr
, ctx
->count
));
1233 if (ctx
->count
< (Py_ssize_t
) ctx
->u
.rep
->pattern
[1]) {
1234 /* not enough matches */
1235 ctx
->u
.rep
->count
= ctx
->count
;
1236 DO_JUMP(JUMP_MAX_UNTIL_1
, jump_max_until_1
,
1237 ctx
->u
.rep
->pattern
+3);
1239 RETURN_ON_ERROR(ret
);
1242 ctx
->u
.rep
->count
= ctx
->count
-1;
1243 state
->ptr
= ctx
->ptr
;
1247 if ((ctx
->count
< (Py_ssize_t
) ctx
->u
.rep
->pattern
[2] ||
1248 ctx
->u
.rep
->pattern
[2] == SRE_MAXREPEAT
) &&
1249 state
->ptr
!= ctx
->u
.rep
->last_ptr
) {
1250 /* we may have enough matches, but if we can
1251 match another item, do so */
1252 ctx
->u
.rep
->count
= ctx
->count
;
1254 MARK_PUSH(ctx
->lastmark
);
1255 /* zero-width match protection */
1256 DATA_PUSH(&ctx
->u
.rep
->last_ptr
);
1257 ctx
->u
.rep
->last_ptr
= state
->ptr
;
1258 DO_JUMP(JUMP_MAX_UNTIL_2
, jump_max_until_2
,
1259 ctx
->u
.rep
->pattern
+3);
1260 DATA_POP(&ctx
->u
.rep
->last_ptr
);
1262 MARK_POP_DISCARD(ctx
->lastmark
);
1263 RETURN_ON_ERROR(ret
);
1266 MARK_POP(ctx
->lastmark
);
1268 ctx
->u
.rep
->count
= ctx
->count
-1;
1269 state
->ptr
= ctx
->ptr
;
1272 /* cannot match more repeated items here. make sure the
1274 state
->repeat
= ctx
->u
.rep
->prev
;
1275 DO_JUMP(JUMP_MAX_UNTIL_3
, jump_max_until_3
, ctx
->pattern
);
1276 RETURN_ON_SUCCESS(ret
);
1277 state
->repeat
= ctx
->u
.rep
;
1278 state
->ptr
= ctx
->ptr
;
1281 case SRE_OP_MIN_UNTIL
:
1282 /* minimizing repeat */
1283 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1285 ctx
->u
.rep
= state
->repeat
;
1287 RETURN_ERROR(SRE_ERROR_STATE
);
1289 state
->ptr
= ctx
->ptr
;
1291 ctx
->count
= ctx
->u
.rep
->count
+1;
1293 TRACE(("|%p|%p|MIN_UNTIL %" PY_FORMAT_SIZE_T
"d %p\n", ctx
->pattern
,
1294 ctx
->ptr
, ctx
->count
, ctx
->u
.rep
->pattern
));
1296 if (ctx
->count
< (Py_ssize_t
) ctx
->u
.rep
->pattern
[1]) {
1297 /* not enough matches */
1298 ctx
->u
.rep
->count
= ctx
->count
;
1299 DO_JUMP(JUMP_MIN_UNTIL_1
, jump_min_until_1
,
1300 ctx
->u
.rep
->pattern
+3);
1302 RETURN_ON_ERROR(ret
);
1305 ctx
->u
.rep
->count
= ctx
->count
-1;
1306 state
->ptr
= ctx
->ptr
;
1312 /* see if the tail matches */
1313 state
->repeat
= ctx
->u
.rep
->prev
;
1314 DO_JUMP(JUMP_MIN_UNTIL_2
, jump_min_until_2
, ctx
->pattern
);
1316 RETURN_ON_ERROR(ret
);
1320 state
->repeat
= ctx
->u
.rep
;
1321 state
->ptr
= ctx
->ptr
;
1325 if ((ctx
->count
>= (Py_ssize_t
) ctx
->u
.rep
->pattern
[2]
1326 && ctx
->u
.rep
->pattern
[2] != SRE_MAXREPEAT
) ||
1327 state
->ptr
== ctx
->u
.rep
->last_ptr
)
1330 ctx
->u
.rep
->count
= ctx
->count
;
1331 /* zero-width match protection */
1332 DATA_PUSH(&ctx
->u
.rep
->last_ptr
);
1333 ctx
->u
.rep
->last_ptr
= state
->ptr
;
1334 DO_JUMP(JUMP_MIN_UNTIL_3
,jump_min_until_3
,
1335 ctx
->u
.rep
->pattern
+3);
1336 DATA_POP(&ctx
->u
.rep
->last_ptr
);
1338 RETURN_ON_ERROR(ret
);
1341 ctx
->u
.rep
->count
= ctx
->count
-1;
1342 state
->ptr
= ctx
->ptr
;
1345 case SRE_OP_GROUPREF
:
1346 /* match backreference */
1347 TRACE(("|%p|%p|GROUPREF %d\n", ctx
->pattern
,
1348 ctx
->ptr
, ctx
->pattern
[0]));
1349 i
= ctx
->pattern
[0];
1351 Py_ssize_t groupref
= i
+i
;
1352 if (groupref
>= state
->lastmark
) {
1355 SRE_CHAR
* p
= (SRE_CHAR
*) state
->mark
[groupref
];
1356 SRE_CHAR
* e
= (SRE_CHAR
*) state
->mark
[groupref
+1];
1357 if (!p
|| !e
|| e
< p
)
1360 if (ctx
->ptr
>= end
|| *ctx
->ptr
!= *p
)
1369 case SRE_OP_GROUPREF_IGNORE
:
1370 /* match backreference */
1371 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx
->pattern
,
1372 ctx
->ptr
, ctx
->pattern
[0]));
1373 i
= ctx
->pattern
[0];
1375 Py_ssize_t groupref
= i
+i
;
1376 if (groupref
>= state
->lastmark
) {
1379 SRE_CHAR
* p
= (SRE_CHAR
*) state
->mark
[groupref
];
1380 SRE_CHAR
* e
= (SRE_CHAR
*) state
->mark
[groupref
+1];
1381 if (!p
|| !e
|| e
< p
)
1384 if (ctx
->ptr
>= end
||
1385 state
->lower(*ctx
->ptr
) != state
->lower(*p
))
1394 case SRE_OP_GROUPREF_EXISTS
:
1395 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx
->pattern
,
1396 ctx
->ptr
, ctx
->pattern
[0]));
1397 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1398 i
= ctx
->pattern
[0];
1400 Py_ssize_t groupref
= i
+i
;
1401 if (groupref
>= state
->lastmark
) {
1402 ctx
->pattern
+= ctx
->pattern
[1];
1405 SRE_CHAR
* p
= (SRE_CHAR
*) state
->mark
[groupref
];
1406 SRE_CHAR
* e
= (SRE_CHAR
*) state
->mark
[groupref
+1];
1407 if (!p
|| !e
|| e
< p
) {
1408 ctx
->pattern
+= ctx
->pattern
[1];
1417 /* assert subpattern */
1418 /* <ASSERT> <skip> <back> <pattern> */
1419 TRACE(("|%p|%p|ASSERT %d\n", ctx
->pattern
,
1420 ctx
->ptr
, ctx
->pattern
[1]));
1421 state
->ptr
= ctx
->ptr
- ctx
->pattern
[1];
1422 if (state
->ptr
< state
->beginning
)
1424 DO_JUMP(JUMP_ASSERT
, jump_assert
, ctx
->pattern
+2);
1425 RETURN_ON_FAILURE(ret
);
1426 ctx
->pattern
+= ctx
->pattern
[0];
1429 case SRE_OP_ASSERT_NOT
:
1430 /* assert not subpattern */
1431 /* <ASSERT_NOT> <skip> <back> <pattern> */
1432 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx
->pattern
,
1433 ctx
->ptr
, ctx
->pattern
[1]));
1434 state
->ptr
= ctx
->ptr
- ctx
->pattern
[1];
1435 if (state
->ptr
>= state
->beginning
) {
1436 DO_JUMP(JUMP_ASSERT_NOT
, jump_assert_not
, ctx
->pattern
+2);
1438 RETURN_ON_ERROR(ret
);
1442 ctx
->pattern
+= ctx
->pattern
[0];
1445 case SRE_OP_FAILURE
:
1446 /* immediate failure */
1447 TRACE(("|%p|%p|FAILURE\n", ctx
->pattern
, ctx
->ptr
));
1451 TRACE(("|%p|%p|UNKNOWN %d\n", ctx
->pattern
, ctx
->ptr
,
1453 RETURN_ERROR(SRE_ERROR_ILLEGAL
);
1458 ctx_pos
= ctx
->last_ctx_pos
;
1460 DATA_POP_DISCARD(ctx
);
1463 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT
, ctx
, ctx_pos
);
1466 case JUMP_MAX_UNTIL_2
:
1467 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx
->pattern
, ctx
->ptr
));
1468 goto jump_max_until_2
;
1469 case JUMP_MAX_UNTIL_3
:
1470 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx
->pattern
, ctx
->ptr
));
1471 goto jump_max_until_3
;
1472 case JUMP_MIN_UNTIL_2
:
1473 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx
->pattern
, ctx
->ptr
));
1474 goto jump_min_until_2
;
1475 case JUMP_MIN_UNTIL_3
:
1476 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx
->pattern
, ctx
->ptr
));
1477 goto jump_min_until_3
;
1479 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx
->pattern
, ctx
->ptr
));
1481 case JUMP_MAX_UNTIL_1
:
1482 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx
->pattern
, ctx
->ptr
));
1483 goto jump_max_until_1
;
1484 case JUMP_MIN_UNTIL_1
:
1485 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx
->pattern
, ctx
->ptr
));
1486 goto jump_min_until_1
;
1488 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx
->pattern
, ctx
->ptr
));
1490 case JUMP_REPEAT_ONE_1
:
1491 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx
->pattern
, ctx
->ptr
));
1492 goto jump_repeat_one_1
;
1493 case JUMP_REPEAT_ONE_2
:
1494 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx
->pattern
, ctx
->ptr
));
1495 goto jump_repeat_one_2
;
1496 case JUMP_MIN_REPEAT_ONE
:
1497 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx
->pattern
, ctx
->ptr
));
1498 goto jump_min_repeat_one
;
1500 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx
->pattern
, ctx
->ptr
));
1502 case JUMP_ASSERT_NOT
:
1503 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx
->pattern
, ctx
->ptr
));
1504 goto jump_assert_not
;
1506 TRACE(("|%p|%p|RETURN %" PY_FORMAT_SIZE_T
"d\n", ctx
->pattern
,
1511 return ret
; /* should never get here */
1515 SRE_SEARCH(SRE_STATE
* state
, SRE_CODE
* pattern
)
1517 SRE_CHAR
* ptr
= (SRE_CHAR
*)state
->start
;
1518 SRE_CHAR
* end
= (SRE_CHAR
*)state
->end
;
1519 Py_ssize_t status
= 0;
1520 Py_ssize_t prefix_len
= 0;
1521 Py_ssize_t prefix_skip
= 0;
1522 SRE_CODE
* prefix
= NULL
;
1523 SRE_CODE
* charset
= NULL
;
1524 SRE_CODE
* overlap
= NULL
;
1527 if (pattern
[0] == SRE_OP_INFO
) {
1528 /* optimization info block */
1529 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1533 if (pattern
[3] > 1) {
1534 /* adjust end point (but make sure we leave at least one
1535 character in there, so literal search will work) */
1536 end
-= pattern
[3]-1;
1541 if (flags
& SRE_INFO_PREFIX
) {
1542 /* pattern starts with a known prefix */
1543 /* <length> <skip> <prefix data> <overlap data> */
1544 prefix_len
= pattern
[5];
1545 prefix_skip
= pattern
[6];
1546 prefix
= pattern
+ 7;
1547 overlap
= prefix
+ prefix_len
- 1;
1548 } else if (flags
& SRE_INFO_CHARSET
)
1549 /* pattern starts with a character from a known set */
1551 charset
= pattern
+ 5;
1553 pattern
+= 1 + pattern
[1];
1556 TRACE(("prefix = %p %" PY_FORMAT_SIZE_T
"d %" PY_FORMAT_SIZE_T
"d\n",
1557 prefix
, prefix_len
, prefix_skip
));
1558 TRACE(("charset = %p\n", charset
));
1560 #if defined(USE_FAST_SEARCH)
1561 if (prefix_len
> 1) {
1562 /* pattern starts with a known prefix. use the overlap
1563 table to skip forward as fast as we possibly can */
1565 end
= (SRE_CHAR
*)state
->end
;
1568 if ((SRE_CODE
) ptr
[0] != prefix
[i
]) {
1574 if (++i
== prefix_len
) {
1575 /* found a potential match */
1576 TRACE(("|%p|%p|SEARCH SCAN\n", pattern
, ptr
));
1577 state
->start
= ptr
+ 1 - prefix_len
;
1578 state
->ptr
= ptr
+ 1 - prefix_len
+ prefix_skip
;
1579 if (flags
& SRE_INFO_LITERAL
)
1580 return 1; /* we got all of it */
1581 status
= SRE_MATCH(state
, pattern
+ 2*prefix_skip
);
1584 /* close but no cigar -- try again */
1596 if (pattern
[0] == SRE_OP_LITERAL
) {
1597 /* pattern starts with a literal character. this is used
1598 for short prefixes, and if fast search is disabled */
1599 SRE_CODE chr
= pattern
[1];
1600 end
= (SRE_CHAR
*)state
->end
;
1602 while (ptr
< end
&& (SRE_CODE
) ptr
[0] != chr
)
1606 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern
, ptr
));
1609 if (flags
& SRE_INFO_LITERAL
)
1610 return 1; /* we got all of it */
1611 status
= SRE_MATCH(state
, pattern
+ 2);
1615 } else if (charset
) {
1616 /* pattern starts with a character from a known set */
1617 end
= (SRE_CHAR
*)state
->end
;
1619 while (ptr
< end
&& !SRE_CHARSET(charset
, ptr
[0]))
1623 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern
, ptr
));
1626 status
= SRE_MATCH(state
, pattern
);
1633 while (ptr
<= end
) {
1634 TRACE(("|%p|%p|SEARCH\n", pattern
, ptr
));
1635 state
->start
= state
->ptr
= ptr
++;
1636 status
= SRE_MATCH(state
, pattern
);
1645 SRE_LITERAL_TEMPLATE(SRE_CHAR
* ptr
, Py_ssize_t len
)
1647 /* check if given string is a literal template (i.e. no escapes) */
1654 #if !defined(SRE_RECURSIVE)
1656 /* -------------------------------------------------------------------- */
1657 /* factories and destructors */
1659 /* see sre.h for object declarations */
1660 static PyObject
*pattern_new_match(PatternObject
*, SRE_STATE
*, int);
1661 static PyObject
*pattern_scanner(PatternObject
*, PyObject
*);
1664 sre_codesize(PyObject
* self
, PyObject
*unused
)
1666 return PyInt_FromSize_t(sizeof(SRE_CODE
));
1670 sre_getlower(PyObject
* self
, PyObject
* args
)
1672 int character
, flags
;
1673 if (!PyArg_ParseTuple(args
, "ii", &character
, &flags
))
1675 if (flags
& SRE_FLAG_LOCALE
)
1676 return Py_BuildValue("i", sre_lower_locale(character
));
1677 if (flags
& SRE_FLAG_UNICODE
)
1678 #if defined(HAVE_UNICODE)
1679 return Py_BuildValue("i", sre_lower_unicode(character
));
1681 return Py_BuildValue("i", sre_lower_locale(character
));
1683 return Py_BuildValue("i", sre_lower(character
));
1687 state_reset(SRE_STATE
* state
)
1689 /* FIXME: dynamic! */
1690 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
1692 state
->lastmark
= -1;
1693 state
->lastindex
= -1;
1695 state
->repeat
= NULL
;
1697 data_stack_dealloc(state
);
1701 getstring(PyObject
* string
, Py_ssize_t
* p_length
, int* p_charsize
)
1703 /* given a python object, return a data pointer, a length (in
1704 characters), and a character size. return NULL if the object
1705 is not a string (or not compatible) */
1707 PyBufferProcs
*buffer
;
1708 Py_ssize_t size
, bytes
;
1712 #if defined(HAVE_UNICODE)
1713 if (PyUnicode_Check(string
)) {
1714 /* unicode strings doesn't always support the buffer interface */
1715 ptr
= (void*) PyUnicode_AS_DATA(string
);
1716 /* bytes = PyUnicode_GET_DATA_SIZE(string); */
1717 size
= PyUnicode_GET_SIZE(string
);
1718 charsize
= sizeof(Py_UNICODE
);
1723 /* get pointer to string buffer */
1724 buffer
= Py_TYPE(string
)->tp_as_buffer
;
1725 if (!buffer
|| !buffer
->bf_getreadbuffer
|| !buffer
->bf_getsegcount
||
1726 buffer
->bf_getsegcount(string
, NULL
) != 1) {
1727 PyErr_SetString(PyExc_TypeError
, "expected string or buffer");
1731 /* determine buffer size */
1732 bytes
= buffer
->bf_getreadbuffer(string
, 0, &ptr
);
1734 PyErr_SetString(PyExc_TypeError
, "buffer has negative size");
1738 /* determine character size */
1739 #if PY_VERSION_HEX >= 0x01060000
1740 size
= PyObject_Size(string
);
1742 size
= PyObject_Length(string
);
1745 if (PyString_Check(string
) || bytes
== size
)
1747 #if defined(HAVE_UNICODE)
1748 else if (bytes
== (Py_ssize_t
) (size
* sizeof(Py_UNICODE
)))
1749 charsize
= sizeof(Py_UNICODE
);
1752 PyErr_SetString(PyExc_TypeError
, "buffer size mismatch");
1756 #if defined(HAVE_UNICODE)
1761 *p_charsize
= charsize
;
1767 state_init(SRE_STATE
* state
, PatternObject
* pattern
, PyObject
* string
,
1768 Py_ssize_t start
, Py_ssize_t end
)
1770 /* prepare state object */
1776 memset(state
, 0, sizeof(SRE_STATE
));
1778 state
->lastmark
= -1;
1779 state
->lastindex
= -1;
1781 ptr
= getstring(string
, &length
, &charsize
);
1785 /* adjust boundaries */
1788 else if (start
> length
)
1793 else if (end
> length
)
1796 state
->charsize
= charsize
;
1798 state
->beginning
= ptr
;
1800 state
->start
= (void*) ((char*) ptr
+ start
* state
->charsize
);
1801 state
->end
= (void*) ((char*) ptr
+ end
* state
->charsize
);
1804 state
->string
= string
;
1806 state
->endpos
= end
;
1808 if (pattern
->flags
& SRE_FLAG_LOCALE
)
1809 state
->lower
= sre_lower_locale
;
1810 else if (pattern
->flags
& SRE_FLAG_UNICODE
)
1811 #if defined(HAVE_UNICODE)
1812 state
->lower
= sre_lower_unicode
;
1814 state
->lower
= sre_lower_locale
;
1817 state
->lower
= sre_lower
;
1823 state_fini(SRE_STATE
* state
)
1825 Py_XDECREF(state
->string
);
1826 data_stack_dealloc(state
);
1829 /* calculate offset from start of string */
1830 #define STATE_OFFSET(state, member)\
1831 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1834 state_getslice(SRE_STATE
* state
, Py_ssize_t index
, PyObject
* string
, int empty
)
1838 index
= (index
- 1) * 2;
1840 if (string
== Py_None
|| index
>= state
->lastmark
|| !state
->mark
[index
] || !state
->mark
[index
+1]) {
1842 /* want empty string */
1849 i
= STATE_OFFSET(state
, state
->mark
[index
]);
1850 j
= STATE_OFFSET(state
, state
->mark
[index
+1]);
1853 return PySequence_GetSlice(string
, i
, j
);
1857 pattern_error(int status
)
1860 case SRE_ERROR_RECURSION_LIMIT
:
1863 "maximum recursion limit exceeded"
1866 case SRE_ERROR_MEMORY
:
1869 case SRE_ERROR_INTERRUPTED
:
1870 /* An exception has already been raised, so let it fly */
1873 /* other error codes indicate compiler/engine bugs */
1876 "internal error in regular expression engine"
1882 pattern_dealloc(PatternObject
* self
)
1884 if (self
->weakreflist
!= NULL
)
1885 PyObject_ClearWeakRefs((PyObject
*) self
);
1886 Py_XDECREF(self
->pattern
);
1887 Py_XDECREF(self
->groupindex
);
1888 Py_XDECREF(self
->indexgroup
);
1893 check_args_size(const char *name
, PyObject
* args
, PyObject
* kw
, int n
)
1895 Py_ssize_t m
= PyTuple_GET_SIZE(args
) + (kw
? PyDict_Size(kw
) : 0);
1898 PyErr_Format(PyExc_TypeError
,
1899 "%s() takes at most %d positional arguments (%zd given)",
1905 fix_string_param(PyObject
*string
, PyObject
*string2
, const char *oldname
)
1907 if (string2
!= NULL
) {
1909 if (string
!= NULL
) {
1910 PyErr_Format(PyExc_TypeError
,
1911 "Argument given by name ('%s') and position (1)",
1915 sprintf(buf
, "The '%s' keyword parameter name is deprecated. "
1916 "Use 'string' instead.", oldname
);
1917 if (PyErr_Warn(PyExc_DeprecationWarning
, buf
) < 0)
1921 if (string
== NULL
) {
1922 PyErr_SetString(PyExc_TypeError
,
1923 "Required argument 'string' (pos 1) not found");
1930 pattern_match(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
1935 PyObject
*string
= NULL
, *string2
= NULL
;
1936 Py_ssize_t start
= 0;
1937 Py_ssize_t end
= PY_SSIZE_T_MAX
;
1938 static char* kwlist
[] = { "string", "pos", "endpos", "pattern", NULL
};
1939 if (!check_args_size("match", args
, kw
, 3))
1942 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "|OnnO:match", kwlist
,
1943 &string
, &start
, &end
, &string2
))
1946 string
= fix_string_param(string
, string2
, "pattern");
1950 string
= state_init(&state
, self
, string
, start
, end
);
1954 state
.ptr
= state
.start
;
1956 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self
), state
.ptr
));
1958 if (state
.charsize
== 1) {
1959 status
= sre_match(&state
, PatternObject_GetCode(self
));
1961 #if defined(HAVE_UNICODE)
1962 status
= sre_umatch(&state
, PatternObject_GetCode(self
));
1966 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self
), state
.ptr
));
1967 if (PyErr_Occurred())
1972 return pattern_new_match(self
, &state
, status
);
1976 pattern_search(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
1981 PyObject
*string
= NULL
, *string2
= NULL
;
1982 Py_ssize_t start
= 0;
1983 Py_ssize_t end
= PY_SSIZE_T_MAX
;
1984 static char* kwlist
[] = { "string", "pos", "endpos", "pattern", NULL
};
1985 if (!check_args_size("search", args
, kw
, 3))
1988 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "|OnnO:search", kwlist
,
1989 &string
, &start
, &end
, &string2
))
1992 string
= fix_string_param(string
, string2
, "pattern");
1996 string
= state_init(&state
, self
, string
, start
, end
);
2000 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self
), state
.ptr
));
2002 if (state
.charsize
== 1) {
2003 status
= sre_search(&state
, PatternObject_GetCode(self
));
2005 #if defined(HAVE_UNICODE)
2006 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
2010 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self
), state
.ptr
));
2014 if (PyErr_Occurred())
2017 return pattern_new_match(self
, &state
, status
);
2021 call(char* module
, char* function
, PyObject
* args
)
2030 name
= PyString_FromString(module
);
2033 mod
= PyImport_Import(name
);
2037 func
= PyObject_GetAttrString(mod
, function
);
2041 result
= PyObject_CallObject(func
, args
);
2047 #ifdef USE_BUILTIN_COPY
2049 deepcopy(PyObject
** object
, PyObject
* memo
)
2055 PyTuple_Pack(2, *object
, memo
)
2063 return 1; /* success */
2068 join_list(PyObject
* list
, PyObject
* string
)
2070 /* join list elements */
2073 #if PY_VERSION_HEX >= 0x01060000
2079 joiner
= PySequence_GetSlice(string
, 0, 0);
2083 if (PyList_GET_SIZE(list
) == 0) {
2088 #if PY_VERSION_HEX >= 0x01060000
2089 function
= PyObject_GetAttrString(joiner
, "join");
2094 args
= PyTuple_New(1);
2096 Py_DECREF(function
);
2100 PyTuple_SET_ITEM(args
, 0, list
);
2101 result
= PyObject_CallObject(function
, args
);
2102 Py_DECREF(args
); /* also removes list */
2103 Py_DECREF(function
);
2107 PyTuple_Pack(2, list
, joiner
)
2116 pattern_findall(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2123 PyObject
*string
= NULL
, *string2
= NULL
;
2124 Py_ssize_t start
= 0;
2125 Py_ssize_t end
= PY_SSIZE_T_MAX
;
2126 static char* kwlist
[] = { "string", "pos", "endpos", "source", NULL
};
2127 if (!check_args_size("findall", args
, kw
, 3))
2130 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "|OnnO:findall", kwlist
,
2131 &string
, &start
, &end
, &string2
))
2134 string
= fix_string_param(string
, string2
, "source");
2138 string
= state_init(&state
, self
, string
, start
, end
);
2142 list
= PyList_New(0);
2148 while (state
.start
<= state
.end
) {
2152 state_reset(&state
);
2154 state
.ptr
= state
.start
;
2156 if (state
.charsize
== 1) {
2157 status
= sre_search(&state
, PatternObject_GetCode(self
));
2159 #if defined(HAVE_UNICODE)
2160 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
2164 if (PyErr_Occurred())
2170 pattern_error(status
);
2174 /* don't bother to build a match object */
2175 switch (self
->groups
) {
2177 b
= STATE_OFFSET(&state
, state
.start
);
2178 e
= STATE_OFFSET(&state
, state
.ptr
);
2179 item
= PySequence_GetSlice(string
, b
, e
);
2184 item
= state_getslice(&state
, 1, string
, 1);
2189 item
= PyTuple_New(self
->groups
);
2192 for (i
= 0; i
< self
->groups
; i
++) {
2193 PyObject
* o
= state_getslice(&state
, i
+1, string
, 1);
2198 PyTuple_SET_ITEM(item
, i
, o
);
2203 status
= PyList_Append(list
, item
);
2208 if (state
.ptr
== state
.start
)
2209 state
.start
= (void*) ((char*) state
.ptr
+ state
.charsize
);
2211 state
.start
= state
.ptr
;
2225 #if PY_VERSION_HEX >= 0x02020000
2227 pattern_finditer(PatternObject
* pattern
, PyObject
* args
)
2233 scanner
= pattern_scanner(pattern
, args
);
2237 search
= PyObject_GetAttrString(scanner
, "search");
2242 iterator
= PyCallIter_New(search
, Py_None
);
2250 pattern_split(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2260 PyObject
*string
= NULL
, *string2
= NULL
;
2261 Py_ssize_t maxsplit
= 0;
2262 static char* kwlist
[] = { "string", "maxsplit", "source", NULL
};
2263 if (!check_args_size("split", args
, kw
, 2))
2266 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "|OnO:split", kwlist
,
2267 &string
, &maxsplit
, &string2
))
2270 string
= fix_string_param(string
, string2
, "source");
2274 string
= state_init(&state
, self
, string
, 0, PY_SSIZE_T_MAX
);
2278 list
= PyList_New(0);
2287 while (!maxsplit
|| n
< maxsplit
) {
2289 state_reset(&state
);
2291 state
.ptr
= state
.start
;
2293 if (state
.charsize
== 1) {
2294 status
= sre_search(&state
, PatternObject_GetCode(self
));
2296 #if defined(HAVE_UNICODE)
2297 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
2301 if (PyErr_Occurred())
2307 pattern_error(status
);
2311 if (state
.start
== state
.ptr
) {
2312 if (last
== state
.end
)
2314 /* skip one character */
2315 state
.start
= (void*) ((char*) state
.ptr
+ state
.charsize
);
2319 /* get segment before this match */
2320 item
= PySequence_GetSlice(
2321 string
, STATE_OFFSET(&state
, last
),
2322 STATE_OFFSET(&state
, state
.start
)
2326 status
= PyList_Append(list
, item
);
2331 /* add groups (if any) */
2332 for (i
= 0; i
< self
->groups
; i
++) {
2333 item
= state_getslice(&state
, i
+1, string
, 0);
2336 status
= PyList_Append(list
, item
);
2344 last
= state
.start
= state
.ptr
;
2348 /* get segment following last match (even if empty) */
2349 item
= PySequence_GetSlice(
2350 string
, STATE_OFFSET(&state
, last
), state
.endpos
2354 status
= PyList_Append(list
, item
);
2370 pattern_subx(PatternObject
* self
, PyObject
* ptemplate
, PyObject
* string
,
2371 Py_ssize_t count
, Py_ssize_t subn
)
2384 int filter_is_callable
;
2386 if (PyCallable_Check(ptemplate
)) {
2387 /* sub/subn takes either a function or a template */
2390 filter_is_callable
= 1;
2392 /* if not callable, check if it's a literal string */
2394 ptr
= getstring(ptemplate
, &n
, &bint
);
2398 literal
= sre_literal_template((unsigned char *)ptr
, n
);
2400 #if defined(HAVE_UNICODE)
2401 literal
= sre_uliteral_template((Py_UNICODE
*)ptr
, n
);
2411 filter_is_callable
= 0;
2413 /* not a literal; hand it over to the template compiler */
2415 SRE_PY_MODULE
, "_subx",
2416 PyTuple_Pack(2, self
, ptemplate
)
2420 filter_is_callable
= PyCallable_Check(filter
);
2424 string
= state_init(&state
, self
, string
, 0, PY_SSIZE_T_MAX
);
2430 list
= PyList_New(0);
2439 while (!count
|| n
< count
) {
2441 state_reset(&state
);
2443 state
.ptr
= state
.start
;
2445 if (state
.charsize
== 1) {
2446 status
= sre_search(&state
, PatternObject_GetCode(self
));
2448 #if defined(HAVE_UNICODE)
2449 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
2453 if (PyErr_Occurred())
2459 pattern_error(status
);
2463 b
= STATE_OFFSET(&state
, state
.start
);
2464 e
= STATE_OFFSET(&state
, state
.ptr
);
2467 /* get segment before this match */
2468 item
= PySequence_GetSlice(string
, i
, b
);
2471 status
= PyList_Append(list
, item
);
2476 } else if (i
== b
&& i
== e
&& n
> 0)
2477 /* ignore empty match on latest position */
2480 if (filter_is_callable
) {
2481 /* pass match object through filter */
2482 match
= pattern_new_match(self
, &state
, 1);
2485 args
= PyTuple_Pack(1, match
);
2490 item
= PyObject_CallObject(filter
, args
);
2496 /* filter is literal string */
2502 if (item
!= Py_None
) {
2503 status
= PyList_Append(list
, item
);
2514 if (state
.ptr
== state
.start
)
2515 state
.start
= (void*) ((char*) state
.ptr
+ state
.charsize
);
2517 state
.start
= state
.ptr
;
2521 /* get segment following last match */
2522 if (i
< state
.endpos
) {
2523 item
= PySequence_GetSlice(string
, i
, state
.endpos
);
2526 status
= PyList_Append(list
, item
);
2536 /* convert list to single string (also removes list) */
2537 item
= join_list(list
, string
);
2543 return Py_BuildValue("Nn", item
, n
);
2556 pattern_sub(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2558 PyObject
* ptemplate
;
2560 Py_ssize_t count
= 0;
2561 static char* kwlist
[] = { "repl", "string", "count", NULL
};
2562 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "OO|n:sub", kwlist
,
2563 &ptemplate
, &string
, &count
))
2566 return pattern_subx(self
, ptemplate
, string
, count
, 0);
2570 pattern_subn(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2572 PyObject
* ptemplate
;
2574 Py_ssize_t count
= 0;
2575 static char* kwlist
[] = { "repl", "string", "count", NULL
};
2576 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "OO|n:subn", kwlist
,
2577 &ptemplate
, &string
, &count
))
2580 return pattern_subx(self
, ptemplate
, string
, count
, 1);
2584 pattern_copy(PatternObject
* self
, PyObject
*unused
)
2586 #ifdef USE_BUILTIN_COPY
2587 PatternObject
* copy
;
2590 copy
= PyObject_NEW_VAR(PatternObject
, &Pattern_Type
, self
->codesize
);
2594 offset
= offsetof(PatternObject
, groups
);
2596 Py_XINCREF(self
->groupindex
);
2597 Py_XINCREF(self
->indexgroup
);
2598 Py_XINCREF(self
->pattern
);
2600 memcpy((char*) copy
+ offset
, (char*) self
+ offset
,
2601 sizeof(PatternObject
) + self
->codesize
* sizeof(SRE_CODE
) - offset
);
2602 copy
->weakreflist
= NULL
;
2604 return (PyObject
*) copy
;
2606 PyErr_SetString(PyExc_TypeError
, "cannot copy this pattern object");
2612 pattern_deepcopy(PatternObject
* self
, PyObject
* memo
)
2614 #ifdef USE_BUILTIN_COPY
2615 PatternObject
* copy
;
2617 copy
= (PatternObject
*) pattern_copy(self
);
2621 if (!deepcopy(©
->groupindex
, memo
) ||
2622 !deepcopy(©
->indexgroup
, memo
) ||
2623 !deepcopy(©
->pattern
, memo
)) {
2629 PyErr_SetString(PyExc_TypeError
, "cannot deepcopy this pattern object");
2634 PyDoc_STRVAR(pattern_match_doc
,
2635 "match(string[, pos[, endpos]]) --> match object or None.\n\
2636 Matches zero or more characters at the beginning of the string");
2638 PyDoc_STRVAR(pattern_search_doc
,
2639 "search(string[, pos[, endpos]]) --> match object or None.\n\
2640 Scan through string looking for a match, and return a corresponding\n\
2641 match object instance. Return None if no position in the string matches.");
2643 PyDoc_STRVAR(pattern_split_doc
,
2644 "split(string[, maxsplit = 0]) --> list.\n\
2645 Split string by the occurrences of pattern.");
2647 PyDoc_STRVAR(pattern_findall_doc
,
2648 "findall(string[, pos[, endpos]]) --> list.\n\
2649 Return a list of all non-overlapping matches of pattern in string.");
2651 PyDoc_STRVAR(pattern_finditer_doc
,
2652 "finditer(string[, pos[, endpos]]) --> iterator.\n\
2653 Return an iterator over all non-overlapping matches for the \n\
2654 RE pattern in string. For each match, the iterator returns a\n\
2657 PyDoc_STRVAR(pattern_sub_doc
,
2658 "sub(repl, string[, count = 0]) --> newstring\n\
2659 Return the string obtained by replacing the leftmost non-overlapping\n\
2660 occurrences of pattern in string by the replacement repl.");
2662 PyDoc_STRVAR(pattern_subn_doc
,
2663 "subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2664 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2665 the leftmost non-overlapping occurrences of pattern with the\n\
2666 replacement repl.");
2668 PyDoc_STRVAR(pattern_doc
, "Compiled regular expression objects");
2670 static PyMethodDef pattern_methods
[] = {
2671 {"match", (PyCFunction
) pattern_match
, METH_VARARGS
|METH_KEYWORDS
,
2673 {"search", (PyCFunction
) pattern_search
, METH_VARARGS
|METH_KEYWORDS
,
2674 pattern_search_doc
},
2675 {"sub", (PyCFunction
) pattern_sub
, METH_VARARGS
|METH_KEYWORDS
,
2677 {"subn", (PyCFunction
) pattern_subn
, METH_VARARGS
|METH_KEYWORDS
,
2679 {"split", (PyCFunction
) pattern_split
, METH_VARARGS
|METH_KEYWORDS
,
2681 {"findall", (PyCFunction
) pattern_findall
, METH_VARARGS
|METH_KEYWORDS
,
2682 pattern_findall_doc
},
2683 #if PY_VERSION_HEX >= 0x02020000
2684 {"finditer", (PyCFunction
) pattern_finditer
, METH_VARARGS
,
2685 pattern_finditer_doc
},
2687 {"scanner", (PyCFunction
) pattern_scanner
, METH_VARARGS
},
2688 {"__copy__", (PyCFunction
) pattern_copy
, METH_NOARGS
},
2689 {"__deepcopy__", (PyCFunction
) pattern_deepcopy
, METH_O
},
2693 #define PAT_OFF(x) offsetof(PatternObject, x)
2694 static PyMemberDef pattern_members
[] = {
2695 {"pattern", T_OBJECT
, PAT_OFF(pattern
), READONLY
},
2696 {"flags", T_INT
, PAT_OFF(flags
), READONLY
},
2697 {"groups", T_PYSSIZET
, PAT_OFF(groups
), READONLY
},
2698 {"groupindex", T_OBJECT
, PAT_OFF(groupindex
), READONLY
},
2699 {NULL
} /* Sentinel */
2702 statichere PyTypeObject Pattern_Type
= {
2703 PyObject_HEAD_INIT(NULL
)
2704 0, "_" SRE_MODULE
".SRE_Pattern",
2705 sizeof(PatternObject
), sizeof(SRE_CODE
),
2706 (destructor
)pattern_dealloc
, /*tp_dealloc*/
2708 0, /* tp_getattrn */
2712 0, /* tp_as_number */
2713 0, /* tp_as_sequence */
2714 0, /* tp_as_mapping */
2718 0, /* tp_getattro */
2719 0, /* tp_setattro */
2720 0, /* tp_as_buffer */
2721 Py_TPFLAGS_DEFAULT
, /* tp_flags */
2722 pattern_doc
, /* tp_doc */
2723 0, /* tp_traverse */
2725 0, /* tp_richcompare */
2726 offsetof(PatternObject
, weakreflist
), /* tp_weaklistoffset */
2728 0, /* tp_iternext */
2729 pattern_methods
, /* tp_methods */
2730 pattern_members
, /* tp_members */
2733 static int _validate(PatternObject
*self
); /* Forward */
2736 _compile(PyObject
* self_
, PyObject
* args
)
2738 /* "compile" pattern descriptor to pattern object */
2740 PatternObject
* self
;
2746 Py_ssize_t groups
= 0;
2747 PyObject
* groupindex
= NULL
;
2748 PyObject
* indexgroup
= NULL
;
2749 if (!PyArg_ParseTuple(args
, "OiO!|nOO", &pattern
, &flags
,
2750 &PyList_Type
, &code
, &groups
,
2751 &groupindex
, &indexgroup
))
2754 n
= PyList_GET_SIZE(code
);
2755 /* coverity[ampersand_in_size] */
2756 self
= PyObject_NEW_VAR(PatternObject
, &Pattern_Type
, n
);
2759 self
->weakreflist
= NULL
;
2760 self
->pattern
= NULL
;
2761 self
->groupindex
= NULL
;
2762 self
->indexgroup
= NULL
;
2766 for (i
= 0; i
< n
; i
++) {
2767 PyObject
*o
= PyList_GET_ITEM(code
, i
);
2768 unsigned long value
= PyInt_Check(o
) ? (unsigned long)PyInt_AsLong(o
)
2769 : PyLong_AsUnsignedLong(o
);
2770 if (value
== (unsigned long)-1 && PyErr_Occurred()) {
2771 if (PyErr_ExceptionMatches(PyExc_OverflowError
)) {
2772 PyErr_SetString(PyExc_OverflowError
,
2773 "regular expression code size limit exceeded");
2777 self
->code
[i
] = (SRE_CODE
) value
;
2778 if ((unsigned long) self
->code
[i
] != value
) {
2779 PyErr_SetString(PyExc_OverflowError
,
2780 "regular expression code size limit exceeded");
2785 if (PyErr_Occurred()) {
2791 self
->pattern
= pattern
;
2793 self
->flags
= flags
;
2795 self
->groups
= groups
;
2797 Py_XINCREF(groupindex
);
2798 self
->groupindex
= groupindex
;
2800 Py_XINCREF(indexgroup
);
2801 self
->indexgroup
= indexgroup
;
2803 self
->weakreflist
= NULL
;
2805 if (!_validate(self
)) {
2810 return (PyObject
*) self
;
2813 /* -------------------------------------------------------------------- */
2814 /* Code validation */
2816 /* To learn more about this code, have a look at the _compile() function in
2817 Lib/sre_compile.py. The validation functions below checks the code array
2818 for conformance with the code patterns generated there.
2820 The nice thing about the generated code is that it is position-independent:
2821 all jumps are relative jumps forward. Also, jumps don't cross each other:
2822 the target of a later jump is always earlier than the target of an earlier
2823 jump. IOW, this is okay:
2825 J---------J-------T--------T
2827 \______________________/
2831 J---------J-------T--------T
2835 It also helps that SRE_CODE is always an unsigned type.
2838 /* Defining this one enables tracing of the validator */
2841 /* Trace macro for the validator */
2842 #if defined(VVERBOSE)
2843 #define VTRACE(v) printf v
2845 #define VTRACE(v) do {} while(0) /* do nothing */
2848 /* Report failure */
2849 #define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2851 /* Extract opcode, argument, or skip count from code array */
2854 VTRACE(("%p: ", code)); \
2855 if (code >= end) FAIL; \
2857 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2861 VTRACE(("%p= ", code)); \
2862 if (code >= end) FAIL; \
2864 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2866 #define GET_SKIP_ADJ(adj) \
2868 VTRACE(("%p= ", code)); \
2869 if (code >= end) FAIL; \
2871 VTRACE(("%lu (skip to %p)\n", \
2872 (unsigned long)skip, code+skip)); \
2873 if (skip-adj > end-code) \
2877 #define GET_SKIP GET_SKIP_ADJ(0)
2880 _validate_charset(SRE_CODE
*code
, SRE_CODE
*end
)
2882 /* Some variables are manipulated by the macros above */
2888 while (code
< end
) {
2895 case SRE_OP_LITERAL
:
2904 case SRE_OP_CHARSET
:
2905 offset
= 32/sizeof(SRE_CODE
); /* 32-byte bitmap */
2906 if (offset
> end
-code
)
2911 case SRE_OP_BIGCHARSET
:
2912 GET_ARG
; /* Number of blocks */
2913 offset
= 256/sizeof(SRE_CODE
); /* 256-byte table */
2914 if (offset
> end
-code
)
2916 /* Make sure that each byte points to a valid block */
2917 for (i
= 0; i
< 256; i
++) {
2918 if (((unsigned char *)code
)[i
] >= arg
)
2922 offset
= arg
* 32/sizeof(SRE_CODE
); /* 32-byte bitmap times arg */
2923 if (offset
> end
-code
)
2928 case SRE_OP_CATEGORY
:
2931 case SRE_CATEGORY_DIGIT
:
2932 case SRE_CATEGORY_NOT_DIGIT
:
2933 case SRE_CATEGORY_SPACE
:
2934 case SRE_CATEGORY_NOT_SPACE
:
2935 case SRE_CATEGORY_WORD
:
2936 case SRE_CATEGORY_NOT_WORD
:
2937 case SRE_CATEGORY_LINEBREAK
:
2938 case SRE_CATEGORY_NOT_LINEBREAK
:
2939 case SRE_CATEGORY_LOC_WORD
:
2940 case SRE_CATEGORY_LOC_NOT_WORD
:
2941 case SRE_CATEGORY_UNI_DIGIT
:
2942 case SRE_CATEGORY_UNI_NOT_DIGIT
:
2943 case SRE_CATEGORY_UNI_SPACE
:
2944 case SRE_CATEGORY_UNI_NOT_SPACE
:
2945 case SRE_CATEGORY_UNI_WORD
:
2946 case SRE_CATEGORY_UNI_NOT_WORD
:
2947 case SRE_CATEGORY_UNI_LINEBREAK
:
2948 case SRE_CATEGORY_UNI_NOT_LINEBREAK
:
2965 _validate_inner(SRE_CODE
*code
, SRE_CODE
*end
, Py_ssize_t groups
)
2967 /* Some variables are manipulated by the macros above */
2972 VTRACE(("code=%p, end=%p\n", code
, end
));
2977 while (code
< end
) {
2982 /* We don't check whether marks are properly nested; the
2983 sre_match() code is robust even if they don't, and the worst
2984 you can get is nonsensical match results. */
2986 if (arg
> 2*groups
+1) {
2987 VTRACE(("arg=%d, groups=%d\n", (int)arg
, (int)groups
));
2992 case SRE_OP_LITERAL
:
2993 case SRE_OP_NOT_LITERAL
:
2994 case SRE_OP_LITERAL_IGNORE
:
2995 case SRE_OP_NOT_LITERAL_IGNORE
:
2997 /* The arg is just a character, nothing to check */
3000 case SRE_OP_SUCCESS
:
3001 case SRE_OP_FAILURE
:
3002 /* Nothing to check; these normally end the matching process */
3008 case SRE_AT_BEGINNING
:
3009 case SRE_AT_BEGINNING_STRING
:
3010 case SRE_AT_BEGINNING_LINE
:
3012 case SRE_AT_END_LINE
:
3013 case SRE_AT_END_STRING
:
3014 case SRE_AT_BOUNDARY
:
3015 case SRE_AT_NON_BOUNDARY
:
3016 case SRE_AT_LOC_BOUNDARY
:
3017 case SRE_AT_LOC_NON_BOUNDARY
:
3018 case SRE_AT_UNI_BOUNDARY
:
3019 case SRE_AT_UNI_NON_BOUNDARY
:
3027 case SRE_OP_ANY_ALL
:
3028 /* These have no operands */
3032 case SRE_OP_IN_IGNORE
:
3034 /* Stop 1 before the end; we check the FAILURE below */
3035 if (!_validate_charset(code
, code
+skip
-2))
3037 if (code
[skip
-2] != SRE_OP_FAILURE
)
3044 /* A minimal info field is
3045 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
3046 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
3051 newcode
= code
+skip
-1;
3052 GET_ARG
; flags
= arg
;
3055 /* Check that only valid flags are present */
3056 if ((flags
& ~(SRE_INFO_PREFIX
|
3058 SRE_INFO_CHARSET
)) != 0)
3060 /* PREFIX and CHARSET are mutually exclusive */
3061 if ((flags
& SRE_INFO_PREFIX
) &&
3062 (flags
& SRE_INFO_CHARSET
))
3064 /* LITERAL implies PREFIX */
3065 if ((flags
& SRE_INFO_LITERAL
) &&
3066 !(flags
& SRE_INFO_PREFIX
))
3068 /* Validate the prefix */
3069 if (flags
& SRE_INFO_PREFIX
) {
3070 SRE_CODE prefix_len
;
3071 GET_ARG
; prefix_len
= arg
;
3072 GET_ARG
; /* prefix skip */
3073 /* Here comes the prefix string */
3074 if (prefix_len
> newcode
-code
)
3077 /* And here comes the overlap table */
3078 if (prefix_len
> newcode
-code
)
3080 /* Each overlap value should be < prefix_len */
3081 for (i
= 0; i
< prefix_len
; i
++) {
3082 if (code
[i
] >= prefix_len
)
3087 /* Validate the charset */
3088 if (flags
& SRE_INFO_CHARSET
) {
3089 if (!_validate_charset(code
, newcode
-1))
3091 if (newcode
[-1] != SRE_OP_FAILURE
)
3095 else if (code
!= newcode
) {
3096 VTRACE(("code=%p, newcode=%p\n", code
, newcode
));
3104 SRE_CODE
*target
= NULL
;
3109 /* Stop 2 before the end; we check the JUMP below */
3110 if (!_validate_inner(code
, code
+skip
-3, groups
))
3113 /* Check that it ends with a JUMP, and that each JUMP
3114 has the same target */
3116 if (op
!= SRE_OP_JUMP
)
3120 target
= code
+skip
-1;
3121 else if (code
+skip
-1 != target
)
3127 case SRE_OP_REPEAT_ONE
:
3128 case SRE_OP_MIN_REPEAT_ONE
:
3136 if (max
> SRE_MAXREPEAT
)
3138 if (!_validate_inner(code
, code
+skip
-4, groups
))
3142 if (op
!= SRE_OP_SUCCESS
)
3155 if (max
> SRE_MAXREPEAT
)
3157 if (!_validate_inner(code
, code
+skip
-3, groups
))
3161 if (op
!= SRE_OP_MAX_UNTIL
&& op
!= SRE_OP_MIN_UNTIL
)
3166 case SRE_OP_GROUPREF
:
3167 case SRE_OP_GROUPREF_IGNORE
:
3173 case SRE_OP_GROUPREF_EXISTS
:
3174 /* The regex syntax for this is: '(?(group)then|else)', where
3175 'group' is either an integer group number or a group name,
3176 'then' and 'else' are sub-regexes, and 'else' is optional. */
3181 code
--; /* The skip is relative to the first arg! */
3182 /* There are two possibilities here: if there is both a 'then'
3183 part and an 'else' part, the generated code looks like:
3191 (<skipyes> jumps here)
3193 (<skipno> jumps here)
3195 If there is only a 'then' part, it looks like:
3203 There is no direct way to decide which it is, and we don't want
3204 to allow arbitrary jumps anywhere in the code; so we just look
3205 for a JUMP opcode preceding our skip target.
3207 if (skip
>= 3 && skip
-3 < end
-code
&&
3208 code
[skip
-3] == SRE_OP_JUMP
)
3210 VTRACE(("both then and else parts present\n"));
3211 if (!_validate_inner(code
+1, code
+skip
-3, groups
))
3213 code
+= skip
-2; /* Position after JUMP, at <skipno> */
3215 if (!_validate_inner(code
, code
+skip
-1, groups
))
3220 VTRACE(("only a then part present\n"));
3221 if (!_validate_inner(code
+1, code
+skip
-1, groups
))
3228 case SRE_OP_ASSERT_NOT
:
3230 GET_ARG
; /* 0 for lookahead, width for lookbehind */
3231 code
--; /* Back up over arg to simplify math below */
3232 if (arg
& 0x80000000)
3233 FAIL
; /* Width too large */
3234 /* Stop 1 before the end; we check the SUCCESS below */
3235 if (!_validate_inner(code
+1, code
+skip
-2, groups
))
3239 if (op
!= SRE_OP_SUCCESS
)
3254 _validate_outer(SRE_CODE
*code
, SRE_CODE
*end
, Py_ssize_t groups
)
3256 if (groups
< 0 || groups
> 100 || code
>= end
|| end
[-1] != SRE_OP_SUCCESS
)
3258 if (groups
== 0) /* fix for simplejson */
3259 groups
= 100; /* 100 groups should always be safe */
3260 return _validate_inner(code
, end
-1, groups
);
3264 _validate(PatternObject
*self
)
3266 if (!_validate_outer(self
->code
, self
->code
+self
->codesize
, self
->groups
))
3268 PyErr_SetString(PyExc_RuntimeError
, "invalid SRE code");
3272 VTRACE(("Success!\n"));
3276 /* -------------------------------------------------------------------- */
3280 match_dealloc(MatchObject
* self
)
3282 Py_XDECREF(self
->regs
);
3283 Py_XDECREF(self
->string
);
3284 Py_DECREF(self
->pattern
);
3289 match_getslice_by_index(MatchObject
* self
, Py_ssize_t index
, PyObject
* def
)
3291 if (index
< 0 || index
>= self
->groups
) {
3292 /* raise IndexError if we were given a bad group number */
3302 if (self
->string
== Py_None
|| self
->mark
[index
] < 0) {
3303 /* return default value if the string or group is undefined */
3308 return PySequence_GetSlice(
3309 self
->string
, self
->mark
[index
], self
->mark
[index
+1]
3314 match_getindex(MatchObject
* self
, PyObject
* index
)
3318 if (PyInt_Check(index
) || PyLong_Check(index
))
3319 return PyInt_AsSsize_t(index
);
3323 if (self
->pattern
->groupindex
) {
3324 index
= PyObject_GetItem(self
->pattern
->groupindex
, index
);
3326 if (PyInt_Check(index
) || PyLong_Check(index
))
3327 i
= PyInt_AsSsize_t(index
);
3337 match_getslice(MatchObject
* self
, PyObject
* index
, PyObject
* def
)
3339 return match_getslice_by_index(self
, match_getindex(self
, index
), def
);
3343 match_expand(MatchObject
* self
, PyObject
* ptemplate
)
3345 /* delegate to Python code */
3347 SRE_PY_MODULE
, "_expand",
3348 PyTuple_Pack(3, self
->pattern
, self
, ptemplate
)
3353 match_group(MatchObject
* self
, PyObject
* args
)
3358 size
= PyTuple_GET_SIZE(args
);
3362 result
= match_getslice(self
, Py_False
, Py_None
);
3365 result
= match_getslice(self
, PyTuple_GET_ITEM(args
, 0), Py_None
);
3368 /* fetch multiple items */
3369 result
= PyTuple_New(size
);
3372 for (i
= 0; i
< size
; i
++) {
3373 PyObject
* item
= match_getslice(
3374 self
, PyTuple_GET_ITEM(args
, i
), Py_None
3380 PyTuple_SET_ITEM(result
, i
, item
);
3388 match_groups(MatchObject
* self
, PyObject
* args
, PyObject
* kw
)
3393 PyObject
* def
= Py_None
;
3394 static char* kwlist
[] = { "default", NULL
};
3395 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "|O:groups", kwlist
, &def
))
3398 result
= PyTuple_New(self
->groups
-1);
3402 for (index
= 1; index
< self
->groups
; index
++) {
3404 item
= match_getslice_by_index(self
, index
, def
);
3409 PyTuple_SET_ITEM(result
, index
-1, item
);
3416 match_groupdict(MatchObject
* self
, PyObject
* args
, PyObject
* kw
)
3422 PyObject
* def
= Py_None
;
3423 static char* kwlist
[] = { "default", NULL
};
3424 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "|O:groupdict", kwlist
, &def
))
3427 result
= PyDict_New();
3428 if (!result
|| !self
->pattern
->groupindex
)
3431 keys
= PyMapping_Keys(self
->pattern
->groupindex
);
3435 for (index
= 0; index
< PyList_GET_SIZE(keys
); index
++) {
3439 key
= PyList_GET_ITEM(keys
, index
);
3442 value
= match_getslice(self
, key
, def
);
3447 status
= PyDict_SetItem(result
, key
, value
);
3464 match_start(MatchObject
* self
, PyObject
* args
)
3468 PyObject
* index_
= Py_False
; /* zero */
3469 if (!PyArg_UnpackTuple(args
, "start", 0, 1, &index_
))
3472 index
= match_getindex(self
, index_
);
3474 if (index
< 0 || index
>= self
->groups
) {
3482 /* mark is -1 if group is undefined */
3483 return PyInt_FromSsize_t(self
->mark
[index
*2]);
3487 match_end(MatchObject
* self
, PyObject
* args
)
3491 PyObject
* index_
= Py_False
; /* zero */
3492 if (!PyArg_UnpackTuple(args
, "end", 0, 1, &index_
))
3495 index
= match_getindex(self
, index_
);
3497 if (index
< 0 || index
>= self
->groups
) {
3505 /* mark is -1 if group is undefined */
3506 return PyInt_FromSsize_t(self
->mark
[index
*2+1]);
3510 _pair(Py_ssize_t i1
, Py_ssize_t i2
)
3515 pair
= PyTuple_New(2);
3519 item
= PyInt_FromSsize_t(i1
);
3522 PyTuple_SET_ITEM(pair
, 0, item
);
3524 item
= PyInt_FromSsize_t(i2
);
3527 PyTuple_SET_ITEM(pair
, 1, item
);
3537 match_span(MatchObject
* self
, PyObject
* args
)
3541 PyObject
* index_
= Py_False
; /* zero */
3542 if (!PyArg_UnpackTuple(args
, "span", 0, 1, &index_
))
3545 index
= match_getindex(self
, index_
);
3547 if (index
< 0 || index
>= self
->groups
) {
3555 /* marks are -1 if group is undefined */
3556 return _pair(self
->mark
[index
*2], self
->mark
[index
*2+1]);
3560 match_regs(MatchObject
* self
)
3566 regs
= PyTuple_New(self
->groups
);
3570 for (index
= 0; index
< self
->groups
; index
++) {
3571 item
= _pair(self
->mark
[index
*2], self
->mark
[index
*2+1]);
3576 PyTuple_SET_ITEM(regs
, index
, item
);
3586 match_copy(MatchObject
* self
, PyObject
*unused
)
3588 #ifdef USE_BUILTIN_COPY
3590 Py_ssize_t slots
, offset
;
3592 slots
= 2 * (self
->pattern
->groups
+1);
3594 copy
= PyObject_NEW_VAR(MatchObject
, &Match_Type
, slots
);
3598 /* this value a constant, but any compiler should be able to
3599 figure that out all by itself */
3600 offset
= offsetof(MatchObject
, string
);
3602 Py_XINCREF(self
->pattern
);
3603 Py_XINCREF(self
->string
);
3604 Py_XINCREF(self
->regs
);
3606 memcpy((char*) copy
+ offset
, (char*) self
+ offset
,
3607 sizeof(MatchObject
) + slots
* sizeof(Py_ssize_t
) - offset
);
3609 return (PyObject
*) copy
;
3611 PyErr_SetString(PyExc_TypeError
, "cannot copy this match object");
3617 match_deepcopy(MatchObject
* self
, PyObject
* memo
)
3619 #ifdef USE_BUILTIN_COPY
3622 copy
= (MatchObject
*) match_copy(self
);
3626 if (!deepcopy((PyObject
**) ©
->pattern
, memo
) ||
3627 !deepcopy(©
->string
, memo
) ||
3628 !deepcopy(©
->regs
, memo
)) {
3634 PyErr_SetString(PyExc_TypeError
, "cannot deepcopy this match object");
3639 PyDoc_STRVAR(match_doc
,
3640 "The result of re.match() and re.search().\n\
3641 Match objects always have a boolean value of True.");
3643 PyDoc_STRVAR(match_group_doc
,
3644 "group([group1, ...]) -> str or tuple.\n\
3645 Return subgroup(s) of the match by indices or names.\n\
3646 For 0 returns the entire match.");
3648 PyDoc_STRVAR(match_start_doc
,
3649 "start([group=0]) -> int.\n\
3650 Return index of the start of the substring matched by group.");
3652 PyDoc_STRVAR(match_end_doc
,
3653 "end([group=0]) -> int.\n\
3654 Return index of the end of the substring matched by group.");
3656 PyDoc_STRVAR(match_span_doc
,
3657 "span([group]) -> tuple.\n\
3658 For MatchObject m, return the 2-tuple (m.start(group), m.end(group)).");
3660 PyDoc_STRVAR(match_groups_doc
,
3661 "groups([default=None]) -> tuple.\n\
3662 Return a tuple containing all the subgroups of the match, from 1.\n\
3663 The default argument is used for groups\n\
3664 that did not participate in the match");
3666 PyDoc_STRVAR(match_groupdict_doc
,
3667 "groupdict([default=None]) -> dict.\n\
3668 Return a dictionary containing all the named subgroups of the match,\n\
3669 keyed by the subgroup name. The default argument is used for groups\n\
3670 that did not participate in the match");
3672 PyDoc_STRVAR(match_expand_doc
,
3673 "expand(template) -> str.\n\
3674 Return the string obtained by doing backslash substitution\n\
3675 on the string template, as done by the sub() method.");
3677 static PyMethodDef match_methods
[] = {
3678 {"group", (PyCFunction
) match_group
, METH_VARARGS
, match_group_doc
},
3679 {"start", (PyCFunction
) match_start
, METH_VARARGS
, match_start_doc
},
3680 {"end", (PyCFunction
) match_end
, METH_VARARGS
, match_end_doc
},
3681 {"span", (PyCFunction
) match_span
, METH_VARARGS
, match_span_doc
},
3682 {"groups", (PyCFunction
) match_groups
, METH_VARARGS
|METH_KEYWORDS
,
3684 {"groupdict", (PyCFunction
) match_groupdict
, METH_VARARGS
|METH_KEYWORDS
,
3685 match_groupdict_doc
},
3686 {"expand", (PyCFunction
) match_expand
, METH_O
, match_expand_doc
},
3687 {"__copy__", (PyCFunction
) match_copy
, METH_NOARGS
},
3688 {"__deepcopy__", (PyCFunction
) match_deepcopy
, METH_O
},
3693 match_lastindex_get(MatchObject
*self
)
3695 if (self
->lastindex
>= 0)
3696 return PyInt_FromSsize_t(self
->lastindex
);
3702 match_lastgroup_get(MatchObject
*self
)
3704 if (self
->pattern
->indexgroup
&& self
->lastindex
>= 0) {
3705 PyObject
* result
= PySequence_GetItem(
3706 self
->pattern
->indexgroup
, self
->lastindex
3717 match_regs_get(MatchObject
*self
)
3720 Py_INCREF(self
->regs
);
3723 return match_regs(self
);
3726 static PyGetSetDef match_getset
[] = {
3727 {"lastindex", (getter
)match_lastindex_get
, (setter
)NULL
},
3728 {"lastgroup", (getter
)match_lastgroup_get
, (setter
)NULL
},
3729 {"regs", (getter
)match_regs_get
, (setter
)NULL
},
3733 #define MATCH_OFF(x) offsetof(MatchObject, x)
3734 static PyMemberDef match_members
[] = {
3735 {"string", T_OBJECT
, MATCH_OFF(string
), READONLY
},
3736 {"re", T_OBJECT
, MATCH_OFF(pattern
), READONLY
},
3737 {"pos", T_PYSSIZET
, MATCH_OFF(pos
), READONLY
},
3738 {"endpos", T_PYSSIZET
, MATCH_OFF(endpos
), READONLY
},
3743 /* FIXME: implement setattr("string", None) as a special case (to
3744 detach the associated string, if any */
3746 static PyTypeObject Match_Type
= {
3747 PyVarObject_HEAD_INIT(NULL
, 0)
3748 "_" SRE_MODULE
".SRE_Match",
3749 sizeof(MatchObject
), sizeof(Py_ssize_t
),
3750 (destructor
)match_dealloc
, /* tp_dealloc */
3756 0, /* tp_as_number */
3757 0, /* tp_as_sequence */
3758 0, /* tp_as_mapping */
3762 0, /* tp_getattro */
3763 0, /* tp_setattro */
3764 0, /* tp_as_buffer */
3766 match_doc
, /* tp_doc */
3767 0, /* tp_traverse */
3769 0, /* tp_richcompare */
3770 0, /* tp_weaklistoffset */
3772 0, /* tp_iternext */
3773 match_methods
, /* tp_methods */
3774 match_members
, /* tp_members */
3775 match_getset
, /* tp_getset */
3779 pattern_new_match(PatternObject
* pattern
, SRE_STATE
* state
, int status
)
3781 /* create match object (from state object) */
3790 /* create match object (with room for extra group marks) */
3791 /* coverity[ampersand_in_size] */
3792 match
= PyObject_NEW_VAR(MatchObject
, &Match_Type
,
3793 2*(pattern
->groups
+1));
3798 match
->pattern
= pattern
;
3800 Py_INCREF(state
->string
);
3801 match
->string
= state
->string
;
3804 match
->groups
= pattern
->groups
+1;
3806 /* fill in group slices */
3808 base
= (char*) state
->beginning
;
3809 n
= state
->charsize
;
3811 match
->mark
[0] = ((char*) state
->start
- base
) / n
;
3812 match
->mark
[1] = ((char*) state
->ptr
- base
) / n
;
3814 for (i
= j
= 0; i
< pattern
->groups
; i
++, j
+=2)
3815 if (j
+1 <= state
->lastmark
&& state
->mark
[j
] && state
->mark
[j
+1]) {
3816 match
->mark
[j
+2] = ((char*) state
->mark
[j
] - base
) / n
;
3817 match
->mark
[j
+3] = ((char*) state
->mark
[j
+1] - base
) / n
;
3819 match
->mark
[j
+2] = match
->mark
[j
+3] = -1; /* undefined */
3821 match
->pos
= state
->pos
;
3822 match
->endpos
= state
->endpos
;
3824 match
->lastindex
= state
->lastindex
;
3826 return (PyObject
*) match
;
3828 } else if (status
== 0) {
3836 /* internal error */
3837 pattern_error(status
);
3842 /* -------------------------------------------------------------------- */
3843 /* scanner methods (experimental) */
3846 scanner_dealloc(ScannerObject
* self
)
3848 state_fini(&self
->state
);
3849 Py_XDECREF(self
->pattern
);
3854 scanner_match(ScannerObject
* self
, PyObject
*unused
)
3856 SRE_STATE
* state
= &self
->state
;
3862 state
->ptr
= state
->start
;
3864 if (state
->charsize
== 1) {
3865 status
= sre_match(state
, PatternObject_GetCode(self
->pattern
));
3867 #if defined(HAVE_UNICODE)
3868 status
= sre_umatch(state
, PatternObject_GetCode(self
->pattern
));
3871 if (PyErr_Occurred())
3874 match
= pattern_new_match((PatternObject
*) self
->pattern
,
3877 if (status
== 0 || state
->ptr
== state
->start
)
3878 state
->start
= (void*) ((char*) state
->ptr
+ state
->charsize
);
3880 state
->start
= state
->ptr
;
3887 scanner_search(ScannerObject
* self
, PyObject
*unused
)
3889 SRE_STATE
* state
= &self
->state
;
3895 state
->ptr
= state
->start
;
3897 if (state
->charsize
== 1) {
3898 status
= sre_search(state
, PatternObject_GetCode(self
->pattern
));
3900 #if defined(HAVE_UNICODE)
3901 status
= sre_usearch(state
, PatternObject_GetCode(self
->pattern
));
3904 if (PyErr_Occurred())
3907 match
= pattern_new_match((PatternObject
*) self
->pattern
,
3910 if (status
== 0 || state
->ptr
== state
->start
)
3911 state
->start
= (void*) ((char*) state
->ptr
+ state
->charsize
);
3913 state
->start
= state
->ptr
;
3918 static PyMethodDef scanner_methods
[] = {
3919 {"match", (PyCFunction
) scanner_match
, METH_NOARGS
},
3920 {"search", (PyCFunction
) scanner_search
, METH_NOARGS
},
3924 #define SCAN_OFF(x) offsetof(ScannerObject, x)
3925 static PyMemberDef scanner_members
[] = {
3926 {"pattern", T_OBJECT
, SCAN_OFF(pattern
), READONLY
},
3927 {NULL
} /* Sentinel */
3930 statichere PyTypeObject Scanner_Type
= {
3931 PyObject_HEAD_INIT(NULL
)
3932 0, "_" SRE_MODULE
".SRE_Scanner",
3933 sizeof(ScannerObject
), 0,
3934 (destructor
)scanner_dealloc
, /*tp_dealloc*/
3938 0, /* tp_reserved */
3940 0, /* tp_as_number */
3941 0, /* tp_as_sequence */
3942 0, /* tp_as_mapping */
3946 0, /* tp_getattro */
3947 0, /* tp_setattro */
3948 0, /* tp_as_buffer */
3949 Py_TPFLAGS_DEFAULT
, /* tp_flags */
3951 0, /* tp_traverse */
3953 0, /* tp_richcompare */
3954 0, /* tp_weaklistoffset */
3956 0, /* tp_iternext */
3957 scanner_methods
, /* tp_methods */
3958 scanner_members
, /* tp_members */
3963 pattern_scanner(PatternObject
* pattern
, PyObject
* args
)
3965 /* create search state object */
3967 ScannerObject
* self
;
3970 Py_ssize_t start
= 0;
3971 Py_ssize_t end
= PY_SSIZE_T_MAX
;
3972 if (!PyArg_ParseTuple(args
, "O|nn:scanner", &string
, &start
, &end
))
3975 /* create scanner object */
3976 self
= PyObject_NEW(ScannerObject
, &Scanner_Type
);
3979 self
->pattern
= NULL
;
3981 string
= state_init(&self
->state
, pattern
, string
, start
, end
);
3988 self
->pattern
= (PyObject
*) pattern
;
3990 return (PyObject
*) self
;
3993 static PyMethodDef _functions
[] = {
3994 {"compile", _compile
, METH_VARARGS
},
3995 {"getcodesize", sre_codesize
, METH_NOARGS
},
3996 {"getlower", sre_getlower
, METH_VARARGS
},
4000 #if PY_VERSION_HEX < 0x02030000
4001 DL_EXPORT(void) init_sre(void)
4003 PyMODINIT_FUNC
init_sre(void)
4010 /* Patch object types */
4011 if (PyType_Ready(&Pattern_Type
) || PyType_Ready(&Match_Type
) ||
4012 PyType_Ready(&Scanner_Type
))
4015 m
= Py_InitModule("_" SRE_MODULE
, _functions
);
4018 d
= PyModule_GetDict(m
);
4020 x
= PyInt_FromLong(SRE_MAGIC
);
4022 PyDict_SetItemString(d
, "MAGIC", x
);
4026 x
= PyInt_FromLong(sizeof(SRE_CODE
));
4028 PyDict_SetItemString(d
, "CODESIZE", x
);
4032 x
= PyLong_FromUnsignedLong(SRE_MAXREPEAT
);
4034 PyDict_SetItemString(d
, "MAXREPEAT", x
);
4038 x
= PyString_FromString(copyright
);
4040 PyDict_SetItemString(d
, "copyright", x
);
4045 #endif /* !defined(SRE_RECURSIVE) */