2 * Secret Labs' Regular Expression Engine
4 * regular expression matching engine
6 Copyright (c) 2011, Intel Corporation. All rights reserved.<BR>
7 This program and the accompanying materials are licensed and made available under
8 the terms and conditions of the BSD License that accompanies this distribution.
9 The full text of the license may be found at
10 http://opensource.org/licenses/bsd-license.
12 THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS,
13 WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED.
16 * 1999-10-24 fl created (based on existing template matcher code)
17 * 2000-03-06 fl first alpha, sort of
18 * 2000-08-01 fl fixes for 1.6b1
19 * 2000-08-07 fl use PyOS_CheckStack() if available
20 * 2000-09-20 fl added expand method
21 * 2001-03-20 fl lots of fixes for 2.1b2
22 * 2001-04-15 fl export copyright as Python attribute, not global
23 * 2001-04-28 fl added __copy__ methods (work in progress)
24 * 2001-05-14 fl fixes for 1.5.2 compatibility
25 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
26 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
27 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
28 * 2001-10-21 fl added sub/subn primitive
29 * 2001-10-24 fl added finditer primitive (for 2.2 only)
30 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
31 * 2002-11-09 fl fixed empty sub/subn return type
32 * 2003-04-18 mvl fully support 4-byte codes
33 * 2003-10-17 gn implemented non recursive scheme
35 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
37 * This version of the SRE library can be redistributed under CNRI's
38 * Python 1.6 license. For any other use, please contact Secret Labs
39 * AB (info@pythonware.com).
41 * Portions of this engine have been developed in cooperation with
42 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
43 * other compatibility work.
46 /* Get rid of these macros to prevent collisions between EFI and Python in this file. */
52 static char copyright
[] =
53 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
55 #define PY_SSIZE_T_CLEAN
58 #include "structmember.h" /* offsetof */
64 /* name of this module, minus the leading underscore */
65 #if !defined(SRE_MODULE)
66 #define SRE_MODULE "sre"
69 #define SRE_PY_MODULE "re"
71 /* defining this one enables tracing */
74 #if PY_VERSION_HEX >= 0x01060000
75 #if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
76 /* defining this enables unicode support (default under 1.6a1 and later) */
81 /* -------------------------------------------------------------------- */
82 /* optional features */
84 /* enables fast searching */
85 #define USE_FAST_SEARCH
87 /* enables aggressive inlining (always on for Visual C) */
90 /* enables copy/deepcopy handling (work in progress) */
91 #undef USE_BUILTIN_COPY
93 #if PY_VERSION_HEX < 0x01060000
94 #define PyObject_DEL(op) PyMem_DEL((op))
97 /* -------------------------------------------------------------------- */
100 #pragma optimize("gt", on) /* doesn't seem to make much difference... */
101 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
102 /* fastest possible local call under MSVC */
103 #define LOCAL(type) static __inline type __fastcall
104 #elif defined(USE_INLINE)
105 #define LOCAL(type) static inline type
107 #define LOCAL(type) static type
111 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
112 #define SRE_ERROR_STATE -2 /* illegal state */
113 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
114 #define SRE_ERROR_MEMORY -9 /* out of memory */
115 #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
118 #define TRACE(v) printf v
123 /* -------------------------------------------------------------------- */
124 /* search engine state */
126 /* default character predicates (run sre_chars.py to regenerate tables) */
128 #define SRE_DIGIT_MASK 1
129 #define SRE_SPACE_MASK 2
130 #define SRE_LINEBREAK_MASK 4
131 #define SRE_ALNUM_MASK 8
132 #define SRE_WORD_MASK 16
134 /* FIXME: this assumes ASCII. create tables in init_sre() instead */
136 static char sre_char_info
[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
137 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
139 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
140 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
141 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
142 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
144 static char sre_char_lower
[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
145 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
146 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
147 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
148 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
149 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
150 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
151 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
152 120, 121, 122, 123, 124, 125, 126, 127 };
154 #define SRE_IS_DIGIT(ch)\
155 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
156 #define SRE_IS_SPACE(ch)\
157 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
158 #define SRE_IS_LINEBREAK(ch)\
159 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
160 #define SRE_IS_ALNUM(ch)\
161 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
162 #define SRE_IS_WORD(ch)\
163 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
165 static unsigned int sre_lower(unsigned int ch
)
167 return ((ch
) < 128 ? (unsigned int)sre_char_lower
[ch
] : ch
);
170 /* locale-specific character predicates */
171 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
172 * warnings when c's type supports only numbers < N+1 */
173 #define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
174 #define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
175 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
176 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
177 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
179 static unsigned int sre_lower_locale(unsigned int ch
)
181 return ((ch
) < 256 ? (unsigned int)tolower((ch
)) : ch
);
184 /* unicode-specific character predicates */
186 #if defined(HAVE_UNICODE)
188 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
189 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
190 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
191 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
192 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
194 static unsigned int sre_lower_unicode(unsigned int ch
)
196 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE
)(ch
));
202 sre_category(SRE_CODE category
, unsigned int ch
)
206 case SRE_CATEGORY_DIGIT
:
207 return SRE_IS_DIGIT(ch
);
208 case SRE_CATEGORY_NOT_DIGIT
:
209 return !SRE_IS_DIGIT(ch
);
210 case SRE_CATEGORY_SPACE
:
211 return SRE_IS_SPACE(ch
);
212 case SRE_CATEGORY_NOT_SPACE
:
213 return !SRE_IS_SPACE(ch
);
214 case SRE_CATEGORY_WORD
:
215 return SRE_IS_WORD(ch
);
216 case SRE_CATEGORY_NOT_WORD
:
217 return !SRE_IS_WORD(ch
);
218 case SRE_CATEGORY_LINEBREAK
:
219 return SRE_IS_LINEBREAK(ch
);
220 case SRE_CATEGORY_NOT_LINEBREAK
:
221 return !SRE_IS_LINEBREAK(ch
);
223 case SRE_CATEGORY_LOC_WORD
:
224 return SRE_LOC_IS_WORD(ch
);
225 case SRE_CATEGORY_LOC_NOT_WORD
:
226 return !SRE_LOC_IS_WORD(ch
);
228 #if defined(HAVE_UNICODE)
229 case SRE_CATEGORY_UNI_DIGIT
:
230 return SRE_UNI_IS_DIGIT(ch
);
231 case SRE_CATEGORY_UNI_NOT_DIGIT
:
232 return !SRE_UNI_IS_DIGIT(ch
);
233 case SRE_CATEGORY_UNI_SPACE
:
234 return SRE_UNI_IS_SPACE(ch
);
235 case SRE_CATEGORY_UNI_NOT_SPACE
:
236 return !SRE_UNI_IS_SPACE(ch
);
237 case SRE_CATEGORY_UNI_WORD
:
238 return SRE_UNI_IS_WORD(ch
);
239 case SRE_CATEGORY_UNI_NOT_WORD
:
240 return !SRE_UNI_IS_WORD(ch
);
241 case SRE_CATEGORY_UNI_LINEBREAK
:
242 return SRE_UNI_IS_LINEBREAK(ch
);
243 case SRE_CATEGORY_UNI_NOT_LINEBREAK
:
244 return !SRE_UNI_IS_LINEBREAK(ch
);
246 case SRE_CATEGORY_UNI_DIGIT
:
247 return SRE_IS_DIGIT(ch
);
248 case SRE_CATEGORY_UNI_NOT_DIGIT
:
249 return !SRE_IS_DIGIT(ch
);
250 case SRE_CATEGORY_UNI_SPACE
:
251 return SRE_IS_SPACE(ch
);
252 case SRE_CATEGORY_UNI_NOT_SPACE
:
253 return !SRE_IS_SPACE(ch
);
254 case SRE_CATEGORY_UNI_WORD
:
255 return SRE_LOC_IS_WORD(ch
);
256 case SRE_CATEGORY_UNI_NOT_WORD
:
257 return !SRE_LOC_IS_WORD(ch
);
258 case SRE_CATEGORY_UNI_LINEBREAK
:
259 return SRE_IS_LINEBREAK(ch
);
260 case SRE_CATEGORY_UNI_NOT_LINEBREAK
:
261 return !SRE_IS_LINEBREAK(ch
);
270 data_stack_dealloc(SRE_STATE
* state
)
272 if (state
->data_stack
) {
273 PyMem_FREE(state
->data_stack
);
274 state
->data_stack
= NULL
;
276 state
->data_stack_size
= state
->data_stack_base
= 0;
280 data_stack_grow(SRE_STATE
* state
, Py_ssize_t size
)
282 Py_ssize_t minsize
, cursize
;
283 minsize
= state
->data_stack_base
+size
;
284 cursize
= state
->data_stack_size
;
285 if (cursize
< minsize
) {
287 cursize
= minsize
+minsize
/4+1024;
288 TRACE(("allocate/grow stack %d\n", cursize
));
289 stack
= PyMem_REALLOC(state
->data_stack
, cursize
);
291 data_stack_dealloc(state
);
292 return SRE_ERROR_MEMORY
;
294 state
->data_stack
= (char *)stack
;
295 state
->data_stack_size
= cursize
;
300 /* generate 8-bit version */
302 #define SRE_CHAR unsigned char
303 #define SRE_AT sre_at
304 #define SRE_COUNT sre_count
305 #define SRE_CHARSET sre_charset
306 #define SRE_INFO sre_info
307 #define SRE_MATCH sre_match
308 #define SRE_MATCH_CONTEXT sre_match_context
309 #define SRE_SEARCH sre_search
310 #define SRE_LITERAL_TEMPLATE sre_literal_template
312 #if defined(HAVE_UNICODE)
314 #define SRE_RECURSIVE
318 #undef SRE_LITERAL_TEMPLATE
321 #undef SRE_MATCH_CONTEXT
328 /* generate 16-bit unicode version */
330 #define SRE_CHAR Py_UNICODE
331 #define SRE_AT sre_uat
332 #define SRE_COUNT sre_ucount
333 #define SRE_CHARSET sre_ucharset
334 #define SRE_INFO sre_uinfo
335 #define SRE_MATCH sre_umatch
336 #define SRE_MATCH_CONTEXT sre_umatch_context
337 #define SRE_SEARCH sre_usearch
338 #define SRE_LITERAL_TEMPLATE sre_uliteral_template
341 #endif /* SRE_RECURSIVE */
343 /* -------------------------------------------------------------------- */
344 /* String matching engine */
346 /* the following section is compiled twice, with different character
350 SRE_AT(SRE_STATE
* state
, SRE_CHAR
* ptr
, SRE_CODE at
)
352 /* check if pointer is at given position */
354 Py_ssize_t thisp
, thatp
;
358 case SRE_AT_BEGINNING
:
359 case SRE_AT_BEGINNING_STRING
:
360 return ((void*) ptr
== state
->beginning
);
362 case SRE_AT_BEGINNING_LINE
:
363 return ((void*) ptr
== state
->beginning
||
364 SRE_IS_LINEBREAK((int) ptr
[-1]));
367 return (((void*) (ptr
+1) == state
->end
&&
368 SRE_IS_LINEBREAK((int) ptr
[0])) ||
369 ((void*) ptr
== state
->end
));
371 case SRE_AT_END_LINE
:
372 return ((void*) ptr
== state
->end
||
373 SRE_IS_LINEBREAK((int) ptr
[0]));
375 case SRE_AT_END_STRING
:
376 return ((void*) ptr
== state
->end
);
378 case SRE_AT_BOUNDARY
:
379 if (state
->beginning
== state
->end
)
381 thatp
= ((void*) ptr
> state
->beginning
) ?
382 SRE_IS_WORD((int) ptr
[-1]) : 0;
383 thisp
= ((void*) ptr
< state
->end
) ?
384 SRE_IS_WORD((int) ptr
[0]) : 0;
385 return thisp
!= thatp
;
387 case SRE_AT_NON_BOUNDARY
:
388 if (state
->beginning
== state
->end
)
390 thatp
= ((void*) ptr
> state
->beginning
) ?
391 SRE_IS_WORD((int) ptr
[-1]) : 0;
392 thisp
= ((void*) ptr
< state
->end
) ?
393 SRE_IS_WORD((int) ptr
[0]) : 0;
394 return thisp
== thatp
;
396 case SRE_AT_LOC_BOUNDARY
:
397 if (state
->beginning
== state
->end
)
399 thatp
= ((void*) ptr
> state
->beginning
) ?
400 SRE_LOC_IS_WORD((int) ptr
[-1]) : 0;
401 thisp
= ((void*) ptr
< state
->end
) ?
402 SRE_LOC_IS_WORD((int) ptr
[0]) : 0;
403 return thisp
!= thatp
;
405 case SRE_AT_LOC_NON_BOUNDARY
:
406 if (state
->beginning
== state
->end
)
408 thatp
= ((void*) ptr
> state
->beginning
) ?
409 SRE_LOC_IS_WORD((int) ptr
[-1]) : 0;
410 thisp
= ((void*) ptr
< state
->end
) ?
411 SRE_LOC_IS_WORD((int) ptr
[0]) : 0;
412 return thisp
== thatp
;
414 #if defined(HAVE_UNICODE)
415 case SRE_AT_UNI_BOUNDARY
:
416 if (state
->beginning
== state
->end
)
418 thatp
= ((void*) ptr
> state
->beginning
) ?
419 SRE_UNI_IS_WORD((int) ptr
[-1]) : 0;
420 thisp
= ((void*) ptr
< state
->end
) ?
421 SRE_UNI_IS_WORD((int) ptr
[0]) : 0;
422 return thisp
!= thatp
;
424 case SRE_AT_UNI_NON_BOUNDARY
:
425 if (state
->beginning
== state
->end
)
427 thatp
= ((void*) ptr
> state
->beginning
) ?
428 SRE_UNI_IS_WORD((int) ptr
[-1]) : 0;
429 thisp
= ((void*) ptr
< state
->end
) ?
430 SRE_UNI_IS_WORD((int) ptr
[0]) : 0;
431 return thisp
== thatp
;
440 SRE_CHARSET(SRE_CODE
* set
, SRE_CODE ch
)
442 /* check if character is a member of the given set */
453 /* <LITERAL> <code> */
459 case SRE_OP_CATEGORY
:
460 /* <CATEGORY> <code> */
461 if (sre_category(set
[0], (int) ch
))
467 if (sizeof(SRE_CODE
) == 2) {
468 /* <CHARSET> <bitmap> (16 bits per code word) */
469 if (ch
< 256 && (set
[ch
>> 4] & (1 << (ch
& 15))))
474 /* <CHARSET> <bitmap> (32 bits per code word) */
475 if (ch
< 256 && (set
[ch
>> 5] & (1 << (ch
& 31))))
482 /* <RANGE> <lower> <upper> */
483 if (set
[0] <= ch
&& ch
<= set
[1])
492 case SRE_OP_BIGCHARSET
:
493 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
495 Py_ssize_t count
, block
;
498 if (sizeof(SRE_CODE
) == 2) {
499 block
= ((unsigned char*)set
)[ch
>> 8];
501 if (set
[block
*16 + ((ch
& 255)>>4)] & (1 << (ch
& 15)))
506 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
507 * warnings when c's type supports only numbers < N+1 */
509 block
= ((unsigned char*)set
)[ch
>> 8];
514 (set
[block
*8 + ((ch
& 255)>>5)] & (1 << (ch
& 31))))
522 /* internal error -- there's not much we can do about it
523 here, so let's just pretend it didn't match... */
529 LOCAL(Py_ssize_t
) SRE_MATCH(SRE_STATE
* state
, SRE_CODE
* pattern
);
532 SRE_COUNT(SRE_STATE
* state
, SRE_CODE
* pattern
, Py_ssize_t maxcount
)
535 SRE_CHAR
* ptr
= (SRE_CHAR
*)state
->ptr
;
536 SRE_CHAR
* end
= (SRE_CHAR
*)state
->end
;
540 if (maxcount
< end
- ptr
&& maxcount
!= 65535)
541 end
= ptr
+ maxcount
;
543 switch (pattern
[0]) {
547 TRACE(("|%p|%p|COUNT IN\n", pattern
, ptr
));
548 while (ptr
< end
&& SRE_CHARSET(pattern
+ 2, *ptr
))
553 /* repeated dot wildcard. */
554 TRACE(("|%p|%p|COUNT ANY\n", pattern
, ptr
));
555 while (ptr
< end
&& !SRE_IS_LINEBREAK(*ptr
))
560 /* repeated dot wildcard. skip to the end of the target
561 string, and backtrack from there */
562 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern
, ptr
));
567 /* repeated literal */
569 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern
, ptr
, chr
));
570 while (ptr
< end
&& (SRE_CODE
) *ptr
== chr
)
574 case SRE_OP_LITERAL_IGNORE
:
575 /* repeated literal */
577 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern
, ptr
, chr
));
578 while (ptr
< end
&& (SRE_CODE
) state
->lower(*ptr
) == chr
)
582 case SRE_OP_NOT_LITERAL
:
583 /* repeated non-literal */
585 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern
, ptr
, chr
));
586 while (ptr
< end
&& (SRE_CODE
) *ptr
!= chr
)
590 case SRE_OP_NOT_LITERAL_IGNORE
:
591 /* repeated non-literal */
593 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern
, ptr
, chr
));
594 while (ptr
< end
&& (SRE_CODE
) state
->lower(*ptr
) != chr
)
599 /* repeated single character pattern */
600 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern
, ptr
));
601 while ((SRE_CHAR
*) state
->ptr
< end
) {
602 i
= SRE_MATCH(state
, pattern
);
608 TRACE(("|%p|%p|COUNT %d\n", pattern
, ptr
,
609 (SRE_CHAR
*) state
->ptr
- ptr
));
610 return (SRE_CHAR
*) state
->ptr
- ptr
;
613 TRACE(("|%p|%p|COUNT %d\n", pattern
, ptr
, ptr
- (SRE_CHAR
*) state
->ptr
));
614 return ptr
- (SRE_CHAR
*) state
->ptr
;
617 #if 0 /* not used in this release */
619 SRE_INFO(SRE_STATE
* state
, SRE_CODE
* pattern
)
621 /* check if an SRE_OP_INFO block matches at the current position.
622 returns the number of SRE_CODE objects to skip if successful, 0
625 SRE_CHAR
* end
= state
->end
;
626 SRE_CHAR
* ptr
= state
->ptr
;
629 /* check minimal length */
630 if (pattern
[3] && (end
- ptr
) < pattern
[3])
633 /* check known prefix */
634 if (pattern
[2] & SRE_INFO_PREFIX
&& pattern
[5] > 1) {
635 /* <length> <skip> <prefix data> <overlap data> */
636 for (i
= 0; i
< pattern
[5]; i
++)
637 if ((SRE_CODE
) ptr
[i
] != pattern
[7 + i
])
639 return pattern
[0] + 2 * pattern
[6];
645 /* The macros below should be used to protect recursive SRE_MATCH()
646 * calls that *failed* and do *not* return immediately (IOW, those
647 * that will backtrack). Explaining:
649 * - Recursive SRE_MATCH() returned true: that's usually a success
650 * (besides atypical cases like ASSERT_NOT), therefore there's no
651 * reason to restore lastmark;
653 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
654 * is returning to the caller: If the current SRE_MATCH() is the
655 * top function of the recursion, returning false will be a matching
656 * failure, and it doesn't matter where lastmark is pointing to.
657 * If it's *not* the top function, it will be a recursive SRE_MATCH()
658 * failure by itself, and the calling SRE_MATCH() will have to deal
659 * with the failure by the same rules explained here (it will restore
660 * lastmark by itself if necessary);
662 * - Recursive SRE_MATCH() returned false, and will continue the
663 * outside 'for' loop: must be protected when breaking, since the next
664 * OP could potentially depend on lastmark;
666 * - Recursive SRE_MATCH() returned false, and will be called again
667 * inside a local for/while loop: must be protected between each
668 * loop iteration, since the recursive SRE_MATCH() could do anything,
669 * and could potentially depend on lastmark.
671 * For more information, check the discussion at SF patch #712900.
673 #define LASTMARK_SAVE() \
675 ctx->lastmark = state->lastmark; \
676 ctx->lastindex = state->lastindex; \
678 #define LASTMARK_RESTORE() \
680 state->lastmark = ctx->lastmark; \
681 state->lastindex = ctx->lastindex; \
684 #define RETURN_ERROR(i) do { return i; } while(0)
685 #define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
686 #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
688 #define RETURN_ON_ERROR(i) \
689 do { if (i < 0) RETURN_ERROR(i); } while (0)
690 #define RETURN_ON_SUCCESS(i) \
691 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
692 #define RETURN_ON_FAILURE(i) \
693 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
697 #define DATA_STACK_ALLOC(state, type, ptr) \
699 alloc_pos = state->data_stack_base; \
700 TRACE(("allocating %s in %d (%d)\n", \
701 SFY(type), alloc_pos, sizeof(type))); \
702 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
703 int j = data_stack_grow(state, sizeof(type)); \
704 if (j < 0) return j; \
706 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
708 ptr = (type*)(state->data_stack+alloc_pos); \
709 state->data_stack_base += sizeof(type); \
712 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
714 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
715 ptr = (type*)(state->data_stack+pos); \
718 #define DATA_STACK_PUSH(state, data, size) \
720 TRACE(("copy data in %p to %d (%d)\n", \
721 data, state->data_stack_base, size)); \
722 if (state->data_stack_size < state->data_stack_base+size) { \
723 int j = data_stack_grow(state, size); \
724 if (j < 0) return j; \
726 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
728 memcpy(state->data_stack+state->data_stack_base, data, size); \
729 state->data_stack_base += size; \
732 #define DATA_STACK_POP(state, data, size, discard) \
734 TRACE(("copy data to %p from %d (%d)\n", \
735 data, state->data_stack_base-size, size)); \
736 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
738 state->data_stack_base -= size; \
741 #define DATA_STACK_POP_DISCARD(state, size) \
743 TRACE(("discard data from %d (%d)\n", \
744 state->data_stack_base-size, size)); \
745 state->data_stack_base -= size; \
748 #define DATA_PUSH(x) \
749 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
750 #define DATA_POP(x) \
751 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
752 #define DATA_POP_DISCARD(x) \
753 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
754 #define DATA_ALLOC(t,p) \
755 DATA_STACK_ALLOC(state, t, p)
756 #define DATA_LOOKUP_AT(t,p,pos) \
757 DATA_STACK_LOOKUP_AT(state,t,p,pos)
759 #define MARK_PUSH(lastmark) \
760 do if (lastmark > 0) { \
761 i = lastmark; /* ctx->lastmark may change if reallocated */ \
762 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
764 #define MARK_POP(lastmark) \
765 do if (lastmark > 0) { \
766 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
768 #define MARK_POP_KEEP(lastmark) \
769 do if (lastmark > 0) { \
770 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
772 #define MARK_POP_DISCARD(lastmark) \
773 do if (lastmark > 0) { \
774 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
778 #define JUMP_MAX_UNTIL_1 1
779 #define JUMP_MAX_UNTIL_2 2
780 #define JUMP_MAX_UNTIL_3 3
781 #define JUMP_MIN_UNTIL_1 4
782 #define JUMP_MIN_UNTIL_2 5
783 #define JUMP_MIN_UNTIL_3 6
784 #define JUMP_REPEAT 7
785 #define JUMP_REPEAT_ONE_1 8
786 #define JUMP_REPEAT_ONE_2 9
787 #define JUMP_MIN_REPEAT_ONE 10
788 #define JUMP_BRANCH 11
789 #define JUMP_ASSERT 12
790 #define JUMP_ASSERT_NOT 13
792 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
793 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
794 nextctx->last_ctx_pos = ctx_pos; \
795 nextctx->jump = jumpvalue; \
796 nextctx->pattern = nextpattern; \
797 ctx_pos = alloc_pos; \
801 while (0) /* gcc doesn't like labels at end of scopes */ \
804 Py_ssize_t last_ctx_pos
;
810 Py_ssize_t lastindex
;
817 /* check if string matches the given pattern. returns <0 for
818 error, 0 for failure, and 1 for success */
820 SRE_MATCH(SRE_STATE
* state
, SRE_CODE
* pattern
)
822 SRE_CHAR
* end
= (SRE_CHAR
*)state
->end
;
823 Py_ssize_t alloc_pos
, ctx_pos
= -1;
824 Py_ssize_t i
, ret
= 0;
826 unsigned int sigcount
=0;
828 SRE_MATCH_CONTEXT
* ctx
;
829 SRE_MATCH_CONTEXT
* nextctx
;
831 TRACE(("|%p|%p|ENTER\n", pattern
, state
->ptr
));
833 DATA_ALLOC(SRE_MATCH_CONTEXT
, ctx
);
834 ctx
->last_ctx_pos
= -1;
835 ctx
->jump
= JUMP_NONE
;
836 ctx
->pattern
= pattern
;
841 ctx
->ptr
= (SRE_CHAR
*)state
->ptr
;
843 if (ctx
->pattern
[0] == SRE_OP_INFO
) {
844 /* optimization info block */
845 /* <INFO> <1=skip> <2=flags> <3=min> ... */
846 if (ctx
->pattern
[3] && (end
- ctx
->ptr
) < ctx
->pattern
[3]) {
847 TRACE(("reject (got %d chars, need %d)\n",
848 (end
- ctx
->ptr
), ctx
->pattern
[3]));
851 ctx
->pattern
+= ctx
->pattern
[1] + 1;
856 if ((0 == (sigcount
& 0xfff)) && PyErr_CheckSignals())
857 RETURN_ERROR(SRE_ERROR_INTERRUPTED
);
859 switch (*ctx
->pattern
++) {
864 TRACE(("|%p|%p|MARK %d\n", ctx
->pattern
,
865 ctx
->ptr
, ctx
->pattern
[0]));
868 state
->lastindex
= i
/2 + 1;
869 if (i
> state
->lastmark
) {
870 /* state->lastmark is the highest valid index in the
871 state->mark array. If it is increased by more than 1,
872 the intervening marks must be set to NULL to signal
873 that these marks have not been encountered. */
874 Py_ssize_t j
= state
->lastmark
+ 1;
876 state
->mark
[j
++] = NULL
;
879 state
->mark
[i
] = ctx
->ptr
;
884 /* match literal string */
885 /* <LITERAL> <code> */
886 TRACE(("|%p|%p|LITERAL %d\n", ctx
->pattern
,
887 ctx
->ptr
, *ctx
->pattern
));
888 if (ctx
->ptr
>= end
|| (SRE_CODE
) ctx
->ptr
[0] != ctx
->pattern
[0])
894 case SRE_OP_NOT_LITERAL
:
895 /* match anything that is not literal character */
896 /* <NOT_LITERAL> <code> */
897 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx
->pattern
,
898 ctx
->ptr
, *ctx
->pattern
));
899 if (ctx
->ptr
>= end
|| (SRE_CODE
) ctx
->ptr
[0] == ctx
->pattern
[0])
907 TRACE(("|%p|%p|SUCCESS\n", ctx
->pattern
, ctx
->ptr
));
908 state
->ptr
= ctx
->ptr
;
912 /* match at given position */
914 TRACE(("|%p|%p|AT %d\n", ctx
->pattern
, ctx
->ptr
, *ctx
->pattern
));
915 if (!SRE_AT(state
, ctx
->ptr
, *ctx
->pattern
))
920 case SRE_OP_CATEGORY
:
921 /* match at given category */
922 /* <CATEGORY> <code> */
923 TRACE(("|%p|%p|CATEGORY %d\n", ctx
->pattern
,
924 ctx
->ptr
, *ctx
->pattern
));
925 if (ctx
->ptr
>= end
|| !sre_category(ctx
->pattern
[0], ctx
->ptr
[0]))
932 /* match anything (except a newline) */
934 TRACE(("|%p|%p|ANY\n", ctx
->pattern
, ctx
->ptr
));
935 if (ctx
->ptr
>= end
|| SRE_IS_LINEBREAK(ctx
->ptr
[0]))
943 TRACE(("|%p|%p|ANY_ALL\n", ctx
->pattern
, ctx
->ptr
));
950 /* match set member (or non_member) */
951 /* <IN> <skip> <set> */
952 TRACE(("|%p|%p|IN\n", ctx
->pattern
, ctx
->ptr
));
953 if (ctx
->ptr
>= end
|| !SRE_CHARSET(ctx
->pattern
+ 1, *ctx
->ptr
))
955 ctx
->pattern
+= ctx
->pattern
[0];
959 case SRE_OP_LITERAL_IGNORE
:
960 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
961 ctx
->pattern
, ctx
->ptr
, ctx
->pattern
[0]));
962 if (ctx
->ptr
>= end
||
963 state
->lower(*ctx
->ptr
) != state
->lower(*ctx
->pattern
))
969 case SRE_OP_NOT_LITERAL_IGNORE
:
970 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
971 ctx
->pattern
, ctx
->ptr
, *ctx
->pattern
));
972 if (ctx
->ptr
>= end
||
973 state
->lower(*ctx
->ptr
) == state
->lower(*ctx
->pattern
))
979 case SRE_OP_IN_IGNORE
:
980 TRACE(("|%p|%p|IN_IGNORE\n", ctx
->pattern
, ctx
->ptr
));
982 || !SRE_CHARSET(ctx
->pattern
+1,
983 (SRE_CODE
)state
->lower(*ctx
->ptr
)))
985 ctx
->pattern
+= ctx
->pattern
[0];
992 /* <JUMP> <offset> */
993 TRACE(("|%p|%p|JUMP %d\n", ctx
->pattern
,
994 ctx
->ptr
, ctx
->pattern
[0]));
995 ctx
->pattern
+= ctx
->pattern
[0];
1000 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
1001 TRACE(("|%p|%p|BRANCH\n", ctx
->pattern
, ctx
->ptr
));
1003 ctx
->u
.rep
= state
->repeat
;
1005 MARK_PUSH(ctx
->lastmark
);
1006 for (; ctx
->pattern
[0]; ctx
->pattern
+= ctx
->pattern
[0]) {
1007 if (ctx
->pattern
[1] == SRE_OP_LITERAL
&&
1009 (SRE_CODE
) *ctx
->ptr
!= ctx
->pattern
[2]))
1011 if (ctx
->pattern
[1] == SRE_OP_IN
&&
1013 !SRE_CHARSET(ctx
->pattern
+ 3, (SRE_CODE
) *ctx
->ptr
)))
1015 state
->ptr
= ctx
->ptr
;
1016 DO_JUMP(JUMP_BRANCH
, jump_branch
, ctx
->pattern
+1);
1019 MARK_POP_DISCARD(ctx
->lastmark
);
1020 RETURN_ON_ERROR(ret
);
1024 MARK_POP_KEEP(ctx
->lastmark
);
1028 MARK_POP_DISCARD(ctx
->lastmark
);
1031 case SRE_OP_REPEAT_ONE
:
1032 /* match repeated sequence (maximizing regexp) */
1034 /* this operator only works if the repeated item is
1035 exactly one character wide, and we're not already
1036 collecting backtracking points. for other cases,
1037 use the MAX_REPEAT operator */
1039 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1041 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx
->pattern
, ctx
->ptr
,
1042 ctx
->pattern
[1], ctx
->pattern
[2]));
1044 if (ctx
->ptr
+ ctx
->pattern
[1] > end
)
1045 RETURN_FAILURE
; /* cannot match */
1047 state
->ptr
= ctx
->ptr
;
1049 ret
= SRE_COUNT(state
, ctx
->pattern
+3, ctx
->pattern
[2]);
1050 RETURN_ON_ERROR(ret
);
1051 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT
, ctx
, ctx_pos
);
1053 ctx
->ptr
+= ctx
->count
;
1055 /* when we arrive here, count contains the number of
1056 matches, and ctx->ptr points to the tail of the target
1057 string. check if the rest of the pattern matches,
1058 and backtrack if not. */
1060 if (ctx
->count
< (Py_ssize_t
) ctx
->pattern
[1])
1063 if (ctx
->pattern
[ctx
->pattern
[0]] == SRE_OP_SUCCESS
) {
1064 /* tail is empty. we're finished */
1065 state
->ptr
= ctx
->ptr
;
1071 if (ctx
->pattern
[ctx
->pattern
[0]] == SRE_OP_LITERAL
) {
1072 /* tail starts with a literal. skip positions where
1073 the rest of the pattern cannot possibly match */
1074 ctx
->u
.chr
= ctx
->pattern
[ctx
->pattern
[0]+1];
1076 while (ctx
->count
>= (Py_ssize_t
) ctx
->pattern
[1] &&
1077 (ctx
->ptr
>= end
|| *ctx
->ptr
!= ctx
->u
.chr
)) {
1081 if (ctx
->count
< (Py_ssize_t
) ctx
->pattern
[1])
1083 state
->ptr
= ctx
->ptr
;
1084 DO_JUMP(JUMP_REPEAT_ONE_1
, jump_repeat_one_1
,
1085 ctx
->pattern
+ctx
->pattern
[0]);
1087 RETURN_ON_ERROR(ret
);
1099 while (ctx
->count
>= (Py_ssize_t
) ctx
->pattern
[1]) {
1100 state
->ptr
= ctx
->ptr
;
1101 DO_JUMP(JUMP_REPEAT_ONE_2
, jump_repeat_one_2
,
1102 ctx
->pattern
+ctx
->pattern
[0]);
1104 RETURN_ON_ERROR(ret
);
1114 case SRE_OP_MIN_REPEAT_ONE
:
1115 /* match repeated sequence (minimizing regexp) */
1117 /* this operator only works if the repeated item is
1118 exactly one character wide, and we're not already
1119 collecting backtracking points. for other cases,
1120 use the MIN_REPEAT operator */
1122 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1124 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx
->pattern
, ctx
->ptr
,
1125 ctx
->pattern
[1], ctx
->pattern
[2]));
1127 if (ctx
->ptr
+ ctx
->pattern
[1] > end
)
1128 RETURN_FAILURE
; /* cannot match */
1130 state
->ptr
= ctx
->ptr
;
1132 if (ctx
->pattern
[1] == 0)
1135 /* count using pattern min as the maximum */
1136 ret
= SRE_COUNT(state
, ctx
->pattern
+3, ctx
->pattern
[1]);
1137 RETURN_ON_ERROR(ret
);
1138 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT
, ctx
, ctx_pos
);
1139 if (ret
< (Py_ssize_t
) ctx
->pattern
[1])
1140 /* didn't match minimum number of times */
1142 /* advance past minimum matches of repeat */
1144 ctx
->ptr
+= ctx
->count
;
1147 if (ctx
->pattern
[ctx
->pattern
[0]] == SRE_OP_SUCCESS
) {
1148 /* tail is empty. we're finished */
1149 state
->ptr
= ctx
->ptr
;
1155 while ((Py_ssize_t
)ctx
->pattern
[2] == 65535
1156 || ctx
->count
<= (Py_ssize_t
)ctx
->pattern
[2]) {
1157 state
->ptr
= ctx
->ptr
;
1158 DO_JUMP(JUMP_MIN_REPEAT_ONE
,jump_min_repeat_one
,
1159 ctx
->pattern
+ctx
->pattern
[0]);
1161 RETURN_ON_ERROR(ret
);
1164 state
->ptr
= ctx
->ptr
;
1165 ret
= SRE_COUNT(state
, ctx
->pattern
+3, 1);
1166 RETURN_ON_ERROR(ret
);
1167 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT
, ctx
, ctx_pos
);
1179 /* create repeat context. all the hard work is done
1180 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1181 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1182 TRACE(("|%p|%p|REPEAT %d %d\n", ctx
->pattern
, ctx
->ptr
,
1183 ctx
->pattern
[1], ctx
->pattern
[2]));
1185 /* install new repeat context */
1186 ctx
->u
.rep
= (SRE_REPEAT
*) PyObject_MALLOC(sizeof(*ctx
->u
.rep
));
1191 ctx
->u
.rep
->count
= -1;
1192 ctx
->u
.rep
->pattern
= ctx
->pattern
;
1193 ctx
->u
.rep
->prev
= state
->repeat
;
1194 ctx
->u
.rep
->last_ptr
= NULL
;
1195 state
->repeat
= ctx
->u
.rep
;
1197 state
->ptr
= ctx
->ptr
;
1198 DO_JUMP(JUMP_REPEAT
, jump_repeat
, ctx
->pattern
+ctx
->pattern
[0]);
1199 state
->repeat
= ctx
->u
.rep
->prev
;
1200 PyObject_FREE(ctx
->u
.rep
);
1203 RETURN_ON_ERROR(ret
);
1208 case SRE_OP_MAX_UNTIL
:
1209 /* maximizing repeat */
1210 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1212 /* FIXME: we probably need to deal with zero-width
1213 matches in here... */
1215 ctx
->u
.rep
= state
->repeat
;
1217 RETURN_ERROR(SRE_ERROR_STATE
);
1219 state
->ptr
= ctx
->ptr
;
1221 ctx
->count
= ctx
->u
.rep
->count
+1;
1223 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx
->pattern
,
1224 ctx
->ptr
, ctx
->count
));
1226 if (ctx
->count
< ctx
->u
.rep
->pattern
[1]) {
1227 /* not enough matches */
1228 ctx
->u
.rep
->count
= ctx
->count
;
1229 DO_JUMP(JUMP_MAX_UNTIL_1
, jump_max_until_1
,
1230 ctx
->u
.rep
->pattern
+3);
1232 RETURN_ON_ERROR(ret
);
1235 ctx
->u
.rep
->count
= ctx
->count
-1;
1236 state
->ptr
= ctx
->ptr
;
1240 if ((ctx
->count
< ctx
->u
.rep
->pattern
[2] ||
1241 ctx
->u
.rep
->pattern
[2] == 65535) &&
1242 state
->ptr
!= ctx
->u
.rep
->last_ptr
) {
1243 /* we may have enough matches, but if we can
1244 match another item, do so */
1245 ctx
->u
.rep
->count
= ctx
->count
;
1247 MARK_PUSH(ctx
->lastmark
);
1248 /* zero-width match protection */
1249 DATA_PUSH(&ctx
->u
.rep
->last_ptr
);
1250 ctx
->u
.rep
->last_ptr
= state
->ptr
;
1251 DO_JUMP(JUMP_MAX_UNTIL_2
, jump_max_until_2
,
1252 ctx
->u
.rep
->pattern
+3);
1253 DATA_POP(&ctx
->u
.rep
->last_ptr
);
1255 MARK_POP_DISCARD(ctx
->lastmark
);
1256 RETURN_ON_ERROR(ret
);
1259 MARK_POP(ctx
->lastmark
);
1261 ctx
->u
.rep
->count
= ctx
->count
-1;
1262 state
->ptr
= ctx
->ptr
;
1265 /* cannot match more repeated items here. make sure the
1267 state
->repeat
= ctx
->u
.rep
->prev
;
1268 DO_JUMP(JUMP_MAX_UNTIL_3
, jump_max_until_3
, ctx
->pattern
);
1269 RETURN_ON_SUCCESS(ret
);
1270 state
->repeat
= ctx
->u
.rep
;
1271 state
->ptr
= ctx
->ptr
;
1274 case SRE_OP_MIN_UNTIL
:
1275 /* minimizing repeat */
1276 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1278 ctx
->u
.rep
= state
->repeat
;
1280 RETURN_ERROR(SRE_ERROR_STATE
);
1282 state
->ptr
= ctx
->ptr
;
1284 ctx
->count
= ctx
->u
.rep
->count
+1;
1286 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx
->pattern
,
1287 ctx
->ptr
, ctx
->count
, ctx
->u
.rep
->pattern
));
1289 if (ctx
->count
< ctx
->u
.rep
->pattern
[1]) {
1290 /* not enough matches */
1291 ctx
->u
.rep
->count
= ctx
->count
;
1292 DO_JUMP(JUMP_MIN_UNTIL_1
, jump_min_until_1
,
1293 ctx
->u
.rep
->pattern
+3);
1295 RETURN_ON_ERROR(ret
);
1298 ctx
->u
.rep
->count
= ctx
->count
-1;
1299 state
->ptr
= ctx
->ptr
;
1305 /* see if the tail matches */
1306 state
->repeat
= ctx
->u
.rep
->prev
;
1307 DO_JUMP(JUMP_MIN_UNTIL_2
, jump_min_until_2
, ctx
->pattern
);
1309 RETURN_ON_ERROR(ret
);
1313 state
->repeat
= ctx
->u
.rep
;
1314 state
->ptr
= ctx
->ptr
;
1318 if (ctx
->count
>= ctx
->u
.rep
->pattern
[2]
1319 && ctx
->u
.rep
->pattern
[2] != 65535)
1322 ctx
->u
.rep
->count
= ctx
->count
;
1323 DO_JUMP(JUMP_MIN_UNTIL_3
,jump_min_until_3
,
1324 ctx
->u
.rep
->pattern
+3);
1326 RETURN_ON_ERROR(ret
);
1329 ctx
->u
.rep
->count
= ctx
->count
-1;
1330 state
->ptr
= ctx
->ptr
;
1333 case SRE_OP_GROUPREF
:
1334 /* match backreference */
1335 TRACE(("|%p|%p|GROUPREF %d\n", ctx
->pattern
,
1336 ctx
->ptr
, ctx
->pattern
[0]));
1337 i
= ctx
->pattern
[0];
1339 Py_ssize_t groupref
= i
+i
;
1340 if (groupref
>= state
->lastmark
) {
1343 SRE_CHAR
* p
= (SRE_CHAR
*) state
->mark
[groupref
];
1344 SRE_CHAR
* e
= (SRE_CHAR
*) state
->mark
[groupref
+1];
1345 if (!p
|| !e
|| e
< p
)
1348 if (ctx
->ptr
>= end
|| *ctx
->ptr
!= *p
)
1357 case SRE_OP_GROUPREF_IGNORE
:
1358 /* match backreference */
1359 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx
->pattern
,
1360 ctx
->ptr
, ctx
->pattern
[0]));
1361 i
= ctx
->pattern
[0];
1363 Py_ssize_t groupref
= i
+i
;
1364 if (groupref
>= state
->lastmark
) {
1367 SRE_CHAR
* p
= (SRE_CHAR
*) state
->mark
[groupref
];
1368 SRE_CHAR
* e
= (SRE_CHAR
*) state
->mark
[groupref
+1];
1369 if (!p
|| !e
|| e
< p
)
1372 if (ctx
->ptr
>= end
||
1373 state
->lower(*ctx
->ptr
) != state
->lower(*p
))
1382 case SRE_OP_GROUPREF_EXISTS
:
1383 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx
->pattern
,
1384 ctx
->ptr
, ctx
->pattern
[0]));
1385 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1386 i
= ctx
->pattern
[0];
1388 Py_ssize_t groupref
= i
+i
;
1389 if (groupref
>= state
->lastmark
) {
1390 ctx
->pattern
+= ctx
->pattern
[1];
1393 SRE_CHAR
* p
= (SRE_CHAR
*) state
->mark
[groupref
];
1394 SRE_CHAR
* e
= (SRE_CHAR
*) state
->mark
[groupref
+1];
1395 if (!p
|| !e
|| e
< p
) {
1396 ctx
->pattern
+= ctx
->pattern
[1];
1405 /* assert subpattern */
1406 /* <ASSERT> <skip> <back> <pattern> */
1407 TRACE(("|%p|%p|ASSERT %d\n", ctx
->pattern
,
1408 ctx
->ptr
, ctx
->pattern
[1]));
1409 state
->ptr
= ctx
->ptr
- ctx
->pattern
[1];
1410 if (state
->ptr
< state
->beginning
)
1412 DO_JUMP(JUMP_ASSERT
, jump_assert
, ctx
->pattern
+2);
1413 RETURN_ON_FAILURE(ret
);
1414 ctx
->pattern
+= ctx
->pattern
[0];
1417 case SRE_OP_ASSERT_NOT
:
1418 /* assert not subpattern */
1419 /* <ASSERT_NOT> <skip> <back> <pattern> */
1420 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx
->pattern
,
1421 ctx
->ptr
, ctx
->pattern
[1]));
1422 state
->ptr
= ctx
->ptr
- ctx
->pattern
[1];
1423 if (state
->ptr
>= state
->beginning
) {
1424 DO_JUMP(JUMP_ASSERT_NOT
, jump_assert_not
, ctx
->pattern
+2);
1426 RETURN_ON_ERROR(ret
);
1430 ctx
->pattern
+= ctx
->pattern
[0];
1433 case SRE_OP_FAILURE
:
1434 /* immediate failure */
1435 TRACE(("|%p|%p|FAILURE\n", ctx
->pattern
, ctx
->ptr
));
1439 TRACE(("|%p|%p|UNKNOWN %d\n", ctx
->pattern
, ctx
->ptr
,
1441 RETURN_ERROR(SRE_ERROR_ILLEGAL
);
1446 ctx_pos
= ctx
->last_ctx_pos
;
1448 DATA_POP_DISCARD(ctx
);
1451 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT
, ctx
, ctx_pos
);
1454 case JUMP_MAX_UNTIL_2
:
1455 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx
->pattern
, ctx
->ptr
));
1456 goto jump_max_until_2
;
1457 case JUMP_MAX_UNTIL_3
:
1458 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx
->pattern
, ctx
->ptr
));
1459 goto jump_max_until_3
;
1460 case JUMP_MIN_UNTIL_2
:
1461 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx
->pattern
, ctx
->ptr
));
1462 goto jump_min_until_2
;
1463 case JUMP_MIN_UNTIL_3
:
1464 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx
->pattern
, ctx
->ptr
));
1465 goto jump_min_until_3
;
1467 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx
->pattern
, ctx
->ptr
));
1469 case JUMP_MAX_UNTIL_1
:
1470 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx
->pattern
, ctx
->ptr
));
1471 goto jump_max_until_1
;
1472 case JUMP_MIN_UNTIL_1
:
1473 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx
->pattern
, ctx
->ptr
));
1474 goto jump_min_until_1
;
1476 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx
->pattern
, ctx
->ptr
));
1478 case JUMP_REPEAT_ONE_1
:
1479 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx
->pattern
, ctx
->ptr
));
1480 goto jump_repeat_one_1
;
1481 case JUMP_REPEAT_ONE_2
:
1482 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx
->pattern
, ctx
->ptr
));
1483 goto jump_repeat_one_2
;
1484 case JUMP_MIN_REPEAT_ONE
:
1485 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx
->pattern
, ctx
->ptr
));
1486 goto jump_min_repeat_one
;
1488 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx
->pattern
, ctx
->ptr
));
1490 case JUMP_ASSERT_NOT
:
1491 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx
->pattern
, ctx
->ptr
));
1492 goto jump_assert_not
;
1494 TRACE(("|%p|%p|RETURN %d\n", ctx
->pattern
, ctx
->ptr
, ret
));
1498 return ret
; /* should never get here */
1502 SRE_SEARCH(SRE_STATE
* state
, SRE_CODE
* pattern
)
1504 SRE_CHAR
* ptr
= (SRE_CHAR
*)state
->start
;
1505 SRE_CHAR
* end
= (SRE_CHAR
*)state
->end
;
1506 Py_ssize_t status
= 0;
1507 Py_ssize_t prefix_len
= 0;
1508 Py_ssize_t prefix_skip
= 0;
1509 SRE_CODE
* prefix
= NULL
;
1510 SRE_CODE
* charset
= NULL
;
1511 SRE_CODE
* overlap
= NULL
;
1514 if (pattern
[0] == SRE_OP_INFO
) {
1515 /* optimization info block */
1516 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1520 if (pattern
[3] > 1) {
1521 /* adjust end point (but make sure we leave at least one
1522 character in there, so literal search will work) */
1523 end
-= pattern
[3]-1;
1528 if (flags
& SRE_INFO_PREFIX
) {
1529 /* pattern starts with a known prefix */
1530 /* <length> <skip> <prefix data> <overlap data> */
1531 prefix_len
= pattern
[5];
1532 prefix_skip
= pattern
[6];
1533 prefix
= pattern
+ 7;
1534 overlap
= prefix
+ prefix_len
- 1;
1535 } else if (flags
& SRE_INFO_CHARSET
)
1536 /* pattern starts with a character from a known set */
1538 charset
= pattern
+ 5;
1540 pattern
+= 1 + pattern
[1];
1543 TRACE(("prefix = %p %d %d\n", prefix
, prefix_len
, prefix_skip
));
1544 TRACE(("charset = %p\n", charset
));
1546 #if defined(USE_FAST_SEARCH)
1547 if (prefix_len
> 1) {
1548 /* pattern starts with a known prefix. use the overlap
1549 table to skip forward as fast as we possibly can */
1551 end
= (SRE_CHAR
*)state
->end
;
1554 if ((SRE_CODE
) ptr
[0] != prefix
[i
]) {
1560 if (++i
== prefix_len
) {
1561 /* found a potential match */
1562 TRACE(("|%p|%p|SEARCH SCAN\n", pattern
, ptr
));
1563 state
->start
= ptr
+ 1 - prefix_len
;
1564 state
->ptr
= ptr
+ 1 - prefix_len
+ prefix_skip
;
1565 if (flags
& SRE_INFO_LITERAL
)
1566 return 1; /* we got all of it */
1567 status
= SRE_MATCH(state
, pattern
+ 2*prefix_skip
);
1570 /* close but no cigar -- try again */
1582 if (pattern
[0] == SRE_OP_LITERAL
) {
1583 /* pattern starts with a literal character. this is used
1584 for short prefixes, and if fast search is disabled */
1585 SRE_CODE chr
= pattern
[1];
1586 end
= (SRE_CHAR
*)state
->end
;
1588 while (ptr
< end
&& (SRE_CODE
) ptr
[0] != chr
)
1592 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern
, ptr
));
1595 if (flags
& SRE_INFO_LITERAL
)
1596 return 1; /* we got all of it */
1597 status
= SRE_MATCH(state
, pattern
+ 2);
1601 } else if (charset
) {
1602 /* pattern starts with a character from a known set */
1603 end
= (SRE_CHAR
*)state
->end
;
1605 while (ptr
< end
&& !SRE_CHARSET(charset
, ptr
[0]))
1609 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern
, ptr
));
1612 status
= SRE_MATCH(state
, pattern
);
1619 while (ptr
<= end
) {
1620 TRACE(("|%p|%p|SEARCH\n", pattern
, ptr
));
1621 state
->start
= state
->ptr
= ptr
++;
1622 status
= SRE_MATCH(state
, pattern
);
1631 SRE_LITERAL_TEMPLATE(SRE_CHAR
* ptr
, Py_ssize_t len
)
1633 /* check if given string is a literal template (i.e. no escapes) */
1640 #if !defined(SRE_RECURSIVE)
1642 /* -------------------------------------------------------------------- */
1643 /* factories and destructors */
1645 /* see sre.h for object declarations */
1646 static PyObject
*pattern_new_match(PatternObject
*, SRE_STATE
*, int);
1647 static PyObject
*pattern_scanner(PatternObject
*, PyObject
*);
1650 sre_codesize(PyObject
* self
, PyObject
*unused
)
1652 return Py_BuildValue("l", sizeof(SRE_CODE
));
1656 sre_getlower(PyObject
* self
, PyObject
* args
)
1658 int character
, flags
;
1659 if (!PyArg_ParseTuple(args
, "ii", &character
, &flags
))
1661 if (flags
& SRE_FLAG_LOCALE
)
1662 return Py_BuildValue("i", sre_lower_locale(character
));
1663 if (flags
& SRE_FLAG_UNICODE
)
1664 #if defined(HAVE_UNICODE)
1665 return Py_BuildValue("i", sre_lower_unicode(character
));
1667 return Py_BuildValue("i", sre_lower_locale(character
));
1669 return Py_BuildValue("i", sre_lower(character
));
1673 state_reset(SRE_STATE
* state
)
1675 /* FIXME: dynamic! */
1676 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
1678 state
->lastmark
= -1;
1679 state
->lastindex
= -1;
1681 state
->repeat
= NULL
;
1683 data_stack_dealloc(state
);
1687 getstring(PyObject
* string
, Py_ssize_t
* p_length
, int* p_charsize
)
1689 /* given a python object, return a data pointer, a length (in
1690 characters), and a character size. return NULL if the object
1691 is not a string (or not compatible) */
1693 PyBufferProcs
*buffer
;
1694 Py_ssize_t size
, bytes
;
1698 #if defined(HAVE_UNICODE)
1699 if (PyUnicode_Check(string
)) {
1700 /* unicode strings doesn't always support the buffer interface */
1701 ptr
= (void*) PyUnicode_AS_DATA(string
);
1702 /* bytes = PyUnicode_GET_DATA_SIZE(string); */
1703 size
= PyUnicode_GET_SIZE(string
);
1704 charsize
= sizeof(Py_UNICODE
);
1709 /* get pointer to string buffer */
1710 buffer
= Py_TYPE(string
)->tp_as_buffer
;
1711 if (!buffer
|| !buffer
->bf_getreadbuffer
|| !buffer
->bf_getsegcount
||
1712 buffer
->bf_getsegcount(string
, NULL
) != 1) {
1713 PyErr_SetString(PyExc_TypeError
, "expected string or buffer");
1717 /* determine buffer size */
1718 bytes
= buffer
->bf_getreadbuffer(string
, 0, &ptr
);
1720 PyErr_SetString(PyExc_TypeError
, "buffer has negative size");
1724 /* determine character size */
1725 #if PY_VERSION_HEX >= 0x01060000
1726 size
= PyObject_Size(string
);
1728 size
= PyObject_Length(string
);
1731 if (PyString_Check(string
) || bytes
== size
)
1733 #if defined(HAVE_UNICODE)
1734 else if (bytes
== (Py_ssize_t
) (size
* sizeof(Py_UNICODE
)))
1735 charsize
= sizeof(Py_UNICODE
);
1738 PyErr_SetString(PyExc_TypeError
, "buffer size mismatch");
1742 #if defined(HAVE_UNICODE)
1747 *p_charsize
= charsize
;
1753 state_init(SRE_STATE
* state
, PatternObject
* pattern
, PyObject
* string
,
1754 Py_ssize_t start
, Py_ssize_t end
)
1756 /* prepare state object */
1762 memset(state
, 0, sizeof(SRE_STATE
));
1764 state
->lastmark
= -1;
1765 state
->lastindex
= -1;
1767 ptr
= getstring(string
, &length
, &charsize
);
1771 /* adjust boundaries */
1774 else if (start
> length
)
1779 else if (end
> length
)
1782 state
->charsize
= charsize
;
1784 state
->beginning
= ptr
;
1786 state
->start
= (void*) ((char*) ptr
+ start
* state
->charsize
);
1787 state
->end
= (void*) ((char*) ptr
+ end
* state
->charsize
);
1790 state
->string
= string
;
1792 state
->endpos
= end
;
1794 if (pattern
->flags
& SRE_FLAG_LOCALE
)
1795 state
->lower
= sre_lower_locale
;
1796 else if (pattern
->flags
& SRE_FLAG_UNICODE
)
1797 #if defined(HAVE_UNICODE)
1798 state
->lower
= sre_lower_unicode
;
1800 state
->lower
= sre_lower_locale
;
1803 state
->lower
= sre_lower
;
1809 state_fini(SRE_STATE
* state
)
1811 Py_XDECREF(state
->string
);
1812 data_stack_dealloc(state
);
1815 /* calculate offset from start of string */
1816 #define STATE_OFFSET(state, member)\
1817 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1820 state_getslice(SRE_STATE
* state
, Py_ssize_t index
, PyObject
* string
, int empty
)
1824 index
= (index
- 1) * 2;
1826 if (string
== Py_None
|| index
>= state
->lastmark
|| !state
->mark
[index
] || !state
->mark
[index
+1]) {
1828 /* want empty string */
1835 i
= STATE_OFFSET(state
, state
->mark
[index
]);
1836 j
= STATE_OFFSET(state
, state
->mark
[index
+1]);
1839 return PySequence_GetSlice(string
, i
, j
);
1843 pattern_error(int status
)
1846 case SRE_ERROR_RECURSION_LIMIT
:
1849 "maximum recursion limit exceeded"
1852 case SRE_ERROR_MEMORY
:
1855 case SRE_ERROR_INTERRUPTED
:
1856 /* An exception has already been raised, so let it fly */
1859 /* other error codes indicate compiler/engine bugs */
1862 "internal error in regular expression engine"
1868 pattern_dealloc(PatternObject
* self
)
1870 if (self
->weakreflist
!= NULL
)
1871 PyObject_ClearWeakRefs((PyObject
*) self
);
1872 Py_XDECREF(self
->pattern
);
1873 Py_XDECREF(self
->groupindex
);
1874 Py_XDECREF(self
->indexgroup
);
1879 pattern_match(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
1885 Py_ssize_t start
= 0;
1886 Py_ssize_t end
= PY_SSIZE_T_MAX
;
1887 static char* kwlist
[] = { "pattern", "pos", "endpos", NULL
};
1888 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|nn:match", kwlist
,
1889 &string
, &start
, &end
))
1892 string
= state_init(&state
, self
, string
, start
, end
);
1896 state
.ptr
= state
.start
;
1898 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self
), state
.ptr
));
1900 if (state
.charsize
== 1) {
1901 status
= sre_match(&state
, PatternObject_GetCode(self
));
1903 #if defined(HAVE_UNICODE)
1904 status
= sre_umatch(&state
, PatternObject_GetCode(self
));
1908 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self
), state
.ptr
));
1909 if (PyErr_Occurred())
1914 return pattern_new_match(self
, &state
, status
);
1918 pattern_search(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
1924 Py_ssize_t start
= 0;
1925 Py_ssize_t end
= PY_SSIZE_T_MAX
;
1926 static char* kwlist
[] = { "pattern", "pos", "endpos", NULL
};
1927 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|nn:search", kwlist
,
1928 &string
, &start
, &end
))
1931 string
= state_init(&state
, self
, string
, start
, end
);
1935 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self
), state
.ptr
));
1937 if (state
.charsize
== 1) {
1938 status
= sre_search(&state
, PatternObject_GetCode(self
));
1940 #if defined(HAVE_UNICODE)
1941 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
1945 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self
), state
.ptr
));
1949 if (PyErr_Occurred())
1952 return pattern_new_match(self
, &state
, status
);
1956 call(char* module
, char* function
, PyObject
* args
)
1965 name
= PyString_FromString(module
);
1968 mod
= PyImport_Import(name
);
1972 func
= PyObject_GetAttrString(mod
, function
);
1976 result
= PyObject_CallObject(func
, args
);
1982 #ifdef USE_BUILTIN_COPY
1984 deepcopy(PyObject
** object
, PyObject
* memo
)
1990 PyTuple_Pack(2, *object
, memo
)
1998 return 1; /* success */
2003 join_list(PyObject
* list
, PyObject
* string
)
2005 /* join list elements */
2008 #if PY_VERSION_HEX >= 0x01060000
2014 joiner
= PySequence_GetSlice(string
, 0, 0);
2018 if (PyList_GET_SIZE(list
) == 0) {
2023 #if PY_VERSION_HEX >= 0x01060000
2024 function
= PyObject_GetAttrString(joiner
, "join");
2029 args
= PyTuple_New(1);
2031 Py_DECREF(function
);
2035 PyTuple_SET_ITEM(args
, 0, list
);
2036 result
= PyObject_CallObject(function
, args
);
2037 Py_DECREF(args
); /* also removes list */
2038 Py_DECREF(function
);
2042 PyTuple_Pack(2, list
, joiner
)
2051 pattern_findall(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2059 Py_ssize_t start
= 0;
2060 Py_ssize_t end
= PY_SSIZE_T_MAX
;
2061 static char* kwlist
[] = { "source", "pos", "endpos", NULL
};
2062 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|nn:findall", kwlist
,
2063 &string
, &start
, &end
))
2066 string
= state_init(&state
, self
, string
, start
, end
);
2070 list
= PyList_New(0);
2076 while (state
.start
<= state
.end
) {
2080 state_reset(&state
);
2082 state
.ptr
= state
.start
;
2084 if (state
.charsize
== 1) {
2085 status
= sre_search(&state
, PatternObject_GetCode(self
));
2087 #if defined(HAVE_UNICODE)
2088 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
2092 if (PyErr_Occurred())
2098 pattern_error(status
);
2102 /* don't bother to build a match object */
2103 switch (self
->groups
) {
2105 b
= STATE_OFFSET(&state
, state
.start
);
2106 e
= STATE_OFFSET(&state
, state
.ptr
);
2107 item
= PySequence_GetSlice(string
, b
, e
);
2112 item
= state_getslice(&state
, 1, string
, 1);
2117 item
= PyTuple_New(self
->groups
);
2120 for (i
= 0; i
< self
->groups
; i
++) {
2121 PyObject
* o
= state_getslice(&state
, i
+1, string
, 1);
2126 PyTuple_SET_ITEM(item
, i
, o
);
2131 status
= PyList_Append(list
, item
);
2136 if (state
.ptr
== state
.start
)
2137 state
.start
= (void*) ((char*) state
.ptr
+ state
.charsize
);
2139 state
.start
= state
.ptr
;
2153 #if PY_VERSION_HEX >= 0x02020000
2155 pattern_finditer(PatternObject
* pattern
, PyObject
* args
)
2161 scanner
= pattern_scanner(pattern
, args
);
2165 search
= PyObject_GetAttrString(scanner
, "search");
2170 iterator
= PyCallIter_New(search
, Py_None
);
2178 pattern_split(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2189 Py_ssize_t maxsplit
= 0;
2190 static char* kwlist
[] = { "source", "maxsplit", NULL
};
2191 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|n:split", kwlist
,
2192 &string
, &maxsplit
))
2195 string
= state_init(&state
, self
, string
, 0, PY_SSIZE_T_MAX
);
2199 list
= PyList_New(0);
2208 while (!maxsplit
|| n
< maxsplit
) {
2210 state_reset(&state
);
2212 state
.ptr
= state
.start
;
2214 if (state
.charsize
== 1) {
2215 status
= sre_search(&state
, PatternObject_GetCode(self
));
2217 #if defined(HAVE_UNICODE)
2218 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
2222 if (PyErr_Occurred())
2228 pattern_error(status
);
2232 if (state
.start
== state
.ptr
) {
2233 if (last
== state
.end
)
2235 /* skip one character */
2236 state
.start
= (void*) ((char*) state
.ptr
+ state
.charsize
);
2240 /* get segment before this match */
2241 item
= PySequence_GetSlice(
2242 string
, STATE_OFFSET(&state
, last
),
2243 STATE_OFFSET(&state
, state
.start
)
2247 status
= PyList_Append(list
, item
);
2252 /* add groups (if any) */
2253 for (i
= 0; i
< self
->groups
; i
++) {
2254 item
= state_getslice(&state
, i
+1, string
, 0);
2257 status
= PyList_Append(list
, item
);
2265 last
= state
.start
= state
.ptr
;
2269 /* get segment following last match (even if empty) */
2270 item
= PySequence_GetSlice(
2271 string
, STATE_OFFSET(&state
, last
), state
.endpos
2275 status
= PyList_Append(list
, item
);
2291 pattern_subx(PatternObject
* self
, PyObject
* ptemplate
, PyObject
* string
,
2292 Py_ssize_t count
, Py_ssize_t subn
)
2305 int filter_is_callable
;
2307 if (PyCallable_Check(ptemplate
)) {
2308 /* sub/subn takes either a function or a template */
2311 filter_is_callable
= 1;
2313 /* if not callable, check if it's a literal string */
2315 ptr
= getstring(ptemplate
, &n
, &bint
);
2319 literal
= sre_literal_template((unsigned char *)ptr
, n
);
2321 #if defined(HAVE_UNICODE)
2322 literal
= sre_uliteral_template((Py_UNICODE
*)ptr
, n
);
2332 filter_is_callable
= 0;
2334 /* not a literal; hand it over to the template compiler */
2336 SRE_PY_MODULE
, "_subx",
2337 PyTuple_Pack(2, self
, ptemplate
)
2341 filter_is_callable
= PyCallable_Check(filter
);
2345 string
= state_init(&state
, self
, string
, 0, PY_SSIZE_T_MAX
);
2351 list
= PyList_New(0);
2360 while (!count
|| n
< count
) {
2362 state_reset(&state
);
2364 state
.ptr
= state
.start
;
2366 if (state
.charsize
== 1) {
2367 status
= sre_search(&state
, PatternObject_GetCode(self
));
2369 #if defined(HAVE_UNICODE)
2370 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
2374 if (PyErr_Occurred())
2380 pattern_error(status
);
2384 b
= STATE_OFFSET(&state
, state
.start
);
2385 e
= STATE_OFFSET(&state
, state
.ptr
);
2388 /* get segment before this match */
2389 item
= PySequence_GetSlice(string
, i
, b
);
2392 status
= PyList_Append(list
, item
);
2397 } else if (i
== b
&& i
== e
&& n
> 0)
2398 /* ignore empty match on latest position */
2401 if (filter_is_callable
) {
2402 /* pass match object through filter */
2403 match
= pattern_new_match(self
, &state
, 1);
2406 args
= PyTuple_Pack(1, match
);
2411 item
= PyObject_CallObject(filter
, args
);
2417 /* filter is literal string */
2423 if (item
!= Py_None
) {
2424 status
= PyList_Append(list
, item
);
2435 if (state
.ptr
== state
.start
)
2436 state
.start
= (void*) ((char*) state
.ptr
+ state
.charsize
);
2438 state
.start
= state
.ptr
;
2442 /* get segment following last match */
2443 if (i
< state
.endpos
) {
2444 item
= PySequence_GetSlice(string
, i
, state
.endpos
);
2447 status
= PyList_Append(list
, item
);
2457 /* convert list to single string (also removes list) */
2458 item
= join_list(list
, string
);
2464 return Py_BuildValue("Ni", item
, n
);
2477 pattern_sub(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2479 PyObject
* ptemplate
;
2481 Py_ssize_t count
= 0;
2482 static char* kwlist
[] = { "repl", "string", "count", NULL
};
2483 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "OO|n:sub", kwlist
,
2484 &ptemplate
, &string
, &count
))
2487 return pattern_subx(self
, ptemplate
, string
, count
, 0);
2491 pattern_subn(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2493 PyObject
* ptemplate
;
2495 Py_ssize_t count
= 0;
2496 static char* kwlist
[] = { "repl", "string", "count", NULL
};
2497 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "OO|n:subn", kwlist
,
2498 &ptemplate
, &string
, &count
))
2501 return pattern_subx(self
, ptemplate
, string
, count
, 1);
2505 pattern_copy(PatternObject
* self
, PyObject
*unused
)
2507 #ifdef USE_BUILTIN_COPY
2508 PatternObject
* copy
;
2511 copy
= PyObject_NEW_VAR(PatternObject
, &Pattern_Type
, self
->codesize
);
2515 offset
= offsetof(PatternObject
, groups
);
2517 Py_XINCREF(self
->groupindex
);
2518 Py_XINCREF(self
->indexgroup
);
2519 Py_XINCREF(self
->pattern
);
2521 memcpy((char*) copy
+ offset
, (char*) self
+ offset
,
2522 sizeof(PatternObject
) + self
->codesize
* sizeof(SRE_CODE
) - offset
);
2523 copy
->weakreflist
= NULL
;
2525 return (PyObject
*) copy
;
2527 PyErr_SetString(PyExc_TypeError
, "cannot copy this pattern object");
2533 pattern_deepcopy(PatternObject
* self
, PyObject
* memo
)
2535 #ifdef USE_BUILTIN_COPY
2536 PatternObject
* copy
;
2538 copy
= (PatternObject
*) pattern_copy(self
);
2542 if (!deepcopy(©
->groupindex
, memo
) ||
2543 !deepcopy(©
->indexgroup
, memo
) ||
2544 !deepcopy(©
->pattern
, memo
)) {
2550 PyErr_SetString(PyExc_TypeError
, "cannot deepcopy this pattern object");
2555 PyDoc_STRVAR(pattern_match_doc
,
2556 "match(string[, pos[, endpos]]) --> match object or None.\n\
2557 Matches zero or more characters at the beginning of the string");
2559 PyDoc_STRVAR(pattern_search_doc
,
2560 "search(string[, pos[, endpos]]) --> match object or None.\n\
2561 Scan through string looking for a match, and return a corresponding\n\
2562 MatchObject instance. Return None if no position in the string matches.");
2564 PyDoc_STRVAR(pattern_split_doc
,
2565 "split(string[, maxsplit = 0]) --> list.\n\
2566 Split string by the occurrences of pattern.");
2568 PyDoc_STRVAR(pattern_findall_doc
,
2569 "findall(string[, pos[, endpos]]) --> list.\n\
2570 Return a list of all non-overlapping matches of pattern in string.");
2572 PyDoc_STRVAR(pattern_finditer_doc
,
2573 "finditer(string[, pos[, endpos]]) --> iterator.\n\
2574 Return an iterator over all non-overlapping matches for the \n\
2575 RE pattern in string. For each match, the iterator returns a\n\
2578 PyDoc_STRVAR(pattern_sub_doc
,
2579 "sub(repl, string[, count = 0]) --> newstring\n\
2580 Return the string obtained by replacing the leftmost non-overlapping\n\
2581 occurrences of pattern in string by the replacement repl.");
2583 PyDoc_STRVAR(pattern_subn_doc
,
2584 "subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2585 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2586 the leftmost non-overlapping occurrences of pattern with the\n\
2587 replacement repl.");
2589 PyDoc_STRVAR(pattern_doc
, "Compiled regular expression objects");
2591 static PyMethodDef pattern_methods
[] = {
2592 {"match", (PyCFunction
) pattern_match
, METH_VARARGS
|METH_KEYWORDS
,
2594 {"search", (PyCFunction
) pattern_search
, METH_VARARGS
|METH_KEYWORDS
,
2595 pattern_search_doc
},
2596 {"sub", (PyCFunction
) pattern_sub
, METH_VARARGS
|METH_KEYWORDS
,
2598 {"subn", (PyCFunction
) pattern_subn
, METH_VARARGS
|METH_KEYWORDS
,
2600 {"split", (PyCFunction
) pattern_split
, METH_VARARGS
|METH_KEYWORDS
,
2602 {"findall", (PyCFunction
) pattern_findall
, METH_VARARGS
|METH_KEYWORDS
,
2603 pattern_findall_doc
},
2604 #if PY_VERSION_HEX >= 0x02020000
2605 {"finditer", (PyCFunction
) pattern_finditer
, METH_VARARGS
,
2606 pattern_finditer_doc
},
2608 {"scanner", (PyCFunction
) pattern_scanner
, METH_VARARGS
},
2609 {"__copy__", (PyCFunction
) pattern_copy
, METH_NOARGS
},
2610 {"__deepcopy__", (PyCFunction
) pattern_deepcopy
, METH_O
},
2614 #define PAT_OFF(x) offsetof(PatternObject, x)
2615 static PyMemberDef pattern_members
[] = {
2616 {"pattern", T_OBJECT
, PAT_OFF(pattern
), READONLY
},
2617 {"flags", T_INT
, PAT_OFF(flags
), READONLY
},
2618 {"groups", T_PYSSIZET
, PAT_OFF(groups
), READONLY
},
2619 {"groupindex", T_OBJECT
, PAT_OFF(groupindex
), READONLY
},
2620 {NULL
} /* Sentinel */
2623 statichere PyTypeObject Pattern_Type
= {
2624 PyObject_HEAD_INIT(NULL
)
2625 0, "_" SRE_MODULE
".SRE_Pattern",
2626 sizeof(PatternObject
), sizeof(SRE_CODE
),
2627 (destructor
)pattern_dealloc
, /*tp_dealloc*/
2629 0, /* tp_getattrn */
2633 0, /* tp_as_number */
2634 0, /* tp_as_sequence */
2635 0, /* tp_as_mapping */
2639 0, /* tp_getattro */
2640 0, /* tp_setattro */
2641 0, /* tp_as_buffer */
2642 Py_TPFLAGS_DEFAULT
, /* tp_flags */
2643 pattern_doc
, /* tp_doc */
2644 0, /* tp_traverse */
2646 0, /* tp_richcompare */
2647 offsetof(PatternObject
, weakreflist
), /* tp_weaklistoffset */
2649 0, /* tp_iternext */
2650 pattern_methods
, /* tp_methods */
2651 pattern_members
, /* tp_members */
2654 static int _validate(PatternObject
*self
); /* Forward */
2657 _compile(PyObject
* self_
, PyObject
* args
)
2659 /* "compile" pattern descriptor to pattern object */
2661 PatternObject
* self
;
2667 Py_ssize_t groups
= 0;
2668 PyObject
* groupindex
= NULL
;
2669 PyObject
* indexgroup
= NULL
;
2670 if (!PyArg_ParseTuple(args
, "OiO!|nOO", &pattern
, &flags
,
2671 &PyList_Type
, &code
, &groups
,
2672 &groupindex
, &indexgroup
))
2675 n
= PyList_GET_SIZE(code
);
2676 /* coverity[ampersand_in_size] */
2677 self
= PyObject_NEW_VAR(PatternObject
, &Pattern_Type
, n
);
2680 self
->weakreflist
= NULL
;
2681 self
->pattern
= NULL
;
2682 self
->groupindex
= NULL
;
2683 self
->indexgroup
= NULL
;
2687 for (i
= 0; i
< n
; i
++) {
2688 PyObject
*o
= PyList_GET_ITEM(code
, i
);
2689 unsigned long value
= PyInt_Check(o
) ? (unsigned long)PyInt_AsLong(o
)
2690 : PyLong_AsUnsignedLong(o
);
2691 self
->code
[i
] = (SRE_CODE
) value
;
2692 if ((unsigned long) self
->code
[i
] != value
) {
2693 PyErr_SetString(PyExc_OverflowError
,
2694 "regular expression code size limit exceeded");
2699 if (PyErr_Occurred()) {
2705 self
->pattern
= pattern
;
2707 self
->flags
= flags
;
2709 self
->groups
= groups
;
2711 Py_XINCREF(groupindex
);
2712 self
->groupindex
= groupindex
;
2714 Py_XINCREF(indexgroup
);
2715 self
->indexgroup
= indexgroup
;
2717 self
->weakreflist
= NULL
;
2719 if (!_validate(self
)) {
2724 return (PyObject
*) self
;
2727 /* -------------------------------------------------------------------- */
2728 /* Code validation */
2730 /* To learn more about this code, have a look at the _compile() function in
2731 Lib/sre_compile.py. The validation functions below checks the code array
2732 for conformance with the code patterns generated there.
2734 The nice thing about the generated code is that it is position-independent:
2735 all jumps are relative jumps forward. Also, jumps don't cross each other:
2736 the target of a later jump is always earlier than the target of an earlier
2737 jump. IOW, this is okay:
2739 J---------J-------T--------T
2741 \______________________/
2745 J---------J-------T--------T
2749 It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
2750 bytes wide (the latter if Python is compiled for "wide" unicode support).
2753 /* Defining this one enables tracing of the validator */
2756 /* Trace macro for the validator */
2757 #if defined(VVERBOSE)
2758 #define VTRACE(v) printf v
2763 /* Report failure */
2764 #define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2766 /* Extract opcode, argument, or skip count from code array */
2769 VTRACE(("%p: ", code)); \
2770 if (code >= end) FAIL; \
2772 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2776 VTRACE(("%p= ", code)); \
2777 if (code >= end) FAIL; \
2779 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2781 #define GET_SKIP_ADJ(adj) \
2783 VTRACE(("%p= ", code)); \
2784 if (code >= end) FAIL; \
2786 VTRACE(("%lu (skip to %p)\n", \
2787 (unsigned long)skip, code+skip)); \
2788 if (code+skip-adj < code || code+skip-adj > end)\
2792 #define GET_SKIP GET_SKIP_ADJ(0)
2795 _validate_charset(SRE_CODE
*code
, SRE_CODE
*end
)
2797 /* Some variables are manipulated by the macros above */
2803 while (code
< end
) {
2810 case SRE_OP_LITERAL
:
2819 case SRE_OP_CHARSET
:
2820 offset
= 32/sizeof(SRE_CODE
); /* 32-byte bitmap */
2821 if (code
+offset
< code
|| code
+offset
> end
)
2826 case SRE_OP_BIGCHARSET
:
2827 GET_ARG
; /* Number of blocks */
2828 offset
= 256/sizeof(SRE_CODE
); /* 256-byte table */
2829 if (code
+offset
< code
|| code
+offset
> end
)
2831 /* Make sure that each byte points to a valid block */
2832 for (i
= 0; i
< 256; i
++) {
2833 if (((unsigned char *)code
)[i
] >= arg
)
2837 offset
= arg
* 32/sizeof(SRE_CODE
); /* 32-byte bitmap times arg */
2838 if (code
+offset
< code
|| code
+offset
> end
)
2843 case SRE_OP_CATEGORY
:
2846 case SRE_CATEGORY_DIGIT
:
2847 case SRE_CATEGORY_NOT_DIGIT
:
2848 case SRE_CATEGORY_SPACE
:
2849 case SRE_CATEGORY_NOT_SPACE
:
2850 case SRE_CATEGORY_WORD
:
2851 case SRE_CATEGORY_NOT_WORD
:
2852 case SRE_CATEGORY_LINEBREAK
:
2853 case SRE_CATEGORY_NOT_LINEBREAK
:
2854 case SRE_CATEGORY_LOC_WORD
:
2855 case SRE_CATEGORY_LOC_NOT_WORD
:
2856 case SRE_CATEGORY_UNI_DIGIT
:
2857 case SRE_CATEGORY_UNI_NOT_DIGIT
:
2858 case SRE_CATEGORY_UNI_SPACE
:
2859 case SRE_CATEGORY_UNI_NOT_SPACE
:
2860 case SRE_CATEGORY_UNI_WORD
:
2861 case SRE_CATEGORY_UNI_NOT_WORD
:
2862 case SRE_CATEGORY_UNI_LINEBREAK
:
2863 case SRE_CATEGORY_UNI_NOT_LINEBREAK
:
2880 _validate_inner(SRE_CODE
*code
, SRE_CODE
*end
, Py_ssize_t groups
)
2882 /* Some variables are manipulated by the macros above */
2887 VTRACE(("code=%p, end=%p\n", code
, end
));
2892 while (code
< end
) {
2897 /* We don't check whether marks are properly nested; the
2898 sre_match() code is robust even if they don't, and the worst
2899 you can get is nonsensical match results. */
2901 if (arg
> 2*groups
+1) {
2902 VTRACE(("arg=%d, groups=%d\n", (int)arg
, (int)groups
));
2907 case SRE_OP_LITERAL
:
2908 case SRE_OP_NOT_LITERAL
:
2909 case SRE_OP_LITERAL_IGNORE
:
2910 case SRE_OP_NOT_LITERAL_IGNORE
:
2912 /* The arg is just a character, nothing to check */
2915 case SRE_OP_SUCCESS
:
2916 case SRE_OP_FAILURE
:
2917 /* Nothing to check; these normally end the matching process */
2923 case SRE_AT_BEGINNING
:
2924 case SRE_AT_BEGINNING_STRING
:
2925 case SRE_AT_BEGINNING_LINE
:
2927 case SRE_AT_END_LINE
:
2928 case SRE_AT_END_STRING
:
2929 case SRE_AT_BOUNDARY
:
2930 case SRE_AT_NON_BOUNDARY
:
2931 case SRE_AT_LOC_BOUNDARY
:
2932 case SRE_AT_LOC_NON_BOUNDARY
:
2933 case SRE_AT_UNI_BOUNDARY
:
2934 case SRE_AT_UNI_NON_BOUNDARY
:
2942 case SRE_OP_ANY_ALL
:
2943 /* These have no operands */
2947 case SRE_OP_IN_IGNORE
:
2949 /* Stop 1 before the end; we check the FAILURE below */
2950 if (!_validate_charset(code
, code
+skip
-2))
2952 if (code
[skip
-2] != SRE_OP_FAILURE
)
2959 /* A minimal info field is
2960 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2961 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2966 newcode
= code
+skip
-1;
2967 GET_ARG
; flags
= arg
;
2970 /* Check that only valid flags are present */
2971 if ((flags
& ~(SRE_INFO_PREFIX
|
2973 SRE_INFO_CHARSET
)) != 0)
2975 /* PREFIX and CHARSET are mutually exclusive */
2976 if ((flags
& SRE_INFO_PREFIX
) &&
2977 (flags
& SRE_INFO_CHARSET
))
2979 /* LITERAL implies PREFIX */
2980 if ((flags
& SRE_INFO_LITERAL
) &&
2981 !(flags
& SRE_INFO_PREFIX
))
2983 /* Validate the prefix */
2984 if (flags
& SRE_INFO_PREFIX
) {
2985 SRE_CODE prefix_len
;
2986 GET_ARG
; prefix_len
= arg
;
2987 GET_ARG
; /* prefix skip */
2988 /* Here comes the prefix string */
2989 if (code
+prefix_len
< code
|| code
+prefix_len
> newcode
)
2992 /* And here comes the overlap table */
2993 if (code
+prefix_len
< code
|| code
+prefix_len
> newcode
)
2995 /* Each overlap value should be < prefix_len */
2996 for (i
= 0; i
< prefix_len
; i
++) {
2997 if (code
[i
] >= prefix_len
)
3002 /* Validate the charset */
3003 if (flags
& SRE_INFO_CHARSET
) {
3004 if (!_validate_charset(code
, newcode
-1))
3006 if (newcode
[-1] != SRE_OP_FAILURE
)
3010 else if (code
!= newcode
) {
3011 VTRACE(("code=%p, newcode=%p\n", code
, newcode
));
3019 SRE_CODE
*target
= NULL
;
3024 /* Stop 2 before the end; we check the JUMP below */
3025 if (!_validate_inner(code
, code
+skip
-3, groups
))
3028 /* Check that it ends with a JUMP, and that each JUMP
3029 has the same target */
3031 if (op
!= SRE_OP_JUMP
)
3035 target
= code
+skip
-1;
3036 else if (code
+skip
-1 != target
)
3042 case SRE_OP_REPEAT_ONE
:
3043 case SRE_OP_MIN_REPEAT_ONE
:
3051 #ifdef Py_UNICODE_WIDE
3055 if (!_validate_inner(code
, code
+skip
-4, groups
))
3059 if (op
!= SRE_OP_SUCCESS
)
3072 #ifdef Py_UNICODE_WIDE
3076 if (!_validate_inner(code
, code
+skip
-3, groups
))
3080 if (op
!= SRE_OP_MAX_UNTIL
&& op
!= SRE_OP_MIN_UNTIL
)
3085 case SRE_OP_GROUPREF
:
3086 case SRE_OP_GROUPREF_IGNORE
:
3092 case SRE_OP_GROUPREF_EXISTS
:
3093 /* The regex syntax for this is: '(?(group)then|else)', where
3094 'group' is either an integer group number or a group name,
3095 'then' and 'else' are sub-regexes, and 'else' is optional. */
3100 code
--; /* The skip is relative to the first arg! */
3101 /* There are two possibilities here: if there is both a 'then'
3102 part and an 'else' part, the generated code looks like:
3110 (<skipyes> jumps here)
3112 (<skipno> jumps here)
3114 If there is only a 'then' part, it looks like:
3122 There is no direct way to decide which it is, and we don't want
3123 to allow arbitrary jumps anywhere in the code; so we just look
3124 for a JUMP opcode preceding our skip target.
3126 if (skip
>= 3 && code
+skip
-3 >= code
&&
3127 code
[skip
-3] == SRE_OP_JUMP
)
3129 VTRACE(("both then and else parts present\n"));
3130 if (!_validate_inner(code
+1, code
+skip
-3, groups
))
3132 code
+= skip
-2; /* Position after JUMP, at <skipno> */
3134 if (!_validate_inner(code
, code
+skip
-1, groups
))
3139 VTRACE(("only a then part present\n"));
3140 if (!_validate_inner(code
+1, code
+skip
-1, groups
))
3147 case SRE_OP_ASSERT_NOT
:
3149 GET_ARG
; /* 0 for lookahead, width for lookbehind */
3150 code
--; /* Back up over arg to simplify math below */
3151 if (arg
& 0x80000000)
3152 FAIL
; /* Width too large */
3153 /* Stop 1 before the end; we check the SUCCESS below */
3154 if (!_validate_inner(code
+1, code
+skip
-2, groups
))
3158 if (op
!= SRE_OP_SUCCESS
)
3173 _validate_outer(SRE_CODE
*code
, SRE_CODE
*end
, Py_ssize_t groups
)
3175 if (groups
< 0 || groups
> 100 || code
>= end
|| end
[-1] != SRE_OP_SUCCESS
)
3177 if (groups
== 0) /* fix for simplejson */
3178 groups
= 100; /* 100 groups should always be safe */
3179 return _validate_inner(code
, end
-1, groups
);
3183 _validate(PatternObject
*self
)
3185 if (!_validate_outer(self
->code
, self
->code
+self
->codesize
, self
->groups
))
3187 PyErr_SetString(PyExc_RuntimeError
, "invalid SRE code");
3191 VTRACE(("Success!\n"));
3195 /* -------------------------------------------------------------------- */
3199 match_dealloc(MatchObject
* self
)
3201 Py_XDECREF(self
->regs
);
3202 Py_XDECREF(self
->string
);
3203 Py_DECREF(self
->pattern
);
3208 match_getslice_by_index(MatchObject
* self
, Py_ssize_t index
, PyObject
* def
)
3210 if (index
< 0 || index
>= self
->groups
) {
3211 /* raise IndexError if we were given a bad group number */
3221 if (self
->string
== Py_None
|| self
->mark
[index
] < 0) {
3222 /* return default value if the string or group is undefined */
3227 return PySequence_GetSlice(
3228 self
->string
, self
->mark
[index
], self
->mark
[index
+1]
3233 match_getindex(MatchObject
* self
, PyObject
* index
)
3237 if (PyInt_Check(index
))
3238 return PyInt_AsSsize_t(index
);
3242 if (self
->pattern
->groupindex
) {
3243 index
= PyObject_GetItem(self
->pattern
->groupindex
, index
);
3245 if (PyInt_Check(index
) || PyLong_Check(index
))
3246 i
= PyInt_AsSsize_t(index
);
3256 match_getslice(MatchObject
* self
, PyObject
* index
, PyObject
* def
)
3258 return match_getslice_by_index(self
, match_getindex(self
, index
), def
);
3262 match_expand(MatchObject
* self
, PyObject
* ptemplate
)
3264 /* delegate to Python code */
3266 SRE_PY_MODULE
, "_expand",
3267 PyTuple_Pack(3, self
->pattern
, self
, ptemplate
)
3272 match_group(MatchObject
* self
, PyObject
* args
)
3277 size
= PyTuple_GET_SIZE(args
);
3281 result
= match_getslice(self
, Py_False
, Py_None
);
3284 result
= match_getslice(self
, PyTuple_GET_ITEM(args
, 0), Py_None
);
3287 /* fetch multiple items */
3288 result
= PyTuple_New(size
);
3291 for (i
= 0; i
< size
; i
++) {
3292 PyObject
* item
= match_getslice(
3293 self
, PyTuple_GET_ITEM(args
, i
), Py_None
3299 PyTuple_SET_ITEM(result
, i
, item
);
3307 match_groups(MatchObject
* self
, PyObject
* args
, PyObject
* kw
)
3312 PyObject
* def
= Py_None
;
3313 static char* kwlist
[] = { "default", NULL
};
3314 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "|O:groups", kwlist
, &def
))
3317 result
= PyTuple_New(self
->groups
-1);
3321 for (index
= 1; index
< self
->groups
; index
++) {
3323 item
= match_getslice_by_index(self
, index
, def
);
3328 PyTuple_SET_ITEM(result
, index
-1, item
);
3335 match_groupdict(MatchObject
* self
, PyObject
* args
, PyObject
* kw
)
3341 PyObject
* def
= Py_None
;
3342 static char* kwlist
[] = { "default", NULL
};
3343 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "|O:groupdict", kwlist
, &def
))
3346 result
= PyDict_New();
3347 if (!result
|| !self
->pattern
->groupindex
)
3350 keys
= PyMapping_Keys(self
->pattern
->groupindex
);
3354 for (index
= 0; index
< PyList_GET_SIZE(keys
); index
++) {
3358 key
= PyList_GET_ITEM(keys
, index
);
3361 value
= match_getslice(self
, key
, def
);
3366 status
= PyDict_SetItem(result
, key
, value
);
3383 match_start(MatchObject
* self
, PyObject
* args
)
3387 PyObject
* index_
= Py_False
; /* zero */
3388 if (!PyArg_UnpackTuple(args
, "start", 0, 1, &index_
))
3391 index
= match_getindex(self
, index_
);
3393 if (index
< 0 || index
>= self
->groups
) {
3401 /* mark is -1 if group is undefined */
3402 return Py_BuildValue("i", self
->mark
[index
*2]);
3406 match_end(MatchObject
* self
, PyObject
* args
)
3410 PyObject
* index_
= Py_False
; /* zero */
3411 if (!PyArg_UnpackTuple(args
, "end", 0, 1, &index_
))
3414 index
= match_getindex(self
, index_
);
3416 if (index
< 0 || index
>= self
->groups
) {
3424 /* mark is -1 if group is undefined */
3425 return Py_BuildValue("i", self
->mark
[index
*2+1]);
3429 _pair(Py_ssize_t i1
, Py_ssize_t i2
)
3434 pair
= PyTuple_New(2);
3438 item
= PyInt_FromSsize_t(i1
);
3441 PyTuple_SET_ITEM(pair
, 0, item
);
3443 item
= PyInt_FromSsize_t(i2
);
3446 PyTuple_SET_ITEM(pair
, 1, item
);
3456 match_span(MatchObject
* self
, PyObject
* args
)
3460 PyObject
* index_
= Py_False
; /* zero */
3461 if (!PyArg_UnpackTuple(args
, "span", 0, 1, &index_
))
3464 index
= match_getindex(self
, index_
);
3466 if (index
< 0 || index
>= self
->groups
) {
3474 /* marks are -1 if group is undefined */
3475 return _pair(self
->mark
[index
*2], self
->mark
[index
*2+1]);
3479 match_regs(MatchObject
* self
)
3485 regs
= PyTuple_New(self
->groups
);
3489 for (index
= 0; index
< self
->groups
; index
++) {
3490 item
= _pair(self
->mark
[index
*2], self
->mark
[index
*2+1]);
3495 PyTuple_SET_ITEM(regs
, index
, item
);
3505 match_copy(MatchObject
* self
, PyObject
*unused
)
3507 #ifdef USE_BUILTIN_COPY
3509 Py_ssize_t slots
, offset
;
3511 slots
= 2 * (self
->pattern
->groups
+1);
3513 copy
= PyObject_NEW_VAR(MatchObject
, &Match_Type
, slots
);
3517 /* this value a constant, but any compiler should be able to
3518 figure that out all by itself */
3519 offset
= offsetof(MatchObject
, string
);
3521 Py_XINCREF(self
->pattern
);
3522 Py_XINCREF(self
->string
);
3523 Py_XINCREF(self
->regs
);
3525 memcpy((char*) copy
+ offset
, (char*) self
+ offset
,
3526 sizeof(MatchObject
) + slots
* sizeof(Py_ssize_t
) - offset
);
3528 return (PyObject
*) copy
;
3530 PyErr_SetString(PyExc_TypeError
, "cannot copy this match object");
3536 match_deepcopy(MatchObject
* self
, PyObject
* memo
)
3538 #ifdef USE_BUILTIN_COPY
3541 copy
= (MatchObject
*) match_copy(self
);
3545 if (!deepcopy((PyObject
**) ©
->pattern
, memo
) ||
3546 !deepcopy(©
->string
, memo
) ||
3547 !deepcopy(©
->regs
, memo
)) {
3553 PyErr_SetString(PyExc_TypeError
, "cannot deepcopy this match object");
3558 static struct PyMethodDef match_methods
[] = {
3559 {"group", (PyCFunction
) match_group
, METH_VARARGS
},
3560 {"start", (PyCFunction
) match_start
, METH_VARARGS
},
3561 {"end", (PyCFunction
) match_end
, METH_VARARGS
},
3562 {"span", (PyCFunction
) match_span
, METH_VARARGS
},
3563 {"groups", (PyCFunction
) match_groups
, METH_VARARGS
|METH_KEYWORDS
},
3564 {"groupdict", (PyCFunction
) match_groupdict
, METH_VARARGS
|METH_KEYWORDS
},
3565 {"expand", (PyCFunction
) match_expand
, METH_O
},
3566 {"__copy__", (PyCFunction
) match_copy
, METH_NOARGS
},
3567 {"__deepcopy__", (PyCFunction
) match_deepcopy
, METH_O
},
3572 match_lastindex_get(MatchObject
*self
)
3574 if (self
->lastindex
>= 0)
3575 return Py_BuildValue("i", self
->lastindex
);
3581 match_lastgroup_get(MatchObject
*self
)
3583 if (self
->pattern
->indexgroup
&& self
->lastindex
>= 0) {
3584 PyObject
* result
= PySequence_GetItem(
3585 self
->pattern
->indexgroup
, self
->lastindex
3596 match_regs_get(MatchObject
*self
)
3599 Py_INCREF(self
->regs
);
3602 return match_regs(self
);
3605 static PyGetSetDef match_getset
[] = {
3606 {"lastindex", (getter
)match_lastindex_get
, (setter
)NULL
},
3607 {"lastgroup", (getter
)match_lastgroup_get
, (setter
)NULL
},
3608 {"regs", (getter
)match_regs_get
, (setter
)NULL
},
3612 #define MATCH_OFF(x) offsetof(MatchObject, x)
3613 static PyMemberDef match_members
[] = {
3614 {"string", T_OBJECT
, MATCH_OFF(string
), READONLY
},
3615 {"re", T_OBJECT
, MATCH_OFF(pattern
), READONLY
},
3616 {"pos", T_PYSSIZET
, MATCH_OFF(pos
), READONLY
},
3617 {"endpos", T_PYSSIZET
, MATCH_OFF(endpos
), READONLY
},
3622 /* FIXME: implement setattr("string", None) as a special case (to
3623 detach the associated string, if any */
3625 static PyTypeObject Match_Type
= {
3626 PyVarObject_HEAD_INIT(NULL
, 0)
3627 "_" SRE_MODULE
".SRE_Match",
3628 sizeof(MatchObject
), sizeof(Py_ssize_t
),
3629 (destructor
)match_dealloc
, /* tp_dealloc */
3635 0, /* tp_as_number */
3636 0, /* tp_as_sequence */
3637 0, /* tp_as_mapping */
3641 0, /* tp_getattro */
3642 0, /* tp_setattro */
3643 0, /* tp_as_buffer */
3646 0, /* tp_traverse */
3648 0, /* tp_richcompare */
3649 0, /* tp_weaklistoffset */
3651 0, /* tp_iternext */
3652 match_methods
, /* tp_methods */
3653 match_members
, /* tp_members */
3654 match_getset
, /* tp_getset */
3658 pattern_new_match(PatternObject
* pattern
, SRE_STATE
* state
, int status
)
3660 /* create match object (from state object) */
3669 /* create match object (with room for extra group marks) */
3670 /* coverity[ampersand_in_size] */
3671 match
= PyObject_NEW_VAR(MatchObject
, &Match_Type
,
3672 2*(pattern
->groups
+1));
3677 match
->pattern
= pattern
;
3679 Py_INCREF(state
->string
);
3680 match
->string
= state
->string
;
3683 match
->groups
= pattern
->groups
+1;
3685 /* fill in group slices */
3687 base
= (char*) state
->beginning
;
3688 n
= state
->charsize
;
3690 match
->mark
[0] = ((char*) state
->start
- base
) / n
;
3691 match
->mark
[1] = ((char*) state
->ptr
- base
) / n
;
3693 for (i
= j
= 0; i
< pattern
->groups
; i
++, j
+=2)
3694 if (j
+1 <= state
->lastmark
&& state
->mark
[j
] && state
->mark
[j
+1]) {
3695 match
->mark
[j
+2] = ((char*) state
->mark
[j
] - base
) / n
;
3696 match
->mark
[j
+3] = ((char*) state
->mark
[j
+1] - base
) / n
;
3698 match
->mark
[j
+2] = match
->mark
[j
+3] = -1; /* undefined */
3700 match
->pos
= state
->pos
;
3701 match
->endpos
= state
->endpos
;
3703 match
->lastindex
= state
->lastindex
;
3705 return (PyObject
*) match
;
3707 } else if (status
== 0) {
3715 /* internal error */
3716 pattern_error(status
);
3721 /* -------------------------------------------------------------------- */
3722 /* scanner methods (experimental) */
3725 scanner_dealloc(ScannerObject
* self
)
3727 state_fini(&self
->state
);
3728 Py_XDECREF(self
->pattern
);
3733 scanner_match(ScannerObject
* self
, PyObject
*unused
)
3735 SRE_STATE
* state
= &self
->state
;
3741 state
->ptr
= state
->start
;
3743 if (state
->charsize
== 1) {
3744 status
= sre_match(state
, PatternObject_GetCode(self
->pattern
));
3746 #if defined(HAVE_UNICODE)
3747 status
= sre_umatch(state
, PatternObject_GetCode(self
->pattern
));
3750 if (PyErr_Occurred())
3753 match
= pattern_new_match((PatternObject
*) self
->pattern
,
3756 if (status
== 0 || state
->ptr
== state
->start
)
3757 state
->start
= (void*) ((char*) state
->ptr
+ state
->charsize
);
3759 state
->start
= state
->ptr
;
3766 scanner_search(ScannerObject
* self
, PyObject
*unused
)
3768 SRE_STATE
* state
= &self
->state
;
3774 state
->ptr
= state
->start
;
3776 if (state
->charsize
== 1) {
3777 status
= sre_search(state
, PatternObject_GetCode(self
->pattern
));
3779 #if defined(HAVE_UNICODE)
3780 status
= sre_usearch(state
, PatternObject_GetCode(self
->pattern
));
3783 if (PyErr_Occurred())
3786 match
= pattern_new_match((PatternObject
*) self
->pattern
,
3789 if (status
== 0 || state
->ptr
== state
->start
)
3790 state
->start
= (void*) ((char*) state
->ptr
+ state
->charsize
);
3792 state
->start
= state
->ptr
;
3797 static PyMethodDef scanner_methods
[] = {
3798 {"match", (PyCFunction
) scanner_match
, METH_NOARGS
},
3799 {"search", (PyCFunction
) scanner_search
, METH_NOARGS
},
3803 #define SCAN_OFF(x) offsetof(ScannerObject, x)
3804 static PyMemberDef scanner_members
[] = {
3805 {"pattern", T_OBJECT
, SCAN_OFF(pattern
), READONLY
},
3806 {NULL
} /* Sentinel */
3809 statichere PyTypeObject Scanner_Type
= {
3810 PyObject_HEAD_INIT(NULL
)
3811 0, "_" SRE_MODULE
".SRE_Scanner",
3812 sizeof(ScannerObject
), 0,
3813 (destructor
)scanner_dealloc
, /*tp_dealloc*/
3817 0, /* tp_reserved */
3819 0, /* tp_as_number */
3820 0, /* tp_as_sequence */
3821 0, /* tp_as_mapping */
3825 0, /* tp_getattro */
3826 0, /* tp_setattro */
3827 0, /* tp_as_buffer */
3828 Py_TPFLAGS_DEFAULT
, /* tp_flags */
3830 0, /* tp_traverse */
3832 0, /* tp_richcompare */
3833 0, /* tp_weaklistoffset */
3835 0, /* tp_iternext */
3836 scanner_methods
, /* tp_methods */
3837 scanner_members
, /* tp_members */
3842 pattern_scanner(PatternObject
* pattern
, PyObject
* args
)
3844 /* create search state object */
3846 ScannerObject
* self
;
3849 Py_ssize_t start
= 0;
3850 Py_ssize_t end
= PY_SSIZE_T_MAX
;
3851 if (!PyArg_ParseTuple(args
, "O|nn:scanner", &string
, &start
, &end
))
3854 /* create scanner object */
3855 self
= PyObject_NEW(ScannerObject
, &Scanner_Type
);
3858 self
->pattern
= NULL
;
3860 string
= state_init(&self
->state
, pattern
, string
, start
, end
);
3867 self
->pattern
= (PyObject
*) pattern
;
3869 return (PyObject
*) self
;
3872 static PyMethodDef _functions
[] = {
3873 {"compile", _compile
, METH_VARARGS
},
3874 {"getcodesize", sre_codesize
, METH_NOARGS
},
3875 {"getlower", sre_getlower
, METH_VARARGS
},
3879 #if PY_VERSION_HEX < 0x02030000
3880 DL_EXPORT(void) init_sre(void)
3882 PyMODINIT_FUNC
init_sre(void)
3889 /* Patch object types */
3890 if (PyType_Ready(&Pattern_Type
) || PyType_Ready(&Match_Type
) ||
3891 PyType_Ready(&Scanner_Type
))
3894 m
= Py_InitModule("_" SRE_MODULE
, _functions
);
3897 d
= PyModule_GetDict(m
);
3899 x
= PyInt_FromLong(SRE_MAGIC
);
3901 PyDict_SetItemString(d
, "MAGIC", x
);
3905 x
= PyInt_FromLong(sizeof(SRE_CODE
));
3907 PyDict_SetItemString(d
, "CODESIZE", x
);
3911 x
= PyString_FromString(copyright
);
3913 PyDict_SetItemString(d
, "copyright", x
);
3918 #endif /* !defined(SRE_RECURSIVE) */