1 /**********************************************************************
2 regcomp.c - Oniguruma (regular expression library)
3 **********************************************************************/
5 * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 OnigCaseFoldType OnigDefaultCaseFoldFlag
= ONIGENC_CASE_FOLD_MIN
;
42 make_int_stack(int_stack
** rs
, int init_size
)
49 s
= xmalloc(sizeof(*s
));
50 if (IS_NULL(s
)) return ONIGERR_MEMORY
;
52 v
= (int* )xmalloc(sizeof(int) * init_size
);
55 return ONIGERR_MEMORY
;
67 free_int_stack(int_stack
* s
)
70 if (IS_NOT_NULL(s
->v
))
77 int_stack_push(int_stack
* s
, int v
)
79 if (s
->n
>= s
->alloc
) {
80 int new_size
= s
->alloc
* 2;
81 int* nv
= (int* )xrealloc(s
->v
, sizeof(int) * new_size
);
82 if (IS_NULL(nv
)) return ONIGERR_MEMORY
;
94 int_stack_pop(int_stack
* s
)
100 fprintf(stderr
, "int_stack_pop: fail empty. %p\n", s
);
111 extern OnigCaseFoldType
112 onig_get_default_case_fold_flag(void)
114 return OnigDefaultCaseFoldFlag
;
118 onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag
)
120 OnigDefaultCaseFoldFlag
= case_fold_flag
;
125 int_multiply_cmp(int x
, int y
, int v
)
127 if (x
== 0 || y
== 0) return -1;
129 if (x
< INT_MAX
/ y
) {
131 if (xy
> v
) return 1;
133 if (xy
== v
) return 0;
142 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
143 static unsigned char PadBuf
[WORD_ALIGNMENT_SIZE
];
147 swap_node(Node
* a
, Node
* b
)
151 c
= *a
; *a
= *b
; *b
= c
;
153 if (NODE_TYPE(a
) == NODE_STRING
) {
154 StrNode
* sn
= STR_(a
);
156 int len
= (int )(sn
->end
- sn
->s
);
158 sn
->end
= sn
->s
+ len
;
162 if (NODE_TYPE(b
) == NODE_STRING
) {
163 StrNode
* sn
= STR_(b
);
165 int len
= (int )(sn
->end
- sn
->s
);
167 sn
->end
= sn
->s
+ len
;
173 distance_add(OnigLen d1
, OnigLen d2
)
175 if (d1
== INFINITE_LEN
|| d2
== INFINITE_LEN
)
178 if (d1
<= INFINITE_LEN
- d2
) return d1
+ d2
;
179 else return INFINITE_LEN
;
184 distance_multiply(OnigLen d
, int m
)
186 if (m
== 0) return 0;
188 if (d
< INFINITE_LEN
/ m
)
195 bitset_is_empty(BitSetRef bs
)
199 for (i
= 0; i
< (int )BITSET_SIZE
; i
++) {
200 if (bs
[i
] != 0) return 0;
206 onig_bbuf_init(BBuf
* buf
, int size
)
213 buf
->p
= (UChar
* )xmalloc(size
);
214 if (IS_NULL(buf
->p
)) return(ONIGERR_MEMORY
);
226 unset_addr_list_init(UnsetAddrList
* list
, int size
)
228 UnsetAddr
* p
= (UnsetAddr
* )xmalloc(sizeof(UnsetAddr
)* size
);
229 CHECK_NULL_RETURN_MEMERR(p
);
238 unset_addr_list_end(UnsetAddrList
* list
)
240 if (IS_NOT_NULL(list
->us
))
245 unset_addr_list_add(UnsetAddrList
* list
, int offset
, struct _Node
* node
)
250 if (list
->num
>= list
->alloc
) {
251 size
= list
->alloc
* 2;
252 p
= (UnsetAddr
* )xrealloc(list
->us
, sizeof(UnsetAddr
) * size
, sizeof(UnsetAddr
)* list
->alloc
);
253 CHECK_NULL_RETURN_MEMERR(p
);
258 list
->us
[list
->num
].offset
= offset
;
259 list
->us
[list
->num
].target
= node
;
263 #endif /* USE_CALL */
267 add_opcode(regex_t
* reg
, int opcode
)
269 BB_ADD1(reg
, opcode
);
274 add_rel_addr(regex_t
* reg
, int addr
)
276 RelAddrType ra
= (RelAddrType
)addr
;
278 BB_ADD(reg
, &ra
, SIZE_RELADDR
);
283 add_abs_addr(regex_t
* reg
, int addr
)
285 AbsAddrType ra
= (AbsAddrType
)addr
;
287 BB_ADD(reg
, &ra
, SIZE_ABSADDR
);
292 add_length(regex_t
* reg
, int len
)
294 LengthType l
= (LengthType
)len
;
296 BB_ADD(reg
, &l
, SIZE_LENGTH
);
301 add_mem_num(regex_t
* reg
, int num
)
303 MemNumType n
= (MemNumType
)num
;
305 BB_ADD(reg
, &n
, SIZE_MEMNUM
);
311 add_pointer(regex_t
* reg
, void* addr
)
313 PointerType ptr
= (PointerType
)addr
;
315 BB_ADD(reg
, &ptr
, SIZE_POINTER
);
321 add_option(regex_t
* reg
, OnigOptionType option
)
323 BB_ADD(reg
, &option
, SIZE_OPTION
);
328 add_save_type(regex_t
* reg
, enum SaveType type
)
330 SaveType t
= (SaveType
)type
;
332 BB_ADD(reg
, &t
, SIZE_SAVE_TYPE
);
337 add_update_var_type(regex_t
* reg
, enum UpdateVarType type
)
339 UpdateVarType t
= (UpdateVarType
)type
;
341 BB_ADD(reg
, &t
, SIZE_UPDATE_VAR_TYPE
);
346 add_mode(regex_t
* reg
, ModeType mode
)
348 BB_ADD(reg
, &mode
, SIZE_MODE
);
353 add_opcode_rel_addr(regex_t
* reg
, int opcode
, int addr
)
357 r
= add_opcode(reg
, opcode
);
358 if (r
!= 0) return r
;
359 r
= add_rel_addr(reg
, addr
);
364 add_bytes(regex_t
* reg
, UChar
* bytes
, int len
)
366 BB_ADD(reg
, bytes
, len
);
371 add_bitset(regex_t
* reg
, BitSetRef bs
)
373 BB_ADD(reg
, bs
, SIZE_BITSET
);
377 static int compile_length_tree(Node
* node
, regex_t
* reg
);
378 static int compile_tree(Node
* node
, regex_t
* reg
, ScanEnv
* env
);
381 #define IS_NEED_STR_LEN_OP_EXACT(op) \
382 ((op) == OP_EXACTN || (op) == OP_EXACTMB2N ||\
383 (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC)
386 select_str_opcode(int mb_len
, int str_len
, int ignore_case
)
392 case 1: op
= OP_EXACT1_IC
; break;
393 default: op
= OP_EXACTN_IC
; break;
400 case 1: op
= OP_EXACT1
; break;
401 case 2: op
= OP_EXACT2
; break;
402 case 3: op
= OP_EXACT3
; break;
403 case 4: op
= OP_EXACT4
; break;
404 case 5: op
= OP_EXACT5
; break;
405 default: op
= OP_EXACTN
; break;
411 case 1: op
= OP_EXACTMB2N1
; break;
412 case 2: op
= OP_EXACTMB2N2
; break;
413 case 3: op
= OP_EXACTMB2N3
; break;
414 default: op
= OP_EXACTMB2N
; break;
431 compile_tree_empty_check(Node
* node
, regex_t
* reg
, int empty_info
, ScanEnv
* env
)
434 int saved_num_null_check
= reg
->num_null_check
;
436 if (empty_info
!= QUANT_BODY_IS_NOT_EMPTY
) {
437 r
= add_opcode(reg
, OP_EMPTY_CHECK_START
);
438 if (r
!= 0) return r
;
439 r
= add_mem_num(reg
, reg
->num_null_check
); /* NULL CHECK ID */
440 if (r
!= 0) return r
;
441 reg
->num_null_check
++;
444 r
= compile_tree(node
, reg
, env
);
445 if (r
!= 0) return r
;
447 if (empty_info
!= QUANT_BODY_IS_NOT_EMPTY
) {
448 if (empty_info
== QUANT_BODY_IS_EMPTY
)
449 r
= add_opcode(reg
, OP_EMPTY_CHECK_END
);
450 else if (empty_info
== QUANT_BODY_IS_EMPTY_MEM
)
451 r
= add_opcode(reg
, OP_EMPTY_CHECK_END_MEMST
);
452 else if (empty_info
== QUANT_BODY_IS_EMPTY_REC
)
453 r
= add_opcode(reg
, OP_EMPTY_CHECK_END_MEMST_PUSH
);
455 if (r
!= 0) return r
;
456 r
= add_mem_num(reg
, saved_num_null_check
); /* NULL CHECK ID */
463 compile_call(CallNode
* node
, regex_t
* reg
, ScanEnv
* env
)
467 r
= add_opcode(reg
, OP_CALL
);
468 if (r
!= 0) return r
;
469 r
= unset_addr_list_add(env
->unset_addr_list
, BB_GET_OFFSET_POS(reg
),
470 NODE_CALL_BODY(node
));
471 if (r
!= 0) return r
;
472 r
= add_abs_addr(reg
, 0 /*dummy addr.*/);
478 compile_tree_n_times(Node
* node
, int n
, regex_t
* reg
, ScanEnv
* env
)
482 for (i
= 0; i
< n
; i
++) {
483 r
= compile_tree(node
, reg
, env
);
484 if (r
!= 0) return r
;
490 add_compile_string_length(UChar
* s ARG_UNUSED
, int mb_len
, int str_len
,
491 regex_t
* reg ARG_UNUSED
, int ignore_case
)
494 int op
= select_str_opcode(mb_len
, str_len
, ignore_case
);
498 if (op
== OP_EXACTMBN
) len
+= SIZE_LENGTH
;
499 if (IS_NEED_STR_LEN_OP_EXACT(op
))
502 len
+= mb_len
* str_len
;
507 add_compile_string(UChar
* s
, int mb_len
, int str_len
,
508 regex_t
* reg
, int ignore_case
)
510 int op
= select_str_opcode(mb_len
, str_len
, ignore_case
);
513 if (op
== OP_EXACTMBN
)
514 add_length(reg
, mb_len
);
516 if (IS_NEED_STR_LEN_OP_EXACT(op
)) {
517 if (op
== OP_EXACTN_IC
)
518 add_length(reg
, mb_len
* str_len
);
520 add_length(reg
, str_len
);
523 add_bytes(reg
, s
, mb_len
* str_len
);
529 compile_length_string_node(Node
* node
, regex_t
* reg
)
531 int rlen
, r
, len
, prev_len
, slen
, ambig
;
534 OnigEncoding enc
= reg
->enc
;
537 if (sn
->end
<= sn
->s
)
540 ambig
= NODE_STRING_IS_AMBIG(node
);
543 prev_len
= enclen(enc
, p
);
548 for (; p
< sn
->end
; ) {
549 len
= enclen(enc
, p
);
550 if (len
== prev_len
) {
554 r
= add_compile_string_length(prev
, prev_len
, slen
, reg
, ambig
);
563 r
= add_compile_string_length(prev
, prev_len
, slen
, reg
, ambig
);
569 compile_length_string_raw_node(StrNode
* sn
, regex_t
* reg
)
571 if (sn
->end
<= sn
->s
)
574 return add_compile_string_length(sn
->s
, 1 /* sb */, (int )(sn
->end
- sn
->s
),
579 compile_string_node(Node
* node
, regex_t
* reg
)
581 int r
, len
, prev_len
, slen
, ambig
;
582 UChar
*p
, *prev
, *end
;
584 OnigEncoding enc
= reg
->enc
;
587 if (sn
->end
<= sn
->s
)
591 ambig
= NODE_STRING_IS_AMBIG(node
);
594 prev_len
= enclen(enc
, p
);
599 len
= enclen(enc
, p
);
600 if (len
== prev_len
) {
604 r
= add_compile_string(prev
, prev_len
, slen
, reg
, ambig
);
605 if (r
!= 0) return r
;
615 return add_compile_string(prev
, prev_len
, slen
, reg
, ambig
);
619 compile_string_raw_node(StrNode
* sn
, regex_t
* reg
)
621 if (sn
->end
<= sn
->s
)
624 return add_compile_string(sn
->s
, 1 /* sb */, (int )(sn
->end
- sn
->s
), reg
, 0);
628 add_multi_byte_cclass(BBuf
* mbuf
, regex_t
* reg
)
630 #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
631 add_length(reg
, mbuf
->used
);
632 return add_bytes(reg
, mbuf
->p
, mbuf
->used
);
635 UChar
* p
= BB_GET_ADD_ADDRESS(reg
) + SIZE_LENGTH
;
637 GET_ALIGNMENT_PAD_SIZE(p
, pad_size
);
638 add_length(reg
, mbuf
->used
+ (WORD_ALIGNMENT_SIZE
- 1));
639 if (pad_size
!= 0) add_bytes(reg
, PadBuf
, pad_size
);
641 r
= add_bytes(reg
, mbuf
->p
, mbuf
->used
);
643 /* padding for return value from compile_length_cclass_node() to be fix. */
644 pad_size
= (WORD_ALIGNMENT_SIZE
- 1) - pad_size
;
645 if (pad_size
!= 0) add_bytes(reg
, PadBuf
, pad_size
);
651 compile_length_cclass_node(CClassNode
* cc
, regex_t
* reg
)
655 if (IS_NULL(cc
->mbuf
)) {
656 len
= SIZE_OPCODE
+ SIZE_BITSET
;
659 if (ONIGENC_MBC_MINLEN(reg
->enc
) > 1 || bitset_is_empty(cc
->bs
)) {
663 len
= SIZE_OPCODE
+ SIZE_BITSET
;
665 #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
666 len
+= SIZE_LENGTH
+ cc
->mbuf
->used
;
668 len
+= SIZE_LENGTH
+ cc
->mbuf
->used
+ (WORD_ALIGNMENT_SIZE
- 1);
676 compile_cclass_node(CClassNode
* cc
, regex_t
* reg
)
680 if (IS_NULL(cc
->mbuf
)) {
681 if (IS_NCCLASS_NOT(cc
))
682 add_opcode(reg
, OP_CCLASS_NOT
);
684 add_opcode(reg
, OP_CCLASS
);
686 r
= add_bitset(reg
, cc
->bs
);
689 if (ONIGENC_MBC_MINLEN(reg
->enc
) > 1 || bitset_is_empty(cc
->bs
)) {
690 if (IS_NCCLASS_NOT(cc
))
691 add_opcode(reg
, OP_CCLASS_MB_NOT
);
693 add_opcode(reg
, OP_CCLASS_MB
);
695 r
= add_multi_byte_cclass(cc
->mbuf
, reg
);
698 if (IS_NCCLASS_NOT(cc
))
699 add_opcode(reg
, OP_CCLASS_MIX_NOT
);
701 add_opcode(reg
, OP_CCLASS_MIX
);
703 r
= add_bitset(reg
, cc
->bs
);
704 if (r
!= 0) return r
;
705 r
= add_multi_byte_cclass(cc
->mbuf
, reg
);
713 entry_repeat_range(regex_t
* reg
, int id
, int lower
, int upper
)
715 #define REPEAT_RANGE_ALLOC 4
719 if (reg
->repeat_range_alloc
== 0) {
720 p
= (OnigRepeatRange
* )xmalloc(sizeof(OnigRepeatRange
) * REPEAT_RANGE_ALLOC
);
721 CHECK_NULL_RETURN_MEMERR(p
);
722 reg
->repeat_range
= p
;
723 reg
->repeat_range_alloc
= REPEAT_RANGE_ALLOC
;
725 else if (reg
->repeat_range_alloc
<= id
) {
727 n
= reg
->repeat_range_alloc
+ REPEAT_RANGE_ALLOC
;
728 p
= (OnigRepeatRange
* )xrealloc(reg
->repeat_range
,
729 sizeof(OnigRepeatRange
) * n
,
730 sizeof(OnigRepeatRange
) * reg
->repeat_range_alloc
);
731 CHECK_NULL_RETURN_MEMERR(p
);
732 reg
->repeat_range
= p
;
733 reg
->repeat_range_alloc
= n
;
736 p
= reg
->repeat_range
;
740 p
[id
].upper
= (IS_REPEAT_INFINITE(upper
) ? 0x7fffffff : upper
);
745 compile_range_repeat_node(QuantNode
* qn
, int target_len
, int empty_info
,
746 regex_t
* reg
, ScanEnv
* env
)
749 int num_repeat
= reg
->num_repeat
;
751 r
= add_opcode(reg
, qn
->greedy
? OP_REPEAT
: OP_REPEAT_NG
);
752 if (r
!= 0) return r
;
753 r
= add_mem_num(reg
, num_repeat
); /* OP_REPEAT ID */
755 if (r
!= 0) return r
;
756 r
= add_rel_addr(reg
, target_len
+ SIZE_OP_REPEAT_INC
);
757 if (r
!= 0) return r
;
759 r
= entry_repeat_range(reg
, num_repeat
, qn
->lower
, qn
->upper
);
760 if (r
!= 0) return r
;
762 r
= compile_tree_empty_check(NODE_QUANT_BODY(qn
), reg
, empty_info
, env
);
763 if (r
!= 0) return r
;
767 NODE_IS_IN_MULTI_ENTRY(qn
) ||
769 NODE_IS_IN_REAL_REPEAT(qn
)) {
770 r
= add_opcode(reg
, qn
->greedy
? OP_REPEAT_INC_SG
: OP_REPEAT_INC_NG_SG
);
773 r
= add_opcode(reg
, qn
->greedy
? OP_REPEAT_INC
: OP_REPEAT_INC_NG
);
775 if (r
!= 0) return r
;
776 r
= add_mem_num(reg
, num_repeat
); /* OP_REPEAT ID */
781 is_anychar_infinite_greedy(QuantNode
* qn
)
783 if (qn
->greedy
&& IS_REPEAT_INFINITE(qn
->upper
) &&
784 NODE_IS_ANYCHAR(NODE_QUANT_BODY(qn
)))
790 #define QUANTIFIER_EXPAND_LIMIT_SIZE 50
791 #define CKN_ON (ckn > 0)
794 compile_length_quantifier_node(QuantNode
* qn
, regex_t
* reg
)
797 int infinite
= IS_REPEAT_INFINITE(qn
->upper
);
798 enum QuantBodyEmpty empty_info
= qn
->body_empty_info
;
799 int tlen
= compile_length_tree(NODE_QUANT_BODY(qn
), reg
);
801 if (tlen
< 0) return tlen
;
802 if (tlen
== 0) return 0;
805 if (is_anychar_infinite_greedy(qn
)) {
806 if (qn
->lower
<= 1 ||
807 int_multiply_cmp(tlen
, qn
->lower
, QUANTIFIER_EXPAND_LIMIT_SIZE
) <= 0) {
808 if (IS_NOT_NULL(qn
->next_head_exact
))
809 return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT
+ tlen
* qn
->lower
;
811 return SIZE_OP_ANYCHAR_STAR
+ tlen
* qn
->lower
;
815 if (empty_info
== QUANT_BODY_IS_NOT_EMPTY
)
818 mod_tlen
= tlen
+ (SIZE_OP_EMPTY_CHECK_START
+ SIZE_OP_EMPTY_CHECK_END
);
822 int_multiply_cmp(tlen
, qn
->lower
, QUANTIFIER_EXPAND_LIMIT_SIZE
) <= 0)) {
823 if (qn
->lower
== 1 && tlen
> QUANTIFIER_EXPAND_LIMIT_SIZE
) {
827 len
= tlen
* qn
->lower
;
831 if (IS_NOT_NULL(qn
->head_exact
))
832 len
+= SIZE_OP_PUSH_OR_JUMP_EXACT1
+ mod_tlen
+ SIZE_OP_JUMP
;
833 else if (IS_NOT_NULL(qn
->next_head_exact
))
834 len
+= SIZE_OP_PUSH_IF_PEEK_NEXT
+ mod_tlen
+ SIZE_OP_JUMP
;
836 len
+= SIZE_OP_PUSH
+ mod_tlen
+ SIZE_OP_JUMP
;
839 len
+= SIZE_OP_JUMP
+ mod_tlen
+ SIZE_OP_PUSH
;
841 else if (qn
->upper
== 0 && qn
->is_refered
!= 0) { /* /(?<n>..){0}/ */
842 len
= SIZE_OP_JUMP
+ tlen
;
844 else if (!infinite
&& qn
->greedy
&&
846 int_multiply_cmp(tlen
+ SIZE_OP_PUSH
, qn
->upper
,
847 QUANTIFIER_EXPAND_LIMIT_SIZE
) <= 0)) {
848 len
= tlen
* qn
->lower
;
849 len
+= (SIZE_OP_PUSH
+ tlen
) * (qn
->upper
- qn
->lower
);
851 else if (!qn
->greedy
&& qn
->upper
== 1 && qn
->lower
== 0) { /* '??' */
852 len
= SIZE_OP_PUSH
+ SIZE_OP_JUMP
+ tlen
;
855 len
= SIZE_OP_REPEAT_INC
856 + mod_tlen
+ SIZE_OPCODE
+ SIZE_RELADDR
+ SIZE_MEMNUM
;
863 compile_quantifier_node(QuantNode
* qn
, regex_t
* reg
, ScanEnv
* env
)
866 int infinite
= IS_REPEAT_INFINITE(qn
->upper
);
867 enum QuantBodyEmpty empty_info
= qn
->body_empty_info
;
868 int tlen
= compile_length_tree(NODE_QUANT_BODY(qn
), reg
);
870 if (tlen
< 0) return tlen
;
871 if (tlen
== 0) return 0;
873 if (is_anychar_infinite_greedy(qn
) &&
875 int_multiply_cmp(tlen
, qn
->lower
, QUANTIFIER_EXPAND_LIMIT_SIZE
) <= 0)) {
876 r
= compile_tree_n_times(NODE_QUANT_BODY(qn
), qn
->lower
, reg
, env
);
877 if (r
!= 0) return r
;
878 if (IS_NOT_NULL(qn
->next_head_exact
)) {
879 if (IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn
), reg
)))
880 r
= add_opcode(reg
, OP_ANYCHAR_ML_STAR_PEEK_NEXT
);
882 r
= add_opcode(reg
, OP_ANYCHAR_STAR_PEEK_NEXT
);
883 if (r
!= 0) return r
;
884 return add_bytes(reg
, STR_(qn
->next_head_exact
)->s
, 1);
887 if (IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn
), reg
)))
888 return add_opcode(reg
, OP_ANYCHAR_ML_STAR
);
890 return add_opcode(reg
, OP_ANYCHAR_STAR
);
894 if (empty_info
== QUANT_BODY_IS_NOT_EMPTY
)
897 mod_tlen
= tlen
+ (SIZE_OP_EMPTY_CHECK_START
+ SIZE_OP_EMPTY_CHECK_END
);
901 int_multiply_cmp(tlen
, qn
->lower
, QUANTIFIER_EXPAND_LIMIT_SIZE
) <= 0)) {
902 if (qn
->lower
== 1 && tlen
> QUANTIFIER_EXPAND_LIMIT_SIZE
) {
904 if (IS_NOT_NULL(qn
->head_exact
))
905 r
= add_opcode_rel_addr(reg
, OP_JUMP
, SIZE_OP_PUSH_OR_JUMP_EXACT1
);
906 else if (IS_NOT_NULL(qn
->next_head_exact
))
907 r
= add_opcode_rel_addr(reg
, OP_JUMP
, SIZE_OP_PUSH_IF_PEEK_NEXT
);
909 r
= add_opcode_rel_addr(reg
, OP_JUMP
, SIZE_OP_PUSH
);
912 r
= add_opcode_rel_addr(reg
, OP_JUMP
, SIZE_OP_JUMP
);
914 if (r
!= 0) return r
;
917 r
= compile_tree_n_times(NODE_QUANT_BODY(qn
), qn
->lower
, reg
, env
);
918 if (r
!= 0) return r
;
922 if (IS_NOT_NULL(qn
->head_exact
)) {
923 r
= add_opcode_rel_addr(reg
, OP_PUSH_OR_JUMP_EXACT1
,
924 mod_tlen
+ SIZE_OP_JUMP
);
925 if (r
!= 0) return r
;
926 add_bytes(reg
, STR_(qn
->head_exact
)->s
, 1);
927 r
= compile_tree_empty_check(NODE_QUANT_BODY(qn
), reg
, empty_info
, env
);
928 if (r
!= 0) return r
;
929 r
= add_opcode_rel_addr(reg
, OP_JUMP
,
930 -(mod_tlen
+ (int )SIZE_OP_JUMP
+ (int )SIZE_OP_PUSH_OR_JUMP_EXACT1
));
932 else if (IS_NOT_NULL(qn
->next_head_exact
)) {
933 r
= add_opcode_rel_addr(reg
, OP_PUSH_IF_PEEK_NEXT
,
934 mod_tlen
+ SIZE_OP_JUMP
);
935 if (r
!= 0) return r
;
936 add_bytes(reg
, STR_(qn
->next_head_exact
)->s
, 1);
937 r
= compile_tree_empty_check(NODE_QUANT_BODY(qn
), reg
, empty_info
, env
);
938 if (r
!= 0) return r
;
939 r
= add_opcode_rel_addr(reg
, OP_JUMP
,
940 -(mod_tlen
+ (int )SIZE_OP_JUMP
+ (int )SIZE_OP_PUSH_IF_PEEK_NEXT
));
943 r
= add_opcode_rel_addr(reg
, OP_PUSH
, mod_tlen
+ SIZE_OP_JUMP
);
944 if (r
!= 0) return r
;
945 r
= compile_tree_empty_check(NODE_QUANT_BODY(qn
), reg
, empty_info
, env
);
946 if (r
!= 0) return r
;
947 r
= add_opcode_rel_addr(reg
, OP_JUMP
,
948 -(mod_tlen
+ (int )SIZE_OP_JUMP
+ (int )SIZE_OP_PUSH
));
952 r
= add_opcode_rel_addr(reg
, OP_JUMP
, mod_tlen
);
953 if (r
!= 0) return r
;
954 r
= compile_tree_empty_check(NODE_QUANT_BODY(qn
), reg
, empty_info
, env
);
955 if (r
!= 0) return r
;
956 r
= add_opcode_rel_addr(reg
, OP_PUSH
, -(mod_tlen
+ (int )SIZE_OP_PUSH
));
959 else if (qn
->upper
== 0 && qn
->is_refered
!= 0) { /* /(?<n>..){0}/ */
960 r
= add_opcode_rel_addr(reg
, OP_JUMP
, tlen
);
961 if (r
!= 0) return r
;
962 r
= compile_tree(NODE_QUANT_BODY(qn
), reg
, env
);
964 else if (! infinite
&& qn
->greedy
&&
966 int_multiply_cmp(tlen
+ SIZE_OP_PUSH
, qn
->upper
,
967 QUANTIFIER_EXPAND_LIMIT_SIZE
) <= 0)) {
968 int n
= qn
->upper
- qn
->lower
;
970 r
= compile_tree_n_times(NODE_QUANT_BODY(qn
), qn
->lower
, reg
, env
);
971 if (r
!= 0) return r
;
973 for (i
= 0; i
< n
; i
++) {
974 r
= add_opcode_rel_addr(reg
, OP_PUSH
,
975 (n
- i
) * tlen
+ (n
- i
- 1) * SIZE_OP_PUSH
);
976 if (r
!= 0) return r
;
977 r
= compile_tree(NODE_QUANT_BODY(qn
), reg
, env
);
978 if (r
!= 0) return r
;
981 else if (! qn
->greedy
&& qn
->upper
== 1 && qn
->lower
== 0) { /* '??' */
982 r
= add_opcode_rel_addr(reg
, OP_PUSH
, SIZE_OP_JUMP
);
983 if (r
!= 0) return r
;
984 r
= add_opcode_rel_addr(reg
, OP_JUMP
, tlen
);
985 if (r
!= 0) return r
;
986 r
= compile_tree(NODE_QUANT_BODY(qn
), reg
, env
);
989 r
= compile_range_repeat_node(qn
, mod_tlen
, empty_info
, reg
, env
);
995 compile_length_option_node(EnclosureNode
* node
, regex_t
* reg
)
998 OnigOptionType prev
= reg
->options
;
1000 reg
->options
= node
->o
.options
;
1001 tlen
= compile_length_tree(NODE_ENCLOSURE_BODY(node
), reg
);
1002 reg
->options
= prev
;
1008 compile_option_node(EnclosureNode
* node
, regex_t
* reg
, ScanEnv
* env
)
1011 OnigOptionType prev
= reg
->options
;
1013 reg
->options
= node
->o
.options
;
1014 r
= compile_tree(NODE_ENCLOSURE_BODY(node
), reg
, env
);
1015 reg
->options
= prev
;
1021 compile_length_enclosure_node(EnclosureNode
* node
, regex_t
* reg
)
1026 if (node
->type
== ENCLOSURE_OPTION
)
1027 return compile_length_option_node(node
, reg
);
1029 if (NODE_ENCLOSURE_BODY(node
)) {
1030 tlen
= compile_length_tree(NODE_ENCLOSURE_BODY(node
), reg
);
1031 if (tlen
< 0) return tlen
;
1036 switch (node
->type
) {
1037 case ENCLOSURE_MEMORY
:
1040 if (node
->m
.regnum
== 0 && NODE_IS_CALLED(node
)) {
1041 len
= tlen
+ SIZE_OP_CALL
+ SIZE_OP_JUMP
+ SIZE_OP_RETURN
;
1045 if (NODE_IS_CALLED(node
)) {
1046 len
= SIZE_OP_MEMORY_START_PUSH
+ tlen
1047 + SIZE_OP_CALL
+ SIZE_OP_JUMP
+ SIZE_OP_RETURN
;
1048 if (MEM_STATUS_AT0(reg
->bt_mem_end
, node
->m
.regnum
))
1049 len
+= (NODE_IS_RECURSION(node
)
1050 ? SIZE_OP_MEMORY_END_PUSH_REC
: SIZE_OP_MEMORY_END_PUSH
);
1052 len
+= (NODE_IS_RECURSION(node
)
1053 ? SIZE_OP_MEMORY_END_REC
: SIZE_OP_MEMORY_END
);
1055 else if (NODE_IS_RECURSION(node
)) {
1056 len
= SIZE_OP_MEMORY_START_PUSH
;
1057 len
+= tlen
+ (MEM_STATUS_AT0(reg
->bt_mem_end
, node
->m
.regnum
)
1058 ? SIZE_OP_MEMORY_END_PUSH_REC
: SIZE_OP_MEMORY_END_REC
);
1063 if (MEM_STATUS_AT0(reg
->bt_mem_start
, node
->m
.regnum
))
1064 len
= SIZE_OP_MEMORY_START_PUSH
;
1066 len
= SIZE_OP_MEMORY_START
;
1068 len
+= tlen
+ (MEM_STATUS_AT0(reg
->bt_mem_end
, node
->m
.regnum
)
1069 ? SIZE_OP_MEMORY_END_PUSH
: SIZE_OP_MEMORY_END
);
1073 case ENCLOSURE_STOP_BACKTRACK
:
1074 if (NODE_IS_STOP_BT_SIMPLE_REPEAT(node
)) {
1075 QuantNode
* qn
= QUANT_(NODE_ENCLOSURE_BODY(node
));
1076 tlen
= compile_length_tree(NODE_QUANT_BODY(qn
), reg
);
1077 if (tlen
< 0) return tlen
;
1079 len
= tlen
* qn
->lower
1080 + SIZE_OP_PUSH
+ tlen
+ SIZE_OP_POP_OUT
+ SIZE_OP_JUMP
;
1083 len
= SIZE_OP_ATOMIC_START
+ tlen
+ SIZE_OP_ATOMIC_END
;
1087 case ENCLOSURE_IF_ELSE
:
1089 Node
* cond
= NODE_ENCLOSURE_BODY(node
);
1090 Node
* Then
= node
->te
.Then
;
1091 Node
* Else
= node
->te
.Else
;
1093 len
= compile_length_tree(cond
, reg
);
1094 if (len
< 0) return len
;
1095 len
+= SIZE_OP_PUSH
;
1096 len
+= SIZE_OP_ATOMIC_START
+ SIZE_OP_ATOMIC_END
;
1098 if (IS_NOT_NULL(Then
)) {
1099 tlen
= compile_length_tree(Then
, reg
);
1100 if (tlen
< 0) return tlen
;
1104 if (IS_NOT_NULL(Else
)) {
1105 len
+= SIZE_OP_JUMP
;
1106 tlen
= compile_length_tree(Else
, reg
);
1107 if (tlen
< 0) return tlen
;
1114 return ONIGERR_TYPE_BUG
;
1121 static int get_char_length_tree(Node
* node
, regex_t
* reg
, int* len
);
1124 compile_enclosure_memory_node(EnclosureNode
* node
, regex_t
* reg
, ScanEnv
* env
)
1130 if (node
->m
.regnum
== 0 && NODE_IS_CALLED(node
)) {
1131 r
= add_opcode(reg
, OP_CALL
);
1132 if (r
!= 0) return r
;
1133 node
->m
.called_addr
= BB_GET_OFFSET_POS(reg
) + SIZE_ABSADDR
+ SIZE_OP_JUMP
;
1134 NODE_STATUS_ADD(node
, ADDR_FIXED
);
1135 r
= add_abs_addr(reg
, (int )node
->m
.called_addr
);
1136 if (r
!= 0) return r
;
1137 len
= compile_length_tree(NODE_ENCLOSURE_BODY(node
), reg
);
1138 len
+= SIZE_OP_RETURN
;
1139 r
= add_opcode_rel_addr(reg
, OP_JUMP
, len
);
1140 if (r
!= 0) return r
;
1142 r
= compile_tree(NODE_ENCLOSURE_BODY(node
), reg
, env
);
1143 if (r
!= 0) return r
;
1144 r
= add_opcode(reg
, OP_RETURN
);
1148 if (NODE_IS_CALLED(node
)) {
1149 r
= add_opcode(reg
, OP_CALL
);
1150 if (r
!= 0) return r
;
1151 node
->m
.called_addr
= BB_GET_OFFSET_POS(reg
) + SIZE_ABSADDR
+ SIZE_OP_JUMP
;
1152 NODE_STATUS_ADD(node
, ADDR_FIXED
);
1153 r
= add_abs_addr(reg
, (int )node
->m
.called_addr
);
1154 if (r
!= 0) return r
;
1155 len
= compile_length_tree(NODE_ENCLOSURE_BODY(node
), reg
);
1156 len
+= (SIZE_OP_MEMORY_START_PUSH
+ SIZE_OP_RETURN
);
1157 if (MEM_STATUS_AT0(reg
->bt_mem_end
, node
->m
.regnum
))
1158 len
+= (NODE_IS_RECURSION(node
)
1159 ? SIZE_OP_MEMORY_END_PUSH_REC
: SIZE_OP_MEMORY_END_PUSH
);
1161 len
+= (NODE_IS_RECURSION(node
)
1162 ? SIZE_OP_MEMORY_END_REC
: SIZE_OP_MEMORY_END
);
1164 r
= add_opcode_rel_addr(reg
, OP_JUMP
, len
);
1165 if (r
!= 0) return r
;
1169 if (MEM_STATUS_AT0(reg
->bt_mem_start
, node
->m
.regnum
))
1170 r
= add_opcode(reg
, OP_MEMORY_START_PUSH
);
1172 r
= add_opcode(reg
, OP_MEMORY_START
);
1173 if (r
!= 0) return r
;
1174 r
= add_mem_num(reg
, node
->m
.regnum
);
1175 if (r
!= 0) return r
;
1176 r
= compile_tree(NODE_ENCLOSURE_BODY(node
), reg
, env
);
1177 if (r
!= 0) return r
;
1180 if (MEM_STATUS_AT0(reg
->bt_mem_end
, node
->m
.regnum
))
1181 r
= add_opcode(reg
, (NODE_IS_RECURSION(node
)
1182 ? OP_MEMORY_END_PUSH_REC
: OP_MEMORY_END_PUSH
));
1184 r
= add_opcode(reg
, (NODE_IS_RECURSION(node
)
1185 ? OP_MEMORY_END_REC
: OP_MEMORY_END
));
1186 if (r
!= 0) return r
;
1187 r
= add_mem_num(reg
, node
->m
.regnum
);
1188 if (NODE_IS_CALLED(node
)) {
1189 if (r
!= 0) return r
;
1190 r
= add_opcode(reg
, OP_RETURN
);
1193 if (MEM_STATUS_AT0(reg
->bt_mem_end
, node
->m
.regnum
))
1194 r
= add_opcode(reg
, OP_MEMORY_END_PUSH
);
1196 r
= add_opcode(reg
, OP_MEMORY_END
);
1197 if (r
!= 0) return r
;
1198 r
= add_mem_num(reg
, node
->m
.regnum
);
1205 compile_enclosure_node(EnclosureNode
* node
, regex_t
* reg
, ScanEnv
* env
)
1209 switch (node
->type
) {
1210 case ENCLOSURE_MEMORY
:
1211 r
= compile_enclosure_memory_node(node
, reg
, env
);
1214 case ENCLOSURE_OPTION
:
1215 r
= compile_option_node(node
, reg
, env
);
1218 case ENCLOSURE_STOP_BACKTRACK
:
1219 if (NODE_IS_STOP_BT_SIMPLE_REPEAT(node
)) {
1220 QuantNode
* qn
= QUANT_(NODE_ENCLOSURE_BODY(node
));
1221 r
= compile_tree_n_times(NODE_QUANT_BODY(qn
), qn
->lower
, reg
, env
);
1222 if (r
!= 0) return r
;
1224 len
= compile_length_tree(NODE_QUANT_BODY(qn
), reg
);
1225 if (len
< 0) return len
;
1227 r
= add_opcode_rel_addr(reg
, OP_PUSH
, len
+ SIZE_OP_POP_OUT
+ SIZE_OP_JUMP
);
1228 if (r
!= 0) return r
;
1229 r
= compile_tree(NODE_QUANT_BODY(qn
), reg
, env
);
1230 if (r
!= 0) return r
;
1231 r
= add_opcode(reg
, OP_POP_OUT
);
1232 if (r
!= 0) return r
;
1233 r
= add_opcode_rel_addr(reg
, OP_JUMP
,
1234 -((int )SIZE_OP_PUSH
+ len
+ (int )SIZE_OP_POP_OUT
+ (int )SIZE_OP_JUMP
));
1237 r
= add_opcode(reg
, OP_ATOMIC_START
);
1238 if (r
!= 0) return r
;
1239 r
= compile_tree(NODE_ENCLOSURE_BODY(node
), reg
, env
);
1240 if (r
!= 0) return r
;
1241 r
= add_opcode(reg
, OP_ATOMIC_END
);
1245 case ENCLOSURE_IF_ELSE
:
1247 int cond_len
, then_len
, jump_len
;
1248 Node
* cond
= NODE_ENCLOSURE_BODY(node
);
1249 Node
* Then
= node
->te
.Then
;
1250 Node
* Else
= node
->te
.Else
;
1252 r
= add_opcode(reg
, OP_ATOMIC_START
);
1253 if (r
!= 0) return r
;
1255 cond_len
= compile_length_tree(cond
, reg
);
1256 if (cond_len
< 0) return cond_len
;
1257 if (IS_NOT_NULL(Then
)) {
1258 then_len
= compile_length_tree(Then
, reg
);
1259 if (then_len
< 0) return then_len
;
1264 jump_len
= cond_len
+ then_len
+ SIZE_OP_ATOMIC_END
;
1265 if (IS_NOT_NULL(Else
)) jump_len
+= SIZE_OP_JUMP
;
1267 r
= add_opcode_rel_addr(reg
, OP_PUSH
, jump_len
);
1268 if (r
!= 0) return r
;
1269 r
= compile_tree(cond
, reg
, env
);
1270 if (r
!= 0) return r
;
1271 r
= add_opcode(reg
, OP_ATOMIC_END
);
1272 if (r
!= 0) return r
;
1274 if (IS_NOT_NULL(Then
)) {
1275 r
= compile_tree(Then
, reg
, env
);
1276 if (r
!= 0) return r
;
1279 if (IS_NOT_NULL(Else
)) {
1280 int else_len
= compile_length_tree(Else
, reg
);
1281 r
= add_opcode_rel_addr(reg
, OP_JUMP
, else_len
);
1282 if (r
!= 0) return r
;
1283 r
= compile_tree(Else
, reg
, env
);
1289 return ONIGERR_TYPE_BUG
;
1297 compile_length_anchor_node(AnchorNode
* node
, regex_t
* reg
)
1302 if (IS_NOT_NULL(NODE_ANCHOR_BODY(node
))) {
1303 tlen
= compile_length_tree(NODE_ANCHOR_BODY(node
), reg
);
1304 if (tlen
< 0) return tlen
;
1307 switch (node
->type
) {
1308 case ANCHOR_PREC_READ
:
1309 len
= SIZE_OP_PREC_READ_START
+ tlen
+ SIZE_OP_PREC_READ_END
;
1311 case ANCHOR_PREC_READ_NOT
:
1312 len
= SIZE_OP_PREC_READ_NOT_START
+ tlen
+ SIZE_OP_PREC_READ_NOT_END
;
1314 case ANCHOR_LOOK_BEHIND
:
1315 len
= SIZE_OP_LOOK_BEHIND
+ tlen
;
1317 case ANCHOR_LOOK_BEHIND_NOT
:
1318 len
= SIZE_OP_LOOK_BEHIND_NOT_START
+ tlen
+ SIZE_OP_LOOK_BEHIND_NOT_END
;
1321 case ANCHOR_WORD_BOUNDARY
:
1322 case ANCHOR_NO_WORD_BOUNDARY
:
1323 #ifdef USE_WORD_BEGIN_END
1324 case ANCHOR_WORD_BEGIN
:
1325 case ANCHOR_WORD_END
:
1327 len
= SIZE_OP_WORD_BOUNDARY
;
1330 case ANCHOR_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY
:
1331 case ANCHOR_NO_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY
:
1344 compile_anchor_node(AnchorNode
* node
, regex_t
* reg
, ScanEnv
* env
)
1349 switch (node
->type
) {
1350 case ANCHOR_BEGIN_BUF
: r
= add_opcode(reg
, OP_BEGIN_BUF
); break;
1351 case ANCHOR_END_BUF
: r
= add_opcode(reg
, OP_END_BUF
); break;
1352 case ANCHOR_BEGIN_LINE
: r
= add_opcode(reg
, OP_BEGIN_LINE
); break;
1353 case ANCHOR_END_LINE
: r
= add_opcode(reg
, OP_END_LINE
); break;
1354 case ANCHOR_SEMI_END_BUF
: r
= add_opcode(reg
, OP_SEMI_END_BUF
); break;
1355 case ANCHOR_BEGIN_POSITION
: r
= add_opcode(reg
, OP_BEGIN_POSITION
); break;
1357 case ANCHOR_WORD_BOUNDARY
:
1358 op
= OP_WORD_BOUNDARY
;
1360 r
= add_opcode(reg
, op
);
1361 if (r
!= 0) return r
;
1362 r
= add_mode(reg
, (ModeType
)node
->ascii_mode
);
1365 case ANCHOR_NO_WORD_BOUNDARY
:
1366 op
= OP_NO_WORD_BOUNDARY
; goto word
;
1368 #ifdef USE_WORD_BEGIN_END
1369 case ANCHOR_WORD_BEGIN
:
1370 op
= OP_WORD_BEGIN
; goto word
;
1372 case ANCHOR_WORD_END
:
1373 op
= OP_WORD_END
; goto word
;
1377 case ANCHOR_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY
:
1378 r
= add_opcode(reg
, OP_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY
);
1381 case ANCHOR_NO_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY
:
1382 r
= add_opcode(reg
, OP_NO_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY
);
1385 case ANCHOR_PREC_READ
:
1386 r
= add_opcode(reg
, OP_PREC_READ_START
);
1387 if (r
!= 0) return r
;
1388 r
= compile_tree(NODE_ANCHOR_BODY(node
), reg
, env
);
1389 if (r
!= 0) return r
;
1390 r
= add_opcode(reg
, OP_PREC_READ_END
);
1393 case ANCHOR_PREC_READ_NOT
:
1394 len
= compile_length_tree(NODE_ANCHOR_BODY(node
), reg
);
1395 if (len
< 0) return len
;
1396 r
= add_opcode_rel_addr(reg
, OP_PREC_READ_NOT_START
, len
+ SIZE_OP_PREC_READ_NOT_END
);
1397 if (r
!= 0) return r
;
1398 r
= compile_tree(NODE_ANCHOR_BODY(node
), reg
, env
);
1399 if (r
!= 0) return r
;
1400 r
= add_opcode(reg
, OP_PREC_READ_NOT_END
);
1403 case ANCHOR_LOOK_BEHIND
:
1406 r
= add_opcode(reg
, OP_LOOK_BEHIND
);
1407 if (r
!= 0) return r
;
1408 if (node
->char_len
< 0) {
1409 r
= get_char_length_tree(NODE_ANCHOR_BODY(node
), reg
, &n
);
1410 if (r
!= 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN
;
1415 r
= add_length(reg
, n
);
1416 if (r
!= 0) return r
;
1417 r
= compile_tree(NODE_ANCHOR_BODY(node
), reg
, env
);
1421 case ANCHOR_LOOK_BEHIND_NOT
:
1425 len
= compile_length_tree(NODE_ANCHOR_BODY(node
), reg
);
1426 r
= add_opcode_rel_addr(reg
, OP_LOOK_BEHIND_NOT_START
,
1427 len
+ SIZE_OP_LOOK_BEHIND_NOT_END
);
1428 if (r
!= 0) return r
;
1429 if (node
->char_len
< 0) {
1430 r
= get_char_length_tree(NODE_ANCHOR_BODY(node
), reg
, &n
);
1431 if (r
!= 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN
;
1435 r
= add_length(reg
, n
);
1436 if (r
!= 0) return r
;
1437 r
= compile_tree(NODE_ANCHOR_BODY(node
), reg
, env
);
1438 if (r
!= 0) return r
;
1439 r
= add_opcode(reg
, OP_LOOK_BEHIND_NOT_END
);
1444 return ONIGERR_TYPE_BUG
;
1452 compile_gimmick_node(GimmickNode
* node
, regex_t
* reg
)
1456 switch (node
->type
) {
1458 r
= add_opcode(reg
, OP_FAIL
);
1462 r
= add_opcode(reg
, OP_PUSH_SAVE_VAL
);
1463 if (r
!= 0) return r
;
1464 r
= add_save_type(reg
, SAVE_KEEP
);
1465 if (r
!= 0) return r
;
1466 r
= add_mem_num(reg
, node
->id
);
1470 r
= add_opcode(reg
, OP_PUSH_SAVE_VAL
);
1471 if (r
!= 0) return r
;
1472 r
= add_save_type(reg
, node
->detail_type
);
1473 if (r
!= 0) return r
;
1474 r
= add_mem_num(reg
, node
->id
);
1477 case GIMMICK_UPDATE_VAR
:
1478 r
= add_opcode(reg
, OP_UPDATE_VAR
);
1479 if (r
!= 0) return r
;
1480 r
= add_update_var_type(reg
, node
->detail_type
);
1481 if (r
!= 0) return r
;
1482 r
= add_mem_num(reg
, node
->id
);
1486 case GIMMICK_CALLOUT
:
1487 switch (node
->detail_type
) {
1488 case ONIG_CALLOUT_OF_CONTENTS
:
1489 case ONIG_CALLOUT_OF_NAME
:
1491 r
= add_opcode(reg
, (node
->detail_type
== ONIG_CALLOUT_OF_CONTENTS
) ?
1492 OP_CALLOUT_CONTENTS
: OP_CALLOUT_NAME
);
1493 if (r
!= 0) return r
;
1494 if (node
->detail_type
== ONIG_CALLOUT_OF_NAME
) {
1495 r
= add_mem_num(reg
, node
->id
);
1496 if (r
!= 0) return r
;
1498 r
= add_mem_num(reg
, node
->num
);
1499 if (r
!= 0) return r
;
1504 r
= ONIGERR_TYPE_BUG
;
1514 compile_length_gimmick_node(GimmickNode
* node
, regex_t
* reg
)
1518 switch (node
->type
) {
1525 len
= SIZE_OP_PUSH_SAVE_VAL
;
1528 case GIMMICK_UPDATE_VAR
:
1529 len
= SIZE_OP_UPDATE_VAR
;
1533 case GIMMICK_CALLOUT
:
1534 switch (node
->detail_type
) {
1535 case ONIG_CALLOUT_OF_CONTENTS
:
1536 len
= SIZE_OP_CALLOUT_CONTENTS
;
1538 case ONIG_CALLOUT_OF_NAME
:
1539 len
= SIZE_OP_CALLOUT_NAME
;
1543 len
= ONIGERR_TYPE_BUG
;
1554 compile_length_tree(Node
* node
, regex_t
* reg
)
1558 switch (NODE_TYPE(node
)) {
1562 r
= compile_length_tree(NODE_CAR(node
), reg
);
1563 if (r
< 0) return r
;
1565 } while (IS_NOT_NULL(node
= NODE_CDR(node
)));
1575 r
+= compile_length_tree(NODE_CAR(node
), reg
);
1577 } while (IS_NOT_NULL(node
= NODE_CDR(node
)));
1578 r
+= (SIZE_OP_PUSH
+ SIZE_OP_JUMP
) * (n
- 1);
1583 if (NODE_STRING_IS_RAW(node
))
1584 r
= compile_length_string_raw_node(STR_(node
), reg
);
1586 r
= compile_length_string_node(node
, reg
);
1590 r
= compile_length_cclass_node(CCLASS_(node
), reg
);
1599 BackRefNode
* br
= BACKREF_(node
);
1601 if (NODE_IS_CHECKER(node
)) {
1602 #ifdef USE_BACKREF_WITH_LEVEL
1603 if (NODE_IS_NEST_LEVEL(node
)) {
1604 r
= SIZE_OPCODE
+ SIZE_LENGTH
+ SIZE_LENGTH
+ (SIZE_MEMNUM
* br
->back_num
);
1608 r
= SIZE_OPCODE
+ SIZE_LENGTH
+ (SIZE_MEMNUM
* br
->back_num
);
1611 #ifdef USE_BACKREF_WITH_LEVEL
1612 if (NODE_IS_NEST_LEVEL(node
)) {
1613 r
= SIZE_OPCODE
+ SIZE_OPTION
+ SIZE_LENGTH
+
1614 SIZE_LENGTH
+ (SIZE_MEMNUM
* br
->back_num
);
1618 if (br
->back_num
== 1) {
1619 r
= ((!IS_IGNORECASE(reg
->options
) && br
->back_static
[0] <= 2)
1620 ? SIZE_OPCODE
: (SIZE_OPCODE
+ SIZE_MEMNUM
));
1623 r
= SIZE_OPCODE
+ SIZE_LENGTH
+ (SIZE_MEMNUM
* br
->back_num
);
1636 r
= compile_length_quantifier_node(QUANT_(node
), reg
);
1639 case NODE_ENCLOSURE
:
1640 r
= compile_length_enclosure_node(ENCLOSURE_(node
), reg
);
1644 r
= compile_length_anchor_node(ANCHOR_(node
), reg
);
1648 r
= compile_length_gimmick_node(GIMMICK_(node
), reg
);
1652 return ONIGERR_TYPE_BUG
;
1660 compile_tree(Node
* node
, regex_t
* reg
, ScanEnv
* env
)
1662 int n
, len
, pos
, r
= 0;
1664 switch (NODE_TYPE(node
)) {
1667 r
= compile_tree(NODE_CAR(node
), reg
, env
);
1668 } while (r
== 0 && IS_NOT_NULL(node
= NODE_CDR(node
)));
1676 len
+= compile_length_tree(NODE_CAR(x
), reg
);
1677 if (IS_NOT_NULL(NODE_CDR(x
))) {
1678 len
+= SIZE_OP_PUSH
+ SIZE_OP_JUMP
;
1680 } while (IS_NOT_NULL(x
= NODE_CDR(x
)));
1681 pos
= reg
->used
+ len
; /* goal position */
1684 len
= compile_length_tree(NODE_CAR(node
), reg
);
1685 if (IS_NOT_NULL(NODE_CDR(node
))) {
1686 enum OpCode push
= NODE_IS_SUPER(node
) ? OP_PUSH_SUPER
: OP_PUSH
;
1687 r
= add_opcode_rel_addr(reg
, push
, len
+ SIZE_OP_JUMP
);
1690 r
= compile_tree(NODE_CAR(node
), reg
, env
);
1692 if (IS_NOT_NULL(NODE_CDR(node
))) {
1693 len
= pos
- (reg
->used
+ SIZE_OP_JUMP
);
1694 r
= add_opcode_rel_addr(reg
, OP_JUMP
, len
);
1697 } while (IS_NOT_NULL(node
= NODE_CDR(node
)));
1702 if (NODE_STRING_IS_RAW(node
))
1703 r
= compile_string_raw_node(STR_(node
), reg
);
1705 r
= compile_string_node(node
, reg
);
1709 r
= compile_cclass_node(CCLASS_(node
), reg
);
1716 switch (CTYPE_(node
)->ctype
) {
1718 if (IS_MULTILINE(CTYPE_OPTION(node
, reg
)))
1719 r
= add_opcode(reg
, OP_ANYCHAR_ML
);
1721 r
= add_opcode(reg
, OP_ANYCHAR
);
1724 case ONIGENC_CTYPE_WORD
:
1725 if (CTYPE_(node
)->ascii_mode
== 0) {
1726 op
= CTYPE_(node
)->not != 0 ? OP_NO_WORD
: OP_WORD
;
1729 op
= CTYPE_(node
)->not != 0 ? OP_NO_WORD_ASCII
: OP_WORD_ASCII
;
1731 r
= add_opcode(reg
, op
);
1735 return ONIGERR_TYPE_BUG
;
1743 BackRefNode
* br
= BACKREF_(node
);
1745 if (NODE_IS_CHECKER(node
)) {
1746 #ifdef USE_BACKREF_WITH_LEVEL
1747 if (NODE_IS_NEST_LEVEL(node
)) {
1748 r
= add_opcode(reg
, OP_BACKREF_CHECK_WITH_LEVEL
);
1749 if (r
!= 0) return r
;
1750 r
= add_length(reg
, br
->nest_level
);
1751 if (r
!= 0) return r
;
1756 r
= add_opcode(reg
, OP_BACKREF_CHECK
);
1757 if (r
!= 0) return r
;
1760 goto add_bacref_mems
;
1763 #ifdef USE_BACKREF_WITH_LEVEL
1764 if (NODE_IS_NEST_LEVEL(node
)) {
1765 r
= add_opcode(reg
, OP_BACKREF_WITH_LEVEL
);
1766 if (r
!= 0) return r
;
1767 r
= add_option(reg
, (reg
->options
& ONIG_OPTION_IGNORECASE
));
1768 if (r
!= 0) return r
;
1769 r
= add_length(reg
, br
->nest_level
);
1770 if (r
!= 0) return r
;
1772 goto add_bacref_mems
;
1776 if (br
->back_num
== 1) {
1777 n
= br
->back_static
[0];
1778 if (IS_IGNORECASE(reg
->options
)) {
1779 r
= add_opcode(reg
, OP_BACKREF_N_IC
);
1780 if (r
!= 0) return r
;
1781 r
= add_mem_num(reg
, n
);
1785 case 1: r
= add_opcode(reg
, OP_BACKREF1
); break;
1786 case 2: r
= add_opcode(reg
, OP_BACKREF2
); break;
1788 r
= add_opcode(reg
, OP_BACKREF_N
);
1789 if (r
!= 0) return r
;
1790 r
= add_mem_num(reg
, n
);
1799 if (IS_IGNORECASE(reg
->options
)) {
1800 r
= add_opcode(reg
, OP_BACKREF_MULTI_IC
);
1803 r
= add_opcode(reg
, OP_BACKREF_MULTI
);
1805 if (r
!= 0) return r
;
1808 r
= add_length(reg
, br
->back_num
);
1809 if (r
!= 0) return r
;
1811 for (i
= br
->back_num
- 1; i
>= 0; i
--) {
1812 r
= add_mem_num(reg
, p
[i
]);
1813 if (r
!= 0) return r
;
1822 r
= compile_call(CALL_(node
), reg
, env
);
1827 r
= compile_quantifier_node(QUANT_(node
), reg
, env
);
1830 case NODE_ENCLOSURE
:
1831 r
= compile_enclosure_node(ENCLOSURE_(node
), reg
, env
);
1835 r
= compile_anchor_node(ANCHOR_(node
), reg
, env
);
1839 r
= compile_gimmick_node(GIMMICK_(node
), reg
);
1844 fprintf(stderr
, "compile_tree: undefined node type %d\n", NODE_TYPE(node
));
1853 noname_disable_map(Node
** plink
, GroupNumRemap
* map
, int* counter
)
1856 Node
* node
= *plink
;
1858 switch (NODE_TYPE(node
)) {
1862 r
= noname_disable_map(&(NODE_CAR(node
)), map
, counter
);
1863 } while (r
== 0 && IS_NOT_NULL(node
= NODE_CDR(node
)));
1868 Node
** ptarget
= &(NODE_BODY(node
));
1869 Node
* old
= *ptarget
;
1870 r
= noname_disable_map(ptarget
, map
, counter
);
1871 if (*ptarget
!= old
&& NODE_TYPE(*ptarget
) == NODE_QUANT
) {
1872 onig_reduce_nested_quantifier(node
, *ptarget
);
1877 case NODE_ENCLOSURE
:
1879 EnclosureNode
* en
= ENCLOSURE_(node
);
1880 if (en
->type
== ENCLOSURE_MEMORY
) {
1881 if (NODE_IS_NAMED_GROUP(node
)) {
1883 map
[en
->m
.regnum
].new_val
= *counter
;
1884 en
->m
.regnum
= *counter
;
1885 r
= noname_disable_map(&(NODE_BODY(node
)), map
, counter
);
1888 *plink
= NODE_BODY(node
);
1889 NODE_BODY(node
) = NULL_NODE
;
1890 onig_node_free(node
);
1891 r
= noname_disable_map(plink
, map
, counter
);
1894 else if (en
->type
== ENCLOSURE_IF_ELSE
) {
1895 r
= noname_disable_map(&(NODE_ENCLOSURE_BODY(en
)), map
, counter
);
1896 if (r
!= 0) return r
;
1897 if (IS_NOT_NULL(en
->te
.Then
)) {
1898 r
= noname_disable_map(&(en
->te
.Then
), map
, counter
);
1899 if (r
!= 0) return r
;
1901 if (IS_NOT_NULL(en
->te
.Else
)) {
1902 r
= noname_disable_map(&(en
->te
.Else
), map
, counter
);
1903 if (r
!= 0) return r
;
1907 r
= noname_disable_map(&(NODE_BODY(node
)), map
, counter
);
1912 if (IS_NOT_NULL(NODE_BODY(node
)))
1913 r
= noname_disable_map(&(NODE_BODY(node
)), map
, counter
);
1924 renumber_node_backref(Node
* node
, GroupNumRemap
* map
)
1926 int i
, pos
, n
, old_num
;
1928 BackRefNode
* bn
= BACKREF_(node
);
1930 if (! NODE_IS_BY_NAME(node
))
1931 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED
;
1933 old_num
= bn
->back_num
;
1934 if (IS_NULL(bn
->back_dynamic
))
1935 backs
= bn
->back_static
;
1937 backs
= bn
->back_dynamic
;
1939 for (i
= 0, pos
= 0; i
< old_num
; i
++) {
1940 n
= map
[backs
[i
]].new_val
;
1952 renumber_by_map(Node
* node
, GroupNumRemap
* map
)
1956 switch (NODE_TYPE(node
)) {
1960 r
= renumber_by_map(NODE_CAR(node
), map
);
1961 } while (r
== 0 && IS_NOT_NULL(node
= NODE_CDR(node
)));
1965 r
= renumber_by_map(NODE_BODY(node
), map
);
1968 case NODE_ENCLOSURE
:
1970 EnclosureNode
* en
= ENCLOSURE_(node
);
1972 r
= renumber_by_map(NODE_BODY(node
), map
);
1973 if (r
!= 0) return r
;
1975 if (en
->type
== ENCLOSURE_IF_ELSE
) {
1976 if (IS_NOT_NULL(en
->te
.Then
)) {
1977 r
= renumber_by_map(en
->te
.Then
, map
);
1978 if (r
!= 0) return r
;
1980 if (IS_NOT_NULL(en
->te
.Else
)) {
1981 r
= renumber_by_map(en
->te
.Else
, map
);
1982 if (r
!= 0) return r
;
1989 r
= renumber_node_backref(node
, map
);
1993 if (IS_NOT_NULL(NODE_BODY(node
)))
1994 r
= renumber_by_map(NODE_BODY(node
), map
);
2005 numbered_ref_check(Node
* node
)
2009 switch (NODE_TYPE(node
)) {
2013 r
= numbered_ref_check(NODE_CAR(node
));
2014 } while (r
== 0 && IS_NOT_NULL(node
= NODE_CDR(node
)));
2018 if (IS_NULL(NODE_BODY(node
)))
2022 r
= numbered_ref_check(NODE_BODY(node
));
2025 case NODE_ENCLOSURE
:
2027 EnclosureNode
* en
= ENCLOSURE_(node
);
2029 r
= numbered_ref_check(NODE_BODY(node
));
2030 if (r
!= 0) return r
;
2032 if (en
->type
== ENCLOSURE_IF_ELSE
) {
2033 if (IS_NOT_NULL(en
->te
.Then
)) {
2034 r
= numbered_ref_check(en
->te
.Then
);
2035 if (r
!= 0) return r
;
2037 if (IS_NOT_NULL(en
->te
.Else
)) {
2038 r
= numbered_ref_check(en
->te
.Else
);
2039 if (r
!= 0) return r
;
2047 if (! NODE_IS_BY_NAME(node
))
2048 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED
;
2059 disable_noname_group_capture(Node
** root
, regex_t
* reg
, ScanEnv
* env
)
2061 int r
, i
, pos
, counter
;
2066 map
= (GroupNumRemap
* )xmalloc(sizeof(GroupNumRemap
) * (env
->num_mem
+ 1));
2067 CHECK_NULL_RETURN_MEMERR(map
);
2068 for (i
= 1; i
<= env
->num_mem
; i
++) {
2072 r
= noname_disable_map(root
, map
, &counter
);
2073 if (r
!= 0) return r
;
2075 r
= renumber_by_map(*root
, map
);
2076 if (r
!= 0) return r
;
2078 for (i
= 1, pos
= 1; i
<= env
->num_mem
; i
++) {
2079 if (map
[i
].new_val
> 0) {
2080 SCANENV_MEMENV(env
)[pos
] = SCANENV_MEMENV(env
)[i
];
2085 loc
= env
->capture_history
;
2086 MEM_STATUS_CLEAR(env
->capture_history
);
2087 for (i
= 1; i
<= ONIG_MAX_CAPTURE_HISTORY_GROUP
; i
++) {
2088 if (MEM_STATUS_AT(loc
, i
)) {
2089 MEM_STATUS_ON_SIMPLE(env
->capture_history
, map
[i
].new_val
);
2093 env
->num_mem
= env
->num_named
;
2094 reg
->num_mem
= env
->num_named
;
2095 result
= onig_renumber_name_table(reg
, map
);
2102 fix_unset_addr_list(UnsetAddrList
* uslist
, regex_t
* reg
)
2108 for (i
= 0; i
< uslist
->num
; i
++) {
2109 if (! NODE_IS_ADDR_FIXED(uslist
->us
[i
].target
))
2110 return ONIGERR_PARSER_BUG
;
2112 en
= ENCLOSURE_(uslist
->us
[i
].target
);
2113 addr
= en
->m
.called_addr
;
2114 offset
= uslist
->us
[i
].offset
;
2116 BB_WRITE(reg
, offset
, &addr
, SIZE_ABSADDR
);
2123 #define GET_CHAR_LEN_VARLEN -1
2124 #define GET_CHAR_LEN_TOP_ALT_VARLEN -2
2126 /* fixed size pattern node only */
2128 get_char_length_tree1(Node
* node
, regex_t
* reg
, int* len
, int level
)
2135 switch (NODE_TYPE(node
)) {
2138 r
= get_char_length_tree1(NODE_CAR(node
), reg
, &tlen
, level
);
2140 *len
= distance_add(*len
, tlen
);
2141 } while (r
== 0 && IS_NOT_NULL(node
= NODE_CDR(node
)));
2149 r
= get_char_length_tree1(NODE_CAR(node
), reg
, &tlen
, level
);
2150 while (r
== 0 && IS_NOT_NULL(node
= NODE_CDR(node
))) {
2151 r
= get_char_length_tree1(NODE_CAR(node
), reg
, &tlen2
, level
);
2160 r
= GET_CHAR_LEN_TOP_ALT_VARLEN
;
2162 r
= GET_CHAR_LEN_VARLEN
;
2172 StrNode
* sn
= STR_(node
);
2175 while (s
< sn
->end
) {
2176 s
+= enclen(reg
->enc
, s
);
2184 QuantNode
* qn
= QUANT_(node
);
2186 if (qn
->lower
== qn
->upper
) {
2187 if (qn
->upper
== 0) {
2191 r
= get_char_length_tree1(NODE_BODY(node
), reg
, &tlen
, level
);
2193 *len
= distance_multiply(tlen
, qn
->lower
);
2197 r
= GET_CHAR_LEN_VARLEN
;
2203 if (! NODE_IS_RECURSION(node
))
2204 r
= get_char_length_tree1(NODE_BODY(node
), reg
, len
, level
);
2206 r
= GET_CHAR_LEN_VARLEN
;
2215 case NODE_ENCLOSURE
:
2217 EnclosureNode
* en
= ENCLOSURE_(node
);
2220 case ENCLOSURE_MEMORY
:
2222 if (NODE_IS_CLEN_FIXED(node
))
2223 *len
= en
->char_len
;
2225 r
= get_char_length_tree1(NODE_BODY(node
), reg
, len
, level
);
2227 en
->char_len
= *len
;
2228 NODE_STATUS_ADD(node
, CLEN_FIXED
);
2233 case ENCLOSURE_OPTION
:
2234 case ENCLOSURE_STOP_BACKTRACK
:
2235 r
= get_char_length_tree1(NODE_BODY(node
), reg
, len
, level
);
2237 case ENCLOSURE_IF_ELSE
:
2241 r
= get_char_length_tree1(NODE_BODY(node
), reg
, &clen
, level
);
2243 if (IS_NOT_NULL(en
->te
.Then
)) {
2244 r
= get_char_length_tree1(en
->te
.Then
, reg
, &tlen
, level
);
2248 if (IS_NOT_NULL(en
->te
.Else
)) {
2249 r
= get_char_length_tree1(en
->te
.Else
, reg
, &elen
, level
);
2254 if (clen
+ tlen
!= elen
) {
2255 r
= GET_CHAR_LEN_VARLEN
;
2275 if (NODE_IS_CHECKER(node
))
2279 r
= GET_CHAR_LEN_VARLEN
;
2287 get_char_length_tree(Node
* node
, regex_t
* reg
, int* len
)
2289 return get_char_length_tree1(node
, reg
, len
, 0);
2292 /* x is not included y ==> 1 : 0 */
2294 is_exclusive(Node
* x
, Node
* y
, regex_t
* reg
)
2302 ytype
= NODE_TYPE(y
);
2303 switch (NODE_TYPE(x
)) {
2306 if (CTYPE_(x
)->ctype
== CTYPE_ANYCHAR
||
2307 CTYPE_(y
)->ctype
== CTYPE_ANYCHAR
)
2312 if (CTYPE_(y
)->ctype
== CTYPE_(x
)->ctype
&&
2313 CTYPE_(y
)->not != CTYPE_(x
)->not &&
2314 CTYPE_(y
)->ascii_mode
== CTYPE_(x
)->ascii_mode
)
2324 tmp
= x
; x
= y
; y
= tmp
;
2342 CClassNode
* xc
= CCLASS_(x
);
2346 switch (CTYPE_(y
)->ctype
) {
2351 case ONIGENC_CTYPE_WORD
:
2352 if (CTYPE_(y
)->not == 0) {
2353 if (IS_NULL(xc
->mbuf
) && !IS_NCCLASS_NOT(xc
)) {
2354 range
= CTYPE_(y
)->ascii_mode
!= 0 ? 128 : SINGLE_BYTE_SIZE
;
2355 for (i
= 0; i
< range
; i
++) {
2356 if (BITSET_AT(xc
->bs
, i
)) {
2357 if (ONIGENC_IS_CODE_WORD(reg
->enc
, i
)) return 0;
2365 if (IS_NOT_NULL(xc
->mbuf
)) return 0;
2366 if (IS_NCCLASS_NOT(xc
)) return 0;
2368 range
= CTYPE_(y
)->ascii_mode
!= 0 ? 128 : SINGLE_BYTE_SIZE
;
2369 for (i
= 0; i
< range
; i
++) {
2370 if (! ONIGENC_IS_CODE_WORD(reg
->enc
, i
)) {
2371 if (BITSET_AT(xc
->bs
, i
))
2375 for (i
= range
; i
< SINGLE_BYTE_SIZE
; i
++) {
2376 if (BITSET_AT(xc
->bs
, i
)) return 0;
2390 CClassNode
* yc
= CCLASS_(y
);
2392 for (i
= 0; i
< SINGLE_BYTE_SIZE
; i
++) {
2393 v
= BITSET_AT(xc
->bs
, i
);
2394 if ((v
!= 0 && !IS_NCCLASS_NOT(xc
)) || (v
== 0 && IS_NCCLASS_NOT(xc
))) {
2395 v
= BITSET_AT(yc
->bs
, i
);
2396 if ((v
!= 0 && !IS_NCCLASS_NOT(yc
)) ||
2397 (v
== 0 && IS_NCCLASS_NOT(yc
)))
2401 if ((IS_NULL(xc
->mbuf
) && !IS_NCCLASS_NOT(xc
)) ||
2402 (IS_NULL(yc
->mbuf
) && !IS_NCCLASS_NOT(yc
)))
2420 StrNode
* xs
= STR_(x
);
2422 if (NODE_STRING_LEN(x
) == 0)
2427 switch (CTYPE_(y
)->ctype
) {
2431 case ONIGENC_CTYPE_WORD
:
2432 if (CTYPE_(y
)->ascii_mode
== 0) {
2433 if (ONIGENC_IS_MBC_WORD(reg
->enc
, xs
->s
, xs
->end
))
2434 return CTYPE_(y
)->not;
2436 return !(CTYPE_(y
)->not);
2439 if (ONIGENC_IS_MBC_WORD_ASCII(reg
->enc
, xs
->s
, xs
->end
))
2440 return CTYPE_(y
)->not;
2442 return !(CTYPE_(y
)->not);
2452 CClassNode
* cc
= CCLASS_(y
);
2454 code
= ONIGENC_MBC_TO_CODE(reg
->enc
, xs
->s
,
2455 xs
->s
+ ONIGENC_MBC_MAXLEN(reg
->enc
));
2456 return (onig_is_code_in_cc(reg
->enc
, code
, cc
) != 0 ? 0 : 1);
2463 StrNode
* ys
= STR_(y
);
2465 len
= NODE_STRING_LEN(x
);
2466 if (len
> NODE_STRING_LEN(y
)) len
= NODE_STRING_LEN(y
);
2467 if (NODE_STRING_IS_AMBIG(x
) || NODE_STRING_IS_AMBIG(y
)) {
2472 for (i
= 0, p
= ys
->s
, q
= xs
->s
; i
< len
; i
++, p
++, q
++) {
2473 if (*p
!= *q
) return 1;
2493 get_head_value_node(Node
* node
, int exact
, regex_t
* reg
)
2495 Node
* n
= NULL_NODE
;
2497 switch (NODE_TYPE(node
)) {
2506 if (CTYPE_(node
)->ctype
== CTYPE_ANYCHAR
)
2516 n
= get_head_value_node(NODE_CAR(node
), exact
, reg
);
2521 StrNode
* sn
= STR_(node
);
2523 if (sn
->end
<= sn
->s
)
2527 !NODE_STRING_IS_RAW(node
) && IS_IGNORECASE(reg
->options
)) {
2537 QuantNode
* qn
= QUANT_(node
);
2538 if (qn
->lower
> 0) {
2539 if (IS_NOT_NULL(qn
->head_exact
))
2542 n
= get_head_value_node(NODE_BODY(node
), exact
, reg
);
2547 case NODE_ENCLOSURE
:
2549 EnclosureNode
* en
= ENCLOSURE_(node
);
2551 case ENCLOSURE_OPTION
:
2553 OnigOptionType options
= reg
->options
;
2555 reg
->options
= ENCLOSURE_(node
)->o
.options
;
2556 n
= get_head_value_node(NODE_BODY(node
), exact
, reg
);
2557 reg
->options
= options
;
2561 case ENCLOSURE_MEMORY
:
2562 case ENCLOSURE_STOP_BACKTRACK
:
2563 case ENCLOSURE_IF_ELSE
:
2564 n
= get_head_value_node(NODE_BODY(node
), exact
, reg
);
2571 if (ANCHOR_(node
)->type
== ANCHOR_PREC_READ
)
2572 n
= get_head_value_node(NODE_BODY(node
), exact
, reg
);
2584 check_type_tree(Node
* node
, int type_mask
, int enclosure_mask
, int anchor_mask
)
2589 type
= NODE_TYPE(node
);
2590 if ((NODE_TYPE2BIT(type
) & type_mask
) == 0)
2597 r
= check_type_tree(NODE_CAR(node
), type_mask
, enclosure_mask
,
2599 } while (r
== 0 && IS_NOT_NULL(node
= NODE_CDR(node
)));
2603 r
= check_type_tree(NODE_BODY(node
), type_mask
, enclosure_mask
, anchor_mask
);
2606 case NODE_ENCLOSURE
:
2608 EnclosureNode
* en
= ENCLOSURE_(node
);
2609 if (((1<<en
->type
) & enclosure_mask
) == 0)
2612 r
= check_type_tree(NODE_BODY(node
), type_mask
, enclosure_mask
, anchor_mask
);
2613 if (r
== 0 && en
->type
== ENCLOSURE_IF_ELSE
) {
2614 if (IS_NOT_NULL(en
->te
.Then
)) {
2615 r
= check_type_tree(en
->te
.Then
, type_mask
, enclosure_mask
, anchor_mask
);
2618 if (IS_NOT_NULL(en
->te
.Else
)) {
2619 r
= check_type_tree(en
->te
.Else
, type_mask
, enclosure_mask
, anchor_mask
);
2626 type
= ANCHOR_(node
)->type
;
2627 if ((type
& anchor_mask
) == 0)
2630 if (IS_NOT_NULL(NODE_BODY(node
)))
2631 r
= check_type_tree(NODE_BODY(node
), type_mask
, enclosure_mask
, anchor_mask
);
2642 tree_min_len(Node
* node
, ScanEnv
* env
)
2648 switch (NODE_TYPE(node
)) {
2650 if (! NODE_IS_CHECKER(node
)) {
2653 MemEnv
* mem_env
= SCANENV_MEMENV(env
);
2654 BackRefNode
* br
= BACKREF_(node
);
2655 if (NODE_IS_RECURSION(node
)) break;
2657 backs
= BACKREFS_P(br
);
2658 len
= tree_min_len(mem_env
[backs
[0]].node
, env
);
2659 for (i
= 1; i
< br
->back_num
; i
++) {
2660 tmin
= tree_min_len(mem_env
[backs
[i
]].node
, env
);
2661 if (len
> tmin
) len
= tmin
;
2669 Node
* t
= NODE_BODY(node
);
2670 if (NODE_IS_RECURSION(node
)) {
2671 if (NODE_IS_MIN_FIXED(t
))
2672 len
= ENCLOSURE_(t
)->min_len
;
2675 len
= tree_min_len(t
, env
);
2682 tmin
= tree_min_len(NODE_CAR(node
), env
);
2683 len
= distance_add(len
, tmin
);
2684 } while (IS_NOT_NULL(node
= NODE_CDR(node
)));
2693 tmin
= tree_min_len(x
, env
);
2694 if (y
== node
) len
= tmin
;
2695 else if (len
> tmin
) len
= tmin
;
2696 } while (IS_NOT_NULL(y
= NODE_CDR(y
)));
2702 StrNode
* sn
= STR_(node
);
2703 len
= (int )(sn
->end
- sn
->s
);
2709 len
= ONIGENC_MBC_MINLEN(env
->enc
);
2714 QuantNode
* qn
= QUANT_(node
);
2716 if (qn
->lower
> 0) {
2717 len
= tree_min_len(NODE_BODY(node
), env
);
2718 len
= distance_multiply(len
, qn
->lower
);
2723 case NODE_ENCLOSURE
:
2725 EnclosureNode
* en
= ENCLOSURE_(node
);
2727 case ENCLOSURE_MEMORY
:
2728 if (NODE_IS_MIN_FIXED(node
))
2731 if (NODE_IS_MARK1(node
))
2732 len
= 0; /* recursive */
2734 NODE_STATUS_ADD(node
, MARK1
);
2735 len
= tree_min_len(NODE_BODY(node
), env
);
2736 NODE_STATUS_REMOVE(node
, MARK1
);
2739 NODE_STATUS_ADD(node
, MIN_FIXED
);
2744 case ENCLOSURE_OPTION
:
2745 case ENCLOSURE_STOP_BACKTRACK
:
2746 len
= tree_min_len(NODE_BODY(node
), env
);
2748 case ENCLOSURE_IF_ELSE
:
2752 len
= tree_min_len(NODE_BODY(node
), env
);
2753 if (IS_NOT_NULL(en
->te
.Then
))
2754 len
+= tree_min_len(en
->te
.Then
, env
);
2755 if (IS_NOT_NULL(en
->te
.Else
))
2756 elen
= tree_min_len(en
->te
.Else
, env
);
2759 if (elen
< len
) len
= elen
;
2768 GimmickNode
* g
= GIMMICK_(node
);
2769 if (g
->type
== GIMMICK_FAIL
) {
2784 tree_max_len(Node
* node
, ScanEnv
* env
)
2790 switch (NODE_TYPE(node
)) {
2793 tmax
= tree_max_len(NODE_CAR(node
), env
);
2794 len
= distance_add(len
, tmax
);
2795 } while (IS_NOT_NULL(node
= NODE_CDR(node
)));
2800 tmax
= tree_max_len(NODE_CAR(node
), env
);
2801 if (len
< tmax
) len
= tmax
;
2802 } while (IS_NOT_NULL(node
= NODE_CDR(node
)));
2807 StrNode
* sn
= STR_(node
);
2808 len
= (OnigLen
)(sn
->end
- sn
->s
);
2814 len
= ONIGENC_MBC_MAXLEN_DIST(env
->enc
);
2818 if (! NODE_IS_CHECKER(node
)) {
2821 MemEnv
* mem_env
= SCANENV_MEMENV(env
);
2822 BackRefNode
* br
= BACKREF_(node
);
2823 if (NODE_IS_RECURSION(node
)) {
2827 backs
= BACKREFS_P(br
);
2828 for (i
= 0; i
< br
->back_num
; i
++) {
2829 tmax
= tree_max_len(mem_env
[backs
[i
]].node
, env
);
2830 if (len
< tmax
) len
= tmax
;
2837 if (! NODE_IS_RECURSION(node
))
2838 len
= tree_max_len(NODE_BODY(node
), env
);
2846 QuantNode
* qn
= QUANT_(node
);
2848 if (qn
->upper
!= 0) {
2849 len
= tree_max_len(NODE_BODY(node
), env
);
2851 if (! IS_REPEAT_INFINITE(qn
->upper
))
2852 len
= distance_multiply(len
, qn
->upper
);
2860 case NODE_ENCLOSURE
:
2862 EnclosureNode
* en
= ENCLOSURE_(node
);
2864 case ENCLOSURE_MEMORY
:
2865 if (NODE_IS_MAX_FIXED(node
))
2868 if (NODE_IS_MARK1(node
))
2871 NODE_STATUS_ADD(node
, MARK1
);
2872 len
= tree_max_len(NODE_BODY(node
), env
);
2873 NODE_STATUS_REMOVE(node
, MARK1
);
2876 NODE_STATUS_ADD(node
, MAX_FIXED
);
2881 case ENCLOSURE_OPTION
:
2882 case ENCLOSURE_STOP_BACKTRACK
:
2883 len
= tree_max_len(NODE_BODY(node
), env
);
2885 case ENCLOSURE_IF_ELSE
:
2889 len
= tree_max_len(NODE_BODY(node
), env
);
2890 if (IS_NOT_NULL(en
->te
.Then
)) {
2891 tlen
= tree_max_len(en
->te
.Then
, env
);
2892 len
= distance_add(len
, tlen
);
2894 if (IS_NOT_NULL(en
->te
.Else
))
2895 elen
= tree_max_len(en
->te
.Else
, env
);
2898 if (elen
> len
) len
= elen
;
2915 check_backrefs(Node
* node
, ScanEnv
* env
)
2919 switch (NODE_TYPE(node
)) {
2923 r
= check_backrefs(NODE_CAR(node
), env
);
2924 } while (r
== 0 && IS_NOT_NULL(node
= NODE_CDR(node
)));
2928 if (! ANCHOR_HAS_BODY(ANCHOR_(node
))) {
2934 r
= check_backrefs(NODE_BODY(node
), env
);
2937 case NODE_ENCLOSURE
:
2938 r
= check_backrefs(NODE_BODY(node
), env
);
2940 EnclosureNode
* en
= ENCLOSURE_(node
);
2942 if (en
->type
== ENCLOSURE_IF_ELSE
) {
2943 if (r
!= 0) return r
;
2944 if (IS_NOT_NULL(en
->te
.Then
)) {
2945 r
= check_backrefs(en
->te
.Then
, env
);
2946 if (r
!= 0) return r
;
2948 if (IS_NOT_NULL(en
->te
.Else
)) {
2949 r
= check_backrefs(en
->te
.Else
, env
);
2958 BackRefNode
* br
= BACKREF_(node
);
2959 int* backs
= BACKREFS_P(br
);
2960 MemEnv
* mem_env
= SCANENV_MEMENV(env
);
2962 for (i
= 0; i
< br
->back_num
; i
++) {
2963 if (backs
[i
] > env
->num_mem
)
2964 return ONIGERR_INVALID_BACKREF
;
2966 NODE_STATUS_ADD(mem_env
[backs
[i
]].node
, BACKREF
);
2983 #define RECURSION_EXIST (1<<0)
2984 #define RECURSION_MUST (1<<1)
2985 #define RECURSION_INFINITE (1<<2)
2988 infinite_recursive_call_check(Node
* node
, ScanEnv
* env
, int head
)
2993 switch (NODE_TYPE(node
)) {
3001 ret
= infinite_recursive_call_check(NODE_CAR(x
), env
, head
);
3002 if (ret
< 0 || (ret
& RECURSION_INFINITE
) != 0) return ret
;
3005 min
= tree_min_len(NODE_CAR(x
), env
);
3006 if (min
!= 0) head
= 0;
3008 } while (IS_NOT_NULL(x
= NODE_CDR(x
)));
3016 must
= RECURSION_MUST
;
3018 ret
= infinite_recursive_call_check(NODE_CAR(node
), env
, head
);
3019 if (ret
< 0 || (ret
& RECURSION_INFINITE
) != 0) return ret
;
3021 r
|= (ret
& RECURSION_EXIST
);
3023 } while (IS_NOT_NULL(node
= NODE_CDR(node
)));
3029 r
= infinite_recursive_call_check(NODE_BODY(node
), env
, head
);
3030 if (r
< 0) return r
;
3031 if ((r
& RECURSION_MUST
) != 0) {
3032 if (QUANT_(node
)->lower
== 0)
3033 r
&= ~RECURSION_MUST
;
3038 if (! ANCHOR_HAS_BODY(ANCHOR_(node
)))
3042 r
= infinite_recursive_call_check(NODE_BODY(node
), env
, head
);
3045 case NODE_ENCLOSURE
:
3047 EnclosureNode
* en
= ENCLOSURE_(node
);
3049 if (en
->type
== ENCLOSURE_MEMORY
) {
3050 if (NODE_IS_MARK2(node
))
3052 else if (NODE_IS_MARK1(node
))
3053 return (head
== 0 ? RECURSION_EXIST
| RECURSION_MUST
3054 : RECURSION_EXIST
| RECURSION_MUST
| RECURSION_INFINITE
);
3056 NODE_STATUS_ADD(node
, MARK2
);
3057 r
= infinite_recursive_call_check(NODE_BODY(node
), env
, head
);
3058 NODE_STATUS_REMOVE(node
, MARK2
);
3061 else if (en
->type
== ENCLOSURE_IF_ELSE
) {
3064 ret
= infinite_recursive_call_check(NODE_BODY(node
), env
, head
);
3065 if (ret
< 0 || (ret
& RECURSION_INFINITE
) != 0) return ret
;
3067 if (IS_NOT_NULL(en
->te
.Then
)) {
3070 min
= tree_min_len(NODE_BODY(node
), env
);
3074 ret
= infinite_recursive_call_check(en
->te
.Then
, env
, min
!= 0 ? 0:head
);
3075 if (ret
< 0 || (ret
& RECURSION_INFINITE
) != 0) return ret
;
3078 if (IS_NOT_NULL(en
->te
.Else
)) {
3079 eret
= infinite_recursive_call_check(en
->te
.Else
, env
, head
);
3080 if (eret
< 0 || (eret
& RECURSION_INFINITE
) != 0) return eret
;
3081 r
|= (eret
& RECURSION_EXIST
);
3082 if ((eret
& RECURSION_MUST
) == 0)
3083 r
&= ~RECURSION_MUST
;
3087 r
= infinite_recursive_call_check(NODE_BODY(node
), env
, head
);
3100 infinite_recursive_call_check_trav(Node
* node
, ScanEnv
* env
)
3104 switch (NODE_TYPE(node
)) {
3108 r
= infinite_recursive_call_check_trav(NODE_CAR(node
), env
);
3109 } while (r
== 0 && IS_NOT_NULL(node
= NODE_CDR(node
)));
3113 if (! ANCHOR_HAS_BODY(ANCHOR_(node
))) {
3119 r
= infinite_recursive_call_check_trav(NODE_BODY(node
), env
);
3122 case NODE_ENCLOSURE
:
3124 EnclosureNode
* en
= ENCLOSURE_(node
);
3126 if (en
->type
== ENCLOSURE_MEMORY
) {
3127 if (NODE_IS_RECURSION(node
) && NODE_IS_CALLED(node
)) {
3130 NODE_STATUS_ADD(node
, MARK1
);
3132 ret
= infinite_recursive_call_check(NODE_BODY(node
), env
, 1);
3133 if (ret
< 0) return ret
;
3134 else if ((ret
& (RECURSION_MUST
| RECURSION_INFINITE
)) != 0)
3135 return ONIGERR_NEVER_ENDING_RECURSION
;
3137 NODE_STATUS_REMOVE(node
, MARK1
);
3140 else if (en
->type
== ENCLOSURE_IF_ELSE
) {
3141 if (IS_NOT_NULL(en
->te
.Then
)) {
3142 r
= infinite_recursive_call_check_trav(en
->te
.Then
, env
);
3143 if (r
!= 0) return r
;
3145 if (IS_NOT_NULL(en
->te
.Else
)) {
3146 r
= infinite_recursive_call_check_trav(en
->te
.Else
, env
);
3147 if (r
!= 0) return r
;
3152 r
= infinite_recursive_call_check_trav(NODE_BODY(node
), env
);
3164 recursive_call_check(Node
* node
)
3168 switch (NODE_TYPE(node
)) {
3173 r
|= recursive_call_check(NODE_CAR(node
));
3174 } while (IS_NOT_NULL(node
= NODE_CDR(node
)));
3178 if (! ANCHOR_HAS_BODY(ANCHOR_(node
))) {
3184 r
= recursive_call_check(NODE_BODY(node
));
3188 r
= recursive_call_check(NODE_BODY(node
));
3190 if (NODE_IS_MARK1(NODE_BODY(node
)))
3191 NODE_STATUS_ADD(node
, RECURSION
);
3195 case NODE_ENCLOSURE
:
3197 EnclosureNode
* en
= ENCLOSURE_(node
);
3199 if (en
->type
== ENCLOSURE_MEMORY
) {
3200 if (NODE_IS_MARK2(node
))
3202 else if (NODE_IS_MARK1(node
))
3203 return 1; /* recursion */
3205 NODE_STATUS_ADD(node
, MARK2
);
3206 r
= recursive_call_check(NODE_BODY(node
));
3207 NODE_STATUS_REMOVE(node
, MARK2
);
3210 else if (en
->type
== ENCLOSURE_IF_ELSE
) {
3212 if (IS_NOT_NULL(en
->te
.Then
)) {
3213 r
|= recursive_call_check(en
->te
.Then
);
3215 if (IS_NOT_NULL(en
->te
.Else
)) {
3216 r
|= recursive_call_check(en
->te
.Else
);
3218 r
|= recursive_call_check(NODE_BODY(node
));
3221 r
= recursive_call_check(NODE_BODY(node
));
3234 #define IN_RECURSION (1<<0)
3235 #define FOUND_CALLED_NODE 1
3238 recursive_call_check_trav(Node
* node
, ScanEnv
* env
, int state
)
3242 switch (NODE_TYPE(node
)) {
3248 ret
= recursive_call_check_trav(NODE_CAR(node
), env
, state
);
3249 if (ret
== FOUND_CALLED_NODE
) r
= FOUND_CALLED_NODE
;
3250 else if (ret
< 0) return ret
;
3251 } while (IS_NOT_NULL(node
= NODE_CDR(node
)));
3256 r
= recursive_call_check_trav(NODE_BODY(node
), env
, state
);
3257 if (QUANT_(node
)->upper
== 0) {
3258 if (r
== FOUND_CALLED_NODE
)
3259 QUANT_(node
)->is_refered
= 1;
3265 AnchorNode
* an
= ANCHOR_(node
);
3266 if (ANCHOR_HAS_BODY(an
))
3267 r
= recursive_call_check_trav(NODE_ANCHOR_BODY(an
), env
, state
);
3271 case NODE_ENCLOSURE
:
3275 EnclosureNode
* en
= ENCLOSURE_(node
);
3277 if (en
->type
== ENCLOSURE_MEMORY
) {
3278 if (NODE_IS_CALLED(node
) || (state
& IN_RECURSION
) != 0) {
3279 if (! NODE_IS_RECURSION(node
)) {
3280 NODE_STATUS_ADD(node
, MARK1
);
3281 r
= recursive_call_check(NODE_BODY(node
));
3283 NODE_STATUS_ADD(node
, RECURSION
);
3284 NODE_STATUS_REMOVE(node
, MARK1
);
3287 if (NODE_IS_CALLED(node
))
3288 r
= FOUND_CALLED_NODE
;
3293 if (NODE_IS_RECURSION(node
))
3294 state1
|= IN_RECURSION
;
3296 ret
= recursive_call_check_trav(NODE_BODY(node
), env
, state1
);
3297 if (ret
== FOUND_CALLED_NODE
)
3298 r
= FOUND_CALLED_NODE
;
3300 if (en
->type
== ENCLOSURE_IF_ELSE
) {
3301 if (IS_NOT_NULL(en
->te
.Then
)) {
3302 ret
= recursive_call_check_trav(en
->te
.Then
, env
, state1
);
3303 if (ret
== FOUND_CALLED_NODE
)
3304 r
= FOUND_CALLED_NODE
;
3306 if (IS_NOT_NULL(en
->te
.Else
)) {
3307 ret
= recursive_call_check_trav(en
->te
.Else
, env
, state1
);
3308 if (ret
== FOUND_CALLED_NODE
)
3309 r
= FOUND_CALLED_NODE
;
3324 /* divide different length alternatives in look-behind.
3325 (?<=A|B) ==> (?<=A)|(?<=B)
3326 (?<!A|B) ==> (?<!A)(?<!B)
3329 divide_look_behind_alternatives(Node
* node
)
3331 Node
*head
, *np
, *insert_node
;
3332 AnchorNode
* an
= ANCHOR_(node
);
3333 int anc_type
= an
->type
;
3335 head
= NODE_ANCHOR_BODY(an
);
3336 np
= NODE_CAR(head
);
3337 swap_node(node
, head
);
3338 NODE_CAR(node
) = head
;
3339 NODE_BODY(head
) = np
;
3342 while (IS_NOT_NULL(np
= NODE_CDR(np
))) {
3343 insert_node
= onig_node_new_anchor(anc_type
, an
->ascii_mode
);
3344 CHECK_NULL_RETURN_MEMERR(insert_node
);
3345 NODE_BODY(insert_node
) = NODE_CAR(np
);
3346 NODE_CAR(np
) = insert_node
;
3349 if (anc_type
== ANCHOR_LOOK_BEHIND_NOT
) {
3352 NODE_SET_TYPE(np
, NODE_LIST
); /* alt -> list */
3353 } while (IS_NOT_NULL(np
= NODE_CDR(np
)));
3359 setup_look_behind(Node
* node
, regex_t
* reg
, ScanEnv
* env
)
3362 AnchorNode
* an
= ANCHOR_(node
);
3364 r
= get_char_length_tree(NODE_ANCHOR_BODY(an
), reg
, &len
);
3367 else if (r
== GET_CHAR_LEN_VARLEN
)
3368 r
= ONIGERR_INVALID_LOOK_BEHIND_PATTERN
;
3369 else if (r
== GET_CHAR_LEN_TOP_ALT_VARLEN
) {
3370 if (IS_SYNTAX_BV(env
->syntax
, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND
))
3371 r
= divide_look_behind_alternatives(node
);
3373 r
= ONIGERR_INVALID_LOOK_BEHIND_PATTERN
;
3380 next_setup(Node
* node
, Node
* next_node
, regex_t
* reg
)
3385 type
= NODE_TYPE(node
);
3386 if (type
== NODE_QUANT
) {
3387 QuantNode
* qn
= QUANT_(node
);
3388 if (qn
->greedy
&& IS_REPEAT_INFINITE(qn
->upper
)) {
3389 #ifdef USE_QUANT_PEEK_NEXT
3390 Node
* n
= get_head_value_node(next_node
, 1, reg
);
3391 /* '\0': for UTF-16BE etc... */
3392 if (IS_NOT_NULL(n
) && STR_(n
)->s
[0] != '\0') {
3393 qn
->next_head_exact
= n
;
3396 /* automatic posseivation a*b ==> (?>a*)b */
3397 if (qn
->lower
<= 1) {
3398 if (NODE_IS_SIMPLE_TYPE(NODE_BODY(node
))) {
3400 x
= get_head_value_node(NODE_BODY(node
), 0, reg
);
3401 if (IS_NOT_NULL(x
)) {
3402 y
= get_head_value_node(next_node
, 0, reg
);
3403 if (IS_NOT_NULL(y
) && is_exclusive(x
, y
, reg
)) {
3404 Node
* en
= onig_node_new_enclosure(ENCLOSURE_STOP_BACKTRACK
);
3405 CHECK_NULL_RETURN_MEMERR(en
);
3406 NODE_STATUS_ADD(en
, STOP_BT_SIMPLE_REPEAT
);
3407 swap_node(node
, en
);
3408 NODE_BODY(node
) = en
;
3415 else if (type
== NODE_ENCLOSURE
) {
3416 EnclosureNode
* en
= ENCLOSURE_(node
);
3417 if (en
->type
== ENCLOSURE_MEMORY
) {
3418 node
= NODE_BODY(node
);
3427 update_string_node_case_fold(regex_t
* reg
, Node
*node
)
3429 UChar
*p
, *end
, buf
[ONIGENC_MBC_CASE_FOLD_MAXLEN
];
3430 UChar
*sbuf
, *ebuf
, *sp
;
3431 int r
, i
, len
, sbuf_size
;
3432 StrNode
* sn
= STR_(node
);
3435 sbuf_size
= (int )(end
- sn
->s
) * 2;
3436 sbuf
= (UChar
* )xmalloc(sbuf_size
);
3437 CHECK_NULL_RETURN_MEMERR(sbuf
);
3438 ebuf
= sbuf
+ sbuf_size
;
3443 len
= ONIGENC_MBC_CASE_FOLD(reg
->enc
, reg
->case_fold_flag
, &p
, end
, buf
);
3444 for (i
= 0; i
< len
; i
++) {
3446 sbuf
= (UChar
* )xrealloc(sbuf
, sbuf_size
* 2, sbuf_size
);
3447 CHECK_NULL_RETURN_MEMERR(sbuf
);
3448 sp
= sbuf
+ sbuf_size
;
3450 ebuf
= sbuf
+ sbuf_size
;
3457 r
= onig_node_str_set(node
, sbuf
, sp
);
3468 expand_case_fold_make_rem_string(Node
** rnode
, UChar
*s
, UChar
*end
, regex_t
* reg
)
3473 node
= onig_node_new_str(s
, end
);
3474 if (IS_NULL(node
)) return ONIGERR_MEMORY
;
3476 r
= update_string_node_case_fold(reg
, node
);
3478 onig_node_free(node
);
3482 NODE_STRING_SET_AMBIG(node
);
3483 NODE_STRING_SET_DONT_GET_OPT_INFO(node
);
3489 expand_case_fold_string_alt(int item_num
, OnigCaseFoldCodeItem items
[], UChar
*p
,
3490 int slen
, UChar
*end
, regex_t
* reg
, Node
**rnode
)
3495 Node
*anode
, *var_anode
, *snode
, *xnode
, *an
;
3496 UChar buf
[ONIGENC_CODE_TO_MBC_MAXLEN
];
3498 *rnode
= var_anode
= NULL_NODE
;
3501 for (i
= 0; i
< item_num
; i
++) {
3502 if (items
[i
].byte_len
!= slen
) {
3509 *rnode
= var_anode
= onig_node_new_alt(NULL_NODE
, NULL_NODE
);
3510 if (IS_NULL(var_anode
)) return ONIGERR_MEMORY
;
3512 xnode
= onig_node_new_list(NULL
, NULL
);
3513 if (IS_NULL(xnode
)) goto mem_err
;
3514 NODE_CAR(var_anode
) = xnode
;
3516 anode
= onig_node_new_alt(NULL_NODE
, NULL_NODE
);
3517 if (IS_NULL(anode
)) goto mem_err
;
3518 NODE_CAR(xnode
) = anode
;
3521 *rnode
= anode
= onig_node_new_alt(NULL_NODE
, NULL_NODE
);
3522 if (IS_NULL(anode
)) return ONIGERR_MEMORY
;
3525 snode
= onig_node_new_str(p
, p
+ slen
);
3526 if (IS_NULL(snode
)) goto mem_err
;
3528 NODE_CAR(anode
) = snode
;
3530 for (i
= 0; i
< item_num
; i
++) {
3531 snode
= onig_node_new_str(NULL
, NULL
);
3532 if (IS_NULL(snode
)) goto mem_err
;
3534 for (j
= 0; j
< items
[i
].code_len
; j
++) {
3535 len
= ONIGENC_CODE_TO_MBC(reg
->enc
, items
[i
].code
[j
], buf
);
3541 r
= onig_node_str_cat(snode
, buf
, buf
+ len
);
3542 if (r
!= 0) goto mem_err2
;
3545 an
= onig_node_new_alt(NULL_NODE
, NULL_NODE
);
3549 //The NULL pointer check is not necessary. It is added just for pass static
3550 //analysis. When condition "items[i].byte_len != slen" is true, "varlen = 1"
3551 //in line 3503 will be reached ,so that "if (IS_NULL(var_anode)) return ONIGERR_MEMORY"
3552 //in line 3510 will be executed, so the null pointer has been checked before
3553 //deferenced in line 3584.
3554 if (items
[i
].byte_len
!= slen
&& IS_NOT_NULL(var_anode
)) {
3556 UChar
*q
= p
+ items
[i
].byte_len
;
3559 r
= expand_case_fold_make_rem_string(&rem
, q
, end
, reg
);
3565 xnode
= onig_node_list_add(NULL_NODE
, snode
);
3566 if (IS_NULL(xnode
)) {
3568 onig_node_free(rem
);
3571 if (IS_NULL(onig_node_list_add(xnode
, rem
))) {
3573 onig_node_free(xnode
);
3574 onig_node_free(rem
);
3578 NODE_CAR(an
) = xnode
;
3581 NODE_CAR(an
) = snode
;
3584 NODE_CDR(var_anode
) = an
;
3588 NODE_CAR(an
) = snode
;
3589 NODE_CDR(anode
) = an
;
3597 onig_node_free(snode
);
3600 onig_node_free(*rnode
);
3602 return ONIGERR_MEMORY
;
3606 expand_case_fold_string(Node
* node
, regex_t
* reg
)
3608 #define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8
3610 int r
, n
, len
, alt_num
;
3611 UChar
*start
, *end
, *p
;
3612 Node
*top_root
, *root
, *snode
, *prev_node
;
3613 OnigCaseFoldCodeItem items
[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM
];
3614 StrNode
* sn
= STR_(node
);
3616 if (NODE_STRING_IS_AMBIG(node
)) return 0;
3620 if (start
>= end
) return 0;
3623 top_root
= root
= prev_node
= snode
= NULL_NODE
;
3627 n
= ONIGENC_GET_CASE_FOLD_CODES_BY_STR(reg
->enc
, reg
->case_fold_flag
, p
, end
,
3634 len
= enclen(reg
->enc
, p
);
3637 if (IS_NULL(snode
)) {
3638 if (IS_NULL(root
) && IS_NOT_NULL(prev_node
)) {
3639 top_root
= root
= onig_node_list_add(NULL_NODE
, prev_node
);
3640 if (IS_NULL(root
)) {
3641 onig_node_free(prev_node
);
3646 prev_node
= snode
= onig_node_new_str(NULL
, NULL
);
3647 if (IS_NULL(snode
)) goto mem_err
;
3648 if (IS_NOT_NULL(root
)) {
3649 if (IS_NULL(onig_node_list_add(root
, snode
))) {
3650 onig_node_free(snode
);
3656 r
= onig_node_str_cat(snode
, p
, p
+ len
);
3657 if (r
!= 0) goto err
;
3661 if (alt_num
> THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION
) break;
3663 if (IS_NULL(root
) && IS_NOT_NULL(prev_node
)) {
3664 top_root
= root
= onig_node_list_add(NULL_NODE
, prev_node
);
3665 if (IS_NULL(root
)) {
3666 onig_node_free(prev_node
);
3671 r
= expand_case_fold_string_alt(n
, items
, p
, len
, end
, reg
, &prev_node
);
3672 if (r
< 0) goto mem_err
;
3674 if (IS_NULL(root
)) {
3675 top_root
= prev_node
;
3678 if (IS_NULL(onig_node_list_add(root
, prev_node
))) {
3679 onig_node_free(prev_node
);
3684 root
= NODE_CAR(prev_node
);
3687 if (IS_NOT_NULL(root
)) {
3688 if (IS_NULL(onig_node_list_add(root
, prev_node
))) {
3689 onig_node_free(prev_node
);
3704 r
= expand_case_fold_make_rem_string(&srem
, p
, end
, reg
);
3705 if (r
!= 0) goto mem_err
;
3707 if (IS_NOT_NULL(prev_node
) && IS_NULL(root
)) {
3708 top_root
= root
= onig_node_list_add(NULL_NODE
, prev_node
);
3709 if (IS_NULL(root
)) {
3710 onig_node_free(srem
);
3711 onig_node_free(prev_node
);
3716 if (IS_NULL(root
)) {
3720 if (IS_NULL(onig_node_list_add(root
, srem
))) {
3721 onig_node_free(srem
);
3728 top_root
= (IS_NOT_NULL(top_root
) ? top_root
: prev_node
);
3729 swap_node(node
, top_root
);
3730 onig_node_free(top_root
);
3737 onig_node_free(top_root
);
3741 #ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT
3742 static enum QuantBodyEmpty
3743 quantifiers_memory_node_info(Node
* node
)
3745 int r
= QUANT_BODY_IS_EMPTY
;
3747 switch (NODE_TYPE(node
)) {
3753 v
= quantifiers_memory_node_info(NODE_CAR(node
));
3755 } while (IS_NOT_NULL(node
= NODE_CDR(node
)));
3761 if (NODE_IS_RECURSION(node
)) {
3762 return QUANT_BODY_IS_EMPTY_REC
; /* tiny version */
3765 r
= quantifiers_memory_node_info(NODE_BODY(node
));
3771 QuantNode
* qn
= QUANT_(node
);
3772 if (qn
->upper
!= 0) {
3773 r
= quantifiers_memory_node_info(NODE_BODY(node
));
3778 case NODE_ENCLOSURE
:
3780 EnclosureNode
* en
= ENCLOSURE_(node
);
3782 case ENCLOSURE_MEMORY
:
3783 if (NODE_IS_RECURSION(node
)) {
3784 return QUANT_BODY_IS_EMPTY_REC
;
3786 return QUANT_BODY_IS_EMPTY_MEM
;
3789 case ENCLOSURE_OPTION
:
3790 case ENCLOSURE_STOP_BACKTRACK
:
3791 r
= quantifiers_memory_node_info(NODE_BODY(node
));
3793 case ENCLOSURE_IF_ELSE
:
3796 r
= quantifiers_memory_node_info(NODE_BODY(node
));
3797 if (IS_NOT_NULL(en
->te
.Then
)) {
3798 v
= quantifiers_memory_node_info(en
->te
.Then
);
3801 if (IS_NOT_NULL(en
->te
.Else
)) {
3802 v
= quantifiers_memory_node_info(en
->te
.Else
);
3825 #endif /* USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT */
3828 #define IN_ALT (1<<0)
3829 #define IN_NOT (1<<1)
3830 #define IN_REAL_REPEAT (1<<2)
3831 #define IN_VAR_REPEAT (1<<3)
3832 #define IN_ZERO_REPEAT (1<<4)
3833 #define IN_MULTI_ENTRY (1<<5)
3841 setup_call_node_call(CallNode
* cn
, ScanEnv
* env
, int state
)
3843 MemEnv
* mem_env
= SCANENV_MEMENV(env
);
3845 if (cn
->by_number
!= 0) {
3846 int gnum
= cn
->group_num
;
3848 if (env
->num_named
> 0 &&
3849 IS_SYNTAX_BV(env
->syntax
, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP
) &&
3850 ! ONIG_IS_OPTION_ON(env
->options
, ONIG_OPTION_CAPTURE_GROUP
)) {
3851 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED
;
3854 if (gnum
> env
->num_mem
) {
3855 onig_scan_env_set_error_string(env
, ONIGERR_UNDEFINED_GROUP_REFERENCE
,
3856 cn
->name
, cn
->name_end
);
3857 return ONIGERR_UNDEFINED_GROUP_REFERENCE
;
3861 NODE_CALL_BODY(cn
) = mem_env
[cn
->group_num
].node
;
3862 if (IS_NULL(NODE_CALL_BODY(cn
))) {
3863 onig_scan_env_set_error_string(env
, ONIGERR_UNDEFINED_NAME_REFERENCE
,
3864 cn
->name
, cn
->name_end
);
3865 return ONIGERR_UNDEFINED_NAME_REFERENCE
;
3871 int n
= onig_name_to_group_numbers(env
->reg
, cn
->name
, cn
->name_end
, &refs
);
3873 onig_scan_env_set_error_string(env
, ONIGERR_UNDEFINED_NAME_REFERENCE
,
3874 cn
->name
, cn
->name_end
);
3875 return ONIGERR_UNDEFINED_NAME_REFERENCE
;
3878 onig_scan_env_set_error_string(env
, ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL
,
3879 cn
->name
, cn
->name_end
);
3880 return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL
;
3883 cn
->group_num
= refs
[0];
3892 setup_call2_call(Node
* node
)
3894 switch (NODE_TYPE(node
)) {
3898 setup_call2_call(NODE_CAR(node
));
3899 } while (IS_NOT_NULL(node
= NODE_CDR(node
)));
3903 setup_call2_call(NODE_BODY(node
));
3907 if (ANCHOR_HAS_BODY(ANCHOR_(node
)))
3908 setup_call2_call(NODE_BODY(node
));
3911 case NODE_ENCLOSURE
:
3913 EnclosureNode
* en
= ENCLOSURE_(node
);
3915 if (en
->type
== ENCLOSURE_MEMORY
) {
3916 if (! NODE_IS_MARK1(node
)) {
3917 NODE_STATUS_ADD(node
, MARK1
);
3918 setup_call2_call(NODE_BODY(node
));
3919 NODE_STATUS_REMOVE(node
, MARK1
);
3922 else if (en
->type
== ENCLOSURE_IF_ELSE
) {
3923 setup_call2_call(NODE_BODY(node
));
3924 if (IS_NOT_NULL(en
->te
.Then
))
3925 setup_call2_call(en
->te
.Then
);
3926 if (IS_NOT_NULL(en
->te
.Else
))
3927 setup_call2_call(en
->te
.Else
);
3930 setup_call2_call(NODE_BODY(node
));
3936 if (! NODE_IS_MARK1(node
)) {
3937 NODE_STATUS_ADD(node
, MARK1
);
3939 CallNode
* cn
= CALL_(node
);
3940 Node
* called
= NODE_CALL_BODY(cn
);
3944 NODE_STATUS_ADD(called
, CALLED
);
3945 ENCLOSURE_(called
)->m
.entry_count
++;
3946 setup_call2_call(called
);
3948 NODE_STATUS_REMOVE(node
, MARK1
);
3958 setup_call(Node
* node
, ScanEnv
* env
, int state
)
3962 switch (NODE_TYPE(node
)) {
3966 r
= setup_call(NODE_CAR(node
), env
, state
);
3967 } while (r
== 0 && IS_NOT_NULL(node
= NODE_CDR(node
)));
3971 if (QUANT_(node
)->upper
== 0)
3972 state
|= IN_ZERO_REPEAT
;
3974 r
= setup_call(NODE_BODY(node
), env
, state
);
3978 if (ANCHOR_HAS_BODY(ANCHOR_(node
)))
3979 r
= setup_call(NODE_BODY(node
), env
, state
);
3984 case NODE_ENCLOSURE
:
3986 EnclosureNode
* en
= ENCLOSURE_(node
);
3988 if (en
->type
== ENCLOSURE_MEMORY
) {
3989 if ((state
& IN_ZERO_REPEAT
) != 0) {
3990 NODE_STATUS_ADD(node
, IN_ZERO_REPEAT
);
3991 ENCLOSURE_(node
)->m
.entry_count
--;
3993 r
= setup_call(NODE_BODY(node
), env
, state
);
3995 else if (en
->type
== ENCLOSURE_IF_ELSE
) {
3996 r
= setup_call(NODE_BODY(node
), env
, state
);
3997 if (r
!= 0) return r
;
3998 if (IS_NOT_NULL(en
->te
.Then
)) {
3999 r
= setup_call(en
->te
.Then
, env
, state
);
4000 if (r
!= 0) return r
;
4002 if (IS_NOT_NULL(en
->te
.Else
))
4003 r
= setup_call(en
->te
.Else
, env
, state
);
4006 r
= setup_call(NODE_BODY(node
), env
, state
);
4011 if ((state
& IN_ZERO_REPEAT
) != 0) {
4012 NODE_STATUS_ADD(node
, IN_ZERO_REPEAT
);
4013 CALL_(node
)->entry_count
--;
4016 r
= setup_call_node_call(CALL_(node
), env
, state
);
4028 setup_call2(Node
* node
)
4032 switch (NODE_TYPE(node
)) {
4036 r
= setup_call2(NODE_CAR(node
));
4037 } while (r
== 0 && IS_NOT_NULL(node
= NODE_CDR(node
)));
4041 if (QUANT_(node
)->upper
!= 0)
4042 r
= setup_call2(NODE_BODY(node
));
4046 if (ANCHOR_HAS_BODY(ANCHOR_(node
)))
4047 r
= setup_call2(NODE_BODY(node
));
4050 case NODE_ENCLOSURE
:
4051 if (! NODE_IS_IN_ZERO_REPEAT(node
))
4052 r
= setup_call2(NODE_BODY(node
));
4055 EnclosureNode
* en
= ENCLOSURE_(node
);
4057 if (r
!= 0) return r
;
4058 if (en
->type
== ENCLOSURE_IF_ELSE
) {
4059 if (IS_NOT_NULL(en
->te
.Then
)) {
4060 r
= setup_call2(en
->te
.Then
);
4061 if (r
!= 0) return r
;
4063 if (IS_NOT_NULL(en
->te
.Else
))
4064 r
= setup_call2(en
->te
.Else
);
4070 if (! NODE_IS_IN_ZERO_REPEAT(node
)) {
4071 setup_call2_call(node
);
4084 setup_called_state_call(Node
* node
, int state
)
4086 switch (NODE_TYPE(node
)) {
4092 setup_called_state_call(NODE_CAR(node
), state
);
4093 } while (IS_NOT_NULL(node
= NODE_CDR(node
)));
4098 QuantNode
* qn
= QUANT_(node
);
4100 if (IS_REPEAT_INFINITE(qn
->upper
) || qn
->upper
>= 2)
4101 state
|= IN_REAL_REPEAT
;
4102 if (qn
->lower
!= qn
->upper
)
4103 state
|= IN_VAR_REPEAT
;
4105 setup_called_state_call(NODE_QUANT_BODY(qn
), state
);
4111 AnchorNode
* an
= ANCHOR_(node
);
4114 case ANCHOR_PREC_READ_NOT
:
4115 case ANCHOR_LOOK_BEHIND_NOT
:
4118 case ANCHOR_PREC_READ
:
4119 case ANCHOR_LOOK_BEHIND
:
4120 setup_called_state_call(NODE_ANCHOR_BODY(an
), state
);
4128 case NODE_ENCLOSURE
:
4130 EnclosureNode
* en
= ENCLOSURE_(node
);
4132 if (en
->type
== ENCLOSURE_MEMORY
) {
4133 if (NODE_IS_MARK1(node
)) {
4134 if ((~en
->m
.called_state
& state
) != 0) {
4135 en
->m
.called_state
|= state
;
4136 setup_called_state_call(NODE_BODY(node
), state
);
4140 NODE_STATUS_ADD(node
, MARK1
);
4141 en
->m
.called_state
|= state
;
4142 setup_called_state_call(NODE_BODY(node
), state
);
4143 NODE_STATUS_REMOVE(node
, MARK1
);
4146 else if (en
->type
== ENCLOSURE_IF_ELSE
) {
4147 if (IS_NOT_NULL(en
->te
.Then
)) {
4148 setup_called_state_call(en
->te
.Then
, state
);
4150 if (IS_NOT_NULL(en
->te
.Else
))
4151 setup_called_state_call(en
->te
.Else
, state
);
4154 setup_called_state_call(NODE_BODY(node
), state
);
4160 setup_called_state_call(NODE_BODY(node
), state
);
4169 setup_called_state(Node
* node
, int state
)
4171 switch (NODE_TYPE(node
)) {
4177 setup_called_state(NODE_CAR(node
), state
);
4178 } while (IS_NOT_NULL(node
= NODE_CDR(node
)));
4183 setup_called_state_call(node
, state
);
4187 case NODE_ENCLOSURE
:
4189 EnclosureNode
* en
= ENCLOSURE_(node
);
4192 case ENCLOSURE_MEMORY
:
4193 if (en
->m
.entry_count
> 1)
4194 state
|= IN_MULTI_ENTRY
;
4196 en
->m
.called_state
|= state
;
4198 case ENCLOSURE_OPTION
:
4199 case ENCLOSURE_STOP_BACKTRACK
:
4200 setup_called_state(NODE_BODY(node
), state
);
4202 case ENCLOSURE_IF_ELSE
:
4203 setup_called_state(NODE_BODY(node
), state
);
4204 if (IS_NOT_NULL(en
->te
.Then
))
4205 setup_called_state(en
->te
.Then
, state
);
4206 if (IS_NOT_NULL(en
->te
.Else
))
4207 setup_called_state(en
->te
.Else
, state
);
4215 QuantNode
* qn
= QUANT_(node
);
4217 if (IS_REPEAT_INFINITE(qn
->upper
) || qn
->upper
>= 2)
4218 state
|= IN_REAL_REPEAT
;
4219 if (qn
->lower
!= qn
->upper
)
4220 state
|= IN_VAR_REPEAT
;
4222 setup_called_state(NODE_QUANT_BODY(qn
), state
);
4228 AnchorNode
* an
= ANCHOR_(node
);
4231 case ANCHOR_PREC_READ_NOT
:
4232 case ANCHOR_LOOK_BEHIND_NOT
:
4235 case ANCHOR_PREC_READ
:
4236 case ANCHOR_LOOK_BEHIND
:
4237 setup_called_state(NODE_ANCHOR_BODY(an
), state
);
4255 #endif /* USE_CALL */
4258 static int setup_tree(Node
* node
, regex_t
* reg
, int state
, ScanEnv
* env
);
4264 setup_anchor(Node
* node
, regex_t
* reg
, int state
, ScanEnv
* env
)
4266 /* allowed node types in look-behind */
4267 #define ALLOWED_TYPE_IN_LB \
4268 ( NODE_BIT_LIST | NODE_BIT_ALT | NODE_BIT_STRING | NODE_BIT_CCLASS \
4269 | NODE_BIT_CTYPE | NODE_BIT_ANCHOR | NODE_BIT_ENCLOSURE | NODE_BIT_QUANT \
4270 | NODE_BIT_CALL | NODE_BIT_GIMMICK)
4272 #define ALLOWED_ENCLOSURE_IN_LB ( 1<<ENCLOSURE_MEMORY | 1<<ENCLOSURE_OPTION )
4273 #define ALLOWED_ENCLOSURE_IN_LB_NOT (1<<ENCLOSURE_OPTION)
4275 #define ALLOWED_ANCHOR_IN_LB \
4276 ( ANCHOR_LOOK_BEHIND | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF \
4277 | ANCHOR_BEGIN_POSITION | ANCHOR_WORD_BOUNDARY | ANCHOR_NO_WORD_BOUNDARY \
4278 | ANCHOR_WORD_BEGIN | ANCHOR_WORD_END \
4279 | ANCHOR_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY \
4280 | ANCHOR_NO_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY )
4282 #define ALLOWED_ANCHOR_IN_LB_NOT \
4283 ( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE \
4284 | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION | ANCHOR_WORD_BOUNDARY \
4285 | ANCHOR_NO_WORD_BOUNDARY | ANCHOR_WORD_BEGIN | ANCHOR_WORD_END \
4286 | ANCHOR_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY \
4287 | ANCHOR_NO_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY )
4290 AnchorNode
* an
= ANCHOR_(node
);
4293 case ANCHOR_PREC_READ
:
4294 r
= setup_tree(NODE_ANCHOR_BODY(an
), reg
, state
, env
);
4296 case ANCHOR_PREC_READ_NOT
:
4297 r
= setup_tree(NODE_ANCHOR_BODY(an
), reg
, (state
| IN_NOT
), env
);
4300 case ANCHOR_LOOK_BEHIND
:
4302 r
= check_type_tree(NODE_ANCHOR_BODY(an
), ALLOWED_TYPE_IN_LB
,
4303 ALLOWED_ENCLOSURE_IN_LB
, ALLOWED_ANCHOR_IN_LB
);
4304 if (r
< 0) return r
;
4305 if (r
> 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN
;
4306 r
= setup_tree(NODE_ANCHOR_BODY(an
), reg
, state
, env
);
4307 if (r
!= 0) return r
;
4308 r
= setup_look_behind(node
, reg
, env
);
4312 case ANCHOR_LOOK_BEHIND_NOT
:
4314 r
= check_type_tree(NODE_ANCHOR_BODY(an
), ALLOWED_TYPE_IN_LB
,
4315 ALLOWED_ENCLOSURE_IN_LB_NOT
, ALLOWED_ANCHOR_IN_LB_NOT
);
4316 if (r
< 0) return r
;
4317 if (r
> 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN
;
4318 r
= setup_tree(NODE_ANCHOR_BODY(an
), reg
, (state
| IN_NOT
), env
);
4319 if (r
!= 0) return r
;
4320 r
= setup_look_behind(node
, reg
, env
);
4336 setup_quant(Node
* node
, regex_t
* reg
, int state
, ScanEnv
* env
)
4340 QuantNode
* qn
= QUANT_(node
);
4341 Node
* body
= NODE_BODY(node
);
4343 if ((state
& IN_REAL_REPEAT
) != 0) {
4344 NODE_STATUS_ADD(node
, IN_REAL_REPEAT
);
4346 if ((state
& IN_MULTI_ENTRY
) != 0) {
4347 NODE_STATUS_ADD(node
, IN_MULTI_ENTRY
);
4350 if (IS_REPEAT_INFINITE(qn
->upper
) || qn
->upper
>= 1) {
4351 d
= tree_min_len(body
, env
);
4353 #ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT
4354 qn
->body_empty_info
= quantifiers_memory_node_info(body
);
4355 if (qn
->body_empty_info
== QUANT_BODY_IS_EMPTY_REC
) {
4356 if (NODE_TYPE(body
) == NODE_ENCLOSURE
&&
4357 ENCLOSURE_(body
)->type
== ENCLOSURE_MEMORY
) {
4358 MEM_STATUS_ON(env
->bt_mem_end
, ENCLOSURE_(body
)->m
.regnum
);
4362 qn
->body_empty_info
= QUANT_BODY_IS_EMPTY
;
4367 if (IS_REPEAT_INFINITE(qn
->upper
) || qn
->upper
>= 2)
4368 state
|= IN_REAL_REPEAT
;
4369 if (qn
->lower
!= qn
->upper
)
4370 state
|= IN_VAR_REPEAT
;
4372 r
= setup_tree(body
, reg
, state
, env
);
4373 if (r
!= 0) return r
;
4376 #define EXPAND_STRING_MAX_LENGTH 100
4377 if (NODE_TYPE(body
) == NODE_STRING
) {
4378 if (!IS_REPEAT_INFINITE(qn
->lower
) && qn
->lower
== qn
->upper
&&
4379 qn
->lower
> 1 && qn
->lower
<= EXPAND_STRING_MAX_LENGTH
) {
4380 int len
= NODE_STRING_LEN(body
);
4381 StrNode
* sn
= STR_(body
);
4383 if (len
* qn
->lower
<= EXPAND_STRING_MAX_LENGTH
) {
4384 int i
, n
= qn
->lower
;
4385 onig_node_conv_to_str_node(node
, STR_(body
)->flag
);
4386 for (i
= 0; i
< n
; i
++) {
4387 r
= onig_node_str_cat(node
, sn
->s
, sn
->end
);
4388 if (r
!= 0) return r
;
4390 onig_node_free(body
);
4396 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
4397 if (qn
->greedy
&& (qn
->body_empty_info
!= QUANT_BODY_IS_NOT_EMPTY
)) {
4398 if (NODE_TYPE(body
) == NODE_QUANT
) {
4399 QuantNode
* tqn
= QUANT_(body
);
4400 if (IS_NOT_NULL(tqn
->head_exact
)) {
4401 qn
->head_exact
= tqn
->head_exact
;
4402 tqn
->head_exact
= NULL
;
4406 qn
->head_exact
= get_head_value_node(NODE_BODY(node
), 1, reg
);
4414 /* setup_tree does the following work.
4415 1. check empty loop. (set qn->body_empty_info)
4416 2. expand ignore-case in char class.
4417 3. set memory status bit flags. (reg->mem_stats)
4418 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact].
4419 5. find invalid patterns in look-behind.
4420 6. expand repeated string.
4423 setup_tree(Node
* node
, regex_t
* reg
, int state
, ScanEnv
* env
)
4427 switch (NODE_TYPE(node
)) {
4430 Node
* prev
= NULL_NODE
;
4432 r
= setup_tree(NODE_CAR(node
), reg
, state
, env
);
4433 if (IS_NOT_NULL(prev
) && r
== 0) {
4434 r
= next_setup(prev
, NODE_CAR(node
), reg
);
4436 prev
= NODE_CAR(node
);
4437 } while (r
== 0 && IS_NOT_NULL(node
= NODE_CDR(node
)));
4443 r
= setup_tree(NODE_CAR(node
), reg
, (state
| IN_ALT
), env
);
4444 } while (r
== 0 && IS_NOT_NULL(node
= NODE_CDR(node
)));
4448 if (IS_IGNORECASE(reg
->options
) && !NODE_STRING_IS_RAW(node
)) {
4449 r
= expand_case_fold_string(node
, reg
);
4457 BackRefNode
* br
= BACKREF_(node
);
4459 for (i
= 0; i
< br
->back_num
; i
++) {
4460 if (p
[i
] > env
->num_mem
) return ONIGERR_INVALID_BACKREF
;
4461 MEM_STATUS_ON(env
->backrefed_mem
, p
[i
]);
4462 MEM_STATUS_ON(env
->bt_mem_start
, p
[i
]);
4463 #ifdef USE_BACKREF_WITH_LEVEL
4464 if (NODE_IS_NEST_LEVEL(node
)) {
4465 MEM_STATUS_ON(env
->bt_mem_end
, p
[i
]);
4472 case NODE_ENCLOSURE
:
4474 EnclosureNode
* en
= ENCLOSURE_(node
);
4477 case ENCLOSURE_OPTION
:
4479 OnigOptionType options
= reg
->options
;
4480 reg
->options
= ENCLOSURE_(node
)->o
.options
;
4481 r
= setup_tree(NODE_BODY(node
), reg
, state
, env
);
4482 reg
->options
= options
;
4486 case ENCLOSURE_MEMORY
:
4488 state
|= en
->m
.called_state
;
4491 if ((state
& (IN_ALT
| IN_NOT
| IN_VAR_REPEAT
| IN_MULTI_ENTRY
)) != 0
4492 || NODE_IS_RECURSION(node
)) {
4493 MEM_STATUS_ON(env
->bt_mem_start
, en
->m
.regnum
);
4495 r
= setup_tree(NODE_BODY(node
), reg
, state
, env
);
4498 case ENCLOSURE_STOP_BACKTRACK
:
4500 Node
* target
= NODE_BODY(node
);
4501 r
= setup_tree(target
, reg
, state
, env
);
4502 if (NODE_TYPE(target
) == NODE_QUANT
) {
4503 QuantNode
* tqn
= QUANT_(target
);
4504 if (IS_REPEAT_INFINITE(tqn
->upper
) && tqn
->lower
<= 1 &&
4505 tqn
->greedy
!= 0) { /* (?>a*), a*+ etc... */
4506 if (NODE_IS_SIMPLE_TYPE(NODE_BODY(target
)))
4507 NODE_STATUS_ADD(node
, STOP_BT_SIMPLE_REPEAT
);
4513 case ENCLOSURE_IF_ELSE
:
4514 r
= setup_tree(NODE_BODY(node
), reg
, (state
| IN_ALT
), env
);
4515 if (r
!= 0) return r
;
4516 if (IS_NOT_NULL(en
->te
.Then
)) {
4517 r
= setup_tree(en
->te
.Then
, reg
, (state
| IN_ALT
), env
);
4518 if (r
!= 0) return r
;
4520 if (IS_NOT_NULL(en
->te
.Else
))
4521 r
= setup_tree(en
->te
.Else
, reg
, (state
| IN_ALT
), env
);
4528 r
= setup_quant(node
, reg
, state
, env
);
4532 r
= setup_anchor(node
, reg
, state
, env
);
4548 /* set skip map for Boyer-Moore search */
4550 set_bm_skip(UChar
* s
, UChar
* end
, OnigEncoding enc ARG_UNUSED
,
4551 UChar skip
[], int** int_skip
)
4555 len
= (int )(end
- s
);
4556 if (len
< ONIG_CHAR_TABLE_SIZE
) {
4557 for (i
= 0; i
< ONIG_CHAR_TABLE_SIZE
; i
++) skip
[i
] = len
;
4559 for (i
= 0; i
< len
- 1; i
++)
4560 skip
[s
[i
]] = len
- 1 - i
;
4563 if (IS_NULL(*int_skip
)) {
4564 *int_skip
= (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE
);
4565 if (IS_NULL(*int_skip
)) return ONIGERR_MEMORY
;
4567 for (i
= 0; i
< ONIG_CHAR_TABLE_SIZE
; i
++) (*int_skip
)[i
] = len
;
4569 for (i
= 0; i
< len
- 1; i
++)
4570 (*int_skip
)[s
[i
]] = len
- 1 - i
;
4575 #define OPT_EXACT_MAXLEN 24
4578 OnigLen min
; /* min byte length */
4579 OnigLen max
; /* max byte length */
4585 OnigOptionType options
;
4586 OnigCaseFoldType case_fold_flag
;
4596 MinMax mmd
; /* position */
4601 UChar s
[OPT_EXACT_MAXLEN
];
4605 MinMax mmd
; /* position */
4607 int value
; /* weighted value */
4608 UChar map
[ONIG_CHAR_TABLE_SIZE
];
4614 OptExact exb
; /* boundary */
4615 OptExact exm
; /* middle */
4616 OptExact expr
; /* prec read (?=...) */
4617 OptMap map
; /* boundary */
4622 map_position_value(OnigEncoding enc
, int i
)
4624 static const short int Vals
[] = {
4625 5, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1,
4626 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4627 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
4628 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5,
4629 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
4630 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5,
4631 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
4632 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1
4635 if (i
< (int )(sizeof(Vals
)/sizeof(Vals
[0]))) {
4636 if (i
== 0 && ONIGENC_MBC_MINLEN(enc
) > 1)
4639 return (int )Vals
[i
];
4642 return 4; /* Take it easy. */
4646 distance_value(MinMax
* mm
)
4648 /* 1000 / (min-max-dist + 1) */
4649 static const short int dist_vals
[] = {
4650 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100,
4651 91, 83, 77, 71, 67, 63, 59, 56, 53, 50,
4652 48, 45, 43, 42, 40, 38, 37, 36, 34, 33,
4653 32, 31, 30, 29, 29, 28, 27, 26, 26, 25,
4654 24, 24, 23, 23, 22, 22, 21, 21, 20, 20,
4655 20, 19, 19, 19, 18, 18, 18, 17, 17, 17,
4656 16, 16, 16, 16, 15, 15, 15, 15, 14, 14,
4657 14, 14, 14, 14, 13, 13, 13, 13, 13, 13,
4658 12, 12, 12, 12, 12, 12, 11, 11, 11, 11,
4659 11, 11, 11, 11, 11, 10, 10, 10, 10, 10
4664 if (mm
->max
== INFINITE_LEN
) return 0;
4666 d
= mm
->max
- mm
->min
;
4667 if (d
< (OnigLen
)(sizeof(dist_vals
)/sizeof(dist_vals
[0])))
4668 /* return dist_vals[d] * 16 / (mm->min + 12); */
4669 return (int )dist_vals
[d
];
4675 comp_distance_value(MinMax
* d1
, MinMax
* d2
, int v1
, int v2
)
4677 if (v2
<= 0) return -1;
4678 if (v1
<= 0) return 1;
4680 v1
*= distance_value(d1
);
4681 v2
*= distance_value(d2
);
4683 if (v2
> v1
) return 1;
4684 if (v2
< v1
) return -1;
4686 if (d2
->min
< d1
->min
) return 1;
4687 if (d2
->min
> d1
->min
) return -1;
4692 is_equal_mml(MinMax
* a
, MinMax
* b
)
4694 return (a
->min
== b
->min
&& a
->max
== b
->max
) ? 1 : 0;
4698 set_mml(MinMax
* l
, OnigLen min
, OnigLen max
)
4705 clear_mml(MinMax
* l
)
4707 l
->min
= l
->max
= 0;
4711 copy_mml(MinMax
* to
, MinMax
* from
)
4713 to
->min
= from
->min
;
4714 to
->max
= from
->max
;
4718 add_mml(MinMax
* to
, MinMax
* from
)
4720 to
->min
= distance_add(to
->min
, from
->min
);
4721 to
->max
= distance_add(to
->max
, from
->max
);
4725 alt_merge_mml(MinMax
* to
, MinMax
* from
)
4727 if (to
->min
> from
->min
) to
->min
= from
->min
;
4728 if (to
->max
< from
->max
) to
->max
= from
->max
;
4732 copy_opt_env(OptEnv
* to
, OptEnv
* from
)
4738 clear_opt_anc_info(OptAnc
* a
)
4745 copy_opt_anc_info(OptAnc
* to
, OptAnc
* from
)
4751 concat_opt_anc_info(OptAnc
* to
, OptAnc
* left
, OptAnc
* right
,
4752 OnigLen left_len
, OnigLen right_len
)
4754 clear_opt_anc_info(to
);
4756 to
->left
= left
->left
;
4757 if (left_len
== 0) {
4758 to
->left
|= right
->left
;
4761 to
->right
= right
->right
;
4762 if (right_len
== 0) {
4763 to
->right
|= left
->right
;
4766 to
->right
|= (left
->right
& ANCHOR_PREC_READ_NOT
);
4773 if (a
== ANCHOR_END_BUF
|| a
== ANCHOR_SEMI_END_BUF
||
4774 a
== ANCHOR_END_LINE
|| a
== ANCHOR_PREC_READ
|| a
== ANCHOR_PREC_READ_NOT
)
4781 is_set_opt_anc_info(OptAnc
* to
, int anc
)
4783 if ((to
->left
& anc
) != 0) return 1;
4785 return ((to
->right
& anc
) != 0 ? 1 : 0);
4789 add_opt_anc_info(OptAnc
* to
, int anc
)
4798 remove_opt_anc_info(OptAnc
* to
, int anc
)
4807 alt_merge_opt_anc_info(OptAnc
* to
, OptAnc
* add
)
4809 to
->left
&= add
->left
;
4810 to
->right
&= add
->right
;
4814 is_full_opt_exact(OptExact
* e
)
4816 return (e
->len
>= OPT_EXACT_MAXLEN
? 1 : 0);
4820 clear_opt_exact(OptExact
* e
)
4823 clear_opt_anc_info(&e
->anc
);
4831 copy_opt_exact(OptExact
* to
, OptExact
* from
)
4837 concat_opt_exact(OptExact
* to
, OptExact
* add
, OnigEncoding enc
)
4843 if (! to
->ignore_case
&& add
->ignore_case
) {
4844 if (to
->len
>= add
->len
) return 0; /* avoid */
4846 to
->ignore_case
= 1;
4852 for (i
= to
->len
; p
< end
; ) {
4853 len
= enclen(enc
, p
);
4854 if (i
+ len
> OPT_EXACT_MAXLEN
) {
4858 for (j
= 0; j
< len
&& p
< end
; j
++)
4863 to
->reach_end
= (p
== end
? add
->reach_end
: 0);
4865 concat_opt_anc_info(&tanc
, &to
->anc
, &add
->anc
, 1, 1);
4866 if (! to
->reach_end
) tanc
.right
= 0;
4867 copy_opt_anc_info(&to
->anc
, &tanc
);
4873 concat_opt_exact_str(OptExact
* to
, UChar
* s
, UChar
* end
, OnigEncoding enc
)
4878 for (i
= to
->len
, p
= s
; p
< end
&& i
< OPT_EXACT_MAXLEN
; ) {
4879 len
= enclen(enc
, p
);
4880 if (i
+ len
> OPT_EXACT_MAXLEN
) break;
4881 for (j
= 0; j
< len
&& p
< end
; j
++)
4889 alt_merge_opt_exact(OptExact
* to
, OptExact
* add
, OptEnv
* env
)
4893 if (add
->len
== 0 || to
->len
== 0) {
4894 clear_opt_exact(to
);
4898 if (! is_equal_mml(&to
->mmd
, &add
->mmd
)) {
4899 clear_opt_exact(to
);
4903 for (i
= 0; i
< to
->len
&& i
< add
->len
; ) {
4904 if (to
->s
[i
] != add
->s
[i
]) break;
4905 len
= enclen(env
->enc
, to
->s
+ i
);
4907 for (j
= 1; j
< len
; j
++) {
4908 if (to
->s
[i
+j
] != add
->s
[i
+j
]) break;
4914 if (! add
->reach_end
|| i
< add
->len
|| i
< to
->len
) {
4918 to
->ignore_case
|= add
->ignore_case
;
4920 alt_merge_opt_anc_info(&to
->anc
, &add
->anc
);
4921 if (! to
->reach_end
) to
->anc
.right
= 0;
4925 select_opt_exact(OnigEncoding enc
, OptExact
* now
, OptExact
* alt
)
4936 copy_opt_exact(now
, alt
);
4939 else if (vn
<= 2 && va
<= 2) {
4940 /* ByteValTable[x] is big value --> low price */
4941 va
= map_position_value(enc
, now
->s
[0]);
4942 vn
= map_position_value(enc
, alt
->s
[0]);
4944 if (now
->len
> 1) vn
+= 5;
4945 if (alt
->len
> 1) va
+= 5;
4948 if (now
->ignore_case
== 0) vn
*= 2;
4949 if (alt
->ignore_case
== 0) va
*= 2;
4951 if (comp_distance_value(&now
->mmd
, &alt
->mmd
, vn
, va
) > 0)
4952 copy_opt_exact(now
, alt
);
4956 clear_opt_map(OptMap
* map
)
4958 static const OptMap clean_info
= {
4961 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4962 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4963 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4964 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4965 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4966 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4967 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4968 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4969 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4970 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4971 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4972 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4973 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4974 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4975 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4976 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
4980 xmemcpy(map
, &clean_info
, sizeof(OptMap
));
4984 copy_opt_map(OptMap
* to
, OptMap
* from
)
4986 xmemcpy(to
,from
,sizeof(OptMap
));
4990 add_char_opt_map(OptMap
* m
, UChar c
, OnigEncoding enc
)
4992 if (m
->map
[c
] == 0) {
4994 m
->value
+= map_position_value(enc
, c
);
4999 add_char_amb_opt_map(OptMap
* map
, UChar
* p
, UChar
* end
,
5000 OnigEncoding enc
, OnigCaseFoldType fold_flag
)
5002 OnigCaseFoldCodeItem items
[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM
];
5003 UChar buf
[ONIGENC_CODE_TO_MBC_MAXLEN
];
5006 add_char_opt_map(map
, p
[0], enc
);
5008 fold_flag
= DISABLE_CASE_FOLD_MULTI_CHAR(fold_flag
);
5009 n
= ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc
, fold_flag
, p
, end
, items
);
5010 if (n
< 0) return n
;
5012 for (i
= 0; i
< n
; i
++) {
5013 ONIGENC_CODE_TO_MBC(enc
, items
[i
].code
[0], buf
);
5014 add_char_opt_map(map
, buf
[0], enc
);
5021 select_opt_map(OptMap
* now
, OptMap
* alt
)
5023 static int z
= 1<<15; /* 32768: something big value */
5027 if (alt
->value
== 0) return ;
5028 if (now
->value
== 0) {
5029 copy_opt_map(now
, alt
);
5033 vn
= z
/ now
->value
;
5034 va
= z
/ alt
->value
;
5035 if (comp_distance_value(&now
->mmd
, &alt
->mmd
, vn
, va
) > 0)
5036 copy_opt_map(now
, alt
);
5040 comp_opt_exact_or_map(OptExact
* e
, OptMap
* m
)
5042 #define COMP_EM_BASE 20
5045 if (m
->value
<= 0) return -1;
5047 ae
= COMP_EM_BASE
* e
->len
* (e
->ignore_case
? 1 : 2);
5048 am
= COMP_EM_BASE
* 5 * 2 / m
->value
;
5049 return comp_distance_value(&e
->mmd
, &m
->mmd
, ae
, am
);
5053 alt_merge_opt_map(OnigEncoding enc
, OptMap
* to
, OptMap
* add
)
5057 /* if (! is_equal_mml(&to->mmd, &add->mmd)) return ; */
5058 if (to
->value
== 0) return ;
5059 if (add
->value
== 0 || to
->mmd
.max
< add
->mmd
.min
) {
5064 alt_merge_mml(&to
->mmd
, &add
->mmd
);
5067 for (i
= 0; i
< ONIG_CHAR_TABLE_SIZE
; i
++) {
5072 val
+= map_position_value(enc
, i
);
5076 alt_merge_opt_anc_info(&to
->anc
, &add
->anc
);
5080 set_bound_node_opt_info(NodeOpt
* opt
, MinMax
* plen
)
5082 copy_mml(&(opt
->exb
.mmd
), plen
);
5083 copy_mml(&(opt
->expr
.mmd
), plen
);
5084 copy_mml(&(opt
->map
.mmd
), plen
);
5088 clear_node_opt_info(NodeOpt
* opt
)
5090 clear_mml(&opt
->len
);
5091 clear_opt_anc_info(&opt
->anc
);
5092 clear_opt_exact(&opt
->exb
);
5093 clear_opt_exact(&opt
->exm
);
5094 clear_opt_exact(&opt
->expr
);
5095 clear_opt_map(&opt
->map
);
5099 copy_node_opt_info(NodeOpt
* to
, NodeOpt
* from
)
5101 xmemcpy(to
,from
,sizeof(NodeOpt
));
5105 concat_left_node_opt_info(OnigEncoding enc
, NodeOpt
* to
, NodeOpt
* add
)
5107 int exb_reach
, exm_reach
;
5110 concat_opt_anc_info(&tanc
, &to
->anc
, &add
->anc
, to
->len
.max
, add
->len
.max
);
5111 copy_opt_anc_info(&to
->anc
, &tanc
);
5113 if (add
->exb
.len
> 0 && to
->len
.max
== 0) {
5114 concat_opt_anc_info(&tanc
, &to
->anc
, &add
->exb
.anc
, to
->len
.max
, add
->len
.max
);
5115 copy_opt_anc_info(&add
->exb
.anc
, &tanc
);
5118 if (add
->map
.value
> 0 && to
->len
.max
== 0) {
5119 if (add
->map
.mmd
.max
== 0)
5120 add
->map
.anc
.left
|= to
->anc
.left
;
5123 exb_reach
= to
->exb
.reach_end
;
5124 exm_reach
= to
->exm
.reach_end
;
5126 if (add
->len
.max
!= 0)
5127 to
->exb
.reach_end
= to
->exm
.reach_end
= 0;
5129 if (add
->exb
.len
> 0) {
5131 concat_opt_exact(&to
->exb
, &add
->exb
, enc
);
5132 clear_opt_exact(&add
->exb
);
5134 else if (exm_reach
) {
5135 concat_opt_exact(&to
->exm
, &add
->exb
, enc
);
5136 clear_opt_exact(&add
->exb
);
5139 select_opt_exact(enc
, &to
->exm
, &add
->exb
);
5140 select_opt_exact(enc
, &to
->exm
, &add
->exm
);
5142 if (to
->expr
.len
> 0) {
5143 if (add
->len
.max
> 0) {
5144 if (to
->expr
.len
> (int )add
->len
.max
)
5145 to
->expr
.len
= add
->len
.max
;
5147 if (to
->expr
.mmd
.max
== 0)
5148 select_opt_exact(enc
, &to
->exb
, &to
->expr
);
5150 select_opt_exact(enc
, &to
->exm
, &to
->expr
);
5153 else if (add
->expr
.len
> 0) {
5154 copy_opt_exact(&to
->expr
, &add
->expr
);
5157 select_opt_map(&to
->map
, &add
->map
);
5158 add_mml(&to
->len
, &add
->len
);
5162 alt_merge_node_opt_info(NodeOpt
* to
, NodeOpt
* add
, OptEnv
* env
)
5164 alt_merge_opt_anc_info(&to
->anc
, &add
->anc
);
5165 alt_merge_opt_exact(&to
->exb
, &add
->exb
, env
);
5166 alt_merge_opt_exact(&to
->exm
, &add
->exm
, env
);
5167 alt_merge_opt_exact(&to
->expr
, &add
->expr
, env
);
5168 alt_merge_opt_map(env
->enc
, &to
->map
, &add
->map
);
5170 alt_merge_mml(&to
->len
, &add
->len
);
5174 #define MAX_NODE_OPT_INFO_REF_COUNT 5
5177 optimize_nodes(Node
* node
, NodeOpt
* opt
, OptEnv
* env
)
5186 clear_node_opt_info(opt
);
5187 set_bound_node_opt_info(opt
, &env
->mmd
);
5189 switch (NODE_TYPE(node
)) {
5195 copy_opt_env(&nenv
, env
);
5197 r
= optimize_nodes(NODE_CAR(nd
), &xo
, &nenv
);
5199 add_mml(&nenv
.mmd
, &xo
.len
);
5200 concat_left_node_opt_info(enc
, opt
, &xo
);
5202 } while (r
== 0 && IS_NOT_NULL(nd
= NODE_CDR(nd
)));
5211 r
= optimize_nodes(NODE_CAR(nd
), &xo
, env
);
5213 if (nd
== node
) copy_node_opt_info(opt
, &xo
);
5214 else alt_merge_node_opt_info(opt
, &xo
, env
);
5216 } while ((r
== 0) && IS_NOT_NULL(nd
= NODE_CDR(nd
)));
5222 StrNode
* sn
= STR_(node
);
5223 int slen
= (int )(sn
->end
- sn
->s
);
5224 /* int is_raw = NODE_STRING_IS_RAW(node); */
5226 if (! NODE_STRING_IS_AMBIG(node
)) {
5227 concat_opt_exact_str(&opt
->exb
, sn
->s
, sn
->end
, enc
);
5229 add_char_opt_map(&opt
->map
, *(sn
->s
), enc
);
5231 set_mml(&opt
->len
, slen
, slen
);
5236 if (NODE_STRING_IS_DONT_GET_OPT_INFO(node
)) {
5237 int n
= onigenc_strlen(enc
, sn
->s
, sn
->end
);
5238 max
= ONIGENC_MBC_MAXLEN_DIST(enc
) * n
;
5241 concat_opt_exact_str(&opt
->exb
, sn
->s
, sn
->end
, enc
);
5242 opt
->exb
.ignore_case
= 1;
5245 r
= add_char_amb_opt_map(&opt
->map
, sn
->s
, sn
->end
,
5246 enc
, env
->case_fold_flag
);
5253 set_mml(&opt
->len
, slen
, max
);
5256 if (opt
->exb
.len
== slen
)
5257 opt
->exb
.reach_end
= 1;
5264 CClassNode
* cc
= CCLASS_(node
);
5266 /* no need to check ignore case. (set in setup_tree()) */
5268 if (IS_NOT_NULL(cc
->mbuf
) || IS_NCCLASS_NOT(cc
)) {
5269 OnigLen min
= ONIGENC_MBC_MINLEN(enc
);
5270 OnigLen max
= ONIGENC_MBC_MAXLEN_DIST(enc
);
5272 set_mml(&opt
->len
, min
, max
);
5275 for (i
= 0; i
< SINGLE_BYTE_SIZE
; i
++) {
5276 z
= BITSET_AT(cc
->bs
, i
);
5277 if ((z
&& ! IS_NCCLASS_NOT(cc
)) || (! z
&& IS_NCCLASS_NOT(cc
))) {
5278 add_char_opt_map(&opt
->map
, (UChar
)i
, enc
);
5281 set_mml(&opt
->len
, 1, 1);
5291 max
= ONIGENC_MBC_MAXLEN_DIST(enc
);
5296 switch (CTYPE_(node
)->ctype
) {
5300 case ONIGENC_CTYPE_WORD
:
5301 range
= CTYPE_(node
)->ascii_mode
!= 0 ? 128 : SINGLE_BYTE_SIZE
;
5302 if (CTYPE_(node
)->not != 0) {
5303 for (i
= 0; i
< range
; i
++) {
5304 if (! ONIGENC_IS_CODE_WORD(enc
, i
)) {
5305 add_char_opt_map(&opt
->map
, (UChar
)i
, enc
);
5308 for (i
= range
; i
< SINGLE_BYTE_SIZE
; i
++) {
5309 add_char_opt_map(&opt
->map
, (UChar
)i
, enc
);
5313 for (i
= 0; i
< range
; i
++) {
5314 if (ONIGENC_IS_CODE_WORD(enc
, i
)) {
5315 add_char_opt_map(&opt
->map
, (UChar
)i
, enc
);
5323 min
= ONIGENC_MBC_MINLEN(enc
);
5325 set_mml(&opt
->len
, min
, max
);
5330 switch (ANCHOR_(node
)->type
) {
5331 case ANCHOR_BEGIN_BUF
:
5332 case ANCHOR_BEGIN_POSITION
:
5333 case ANCHOR_BEGIN_LINE
:
5334 case ANCHOR_END_BUF
:
5335 case ANCHOR_SEMI_END_BUF
:
5336 case ANCHOR_END_LINE
:
5337 case ANCHOR_PREC_READ_NOT
:
5338 case ANCHOR_LOOK_BEHIND
:
5339 add_opt_anc_info(&opt
->anc
, ANCHOR_(node
)->type
);
5342 case ANCHOR_PREC_READ
:
5344 r
= optimize_nodes(NODE_BODY(node
), &xo
, env
);
5347 copy_opt_exact(&opt
->expr
, &xo
.exb
);
5348 else if (xo
.exm
.len
> 0)
5349 copy_opt_exact(&opt
->expr
, &xo
.exm
);
5351 opt
->expr
.reach_end
= 0;
5353 if (xo
.map
.value
> 0)
5354 copy_opt_map(&opt
->map
, &xo
.map
);
5359 case ANCHOR_LOOK_BEHIND_NOT
:
5365 if (! NODE_IS_CHECKER(node
)) {
5367 OnigLen min
, max
, tmin
, tmax
;
5368 MemEnv
* mem_env
= SCANENV_MEMENV(env
->scan_env
);
5369 BackRefNode
* br
= BACKREF_(node
);
5371 if (NODE_IS_RECURSION(node
)) {
5372 set_mml(&opt
->len
, 0, INFINITE_LEN
);
5375 backs
= BACKREFS_P(br
);
5376 min
= tree_min_len(mem_env
[backs
[0]].node
, env
->scan_env
);
5377 max
= tree_max_len(mem_env
[backs
[0]].node
, env
->scan_env
);
5378 for (i
= 1; i
< br
->back_num
; i
++) {
5379 tmin
= tree_min_len(mem_env
[backs
[i
]].node
, env
->scan_env
);
5380 tmax
= tree_max_len(mem_env
[backs
[i
]].node
, env
->scan_env
);
5381 if (min
> tmin
) min
= tmin
;
5382 if (max
< tmax
) max
= tmax
;
5384 set_mml(&opt
->len
, min
, max
);
5390 if (NODE_IS_RECURSION(node
))
5391 set_mml(&opt
->len
, 0, INFINITE_LEN
);
5393 OnigOptionType save
= env
->options
;
5394 env
->options
= ENCLOSURE_(NODE_BODY(node
))->o
.options
;
5395 r
= optimize_nodes(NODE_BODY(node
), opt
, env
);
5396 env
->options
= save
;
5404 QuantNode
* qn
= QUANT_(node
);
5406 r
= optimize_nodes(NODE_BODY(node
), &xo
, env
);
5409 if (qn
->lower
> 0) {
5410 copy_node_opt_info(opt
, &xo
);
5411 if (xo
.exb
.len
> 0) {
5412 if (xo
.exb
.reach_end
) {
5413 for (i
= 2; i
<= qn
->lower
&& ! is_full_opt_exact(&opt
->exb
); i
++) {
5414 int rc
= concat_opt_exact(&opt
->exb
, &xo
.exb
, enc
);
5417 if (i
< qn
->lower
) opt
->exb
.reach_end
= 0;
5421 if (qn
->lower
!= qn
->upper
) {
5422 opt
->exb
.reach_end
= 0;
5423 opt
->exm
.reach_end
= 0;
5426 opt
->exm
.reach_end
= 0;
5429 if (IS_REPEAT_INFINITE(qn
->upper
)) {
5430 if (env
->mmd
.max
== 0 &&
5431 NODE_IS_ANYCHAR(NODE_BODY(node
)) && qn
->greedy
!= 0) {
5432 if (IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn
), env
)))
5433 add_opt_anc_info(&opt
->anc
, ANCHOR_ANYCHAR_INF_ML
);
5435 add_opt_anc_info(&opt
->anc
, ANCHOR_ANYCHAR_INF
);
5438 max
= (xo
.len
.max
> 0 ? INFINITE_LEN
: 0);
5441 max
= distance_multiply(xo
.len
.max
, qn
->upper
);
5444 min
= distance_multiply(xo
.len
.min
, qn
->lower
);
5445 set_mml(&opt
->len
, min
, max
);
5449 case NODE_ENCLOSURE
:
5451 EnclosureNode
* en
= ENCLOSURE_(node
);
5454 case ENCLOSURE_OPTION
:
5456 OnigOptionType save
= env
->options
;
5458 env
->options
= en
->o
.options
;
5459 r
= optimize_nodes(NODE_BODY(node
), opt
, env
);
5460 env
->options
= save
;
5464 case ENCLOSURE_MEMORY
:
5467 if (en
->opt_count
> MAX_NODE_OPT_INFO_REF_COUNT
) {
5472 if (NODE_IS_MIN_FIXED(node
)) min
= en
->min_len
;
5473 if (NODE_IS_MAX_FIXED(node
)) max
= en
->max_len
;
5474 set_mml(&opt
->len
, min
, max
);
5479 r
= optimize_nodes(NODE_BODY(node
), opt
, env
);
5480 if (is_set_opt_anc_info(&opt
->anc
, ANCHOR_ANYCHAR_INF_MASK
)) {
5481 if (MEM_STATUS_AT0(env
->scan_env
->backrefed_mem
, en
->m
.regnum
))
5482 remove_opt_anc_info(&opt
->anc
, ANCHOR_ANYCHAR_INF_MASK
);
5487 case ENCLOSURE_STOP_BACKTRACK
:
5488 r
= optimize_nodes(NODE_BODY(node
), opt
, env
);
5491 case ENCLOSURE_IF_ELSE
:
5495 copy_opt_env(&nenv
, env
);
5496 r
= optimize_nodes(NODE_ENCLOSURE_BODY(en
), &xo
, &nenv
);
5498 add_mml(&nenv
.mmd
, &xo
.len
);
5499 concat_left_node_opt_info(enc
, opt
, &xo
);
5500 if (IS_NOT_NULL(en
->te
.Then
)) {
5501 r
= optimize_nodes(en
->te
.Then
, &xo
, &nenv
);
5503 concat_left_node_opt_info(enc
, opt
, &xo
);
5507 if (IS_NOT_NULL(en
->te
.Else
)) {
5508 r
= optimize_nodes(en
->te
.Else
, &xo
, env
);
5510 alt_merge_node_opt_info(opt
, &xo
, env
);
5524 fprintf(stderr
, "optimize_nodes: undefined node type %d\n", NODE_TYPE(node
));
5526 r
= ONIGERR_TYPE_BUG
;
5534 set_optimize_exact(regex_t
* reg
, OptExact
* e
)
5538 if (e
->len
== 0) return 0;
5540 if (e
->ignore_case
) {
5541 reg
->exact
= (UChar
* )xmalloc(e
->len
);
5542 CHECK_NULL_RETURN_MEMERR(reg
->exact
);
5543 xmemcpy(reg
->exact
, e
->s
, e
->len
);
5544 reg
->exact_end
= reg
->exact
+ e
->len
;
5545 reg
->optimize
= OPTIMIZE_EXACT_IC
;
5550 reg
->exact
= onigenc_strdup(reg
->enc
, e
->s
, e
->s
+ e
->len
);
5551 CHECK_NULL_RETURN_MEMERR(reg
->exact
);
5552 reg
->exact_end
= reg
->exact
+ e
->len
;
5555 ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg
->enc
, reg
->exact
, reg
->exact_end
);
5557 if (e
->len
>= 3 || (e
->len
>= 2 && allow_reverse
)) {
5558 r
= set_bm_skip(reg
->exact
, reg
->exact_end
, reg
->enc
,
5559 reg
->map
, &(reg
->int_map
));
5560 if (r
!= 0) return r
;
5562 reg
->optimize
= (allow_reverse
!= 0
5563 ? OPTIMIZE_EXACT_BM
: OPTIMIZE_EXACT_BM_NO_REV
);
5566 reg
->optimize
= OPTIMIZE_EXACT
;
5570 reg
->dmin
= e
->mmd
.min
;
5571 reg
->dmax
= e
->mmd
.max
;
5573 if (reg
->dmin
!= INFINITE_LEN
) {
5574 reg
->threshold_len
= reg
->dmin
+ (int )(reg
->exact_end
- reg
->exact
);
5581 set_optimize_map(regex_t
* reg
, OptMap
* m
)
5585 for (i
= 0; i
< ONIG_CHAR_TABLE_SIZE
; i
++)
5586 reg
->map
[i
] = m
->map
[i
];
5588 reg
->optimize
= OPTIMIZE_MAP
;
5589 reg
->dmin
= m
->mmd
.min
;
5590 reg
->dmax
= m
->mmd
.max
;
5592 if (reg
->dmin
!= INFINITE_LEN
) {
5593 reg
->threshold_len
= reg
->dmin
+ 1;
5598 set_sub_anchor(regex_t
* reg
, OptAnc
* anc
)
5600 reg
->sub_anchor
|= anc
->left
& ANCHOR_BEGIN_LINE
;
5601 reg
->sub_anchor
|= anc
->right
& ANCHOR_END_LINE
;
5604 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
5605 static void print_optimize_info(FILE* f
, regex_t
* reg
);
5609 set_optimize_info_from_tree(Node
* node
, regex_t
* reg
, ScanEnv
* scan_env
)
5616 env
.options
= reg
->options
;
5617 env
.case_fold_flag
= reg
->case_fold_flag
;
5618 env
.scan_env
= scan_env
;
5619 clear_mml(&env
.mmd
);
5621 r
= optimize_nodes(node
, &opt
, &env
);
5622 if (r
!= 0) return r
;
5624 reg
->anchor
= opt
.anc
.left
& (ANCHOR_BEGIN_BUF
|
5625 ANCHOR_BEGIN_POSITION
| ANCHOR_ANYCHAR_INF
| ANCHOR_ANYCHAR_INF_ML
|
5626 ANCHOR_LOOK_BEHIND
);
5628 if ((opt
.anc
.left
& (ANCHOR_LOOK_BEHIND
| ANCHOR_PREC_READ_NOT
)) != 0)
5629 reg
->anchor
&= ~ANCHOR_ANYCHAR_INF_ML
;
5631 reg
->anchor
|= opt
.anc
.right
& (ANCHOR_END_BUF
| ANCHOR_SEMI_END_BUF
|
5632 ANCHOR_PREC_READ_NOT
);
5634 if (reg
->anchor
& (ANCHOR_END_BUF
| ANCHOR_SEMI_END_BUF
)) {
5635 reg
->anchor_dmin
= opt
.len
.min
;
5636 reg
->anchor_dmax
= opt
.len
.max
;
5639 if (opt
.exb
.len
> 0 || opt
.exm
.len
> 0) {
5640 select_opt_exact(reg
->enc
, &opt
.exb
, &opt
.exm
);
5641 if (opt
.map
.value
> 0 && comp_opt_exact_or_map(&opt
.exb
, &opt
.map
) > 0) {
5645 r
= set_optimize_exact(reg
, &opt
.exb
);
5646 set_sub_anchor(reg
, &opt
.exb
.anc
);
5649 else if (opt
.map
.value
> 0) {
5651 set_optimize_map(reg
, &opt
.map
);
5652 set_sub_anchor(reg
, &opt
.map
.anc
);
5655 reg
->sub_anchor
|= opt
.anc
.left
& ANCHOR_BEGIN_LINE
;
5656 if (opt
.len
.max
== 0)
5657 reg
->sub_anchor
|= opt
.anc
.right
& ANCHOR_END_LINE
;
5660 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
5661 print_optimize_info(stderr
, reg
);
5667 clear_optimize_info(regex_t
* reg
)
5669 reg
->optimize
= OPTIMIZE_NONE
;
5671 reg
->anchor_dmin
= 0;
5672 reg
->anchor_dmax
= 0;
5673 reg
->sub_anchor
= 0;
5674 reg
->exact_end
= (UChar
* )NULL
;
5675 reg
->threshold_len
= 0;
5676 if (IS_NOT_NULL(reg
->exact
)) {
5678 reg
->exact
= (UChar
* )NULL
;
5684 static void print_enc_string(FILE* fp
, OnigEncoding enc
,
5685 const UChar
*s
, const UChar
*end
)
5687 fprintf(fp
, "\nPATTERN: /");
5689 if (ONIGENC_MBC_MINLEN(enc
) > 1) {
5695 code
= ONIGENC_MBC_TO_CODE(enc
, p
, end
);
5697 fprintf(fp
, " 0x%04x ", (int )code
);
5700 fputc((int )code
, fp
);
5703 p
+= enclen(enc
, p
);
5708 fputc((int )*s
, fp
);
5716 #endif /* ONIG_DEBUG */
5718 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
5721 print_distance_range(FILE* f
, OnigLen a
, OnigLen b
)
5723 if (a
== INFINITE_LEN
)
5726 fprintf(f
, "(%u)", a
);
5730 if (b
== INFINITE_LEN
)
5733 fprintf(f
, "(%u)", b
);
5737 print_anchor(FILE* f
, int anchor
)
5743 if (anchor
& ANCHOR_BEGIN_BUF
) {
5744 fprintf(f
, "begin-buf");
5747 if (anchor
& ANCHOR_BEGIN_LINE
) {
5748 if (q
) fprintf(f
, ", ");
5750 fprintf(f
, "begin-line");
5752 if (anchor
& ANCHOR_BEGIN_POSITION
) {
5753 if (q
) fprintf(f
, ", ");
5755 fprintf(f
, "begin-pos");
5757 if (anchor
& ANCHOR_END_BUF
) {
5758 if (q
) fprintf(f
, ", ");
5760 fprintf(f
, "end-buf");
5762 if (anchor
& ANCHOR_SEMI_END_BUF
) {
5763 if (q
) fprintf(f
, ", ");
5765 fprintf(f
, "semi-end-buf");
5767 if (anchor
& ANCHOR_END_LINE
) {
5768 if (q
) fprintf(f
, ", ");
5770 fprintf(f
, "end-line");
5772 if (anchor
& ANCHOR_ANYCHAR_INF
) {
5773 if (q
) fprintf(f
, ", ");
5775 fprintf(f
, "anychar-inf");
5777 if (anchor
& ANCHOR_ANYCHAR_INF_ML
) {
5778 if (q
) fprintf(f
, ", ");
5779 fprintf(f
, "anychar-inf-ml");
5786 print_optimize_info(FILE* f
, regex_t
* reg
)
5788 static const char* on
[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV",
5789 "EXACT_IC", "MAP" };
5791 fprintf(f
, "optimize: %s\n", on
[reg
->optimize
]);
5792 fprintf(f
, " anchor: "); print_anchor(f
, reg
->anchor
);
5793 if ((reg
->anchor
& ANCHOR_END_BUF_MASK
) != 0)
5794 print_distance_range(f
, reg
->anchor_dmin
, reg
->anchor_dmax
);
5797 if (reg
->optimize
) {
5798 fprintf(f
, " sub anchor: "); print_anchor(f
, reg
->sub_anchor
);
5805 fprintf(f
, "exact: [");
5806 for (p
= reg
->exact
; p
< reg
->exact_end
; p
++) {
5809 fprintf(f
, "]: length: %ld\n", (reg
->exact_end
- reg
->exact
));
5811 else if (reg
->optimize
& OPTIMIZE_MAP
) {
5814 for (i
= 0; i
< ONIG_CHAR_TABLE_SIZE
; i
++)
5815 if (reg
->map
[i
]) n
++;
5817 fprintf(f
, "map: n=%d\n", n
);
5821 for (i
= 0; i
< ONIG_CHAR_TABLE_SIZE
; i
++) {
5822 if (reg
->map
[i
] != 0) {
5823 if (c
> 0) fputs(", ", f
);
5825 if (ONIGENC_MBC_MAXLEN(reg
->enc
) == 1 &&
5826 ONIGENC_IS_CODE_PRINT(reg
->enc
, (OnigCodePoint
)i
))
5829 fprintf(f
, "%d", i
);
5840 onig_get_regex_ext(regex_t
* reg
)
5842 if (IS_NULL(REG_EXTP(reg
))) {
5843 RegexExt
* ext
= (RegexExt
* )xmalloc(sizeof(*ext
));
5844 if (IS_NULL(ext
)) return 0;
5847 ext
->pattern_end
= 0;
5850 ext
->callout_num
= 0;
5851 ext
->callout_list_alloc
= 0;
5852 ext
->callout_list
= 0;
5855 REG_EXTPL(reg
) = (void* )ext
;
5858 return REG_EXTP(reg
);
5862 free_regex_ext(RegexExt
* ext
)
5864 if (IS_NOT_NULL(ext
)) {
5865 if (IS_NOT_NULL(ext
->pattern
))
5866 xfree((void* )ext
->pattern
);
5869 if (IS_NOT_NULL(ext
->tag_table
))
5870 onig_callout_tag_table_free(ext
->tag_table
);
5872 if (IS_NOT_NULL(ext
->callout_list
))
5873 onig_free_reg_callout_list(ext
->callout_num
, ext
->callout_list
);
5881 onig_ext_set_pattern(regex_t
* reg
, const UChar
* pattern
, const UChar
* pattern_end
)
5886 ext
= onig_get_regex_ext(reg
);
5887 CHECK_NULL_RETURN_MEMERR(ext
);
5889 s
= onigenc_strdup(reg
->enc
, pattern
, pattern_end
);
5890 CHECK_NULL_RETURN_MEMERR(s
);
5893 ext
->pattern_end
= s
+ (pattern_end
- pattern
);
5900 onig_free_body(regex_t
* reg
)
5902 if (IS_NOT_NULL(reg
)) {
5903 if (IS_NOT_NULL(reg
->p
)) xfree(reg
->p
);
5904 if (IS_NOT_NULL(reg
->exact
)) xfree(reg
->exact
);
5905 if (IS_NOT_NULL(reg
->int_map
)) xfree(reg
->int_map
);
5906 if (IS_NOT_NULL(reg
->int_map_backward
)) xfree(reg
->int_map_backward
);
5907 if (IS_NOT_NULL(reg
->repeat_range
)) xfree(reg
->repeat_range
);
5908 if (IS_NOT_NULL(REG_EXTP(reg
))) {
5909 free_regex_ext(REG_EXTP(reg
));
5913 onig_names_free(reg
);
5918 onig_free(regex_t
* reg
)
5920 if (IS_NOT_NULL(reg
)) {
5921 onig_free_body(reg
);
5926 #define REGEX_TRANSFER(to,from) do {\
5927 onig_free_body(to);\
5928 xmemcpy(to, from, sizeof(regex_t));\
5933 onig_transfer(regex_t
* to
, regex_t
* from
)
5935 REGEX_TRANSFER(to
, from
);
5939 #ifdef ONIG_DEBUG_PARSE
5940 static void print_tree
P_((FILE* f
, Node
* node
));
5944 onig_compile(regex_t
* reg
, const UChar
* pattern
, const UChar
* pattern_end
,
5945 OnigErrorInfo
* einfo
)
5947 #define COMPILE_INIT_SIZE 20
5953 UnsetAddrList uslist
;
5957 if (IS_NOT_NULL(einfo
)) {
5958 einfo
->enc
= reg
->enc
;
5959 einfo
->par
= (UChar
* )NULL
;
5963 print_enc_string(stderr
, reg
->enc
, pattern
, pattern_end
);
5966 if (reg
->alloc
== 0) {
5967 init_size
= (int )(pattern_end
- pattern
) * 2;
5968 if (init_size
<= 0) init_size
= COMPILE_INIT_SIZE
;
5969 r
= BB_INIT(reg
, init_size
);
5970 if (r
!= 0) goto end
;
5976 reg
->num_repeat
= 0;
5977 reg
->num_null_check
= 0;
5978 reg
->repeat_range_alloc
= 0;
5979 reg
->repeat_range
= (OnigRepeatRange
* )NULL
;
5981 r
= onig_parse_tree(&root
, pattern
, pattern_end
, reg
, &scan_env
);
5982 if (r
!= 0) goto err
;
5984 /* mixed use named group and no-named group */
5985 if (scan_env
.num_named
> 0 &&
5986 IS_SYNTAX_BV(scan_env
.syntax
, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP
) &&
5987 ! ONIG_IS_OPTION_ON(reg
->options
, ONIG_OPTION_CAPTURE_GROUP
)) {
5988 if (scan_env
.num_named
!= scan_env
.num_mem
)
5989 r
= disable_noname_group_capture(&root
, reg
, &scan_env
);
5991 r
= numbered_ref_check(root
);
5993 if (r
!= 0) goto err
;
5996 r
= check_backrefs(root
, &scan_env
);
5997 if (r
!= 0) goto err
;
6000 if (scan_env
.num_call
> 0) {
6001 r
= unset_addr_list_init(&uslist
, scan_env
.num_call
);
6002 if (r
!= 0) goto err
;
6003 scan_env
.unset_addr_list
= &uslist
;
6004 r
= setup_call(root
, &scan_env
, 0);
6005 if (r
!= 0) goto err_unset
;
6006 r
= setup_call2(root
);
6007 if (r
!= 0) goto err_unset
;
6008 r
= recursive_call_check_trav(root
, &scan_env
, 0);
6009 if (r
< 0) goto err_unset
;
6010 r
= infinite_recursive_call_check_trav(root
, &scan_env
);
6011 if (r
!= 0) goto err_unset
;
6013 setup_called_state(root
, 0);
6016 reg
->num_call
= scan_env
.num_call
;
6019 r
= setup_tree(root
, reg
, 0, &scan_env
);
6020 if (r
!= 0) goto err_unset
;
6022 #ifdef ONIG_DEBUG_PARSE
6023 print_tree(stderr
, root
);
6026 reg
->capture_history
= scan_env
.capture_history
;
6027 reg
->bt_mem_start
= scan_env
.bt_mem_start
;
6028 reg
->bt_mem_start
|= reg
->capture_history
;
6029 if (IS_FIND_CONDITION(reg
->options
))
6030 MEM_STATUS_ON_ALL(reg
->bt_mem_end
);
6032 reg
->bt_mem_end
= scan_env
.bt_mem_end
;
6033 reg
->bt_mem_end
|= reg
->capture_history
;
6035 reg
->bt_mem_start
|= reg
->bt_mem_end
;
6037 clear_optimize_info(reg
);
6038 #ifndef ONIG_DONT_OPTIMIZE
6039 r
= set_optimize_info_from_tree(root
, reg
, &scan_env
);
6040 if (r
!= 0) goto err_unset
;
6043 if (IS_NOT_NULL(scan_env
.mem_env_dynamic
)) {
6044 xfree(scan_env
.mem_env_dynamic
);
6045 scan_env
.mem_env_dynamic
= (MemEnv
* )NULL
;
6048 r
= compile_tree(root
, reg
, &scan_env
);
6050 if (scan_env
.keep_num
> 0) {
6051 r
= add_opcode(reg
, OP_UPDATE_VAR
);
6052 if (r
!= 0) goto err
;
6053 r
= add_update_var_type(reg
, UPDATE_VAR_KEEP_FROM_STACK_LAST
);
6054 if (r
!= 0) goto err
;
6055 r
= add_mem_num(reg
, 0 /* not used */);
6056 if (r
!= 0) goto err
;
6059 r
= add_opcode(reg
, OP_END
);
6061 if (scan_env
.num_call
> 0) {
6062 r
= fix_unset_addr_list(&uslist
, reg
);
6063 unset_addr_list_end(&uslist
);
6064 if (r
!= 0) goto err
;
6068 if ((reg
->num_repeat
!= 0) || (reg
->bt_mem_end
!= 0)
6070 || (IS_NOT_NULL(REG_EXTP(reg
)) && REG_EXTP(reg
)->callout_num
!= 0)
6073 reg
->stack_pop_level
= STACK_POP_LEVEL_ALL
;
6075 if (reg
->bt_mem_start
!= 0)
6076 reg
->stack_pop_level
= STACK_POP_LEVEL_MEM_START
;
6078 reg
->stack_pop_level
= STACK_POP_LEVEL_FREE
;
6082 else if (scan_env
.num_call
> 0) {
6083 unset_addr_list_end(&uslist
);
6086 onig_node_free(root
);
6088 #ifdef ONIG_DEBUG_COMPILE
6089 onig_print_names(stderr
, reg
);
6090 onig_print_compiled_byte_code_list(stderr
, reg
);
6098 if (scan_env
.num_call
> 0) {
6099 unset_addr_list_end(&uslist
);
6103 if (IS_NOT_NULL(scan_env
.error
)) {
6104 if (IS_NOT_NULL(einfo
)) {
6105 einfo
->par
= scan_env
.error
;
6106 einfo
->par_end
= scan_env
.error_end
;
6110 onig_node_free(root
);
6111 if (IS_NOT_NULL(scan_env
.mem_env_dynamic
))
6112 xfree(scan_env
.mem_env_dynamic
);
6117 static int onig_inited
= 0;
6120 onig_reg_init(regex_t
* reg
, OnigOptionType option
, OnigCaseFoldType case_fold_flag
,
6121 OnigEncoding enc
, OnigSyntaxType
* syntax
)
6125 xmemset(reg
, 0, sizeof(*reg
));
6127 if (onig_inited
== 0) {
6129 return ONIGERR_LIBRARY_IS_NOT_INITIALIZED
;
6131 r
= onig_initialize(&enc
, 1);
6133 return ONIGERR_FAIL_TO_INITIALIZE
;
6135 onig_warning("You didn't call onig_initialize() explicitly");
6140 return ONIGERR_INVALID_ARGUMENT
;
6142 if (ONIGENC_IS_UNDEF(enc
))
6143 return ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED
;
6145 if ((option
& (ONIG_OPTION_DONT_CAPTURE_GROUP
|ONIG_OPTION_CAPTURE_GROUP
))
6146 == (ONIG_OPTION_DONT_CAPTURE_GROUP
|ONIG_OPTION_CAPTURE_GROUP
)) {
6147 return ONIGERR_INVALID_COMBINATION_OF_OPTIONS
;
6150 if ((option
& ONIG_OPTION_NEGATE_SINGLELINE
) != 0) {
6151 option
|= syntax
->options
;
6152 option
&= ~ONIG_OPTION_SINGLELINE
;
6155 option
|= syntax
->options
;
6158 (reg
)->options
= option
;
6159 (reg
)->syntax
= syntax
;
6160 (reg
)->optimize
= 0;
6161 (reg
)->exact
= (UChar
* )NULL
;
6162 (reg
)->int_map
= (int* )NULL
;
6163 (reg
)->int_map_backward
= (int* )NULL
;
6164 REG_EXTPL(reg
) = NULL
;
6166 (reg
)->p
= (UChar
* )NULL
;
6169 (reg
)->name_table
= (void* )NULL
;
6171 (reg
)->case_fold_flag
= case_fold_flag
;
6176 onig_new_without_alloc(regex_t
* reg
,
6177 const UChar
* pattern
, const UChar
* pattern_end
,
6178 OnigOptionType option
, OnigEncoding enc
,
6179 OnigSyntaxType
* syntax
, OnigErrorInfo
* einfo
)
6183 r
= onig_reg_init(reg
, option
, ONIGENC_CASE_FOLD_DEFAULT
, enc
, syntax
);
6184 if (r
!= 0) return r
;
6186 r
= onig_compile(reg
, pattern
, pattern_end
, einfo
);
6191 onig_new(regex_t
** reg
, const UChar
* pattern
, const UChar
* pattern_end
,
6192 OnigOptionType option
, OnigEncoding enc
, OnigSyntaxType
* syntax
,
6193 OnigErrorInfo
* einfo
)
6197 *reg
= (regex_t
* )xmalloc(sizeof(regex_t
));
6198 if (IS_NULL(*reg
)) return ONIGERR_MEMORY
;
6200 r
= onig_reg_init(*reg
, option
, ONIGENC_CASE_FOLD_DEFAULT
, enc
, syntax
);
6201 if (r
!= 0) goto err
;
6203 r
= onig_compile(*reg
, pattern
, pattern_end
, einfo
);
6213 onig_initialize(OnigEncoding encodings
[], int n
)
6218 if (onig_inited
!= 0)
6225 for (i
= 0; i
< n
; i
++) {
6226 OnigEncoding enc
= encodings
[i
];
6227 r
= onig_initialize_encoding(enc
);
6235 typedef struct EndCallListItem
{
6236 struct EndCallListItem
* next
;
6238 } EndCallListItemType
;
6240 static EndCallListItemType
* EndCallTop
;
6242 extern void onig_add_end_call(void (*func
)(void))
6244 EndCallListItemType
* item
;
6246 item
= (EndCallListItemType
* )xmalloc(sizeof(*item
));
6247 if (item
== 0) return ;
6249 item
->next
= EndCallTop
;
6256 exec_end_call_list(void)
6258 EndCallListItemType
* prev
;
6261 while (EndCallTop
!= 0) {
6262 func
= EndCallTop
->func
;
6266 EndCallTop
= EndCallTop
->next
;
6274 exec_end_call_list();
6277 onig_global_callout_names_free();
6288 onig_is_in_code_range(const UChar
* p
, OnigCodePoint code
)
6290 OnigCodePoint n
, *data
;
6291 OnigCodePoint low
, high
, x
;
6293 GET_CODE_POINT(n
, p
);
6294 data
= (OnigCodePoint
* )p
;
6297 for (low
= 0, high
= n
; low
< high
; ) {
6298 x
= (low
+ high
) >> 1;
6299 if (code
> data
[x
* 2 + 1])
6305 return ((low
< n
&& code
>= data
[low
* 2]) ? 1 : 0);
6309 onig_is_code_in_cc_len(int elen
, OnigCodePoint code
, /* CClassNode* */ void* cc_arg
)
6312 CClassNode
* cc
= (CClassNode
* )cc_arg
;
6314 if (elen
> 1 || (code
>= SINGLE_BYTE_SIZE
)) {
6315 if (IS_NULL(cc
->mbuf
)) {
6319 found
= (onig_is_in_code_range(cc
->mbuf
->p
, code
) != 0 ? 1 : 0);
6323 found
= (BITSET_AT(cc
->bs
, code
) == 0 ? 0 : 1);
6326 if (IS_NCCLASS_NOT(cc
))
6333 onig_is_code_in_cc(OnigEncoding enc
, OnigCodePoint code
, CClassNode
* cc
)
6337 if (ONIGENC_MBC_MINLEN(enc
) > 1) {
6341 len
= ONIGENC_CODE_TO_MBCLEN(enc
, code
);
6343 return onig_is_code_in_cc_len(len
, code
, cc
);
6347 #ifdef ONIG_DEBUG_PARSE
6350 p_string(FILE* f
, int len
, UChar
* s
)
6353 while (len
-- > 0) { fputc(*s
++, f
); }
6357 Indent(FILE* f
, int indent
)
6360 for (i
= 0; i
< indent
; i
++) putc(' ', f
);
6364 print_indent_tree(FILE* f
, Node
* node
, int indent
)
6372 if (IS_NULL(node
)) {
6373 fprintf(f
, "ERROR: null node!!!\n");
6377 type
= NODE_TYPE(node
);
6381 if (type
== NODE_LIST
)
6382 fprintf(f
, "<list:%p>\n", node
);
6384 fprintf(f
, "<alt:%p>\n", node
);
6386 print_indent_tree(f
, NODE_CAR(node
), indent
+ add
);
6387 while (IS_NOT_NULL(node
= NODE_CDR(node
))) {
6388 if (NODE_TYPE(node
) != type
) {
6389 fprintf(f
, "ERROR: list/alt right is not a cons. %d\n", NODE_TYPE(node
));
6392 print_indent_tree(f
, NODE_CAR(node
), indent
+ add
);
6397 fprintf(f
, "<string%s:%p>", (NODE_STRING_IS_RAW(node
) ? "-raw" : ""), node
);
6398 for (p
= STR_(node
)->s
; p
< STR_(node
)->end
; p
++) {
6399 if (*p
>= 0x20 && *p
< 0x7f)
6402 fprintf(f
, " 0x%02x", *p
);
6408 fprintf(f
, "<cclass:%p>", node
);
6409 if (IS_NCCLASS_NOT(CCLASS_(node
))) fputs(" not", f
);
6410 if (CCLASS_(node
)->mbuf
) {
6411 BBuf
* bbuf
= CCLASS_(node
)->mbuf
;
6412 for (i
= 0; i
< bbuf
->used
; i
++) {
6413 if (i
> 0) fprintf(f
, ",");
6414 fprintf(f
, "%0x", bbuf
->p
[i
]);
6420 fprintf(f
, "<ctype:%p> ", node
);
6421 switch (CTYPE_(node
)->ctype
) {
6423 fprintf(f
, "<anychar:%p>", node
);
6426 case ONIGENC_CTYPE_WORD
:
6427 if (CTYPE_(node
)->not != 0)
6428 fputs("not word", f
);
6432 if (CTYPE_(node
)->ascii_mode
!= 0)
6433 fputs(" (ascii)", f
);
6438 fprintf(f
, "ERROR: undefined ctype.\n");
6444 fprintf(f
, "<anchor:%p> ", node
);
6445 switch (ANCHOR_(node
)->type
) {
6446 case ANCHOR_BEGIN_BUF
: fputs("begin buf", f
); break;
6447 case ANCHOR_END_BUF
: fputs("end buf", f
); break;
6448 case ANCHOR_BEGIN_LINE
: fputs("begin line", f
); break;
6449 case ANCHOR_END_LINE
: fputs("end line", f
); break;
6450 case ANCHOR_SEMI_END_BUF
: fputs("semi end buf", f
); break;
6451 case ANCHOR_BEGIN_POSITION
: fputs("begin position", f
); break;
6453 case ANCHOR_WORD_BOUNDARY
: fputs("word boundary", f
); break;
6454 case ANCHOR_NO_WORD_BOUNDARY
: fputs("not word boundary", f
); break;
6455 #ifdef USE_WORD_BEGIN_END
6456 case ANCHOR_WORD_BEGIN
: fputs("word begin", f
); break;
6457 case ANCHOR_WORD_END
: fputs("word end", f
); break;
6459 case ANCHOR_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY
:
6460 fputs("extended-grapheme-cluster boundary", f
); break;
6461 case ANCHOR_NO_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY
:
6462 fputs("no-extended-grapheme-cluster boundary", f
); break;
6463 case ANCHOR_PREC_READ
:
6464 fprintf(f
, "prec read\n");
6465 print_indent_tree(f
, NODE_BODY(node
), indent
+ add
);
6467 case ANCHOR_PREC_READ_NOT
:
6468 fprintf(f
, "prec read not\n");
6469 print_indent_tree(f
, NODE_BODY(node
), indent
+ add
);
6471 case ANCHOR_LOOK_BEHIND
:
6472 fprintf(f
, "look behind\n");
6473 print_indent_tree(f
, NODE_BODY(node
), indent
+ add
);
6475 case ANCHOR_LOOK_BEHIND_NOT
:
6476 fprintf(f
, "look behind not\n");
6477 print_indent_tree(f
, NODE_BODY(node
), indent
+ add
);
6481 fprintf(f
, "ERROR: undefined anchor type.\n");
6489 BackRefNode
* br
= BACKREF_(node
);
6491 fprintf(f
, "<backref%s:%p>", NODE_IS_CHECKER(node
) ? "-checker" : "", node
);
6492 for (i
= 0; i
< br
->back_num
; i
++) {
6493 if (i
> 0) fputs(", ", f
);
6494 fprintf(f
, "%d", p
[i
]);
6502 CallNode
* cn
= CALL_(node
);
6503 fprintf(f
, "<call:%p>", node
);
6504 p_string(f
, cn
->name_end
- cn
->name
, cn
->name
);
6510 fprintf(f
, "<quantifier:%p>{%d,%d}%s\n", node
,
6511 QUANT_(node
)->lower
, QUANT_(node
)->upper
,
6512 (QUANT_(node
)->greedy
? "" : "?"));
6513 print_indent_tree(f
, NODE_BODY(node
), indent
+ add
);
6516 case NODE_ENCLOSURE
:
6517 fprintf(f
, "<enclosure:%p> ", node
);
6518 switch (ENCLOSURE_(node
)->type
) {
6519 case ENCLOSURE_OPTION
:
6520 fprintf(f
, "option:%d", ENCLOSURE_(node
)->o
.options
);
6522 case ENCLOSURE_MEMORY
:
6523 fprintf(f
, "memory:%d", ENCLOSURE_(node
)->m
.regnum
);
6525 case ENCLOSURE_STOP_BACKTRACK
:
6526 fprintf(f
, "stop-bt");
6533 print_indent_tree(f
, NODE_BODY(node
), indent
+ add
);
6537 fprintf(f
, "<gimmick:%p> ", node
);
6538 switch (GIMMICK_(node
)->type
) {
6543 fprintf(f
, "keep:%d", GIMMICK_(node
)->id
);
6546 fprintf(f
, "save:%d:%d", GIMMICK_(node
)->detail_type
, GIMMICK_(node
)->id
);
6548 case GIMMICK_UPDATE_VAR
:
6549 fprintf(f
, "update_var:%d:%d", GIMMICK_(node
)->detail_type
, GIMMICK_(node
)->id
);
6552 case GIMMICK_CALLOUT
:
6553 switch (GIMMICK_(node
)->detail_type
) {
6554 case ONIG_CALLOUT_OF_CONTENTS
:
6555 fprintf(f
, "callout:contents:%d", GIMMICK_(node
)->num
);
6557 case ONIG_CALLOUT_OF_NAME
:
6558 fprintf(f
, "callout:name:%d:%d", GIMMICK_(node
)->id
, GIMMICK_(node
)->num
);
6566 fprintf(f
, "print_indent_tree: undefined node type %d\n", NODE_TYPE(node
));
6570 if (type
!= NODE_LIST
&& type
!= NODE_ALT
&& type
!= NODE_QUANT
&&
6571 type
!= NODE_ENCLOSURE
)
6577 print_tree(FILE* f
, Node
* node
)
6579 print_indent_tree(f
, node
, 0);