1 /**********************************************************************
2 regcomp.c - Oniguruma (regular expression library)
3 **********************************************************************/
5 * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 #define OPS_INIT_SIZE 8
34 OnigCaseFoldType OnigDefaultCaseFoldFlag
= ONIGENC_CASE_FOLD_MIN
;
44 make_int_stack(int_stack
** rs
, int init_size
)
51 s
= xmalloc(sizeof(*s
));
52 if (IS_NULL(s
)) return ONIGERR_MEMORY
;
54 v
= (int* )xmalloc(sizeof(int) * init_size
);
57 return ONIGERR_MEMORY
;
69 free_int_stack(int_stack
* s
)
72 if (IS_NOT_NULL(s
->v
))
79 int_stack_push(int_stack
* s
, int v
)
81 if (s
->n
>= s
->alloc
) {
82 int new_size
= s
->alloc
* 2;
83 int* nv
= (int* )xrealloc(s
->v
, sizeof(int) * new_size
, sizeof(int) * s
->alloc
);
84 if (IS_NULL(nv
)) return ONIGERR_MEMORY
;
96 int_stack_pop(int_stack
* s
)
102 fprintf(stderr
, "int_stack_pop: fail empty. %p\n", s
);
114 ops_init(regex_t
* reg
, int init_alloc_size
)
119 if (init_alloc_size
> 0) {
120 size
= sizeof(Operation
) * init_alloc_size
;
121 p
= (Operation
* )xmalloc(size
);
122 CHECK_NULL_RETURN_MEMERR(p
);
123 #ifdef USE_DIRECT_THREADED_CODE
126 size
= sizeof(enum OpCode
) * init_alloc_size
;
127 cp
= (enum OpCode
* )xmalloc(size
);
128 CHECK_NULL_RETURN_MEMERR(cp
);
135 #ifdef USE_DIRECT_THREADED_CODE
136 reg
->ocs
= (enum OpCode
* )0;
141 reg
->ops_curr
= 0; /* !!! not yet done ops_new() */
142 reg
->ops_alloc
= init_alloc_size
;
149 ops_expand(regex_t
* reg
, int n
)
151 #define MIN_OPS_EXPAND_SIZE 4
153 #ifdef USE_DIRECT_THREADED_CODE
159 if (n
<= 0) n
= MIN_OPS_EXPAND_SIZE
;
163 size
= sizeof(Operation
) * n
;
164 p
= (Operation
* )xrealloc(reg
->ops
, size
, sizeof(Operation
) * reg
->ops_alloc
);
165 CHECK_NULL_RETURN_MEMERR(p
);
167 #ifdef USE_DIRECT_THREADED_CODE
168 size
= sizeof(enum OpCode
) * n
;
169 cp
= (enum OpCode
* )xrealloc(reg
->ocs
, size
, sizeof(enum OpCode
) * reg
->ops_alloc
);
170 CHECK_NULL_RETURN_MEMERR(cp
);
176 if (reg
->ops_used
== 0)
179 reg
->ops_curr
= reg
->ops
+ (reg
->ops_used
- 1);
185 ops_new(regex_t
* reg
)
189 if (reg
->ops_used
>= reg
->ops_alloc
) {
190 r
= ops_expand(reg
, reg
->ops_alloc
);
191 if (r
!= ONIG_NORMAL
) return r
;
194 reg
->ops_curr
= reg
->ops
+ reg
->ops_used
;
197 xmemset(reg
->ops_curr
, 0, sizeof(Operation
));
202 is_in_string_pool(regex_t
* reg
, UChar
* s
)
204 return (s
>= reg
->string_pool
&& s
< reg
->string_pool_end
);
208 ops_free(regex_t
* reg
)
212 if (IS_NULL(reg
->ops
)) return ;
214 for (i
= 0; i
< (int )reg
->ops_used
; i
++) {
220 #ifdef USE_DIRECT_THREADED_CODE
221 opcode
= *(reg
->ocs
+ i
);
228 if (! is_in_string_pool(reg
, op
->exact_len_n
.s
))
229 xfree(op
->exact_len_n
.s
);
231 case OP_EXACTN
: case OP_EXACTMB2N
: case OP_EXACTMB3N
: case OP_EXACTN_IC
:
232 if (! is_in_string_pool(reg
, op
->exact_n
.s
))
233 xfree(op
->exact_n
.s
);
235 case OP_EXACT1
: case OP_EXACT2
: case OP_EXACT3
: case OP_EXACT4
:
236 case OP_EXACT5
: case OP_EXACTMB2N1
: case OP_EXACTMB2N2
:
237 case OP_EXACTMB2N3
: case OP_EXACT1_IC
:
240 case OP_CCLASS_NOT
: case OP_CCLASS
:
241 xfree(op
->cclass
.bsp
);
244 case OP_CCLASS_MB_NOT
: case OP_CCLASS_MB
:
245 xfree(op
->cclass_mb
.mb
);
247 case OP_CCLASS_MIX_NOT
: case OP_CCLASS_MIX
:
248 xfree(op
->cclass_mix
.mb
);
249 xfree(op
->cclass_mix
.bsp
);
252 case OP_BACKREF1
: case OP_BACKREF2
: case OP_BACKREF_N
: case OP_BACKREF_N_IC
:
254 case OP_BACKREF_MULTI
: case OP_BACKREF_MULTI_IC
:
255 case OP_BACKREF_WITH_LEVEL
:
256 case OP_BACKREF_WITH_LEVEL_IC
:
257 case OP_BACKREF_CHECK
:
258 case OP_BACKREF_CHECK_WITH_LEVEL
:
259 if (op
->backref_general
.num
!= 1)
260 xfree(op
->backref_general
.ns
);
269 #ifdef USE_DIRECT_THREADED_CODE
281 ops_calc_size_of_string_pool(regex_t
* reg
)
286 if (IS_NULL(reg
->ops
)) return 0;
289 for (i
= 0; i
< (int )reg
->ops_used
; i
++) {
294 #ifdef USE_DIRECT_THREADED_CODE
295 opcode
= *(reg
->ocs
+ i
);
302 total
+= op
->exact_len_n
.len
* op
->exact_len_n
.n
;
306 total
+= op
->exact_n
.n
;
309 total
+= op
->exact_n
.n
* 2;
312 total
+= op
->exact_n
.n
* 3;
324 ops_make_string_pool(regex_t
* reg
)
332 size
= ops_calc_size_of_string_pool(reg
);
337 curr
= pool
= (UChar
* )xmalloc((size_t )size
);
338 CHECK_NULL_RETURN_MEMERR(pool
);
340 for (i
= 0; i
< (int )reg
->ops_used
; i
++) {
345 #ifdef USE_DIRECT_THREADED_CODE
346 opcode
= *(reg
->ocs
+ i
);
353 len
= op
->exact_len_n
.len
* op
->exact_len_n
.n
;
354 xmemcpy(curr
, op
->exact_len_n
.s
, len
);
355 xfree(op
->exact_len_n
.s
);
356 op
->exact_len_n
.s
= curr
;
363 xmemcpy(curr
, op
->exact_n
.s
, len
);
364 xfree(op
->exact_n
.s
);
365 op
->exact_n
.s
= curr
;
369 len
= op
->exact_n
.n
* 2;
373 len
= op
->exact_n
.n
* 3;
382 reg
->string_pool
= pool
;
383 reg
->string_pool_end
= pool
+ size
;
387 extern OnigCaseFoldType
388 onig_get_default_case_fold_flag(void)
390 return OnigDefaultCaseFoldFlag
;
394 onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag
)
396 OnigDefaultCaseFoldFlag
= case_fold_flag
;
401 int_multiply_cmp(int x
, int y
, int v
)
403 if (x
== 0 || y
== 0) return -1;
405 if (x
< INT_MAX
/ y
) {
407 if (xy
> v
) return 1;
409 if (xy
== v
) return 0;
418 onig_positive_int_multiply(int x
, int y
)
420 if (x
== 0 || y
== 0) return 0;
430 swap_node(Node
* a
, Node
* b
)
434 c
= *a
; *a
= *b
; *b
= c
;
436 if (NODE_TYPE(a
) == NODE_STRING
) {
437 StrNode
* sn
= STR_(a
);
438 if (sn
->capacity
== 0) {
439 int len
= (int )(sn
->end
- sn
->s
);
441 sn
->end
= sn
->s
+ len
;
445 if (NODE_TYPE(b
) == NODE_STRING
) {
446 StrNode
* sn
= STR_(b
);
447 if (sn
->capacity
== 0) {
448 int len
= (int )(sn
->end
- sn
->s
);
450 sn
->end
= sn
->s
+ len
;
456 distance_add(OnigLen d1
, OnigLen d2
)
458 if (d1
== INFINITE_LEN
|| d2
== INFINITE_LEN
)
461 if (d1
<= INFINITE_LEN
- d2
) return d1
+ d2
;
462 else return INFINITE_LEN
;
467 distance_multiply(OnigLen d
, int m
)
469 if (m
== 0) return 0;
471 if (d
< INFINITE_LEN
/ m
)
478 bitset_is_empty(BitSetRef bs
)
482 for (i
= 0; i
< (int )BITSET_SIZE
; i
++) {
483 if (bs
[i
] != 0) return 0;
491 unset_addr_list_init(UnsetAddrList
* list
, int size
)
493 UnsetAddr
* p
= (UnsetAddr
* )xmalloc(sizeof(UnsetAddr
)* size
);
494 CHECK_NULL_RETURN_MEMERR(p
);
503 unset_addr_list_end(UnsetAddrList
* list
)
505 if (IS_NOT_NULL(list
->us
))
510 unset_addr_list_add(UnsetAddrList
* list
, int offset
, struct _Node
* node
)
515 if (list
->num
>= list
->alloc
) {
516 size
= list
->alloc
* 2;
517 p
= (UnsetAddr
* )xrealloc(list
->us
, sizeof(UnsetAddr
) * size
, sizeof(UnsetAddr
)* list
->alloc
);
518 CHECK_NULL_RETURN_MEMERR(p
);
523 list
->us
[list
->num
].offset
= offset
;
524 list
->us
[list
->num
].target
= node
;
528 #endif /* USE_CALL */
532 add_op(regex_t
* reg
, int opcode
)
537 if (r
!= ONIG_NORMAL
) return r
;
539 #ifdef USE_DIRECT_THREADED_CODE
540 *(reg
->ocs
+ (reg
->ops_curr
- reg
->ops
)) = opcode
;
542 reg
->ops_curr
->opcode
= opcode
;
548 static int compile_length_tree(Node
* node
, regex_t
* reg
);
549 static int compile_tree(Node
* node
, regex_t
* reg
, ScanEnv
* env
);
552 #define IS_NEED_STR_LEN_OP_EXACT(op) \
553 ((op) == OP_EXACTN || (op) == OP_EXACTMB2N ||\
554 (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC)
557 select_str_opcode(int mb_len
, int str_len
, int ignore_case
)
563 case 1: op
= OP_EXACT1_IC
; break;
564 default: op
= OP_EXACTN_IC
; break;
571 case 1: op
= OP_EXACT1
; break;
572 case 2: op
= OP_EXACT2
; break;
573 case 3: op
= OP_EXACT3
; break;
574 case 4: op
= OP_EXACT4
; break;
575 case 5: op
= OP_EXACT5
; break;
576 default: op
= OP_EXACTN
; break;
582 case 1: op
= OP_EXACTMB2N1
; break;
583 case 2: op
= OP_EXACTMB2N2
; break;
584 case 3: op
= OP_EXACTMB2N3
; break;
585 default: op
= OP_EXACTMB2N
; break;
602 is_strict_real_node(Node
* node
)
604 switch (NODE_TYPE(node
)) {
607 StrNode
* sn
= STR_(node
);
608 return (sn
->end
!= sn
->s
);
624 compile_tree_empty_check(Node
* node
, regex_t
* reg
, int emptiness
, ScanEnv
* env
)
627 int saved_num_null_check
= reg
->num_null_check
;
629 if (emptiness
!= BODY_IS_NOT_EMPTY
) {
630 r
= add_op(reg
, OP_EMPTY_CHECK_START
);
631 if (r
!= 0) return r
;
632 COP(reg
)->empty_check_start
.mem
= reg
->num_null_check
; /* NULL CHECK ID */
633 reg
->num_null_check
++;
636 r
= compile_tree(node
, reg
, env
);
637 if (r
!= 0) return r
;
639 if (emptiness
!= BODY_IS_NOT_EMPTY
) {
640 if (emptiness
== BODY_IS_EMPTY_POSSIBILITY
)
641 r
= add_op(reg
, OP_EMPTY_CHECK_END
);
642 else if (emptiness
== BODY_IS_EMPTY_POSSIBILITY_MEM
)
643 r
= add_op(reg
, OP_EMPTY_CHECK_END_MEMST
);
644 else if (emptiness
== BODY_IS_EMPTY_POSSIBILITY_REC
)
645 r
= add_op(reg
, OP_EMPTY_CHECK_END_MEMST_PUSH
);
647 if (r
!= 0) return r
;
648 COP(reg
)->empty_check_end
.mem
= saved_num_null_check
; /* NULL CHECK ID */
655 compile_call(CallNode
* node
, regex_t
* reg
, ScanEnv
* env
)
660 r
= add_op(reg
, OP_CALL
);
661 if (r
!= 0) return r
;
663 COP(reg
)->call
.addr
= 0; /* dummy addr. */
665 offset
= COP_CURR_OFFSET_BYTES(reg
, call
.addr
);
666 r
= unset_addr_list_add(env
->unset_addr_list
, offset
, NODE_CALL_BODY(node
));
672 compile_tree_n_times(Node
* node
, int n
, regex_t
* reg
, ScanEnv
* env
)
676 for (i
= 0; i
< n
; i
++) {
677 r
= compile_tree(node
, reg
, env
);
678 if (r
!= 0) return r
;
684 add_compile_string_length(UChar
* s ARG_UNUSED
, int mb_len
, int str_len
,
685 regex_t
* reg ARG_UNUSED
, int ignore_case
)
691 add_compile_string(UChar
* s
, int mb_len
, int str_len
,
692 regex_t
* reg
, int ignore_case
)
700 op
= select_str_opcode(mb_len
, str_len
, ignore_case
);
702 if (r
!= 0) return r
;
704 byte_len
= mb_len
* str_len
;
707 if (op
== OP_EXACTMBN
) {
708 p
= onigenc_strdup(reg
->enc
, s
, end
);
709 CHECK_NULL_RETURN_MEMERR(p
);
711 COP(reg
)->exact_len_n
.len
= mb_len
;
712 COP(reg
)->exact_len_n
.n
= str_len
;
713 COP(reg
)->exact_len_n
.s
= p
;
715 else if (IS_NEED_STR_LEN_OP_EXACT(op
)) {
716 p
= onigenc_strdup(reg
->enc
, s
, end
);
717 CHECK_NULL_RETURN_MEMERR(p
);
719 if (op
== OP_EXACTN_IC
)
720 COP(reg
)->exact_n
.n
= byte_len
;
722 COP(reg
)->exact_n
.n
= str_len
;
724 COP(reg
)->exact_n
.s
= p
;
727 xmemcpy(COP(reg
)->exact
.s
, s
, (size_t )byte_len
);
728 COP(reg
)->exact
.s
[byte_len
] = '\0';
735 compile_length_string_node(Node
* node
, regex_t
* reg
)
737 int rlen
, r
, len
, prev_len
, slen
, ambig
;
740 OnigEncoding enc
= reg
->enc
;
743 if (sn
->end
<= sn
->s
)
746 ambig
= NODE_STRING_IS_AMBIG(node
);
749 prev_len
= enclen(enc
, p
);
754 for (; p
< sn
->end
; ) {
755 len
= enclen(enc
, p
);
756 if (len
== prev_len
) {
760 r
= add_compile_string_length(prev
, prev_len
, slen
, reg
, ambig
);
769 r
= add_compile_string_length(prev
, prev_len
, slen
, reg
, ambig
);
775 compile_length_string_raw_node(StrNode
* sn
, regex_t
* reg
)
777 if (sn
->end
<= sn
->s
)
780 return add_compile_string_length(sn
->s
, 1 /* sb */, (int )(sn
->end
- sn
->s
),
785 compile_string_node(Node
* node
, regex_t
* reg
)
787 int r
, len
, prev_len
, slen
, ambig
;
788 UChar
*p
, *prev
, *end
;
790 OnigEncoding enc
= reg
->enc
;
793 if (sn
->end
<= sn
->s
)
797 ambig
= NODE_STRING_IS_AMBIG(node
);
800 prev_len
= enclen(enc
, p
);
805 len
= enclen(enc
, p
);
806 if (len
== prev_len
) {
810 r
= add_compile_string(prev
, prev_len
, slen
, reg
, ambig
);
811 if (r
!= 0) return r
;
821 return add_compile_string(prev
, prev_len
, slen
, reg
, ambig
);
825 compile_string_raw_node(StrNode
* sn
, regex_t
* reg
)
827 if (sn
->end
<= sn
->s
)
830 return add_compile_string(sn
->s
, 1 /* sb */, (int )(sn
->end
- sn
->s
), reg
, 0);
834 set_multi_byte_cclass(BBuf
* mbuf
, regex_t
* reg
)
839 len
= (size_t )mbuf
->used
;
841 if (IS_NULL(p
)) return NULL
;
843 xmemcpy(p
, mbuf
->p
, len
);
848 compile_length_cclass_node(CClassNode
* cc
, regex_t
* reg
)
854 compile_cclass_node(CClassNode
* cc
, regex_t
* reg
)
858 if (IS_NULL(cc
->mbuf
)) {
859 r
= add_op(reg
, IS_NCCLASS_NOT(cc
) ? OP_CCLASS_NOT
: OP_CCLASS
);
860 if (r
!= 0) return r
;
862 COP(reg
)->cclass
.bsp
= xmalloc(SIZE_BITSET
);
863 CHECK_NULL_RETURN_MEMERR(COP(reg
)->cclass
.bsp
);
864 xmemcpy(COP(reg
)->cclass
.bsp
, cc
->bs
, SIZE_BITSET
);
869 if (ONIGENC_MBC_MINLEN(reg
->enc
) > 1 || bitset_is_empty(cc
->bs
)) {
870 r
= add_op(reg
, IS_NCCLASS_NOT(cc
) ? OP_CCLASS_MB_NOT
: OP_CCLASS_MB
);
871 if (r
!= 0) return r
;
873 p
= set_multi_byte_cclass(cc
->mbuf
, reg
);
874 CHECK_NULL_RETURN_MEMERR(p
);
875 COP(reg
)->cclass_mb
.mb
= p
;
878 r
= add_op(reg
, IS_NCCLASS_NOT(cc
) ? OP_CCLASS_MIX_NOT
: OP_CCLASS_MIX
);
879 if (r
!= 0) return r
;
881 COP(reg
)->cclass_mix
.bsp
= xmalloc(SIZE_BITSET
);
882 CHECK_NULL_RETURN_MEMERR(COP(reg
)->cclass_mix
.bsp
);
883 xmemcpy(COP(reg
)->cclass_mix
.bsp
, cc
->bs
, SIZE_BITSET
);
885 p
= set_multi_byte_cclass(cc
->mbuf
, reg
);
886 CHECK_NULL_RETURN_MEMERR(p
);
887 COP(reg
)->cclass_mix
.mb
= p
;
895 entry_repeat_range(regex_t
* reg
, int id
, int lower
, int upper
)
897 #define REPEAT_RANGE_ALLOC 4
901 if (reg
->repeat_range_alloc
== 0) {
902 p
= (OnigRepeatRange
* )xmalloc(sizeof(OnigRepeatRange
) * REPEAT_RANGE_ALLOC
);
903 CHECK_NULL_RETURN_MEMERR(p
);
904 reg
->repeat_range
= p
;
905 reg
->repeat_range_alloc
= REPEAT_RANGE_ALLOC
;
907 else if (reg
->repeat_range_alloc
<= id
) {
909 n
= reg
->repeat_range_alloc
+ REPEAT_RANGE_ALLOC
;
910 p
= (OnigRepeatRange
* )xrealloc(reg
->repeat_range
,
911 sizeof(OnigRepeatRange
) * n
,
912 sizeof(OnigRepeatRange
) * reg
->repeat_range_alloc
);
913 CHECK_NULL_RETURN_MEMERR(p
);
914 reg
->repeat_range
= p
;
915 reg
->repeat_range_alloc
= n
;
918 p
= reg
->repeat_range
;
922 p
[id
].upper
= (IS_INFINITE_REPEAT(upper
) ? 0x7fffffff : upper
);
927 compile_range_repeat_node(QuantNode
* qn
, int target_len
, int emptiness
,
928 regex_t
* reg
, ScanEnv
* env
)
931 int num_repeat
= reg
->num_repeat
++;
933 r
= add_op(reg
, qn
->greedy
? OP_REPEAT
: OP_REPEAT_NG
);
934 if (r
!= 0) return r
;
936 COP(reg
)->repeat
.id
= num_repeat
;
937 COP(reg
)->repeat
.addr
= SIZE_INC_OP
+ target_len
+ SIZE_OP_REPEAT_INC
;
939 r
= entry_repeat_range(reg
, num_repeat
, qn
->lower
, qn
->upper
);
940 if (r
!= 0) return r
;
942 r
= compile_tree_empty_check(NODE_QUANT_BODY(qn
), reg
, emptiness
, env
);
943 if (r
!= 0) return r
;
947 NODE_IS_IN_MULTI_ENTRY(qn
) ||
949 NODE_IS_IN_REAL_REPEAT(qn
)) {
950 r
= add_op(reg
, qn
->greedy
? OP_REPEAT_INC_SG
: OP_REPEAT_INC_NG_SG
);
953 r
= add_op(reg
, qn
->greedy
? OP_REPEAT_INC
: OP_REPEAT_INC_NG
);
955 if (r
!= 0) return r
;
957 COP(reg
)->repeat_inc
.id
= num_repeat
;
962 is_anychar_infinite_greedy(QuantNode
* qn
)
964 if (qn
->greedy
&& IS_INFINITE_REPEAT(qn
->upper
) &&
965 NODE_IS_ANYCHAR(NODE_QUANT_BODY(qn
)))
971 #define QUANTIFIER_EXPAND_LIMIT_SIZE 10
972 #define CKN_ON (ckn > 0)
975 compile_length_quantifier_node(QuantNode
* qn
, regex_t
* reg
)
978 int infinite
= IS_INFINITE_REPEAT(qn
->upper
);
979 enum BodyEmptyType emptiness
= qn
->emptiness
;
980 int tlen
= compile_length_tree(NODE_QUANT_BODY(qn
), reg
);
982 if (tlen
< 0) return tlen
;
983 if (tlen
== 0) return 0;
986 if (is_anychar_infinite_greedy(qn
)) {
987 if (qn
->lower
<= 1 ||
988 int_multiply_cmp(tlen
, qn
->lower
, QUANTIFIER_EXPAND_LIMIT_SIZE
) <= 0) {
989 if (IS_NOT_NULL(qn
->next_head_exact
))
990 return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT
+ tlen
* qn
->lower
;
992 return SIZE_OP_ANYCHAR_STAR
+ tlen
* qn
->lower
;
997 if (emptiness
!= BODY_IS_NOT_EMPTY
)
998 mod_tlen
+= SIZE_OP_EMPTY_CHECK_START
+ SIZE_OP_EMPTY_CHECK_END
;
1002 int_multiply_cmp(tlen
, qn
->lower
, QUANTIFIER_EXPAND_LIMIT_SIZE
) <= 0)) {
1003 if (qn
->lower
== 1 && tlen
> QUANTIFIER_EXPAND_LIMIT_SIZE
) {
1007 len
= tlen
* qn
->lower
;
1011 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1012 if (IS_NOT_NULL(qn
->head_exact
))
1013 len
+= SIZE_OP_PUSH_OR_JUMP_EXACT1
+ mod_tlen
+ SIZE_OP_JUMP
;
1016 if (IS_NOT_NULL(qn
->next_head_exact
))
1017 len
+= SIZE_OP_PUSH_IF_PEEK_NEXT
+ mod_tlen
+ SIZE_OP_JUMP
;
1019 len
+= SIZE_OP_PUSH
+ mod_tlen
+ SIZE_OP_JUMP
;
1022 len
+= SIZE_OP_JUMP
+ mod_tlen
+ SIZE_OP_PUSH
;
1024 else if (qn
->upper
== 0) {
1025 if (qn
->is_refered
!= 0) { /* /(?<n>..){0}/ */
1026 len
= SIZE_OP_JUMP
+ tlen
;
1031 else if (!infinite
&& qn
->greedy
&&
1033 int_multiply_cmp(tlen
+ SIZE_OP_PUSH
, qn
->upper
,
1034 QUANTIFIER_EXPAND_LIMIT_SIZE
) <= 0)) {
1035 len
= tlen
* qn
->lower
;
1036 len
+= (SIZE_OP_PUSH
+ tlen
) * (qn
->upper
- qn
->lower
);
1038 else if (!qn
->greedy
&& qn
->upper
== 1 && qn
->lower
== 0) { /* '??' */
1039 len
= SIZE_OP_PUSH
+ SIZE_OP_JUMP
+ tlen
;
1042 len
= SIZE_OP_REPEAT_INC
+ mod_tlen
+ SIZE_OP_REPEAT
;
1049 compile_quantifier_node(QuantNode
* qn
, regex_t
* reg
, ScanEnv
* env
)
1052 int infinite
= IS_INFINITE_REPEAT(qn
->upper
);
1053 enum BodyEmptyType emptiness
= qn
->emptiness
;
1054 int tlen
= compile_length_tree(NODE_QUANT_BODY(qn
), reg
);
1056 if (tlen
< 0) return tlen
;
1057 if (tlen
== 0) return 0;
1059 if (is_anychar_infinite_greedy(qn
) &&
1061 int_multiply_cmp(tlen
, qn
->lower
, QUANTIFIER_EXPAND_LIMIT_SIZE
) <= 0)) {
1062 r
= compile_tree_n_times(NODE_QUANT_BODY(qn
), qn
->lower
, reg
, env
);
1063 if (r
!= 0) return r
;
1064 if (IS_NOT_NULL(qn
->next_head_exact
)) {
1066 IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn
), reg
)) ?
1067 OP_ANYCHAR_ML_STAR_PEEK_NEXT
: OP_ANYCHAR_STAR_PEEK_NEXT
);
1068 if (r
!= 0) return r
;
1070 COP(reg
)->anychar_star_peek_next
.c
= STR_(qn
->next_head_exact
)->s
[0];
1075 IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn
), reg
)) ?
1076 OP_ANYCHAR_ML_STAR
: OP_ANYCHAR_STAR
);
1082 if (emptiness
!= BODY_IS_NOT_EMPTY
)
1083 mod_tlen
+= SIZE_OP_EMPTY_CHECK_START
+ SIZE_OP_EMPTY_CHECK_END
;
1087 int_multiply_cmp(tlen
, qn
->lower
, QUANTIFIER_EXPAND_LIMIT_SIZE
) <= 0)) {
1090 if (qn
->lower
== 1 && tlen
> QUANTIFIER_EXPAND_LIMIT_SIZE
) {
1091 r
= add_op(reg
, OP_JUMP
);
1092 if (r
!= 0) return r
;
1094 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1095 if (IS_NOT_NULL(qn
->head_exact
))
1096 COP(reg
)->jump
.addr
= SIZE_OP_PUSH_OR_JUMP_EXACT1
+ SIZE_INC_OP
;
1099 if (IS_NOT_NULL(qn
->next_head_exact
))
1100 COP(reg
)->jump
.addr
= SIZE_OP_PUSH_IF_PEEK_NEXT
+ SIZE_INC_OP
;
1102 COP(reg
)->jump
.addr
= SIZE_OP_PUSH
+ SIZE_INC_OP
;
1105 COP(reg
)->jump
.addr
= SIZE_OP_JUMP
+ SIZE_INC_OP
;
1109 r
= compile_tree_n_times(NODE_QUANT_BODY(qn
), qn
->lower
, reg
, env
);
1110 if (r
!= 0) return r
;
1114 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1115 if (IS_NOT_NULL(qn
->head_exact
)) {
1116 r
= add_op(reg
, OP_PUSH_OR_JUMP_EXACT1
);
1117 if (r
!= 0) return r
;
1118 COP(reg
)->push_or_jump_exact1
.addr
= SIZE_INC_OP
+ mod_tlen
+ SIZE_OP_JUMP
;
1119 COP(reg
)->push_or_jump_exact1
.c
= STR_(qn
->head_exact
)->s
[0];
1121 r
= compile_tree_empty_check(NODE_QUANT_BODY(qn
), reg
, emptiness
, env
);
1122 if (r
!= 0) return r
;
1124 addr
= -(mod_tlen
+ (int )SIZE_OP_PUSH_OR_JUMP_EXACT1
);
1128 if (IS_NOT_NULL(qn
->next_head_exact
)) {
1129 r
= add_op(reg
, OP_PUSH_IF_PEEK_NEXT
);
1130 if (r
!= 0) return r
;
1131 COP(reg
)->push_if_peek_next
.addr
= SIZE_INC_OP
+ mod_tlen
+ SIZE_OP_JUMP
;
1132 COP(reg
)->push_if_peek_next
.c
= STR_(qn
->next_head_exact
)->s
[0];
1134 r
= compile_tree_empty_check(NODE_QUANT_BODY(qn
), reg
, emptiness
, env
);
1135 if (r
!= 0) return r
;
1137 addr
= -(mod_tlen
+ (int )SIZE_OP_PUSH_IF_PEEK_NEXT
);
1140 r
= add_op(reg
, OP_PUSH
);
1141 if (r
!= 0) return r
;
1142 COP(reg
)->push
.addr
= SIZE_INC_OP
+ mod_tlen
+ SIZE_OP_JUMP
;
1144 r
= compile_tree_empty_check(NODE_QUANT_BODY(qn
), reg
, emptiness
, env
);
1145 if (r
!= 0) return r
;
1147 addr
= -(mod_tlen
+ (int )SIZE_OP_PUSH
);
1150 r
= add_op(reg
, OP_JUMP
);
1151 if (r
!= 0) return r
;
1152 COP(reg
)->jump
.addr
= addr
;
1155 r
= add_op(reg
, OP_JUMP
);
1156 if (r
!= 0) return r
;
1157 COP(reg
)->jump
.addr
= mod_tlen
+ SIZE_INC_OP
;
1159 r
= compile_tree_empty_check(NODE_QUANT_BODY(qn
), reg
, emptiness
, env
);
1160 if (r
!= 0) return r
;
1162 r
= add_op(reg
, OP_PUSH
);
1163 if (r
!= 0) return r
;
1164 COP(reg
)->push
.addr
= -mod_tlen
;
1167 else if (qn
->upper
== 0) {
1168 if (qn
->is_refered
!= 0) { /* /(?<n>..){0}/ */
1169 r
= add_op(reg
, OP_JUMP
);
1170 if (r
!= 0) return r
;
1171 COP(reg
)->jump
.addr
= tlen
+ SIZE_INC_OP
;
1173 r
= compile_tree(NODE_QUANT_BODY(qn
), reg
, env
);
1176 /* Nothing output */
1180 else if (! infinite
&& qn
->greedy
&&
1182 int_multiply_cmp(tlen
+ SIZE_OP_PUSH
, qn
->upper
,
1183 QUANTIFIER_EXPAND_LIMIT_SIZE
) <= 0)) {
1184 int n
= qn
->upper
- qn
->lower
;
1186 r
= compile_tree_n_times(NODE_QUANT_BODY(qn
), qn
->lower
, reg
, env
);
1187 if (r
!= 0) return r
;
1189 for (i
= 0; i
< n
; i
++) {
1190 int v
= onig_positive_int_multiply(n
- i
, tlen
+ SIZE_OP_PUSH
);
1191 if (v
< 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE
;
1193 r
= add_op(reg
, OP_PUSH
);
1194 if (r
!= 0) return r
;
1195 COP(reg
)->push
.addr
= v
;
1197 r
= compile_tree(NODE_QUANT_BODY(qn
), reg
, env
);
1198 if (r
!= 0) return r
;
1201 else if (! qn
->greedy
&& qn
->upper
== 1 && qn
->lower
== 0) { /* '??' */
1202 r
= add_op(reg
, OP_PUSH
);
1203 if (r
!= 0) return r
;
1204 COP(reg
)->push
.addr
= SIZE_INC_OP
+ SIZE_OP_JUMP
;
1206 r
= add_op(reg
, OP_JUMP
);
1207 if (r
!= 0) return r
;
1208 COP(reg
)->jump
.addr
= tlen
+ SIZE_INC_OP
;
1210 r
= compile_tree(NODE_QUANT_BODY(qn
), reg
, env
);
1213 r
= compile_range_repeat_node(qn
, mod_tlen
, emptiness
, reg
, env
);
1219 compile_length_option_node(BagNode
* node
, regex_t
* reg
)
1222 OnigOptionType prev
= reg
->options
;
1224 reg
->options
= node
->o
.options
;
1225 tlen
= compile_length_tree(NODE_BAG_BODY(node
), reg
);
1226 reg
->options
= prev
;
1232 compile_option_node(BagNode
* node
, regex_t
* reg
, ScanEnv
* env
)
1235 OnigOptionType prev
= reg
->options
;
1237 reg
->options
= node
->o
.options
;
1238 r
= compile_tree(NODE_BAG_BODY(node
), reg
, env
);
1239 reg
->options
= prev
;
1245 compile_length_bag_node(BagNode
* node
, regex_t
* reg
)
1250 if (node
->type
== BAG_OPTION
)
1251 return compile_length_option_node(node
, reg
);
1253 if (NODE_BAG_BODY(node
)) {
1254 tlen
= compile_length_tree(NODE_BAG_BODY(node
), reg
);
1255 if (tlen
< 0) return tlen
;
1260 switch (node
->type
) {
1264 if (node
->m
.regnum
== 0 && NODE_IS_CALLED(node
)) {
1265 len
= tlen
+ SIZE_OP_CALL
+ SIZE_OP_JUMP
+ SIZE_OP_RETURN
;
1269 if (NODE_IS_CALLED(node
)) {
1270 len
= SIZE_OP_MEMORY_START_PUSH
+ tlen
1271 + SIZE_OP_CALL
+ SIZE_OP_JUMP
+ SIZE_OP_RETURN
;
1272 if (MEM_STATUS_AT0(reg
->bt_mem_end
, node
->m
.regnum
))
1273 len
+= (NODE_IS_RECURSION(node
)
1274 ? SIZE_OP_MEMORY_END_PUSH_REC
: SIZE_OP_MEMORY_END_PUSH
);
1276 len
+= (NODE_IS_RECURSION(node
)
1277 ? SIZE_OP_MEMORY_END_REC
: SIZE_OP_MEMORY_END
);
1279 else if (NODE_IS_RECURSION(node
)) {
1280 len
= SIZE_OP_MEMORY_START_PUSH
;
1281 len
+= tlen
+ (MEM_STATUS_AT0(reg
->bt_mem_end
, node
->m
.regnum
)
1282 ? SIZE_OP_MEMORY_END_PUSH_REC
: SIZE_OP_MEMORY_END_REC
);
1287 if (MEM_STATUS_AT0(reg
->bt_mem_start
, node
->m
.regnum
))
1288 len
= SIZE_OP_MEMORY_START_PUSH
;
1290 len
= SIZE_OP_MEMORY_START
;
1292 len
+= tlen
+ (MEM_STATUS_AT0(reg
->bt_mem_end
, node
->m
.regnum
)
1293 ? SIZE_OP_MEMORY_END_PUSH
: SIZE_OP_MEMORY_END
);
1297 case BAG_STOP_BACKTRACK
:
1298 if (NODE_IS_STRICT_REAL_REPEAT(node
)) {
1302 qn
= QUANT_(NODE_BAG_BODY(node
));
1303 tlen
= compile_length_tree(NODE_QUANT_BODY(qn
), reg
);
1304 if (tlen
< 0) return tlen
;
1306 v
= onig_positive_int_multiply(qn
->lower
, tlen
);
1307 if (v
< 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE
;
1308 len
= v
+ SIZE_OP_PUSH
+ tlen
+ SIZE_OP_POP_OUT
+ SIZE_OP_JUMP
;
1311 len
= SIZE_OP_ATOMIC_START
+ tlen
+ SIZE_OP_ATOMIC_END
;
1317 Node
* cond
= NODE_BAG_BODY(node
);
1318 Node
* Then
= node
->te
.Then
;
1319 Node
* Else
= node
->te
.Else
;
1321 len
= compile_length_tree(cond
, reg
);
1322 if (len
< 0) return len
;
1323 len
+= SIZE_OP_PUSH
;
1324 len
+= SIZE_OP_ATOMIC_START
+ SIZE_OP_ATOMIC_END
;
1326 if (IS_NOT_NULL(Then
)) {
1327 tlen
= compile_length_tree(Then
, reg
);
1328 if (tlen
< 0) return tlen
;
1332 len
+= SIZE_OP_JUMP
+ SIZE_OP_ATOMIC_END
;
1334 if (IS_NOT_NULL(Else
)) {
1335 tlen
= compile_length_tree(Else
, reg
);
1336 if (tlen
< 0) return tlen
;
1343 /* never come here, but set for escape warning */
1351 static int get_char_len_node(Node
* node
, regex_t
* reg
, int* len
);
1354 compile_bag_memory_node(BagNode
* node
, regex_t
* reg
, ScanEnv
* env
)
1360 if (NODE_IS_CALLED(node
)) {
1361 r
= add_op(reg
, OP_CALL
);
1362 if (r
!= 0) return r
;
1364 node
->m
.called_addr
= COP_CURR_OFFSET(reg
) + 1 + SIZE_OP_JUMP
;
1365 NODE_STATUS_ADD(node
, ADDR_FIXED
);
1366 COP(reg
)->call
.addr
= (int )node
->m
.called_addr
;
1368 if (node
->m
.regnum
== 0) {
1369 len
= compile_length_tree(NODE_BAG_BODY(node
), reg
);
1370 len
+= SIZE_OP_RETURN
;
1372 r
= add_op(reg
, OP_JUMP
);
1373 if (r
!= 0) return r
;
1374 COP(reg
)->jump
.addr
= len
+ SIZE_INC_OP
;
1376 r
= compile_tree(NODE_BAG_BODY(node
), reg
, env
);
1377 if (r
!= 0) return r
;
1379 r
= add_op(reg
, OP_RETURN
);
1383 len
= compile_length_tree(NODE_BAG_BODY(node
), reg
);
1384 len
+= (SIZE_OP_MEMORY_START_PUSH
+ SIZE_OP_RETURN
);
1385 if (MEM_STATUS_AT0(reg
->bt_mem_end
, node
->m
.regnum
))
1386 len
+= (NODE_IS_RECURSION(node
)
1387 ? SIZE_OP_MEMORY_END_PUSH_REC
: SIZE_OP_MEMORY_END_PUSH
);
1389 len
+= (NODE_IS_RECURSION(node
)
1390 ? SIZE_OP_MEMORY_END_REC
: SIZE_OP_MEMORY_END
);
1392 r
= add_op(reg
, OP_JUMP
);
1393 if (r
!= 0) return r
;
1394 COP(reg
)->jump
.addr
= len
+ SIZE_INC_OP
;
1399 if (MEM_STATUS_AT0(reg
->bt_mem_start
, node
->m
.regnum
))
1400 r
= add_op(reg
, OP_MEMORY_START_PUSH
);
1402 r
= add_op(reg
, OP_MEMORY_START
);
1403 if (r
!= 0) return r
;
1404 COP(reg
)->memory_start
.num
= node
->m
.regnum
;
1406 r
= compile_tree(NODE_BAG_BODY(node
), reg
, env
);
1407 if (r
!= 0) return r
;
1410 if (MEM_STATUS_AT0(reg
->bt_mem_end
, node
->m
.regnum
))
1411 r
= add_op(reg
, (NODE_IS_RECURSION(node
)
1412 ? OP_MEMORY_END_PUSH_REC
: OP_MEMORY_END_PUSH
));
1414 r
= add_op(reg
, (NODE_IS_RECURSION(node
) ? OP_MEMORY_END_REC
: OP_MEMORY_END
));
1415 if (r
!= 0) return r
;
1416 COP(reg
)->memory_end
.num
= node
->m
.regnum
;
1418 if (NODE_IS_CALLED(node
)) {
1419 if (r
!= 0) return r
;
1420 r
= add_op(reg
, OP_RETURN
);
1423 if (MEM_STATUS_AT0(reg
->bt_mem_end
, node
->m
.regnum
))
1424 r
= add_op(reg
, OP_MEMORY_END_PUSH
);
1426 r
= add_op(reg
, OP_MEMORY_END
);
1427 if (r
!= 0) return r
;
1428 COP(reg
)->memory_end
.num
= node
->m
.regnum
;
1435 compile_bag_node(BagNode
* node
, regex_t
* reg
, ScanEnv
* env
)
1439 switch (node
->type
) {
1441 r
= compile_bag_memory_node(node
, reg
, env
);
1445 r
= compile_option_node(node
, reg
, env
);
1448 case BAG_STOP_BACKTRACK
:
1449 if (NODE_IS_STRICT_REAL_REPEAT(node
)) {
1450 QuantNode
* qn
= QUANT_(NODE_BAG_BODY(node
));
1451 r
= compile_tree_n_times(NODE_QUANT_BODY(qn
), qn
->lower
, reg
, env
);
1452 if (r
!= 0) return r
;
1454 len
= compile_length_tree(NODE_QUANT_BODY(qn
), reg
);
1455 if (len
< 0) return len
;
1457 r
= add_op(reg
, OP_PUSH
);
1458 if (r
!= 0) return r
;
1459 COP(reg
)->push
.addr
= SIZE_INC_OP
+ len
+ SIZE_OP_POP_OUT
+ SIZE_OP_JUMP
;
1461 r
= compile_tree(NODE_QUANT_BODY(qn
), reg
, env
);
1462 if (r
!= 0) return r
;
1463 r
= add_op(reg
, OP_POP_OUT
);
1464 if (r
!= 0) return r
;
1466 r
= add_op(reg
, OP_JUMP
);
1467 if (r
!= 0) return r
;
1468 COP(reg
)->jump
.addr
= -((int )SIZE_OP_PUSH
+ len
+ (int )SIZE_OP_POP_OUT
);
1471 r
= add_op(reg
, OP_ATOMIC_START
);
1472 if (r
!= 0) return r
;
1473 r
= compile_tree(NODE_BAG_BODY(node
), reg
, env
);
1474 if (r
!= 0) return r
;
1475 r
= add_op(reg
, OP_ATOMIC_END
);
1481 int cond_len
, then_len
, else_len
, jump_len
;
1482 Node
* cond
= NODE_BAG_BODY(node
);
1483 Node
* Then
= node
->te
.Then
;
1484 Node
* Else
= node
->te
.Else
;
1486 r
= add_op(reg
, OP_ATOMIC_START
);
1487 if (r
!= 0) return r
;
1489 cond_len
= compile_length_tree(cond
, reg
);
1490 if (cond_len
< 0) return cond_len
;
1491 if (IS_NOT_NULL(Then
)) {
1492 then_len
= compile_length_tree(Then
, reg
);
1493 if (then_len
< 0) return then_len
;
1498 jump_len
= cond_len
+ then_len
+ SIZE_OP_ATOMIC_END
+ SIZE_OP_JUMP
;
1500 r
= add_op(reg
, OP_PUSH
);
1501 if (r
!= 0) return r
;
1502 COP(reg
)->push
.addr
= SIZE_INC_OP
+ jump_len
;
1504 r
= compile_tree(cond
, reg
, env
);
1505 if (r
!= 0) return r
;
1506 r
= add_op(reg
, OP_ATOMIC_END
);
1507 if (r
!= 0) return r
;
1509 if (IS_NOT_NULL(Then
)) {
1510 r
= compile_tree(Then
, reg
, env
);
1511 if (r
!= 0) return r
;
1514 if (IS_NOT_NULL(Else
)) {
1515 else_len
= compile_length_tree(Else
, reg
);
1516 if (else_len
< 0) return else_len
;
1521 r
= add_op(reg
, OP_JUMP
);
1522 if (r
!= 0) return r
;
1523 COP(reg
)->jump
.addr
= SIZE_OP_ATOMIC_END
+ else_len
+ SIZE_INC_OP
;
1525 r
= add_op(reg
, OP_ATOMIC_END
);
1526 if (r
!= 0) return r
;
1528 if (IS_NOT_NULL(Else
)) {
1529 r
= compile_tree(Else
, reg
, env
);
1539 compile_length_anchor_node(AnchorNode
* node
, regex_t
* reg
)
1544 if (IS_NOT_NULL(NODE_ANCHOR_BODY(node
))) {
1545 tlen
= compile_length_tree(NODE_ANCHOR_BODY(node
), reg
);
1546 if (tlen
< 0) return tlen
;
1549 switch (node
->type
) {
1550 case ANCR_PREC_READ
:
1551 len
= SIZE_OP_PREC_READ_START
+ tlen
+ SIZE_OP_PREC_READ_END
;
1553 case ANCR_PREC_READ_NOT
:
1554 len
= SIZE_OP_PREC_READ_NOT_START
+ tlen
+ SIZE_OP_PREC_READ_NOT_END
;
1556 case ANCR_LOOK_BEHIND
:
1557 len
= SIZE_OP_LOOK_BEHIND
+ tlen
;
1559 case ANCR_LOOK_BEHIND_NOT
:
1560 len
= SIZE_OP_LOOK_BEHIND_NOT_START
+ tlen
+ SIZE_OP_LOOK_BEHIND_NOT_END
;
1563 case ANCR_WORD_BOUNDARY
:
1564 case ANCR_NO_WORD_BOUNDARY
:
1565 #ifdef USE_WORD_BEGIN_END
1566 case ANCR_WORD_BEGIN
:
1569 len
= SIZE_OP_WORD_BOUNDARY
;
1572 case ANCR_TEXT_SEGMENT_BOUNDARY
:
1573 case ANCR_NO_TEXT_SEGMENT_BOUNDARY
:
1586 compile_anchor_node(AnchorNode
* node
, regex_t
* reg
, ScanEnv
* env
)
1591 switch (node
->type
) {
1592 case ANCR_BEGIN_BUF
: r
= add_op(reg
, OP_BEGIN_BUF
); break;
1593 case ANCR_END_BUF
: r
= add_op(reg
, OP_END_BUF
); break;
1594 case ANCR_BEGIN_LINE
: r
= add_op(reg
, OP_BEGIN_LINE
); break;
1595 case ANCR_END_LINE
: r
= add_op(reg
, OP_END_LINE
); break;
1596 case ANCR_SEMI_END_BUF
: r
= add_op(reg
, OP_SEMI_END_BUF
); break;
1597 case ANCR_BEGIN_POSITION
: r
= add_op(reg
, OP_BEGIN_POSITION
); break;
1599 case ANCR_WORD_BOUNDARY
:
1600 op
= OP_WORD_BOUNDARY
;
1602 r
= add_op(reg
, op
);
1603 if (r
!= 0) return r
;
1604 COP(reg
)->word_boundary
.mode
= (ModeType
)node
->ascii_mode
;
1607 case ANCR_NO_WORD_BOUNDARY
:
1608 op
= OP_NO_WORD_BOUNDARY
; goto word
;
1610 #ifdef USE_WORD_BEGIN_END
1611 case ANCR_WORD_BEGIN
:
1612 op
= OP_WORD_BEGIN
; goto word
;
1615 op
= OP_WORD_END
; goto word
;
1619 case ANCR_TEXT_SEGMENT_BOUNDARY
:
1620 case ANCR_NO_TEXT_SEGMENT_BOUNDARY
:
1622 enum TextSegmentBoundaryType type
;
1624 r
= add_op(reg
, OP_TEXT_SEGMENT_BOUNDARY
);
1625 if (r
!= 0) return r
;
1627 type
= EXTENDED_GRAPHEME_CLUSTER_BOUNDARY
;
1628 #ifdef USE_UNICODE_WORD_BREAK
1629 if (ONIG_IS_OPTION_ON(reg
->options
, ONIG_OPTION_TEXT_SEGMENT_WORD
))
1630 type
= WORD_BOUNDARY
;
1633 COP(reg
)->text_segment_boundary
.type
= type
;
1634 COP(reg
)->text_segment_boundary
.not =
1635 (node
->type
== ANCR_NO_TEXT_SEGMENT_BOUNDARY
? 1 : 0);
1639 case ANCR_PREC_READ
:
1640 r
= add_op(reg
, OP_PREC_READ_START
);
1641 if (r
!= 0) return r
;
1642 r
= compile_tree(NODE_ANCHOR_BODY(node
), reg
, env
);
1643 if (r
!= 0) return r
;
1644 r
= add_op(reg
, OP_PREC_READ_END
);
1647 case ANCR_PREC_READ_NOT
:
1648 len
= compile_length_tree(NODE_ANCHOR_BODY(node
), reg
);
1649 if (len
< 0) return len
;
1651 r
= add_op(reg
, OP_PREC_READ_NOT_START
);
1652 if (r
!= 0) return r
;
1653 COP(reg
)->prec_read_not_start
.addr
= SIZE_INC_OP
+ len
+ SIZE_OP_PREC_READ_NOT_END
;
1654 r
= compile_tree(NODE_ANCHOR_BODY(node
), reg
, env
);
1655 if (r
!= 0) return r
;
1656 r
= add_op(reg
, OP_PREC_READ_NOT_END
);
1659 case ANCR_LOOK_BEHIND
:
1662 r
= add_op(reg
, OP_LOOK_BEHIND
);
1663 if (r
!= 0) return r
;
1664 if (node
->char_len
< 0) {
1665 r
= get_char_len_node(NODE_ANCHOR_BODY(node
), reg
, &n
);
1666 if (r
!= 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN
;
1671 COP(reg
)->look_behind
.len
= n
;
1672 r
= compile_tree(NODE_ANCHOR_BODY(node
), reg
, env
);
1676 case ANCR_LOOK_BEHIND_NOT
:
1680 len
= compile_length_tree(NODE_ANCHOR_BODY(node
), reg
);
1681 r
= add_op(reg
, OP_LOOK_BEHIND_NOT_START
);
1682 if (r
!= 0) return r
;
1683 COP(reg
)->look_behind_not_start
.addr
= SIZE_INC_OP
+ len
+ SIZE_OP_LOOK_BEHIND_NOT_END
;
1685 if (node
->char_len
< 0) {
1686 r
= get_char_len_node(NODE_ANCHOR_BODY(node
), reg
, &n
);
1687 if (r
!= 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN
;
1692 COP(reg
)->look_behind_not_start
.len
= n
;
1694 r
= compile_tree(NODE_ANCHOR_BODY(node
), reg
, env
);
1695 if (r
!= 0) return r
;
1696 r
= add_op(reg
, OP_LOOK_BEHIND_NOT_END
);
1701 return ONIGERR_TYPE_BUG
;
1709 compile_gimmick_node(GimmickNode
* node
, regex_t
* reg
)
1713 switch (node
->type
) {
1715 r
= add_op(reg
, OP_FAIL
);
1719 r
= add_op(reg
, OP_PUSH_SAVE_VAL
);
1720 if (r
!= 0) return r
;
1721 COP(reg
)->push_save_val
.type
= node
->detail_type
;
1722 COP(reg
)->push_save_val
.id
= node
->id
;
1725 case GIMMICK_UPDATE_VAR
:
1726 r
= add_op(reg
, OP_UPDATE_VAR
);
1727 if (r
!= 0) return r
;
1728 COP(reg
)->update_var
.type
= node
->detail_type
;
1729 COP(reg
)->update_var
.id
= node
->id
;
1733 case GIMMICK_CALLOUT
:
1734 switch (node
->detail_type
) {
1735 case ONIG_CALLOUT_OF_CONTENTS
:
1736 case ONIG_CALLOUT_OF_NAME
:
1738 if (node
->detail_type
== ONIG_CALLOUT_OF_NAME
) {
1739 r
= add_op(reg
, OP_CALLOUT_NAME
);
1740 if (r
!= 0) return r
;
1741 COP(reg
)->callout_name
.id
= node
->id
;
1742 COP(reg
)->callout_name
.num
= node
->num
;
1745 r
= add_op(reg
, OP_CALLOUT_CONTENTS
);
1746 if (r
!= 0) return r
;
1747 COP(reg
)->callout_contents
.num
= node
->num
;
1753 r
= ONIGERR_TYPE_BUG
;
1763 compile_length_gimmick_node(GimmickNode
* node
, regex_t
* reg
)
1767 switch (node
->type
) {
1773 len
= SIZE_OP_PUSH_SAVE_VAL
;
1776 case GIMMICK_UPDATE_VAR
:
1777 len
= SIZE_OP_UPDATE_VAR
;
1781 case GIMMICK_CALLOUT
:
1782 switch (node
->detail_type
) {
1783 case ONIG_CALLOUT_OF_CONTENTS
:
1784 len
= SIZE_OP_CALLOUT_CONTENTS
;
1786 case ONIG_CALLOUT_OF_NAME
:
1787 len
= SIZE_OP_CALLOUT_NAME
;
1791 len
= ONIGERR_TYPE_BUG
;
1802 compile_length_tree(Node
* node
, regex_t
* reg
)
1806 switch (NODE_TYPE(node
)) {
1810 r
= compile_length_tree(NODE_CAR(node
), reg
);
1811 if (r
< 0) return r
;
1813 } while (IS_NOT_NULL(node
= NODE_CDR(node
)));
1823 r
+= compile_length_tree(NODE_CAR(node
), reg
);
1825 } while (IS_NOT_NULL(node
= NODE_CDR(node
)));
1826 r
+= (SIZE_OP_PUSH
+ SIZE_OP_JUMP
) * (n
- 1);
1831 if (NODE_STRING_IS_RAW(node
))
1832 r
= compile_length_string_raw_node(STR_(node
), reg
);
1834 r
= compile_length_string_node(node
, reg
);
1838 r
= compile_length_cclass_node(CCLASS_(node
), reg
);
1846 r
= SIZE_OP_BACKREF
;
1856 r
= compile_length_quantifier_node(QUANT_(node
), reg
);
1860 r
= compile_length_bag_node(BAG_(node
), reg
);
1864 r
= compile_length_anchor_node(ANCHOR_(node
), reg
);
1868 r
= compile_length_gimmick_node(GIMMICK_(node
), reg
);
1872 return ONIGERR_TYPE_BUG
;
1880 compile_tree(Node
* node
, regex_t
* reg
, ScanEnv
* env
)
1882 int n
, len
, pos
, r
= 0;
1884 switch (NODE_TYPE(node
)) {
1887 r
= compile_tree(NODE_CAR(node
), reg
, env
);
1888 } while (r
== 0 && IS_NOT_NULL(node
= NODE_CDR(node
)));
1896 len
+= compile_length_tree(NODE_CAR(x
), reg
);
1897 if (IS_NOT_NULL(NODE_CDR(x
))) {
1898 len
+= SIZE_OP_PUSH
+ SIZE_OP_JUMP
;
1900 } while (IS_NOT_NULL(x
= NODE_CDR(x
)));
1901 pos
= COP_CURR_OFFSET(reg
) + 1 + len
; /* goal position */
1904 len
= compile_length_tree(NODE_CAR(node
), reg
);
1905 if (IS_NOT_NULL(NODE_CDR(node
))) {
1906 enum OpCode push
= NODE_IS_SUPER(node
) ? OP_PUSH_SUPER
: OP_PUSH
;
1907 r
= add_op(reg
, push
);
1909 COP(reg
)->push
.addr
= SIZE_INC_OP
+ len
+ SIZE_OP_JUMP
;
1911 r
= compile_tree(NODE_CAR(node
), reg
, env
);
1913 if (IS_NOT_NULL(NODE_CDR(node
))) {
1914 len
= pos
- (COP_CURR_OFFSET(reg
) + 1);
1915 r
= add_op(reg
, OP_JUMP
);
1917 COP(reg
)->jump
.addr
= len
;
1919 } while (IS_NOT_NULL(node
= NODE_CDR(node
)));
1924 if (NODE_STRING_IS_RAW(node
))
1925 r
= compile_string_raw_node(STR_(node
), reg
);
1927 r
= compile_string_node(node
, reg
);
1931 r
= compile_cclass_node(CCLASS_(node
), reg
);
1938 switch (CTYPE_(node
)->ctype
) {
1940 r
= add_op(reg
, IS_MULTILINE(CTYPE_OPTION(node
, reg
)) ?
1941 OP_ANYCHAR_ML
: OP_ANYCHAR
);
1944 case ONIGENC_CTYPE_WORD
:
1945 if (CTYPE_(node
)->ascii_mode
== 0) {
1946 op
= CTYPE_(node
)->not != 0 ? OP_NO_WORD
: OP_WORD
;
1949 op
= CTYPE_(node
)->not != 0 ? OP_NO_WORD_ASCII
: OP_WORD_ASCII
;
1951 r
= add_op(reg
, op
);
1955 return ONIGERR_TYPE_BUG
;
1963 BackRefNode
* br
= BACKREF_(node
);
1965 if (NODE_IS_CHECKER(node
)) {
1966 #ifdef USE_BACKREF_WITH_LEVEL
1967 if (NODE_IS_NEST_LEVEL(node
)) {
1968 r
= add_op(reg
, OP_BACKREF_CHECK_WITH_LEVEL
);
1969 if (r
!= 0) return r
;
1970 COP(reg
)->backref_general
.nest_level
= br
->nest_level
;
1975 r
= add_op(reg
, OP_BACKREF_CHECK
);
1976 if (r
!= 0) return r
;
1978 goto add_bacref_mems
;
1981 #ifdef USE_BACKREF_WITH_LEVEL
1982 if (NODE_IS_NEST_LEVEL(node
)) {
1983 if ((reg
->options
& ONIG_OPTION_IGNORECASE
) != 0)
1984 r
= add_op(reg
, OP_BACKREF_WITH_LEVEL_IC
);
1986 r
= add_op(reg
, OP_BACKREF_WITH_LEVEL
);
1988 if (r
!= 0) return r
;
1989 COP(reg
)->backref_general
.nest_level
= br
->nest_level
;
1990 goto add_bacref_mems
;
1994 if (br
->back_num
== 1) {
1995 n
= br
->back_static
[0];
1996 if (IS_IGNORECASE(reg
->options
)) {
1997 r
= add_op(reg
, OP_BACKREF_N_IC
);
1998 if (r
!= 0) return r
;
1999 COP(reg
)->backref_n
.n1
= n
;
2003 case 1: r
= add_op(reg
, OP_BACKREF1
); break;
2004 case 2: r
= add_op(reg
, OP_BACKREF2
); break;
2006 r
= add_op(reg
, OP_BACKREF_N
);
2007 if (r
!= 0) return r
;
2008 COP(reg
)->backref_n
.n1
= n
;
2017 r
= add_op(reg
, IS_IGNORECASE(reg
->options
) ?
2018 OP_BACKREF_MULTI_IC
: OP_BACKREF_MULTI
);
2019 if (r
!= 0) return r
;
2023 COP(reg
)->backref_general
.num
= num
;
2025 COP(reg
)->backref_general
.n1
= br
->back_static
[0];
2031 ns
= xmalloc(sizeof(MemNumType
) * num
);
2032 CHECK_NULL_RETURN_MEMERR(ns
);
2033 COP(reg
)->backref_general
.ns
= ns
;
2035 for (i
= num
- 1, j
= 0; i
>= 0; i
--, j
++) {
2046 r
= compile_call(CALL_(node
), reg
, env
);
2051 r
= compile_quantifier_node(QUANT_(node
), reg
, env
);
2055 r
= compile_bag_node(BAG_(node
), reg
, env
);
2059 r
= compile_anchor_node(ANCHOR_(node
), reg
, env
);
2063 r
= compile_gimmick_node(GIMMICK_(node
), reg
);
2068 fprintf(stderr
, "compile_tree: undefined node type %d\n", NODE_TYPE(node
));
2077 noname_disable_map(Node
** plink
, GroupNumRemap
* map
, int* counter
)
2080 Node
* node
= *plink
;
2082 switch (NODE_TYPE(node
)) {
2086 r
= noname_disable_map(&(NODE_CAR(node
)), map
, counter
);
2087 } while (r
== 0 && IS_NOT_NULL(node
= NODE_CDR(node
)));
2092 Node
** ptarget
= &(NODE_BODY(node
));
2093 Node
* old
= *ptarget
;
2094 r
= noname_disable_map(ptarget
, map
, counter
);
2095 if (*ptarget
!= old
&& NODE_TYPE(*ptarget
) == NODE_QUANT
) {
2096 onig_reduce_nested_quantifier(node
, *ptarget
);
2103 BagNode
* en
= BAG_(node
);
2104 if (en
->type
== BAG_MEMORY
) {
2105 if (NODE_IS_NAMED_GROUP(node
)) {
2107 map
[en
->m
.regnum
].new_val
= *counter
;
2108 en
->m
.regnum
= *counter
;
2109 r
= noname_disable_map(&(NODE_BODY(node
)), map
, counter
);
2112 *plink
= NODE_BODY(node
);
2113 NODE_BODY(node
) = NULL_NODE
;
2114 onig_node_free(node
);
2115 r
= noname_disable_map(plink
, map
, counter
);
2118 else if (en
->type
== BAG_IF_ELSE
) {
2119 r
= noname_disable_map(&(NODE_BAG_BODY(en
)), map
, counter
);
2120 if (r
!= 0) return r
;
2121 if (IS_NOT_NULL(en
->te
.Then
)) {
2122 r
= noname_disable_map(&(en
->te
.Then
), map
, counter
);
2123 if (r
!= 0) return r
;
2125 if (IS_NOT_NULL(en
->te
.Else
)) {
2126 r
= noname_disable_map(&(en
->te
.Else
), map
, counter
);
2127 if (r
!= 0) return r
;
2131 r
= noname_disable_map(&(NODE_BODY(node
)), map
, counter
);
2136 if (IS_NOT_NULL(NODE_BODY(node
)))
2137 r
= noname_disable_map(&(NODE_BODY(node
)), map
, counter
);
2148 renumber_node_backref(Node
* node
, GroupNumRemap
* map
)
2150 int i
, pos
, n
, old_num
;
2152 BackRefNode
* bn
= BACKREF_(node
);
2154 if (! NODE_IS_BY_NAME(node
))
2155 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED
;
2157 old_num
= bn
->back_num
;
2158 if (IS_NULL(bn
->back_dynamic
))
2159 backs
= bn
->back_static
;
2161 backs
= bn
->back_dynamic
;
2163 for (i
= 0, pos
= 0; i
< old_num
; i
++) {
2164 n
= map
[backs
[i
]].new_val
;
2176 renumber_by_map(Node
* node
, GroupNumRemap
* map
)
2180 switch (NODE_TYPE(node
)) {
2184 r
= renumber_by_map(NODE_CAR(node
), map
);
2185 } while (r
== 0 && IS_NOT_NULL(node
= NODE_CDR(node
)));
2189 r
= renumber_by_map(NODE_BODY(node
), map
);
2194 BagNode
* en
= BAG_(node
);
2196 r
= renumber_by_map(NODE_BODY(node
), map
);
2197 if (r
!= 0) return r
;
2199 if (en
->type
== BAG_IF_ELSE
) {
2200 if (IS_NOT_NULL(en
->te
.Then
)) {
2201 r
= renumber_by_map(en
->te
.Then
, map
);
2202 if (r
!= 0) return r
;
2204 if (IS_NOT_NULL(en
->te
.Else
)) {
2205 r
= renumber_by_map(en
->te
.Else
, map
);
2206 if (r
!= 0) return r
;
2213 r
= renumber_node_backref(node
, map
);
2217 if (IS_NOT_NULL(NODE_BODY(node
)))
2218 r
= renumber_by_map(NODE_BODY(node
), map
);
2229 numbered_ref_check(Node
* node
)
2233 switch (NODE_TYPE(node
)) {
2237 r
= numbered_ref_check(NODE_CAR(node
));
2238 } while (r
== 0 && IS_NOT_NULL(node
= NODE_CDR(node
)));
2242 if (IS_NULL(NODE_BODY(node
)))
2246 r
= numbered_ref_check(NODE_BODY(node
));
2251 BagNode
* en
= BAG_(node
);
2253 r
= numbered_ref_check(NODE_BODY(node
));
2254 if (r
!= 0) return r
;
2256 if (en
->type
== BAG_IF_ELSE
) {
2257 if (IS_NOT_NULL(en
->te
.Then
)) {
2258 r
= numbered_ref_check(en
->te
.Then
);
2259 if (r
!= 0) return r
;
2261 if (IS_NOT_NULL(en
->te
.Else
)) {
2262 r
= numbered_ref_check(en
->te
.Else
);
2263 if (r
!= 0) return r
;
2271 if (! NODE_IS_BY_NAME(node
))
2272 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED
;
2283 disable_noname_group_capture(Node
** root
, regex_t
* reg
, ScanEnv
* env
)
2285 int r
, i
, pos
, counter
;
2290 map
= (GroupNumRemap
* )xmalloc(sizeof(GroupNumRemap
) * (env
->num_mem
+ 1));
2291 CHECK_NULL_RETURN_MEMERR(map
);
2292 for (i
= 1; i
<= env
->num_mem
; i
++) {
2296 r
= noname_disable_map(root
, map
, &counter
);
2297 if (r
!= 0) return r
;
2299 r
= renumber_by_map(*root
, map
);
2300 if (r
!= 0) return r
;
2302 for (i
= 1, pos
= 1; i
<= env
->num_mem
; i
++) {
2303 if (map
[i
].new_val
> 0) {
2304 SCANENV_MEMENV(env
)[pos
] = SCANENV_MEMENV(env
)[i
];
2309 loc
= env
->capture_history
;
2310 MEM_STATUS_CLEAR(env
->capture_history
);
2311 for (i
= 1; i
<= ONIG_MAX_CAPTURE_HISTORY_GROUP
; i
++) {
2312 if (MEM_STATUS_AT(loc
, i
)) {
2313 MEM_STATUS_ON_SIMPLE(env
->capture_history
, map
[i
].new_val
);
2317 env
->num_mem
= env
->num_named
;
2318 reg
->num_mem
= env
->num_named
;
2319 result
= onig_renumber_name_table(reg
, map
);
2326 fix_unset_addr_list(UnsetAddrList
* uslist
, regex_t
* reg
)
2333 for (i
= 0; i
< uslist
->num
; i
++) {
2334 if (! NODE_IS_ADDR_FIXED(uslist
->us
[i
].target
))
2335 return ONIGERR_PARSER_BUG
;
2337 en
= BAG_(uslist
->us
[i
].target
);
2338 addr
= en
->m
.called_addr
;
2339 offset
= uslist
->us
[i
].offset
;
2341 paddr
= (AbsAddrType
* )((char* )reg
->ops
+ offset
);
2349 #define GET_CHAR_LEN_VARLEN -1
2350 #define GET_CHAR_LEN_TOP_ALT_VARLEN -2
2352 /* fixed size pattern node only */
2354 get_char_len_node1(Node
* node
, regex_t
* reg
, int* len
, int level
)
2361 switch (NODE_TYPE(node
)) {
2364 r
= get_char_len_node1(NODE_CAR(node
), reg
, &tlen
, level
);
2366 *len
= distance_add(*len
, tlen
);
2367 } while (r
== 0 && IS_NOT_NULL(node
= NODE_CDR(node
)));
2375 r
= get_char_len_node1(NODE_CAR(node
), reg
, &tlen
, level
);
2376 while (r
== 0 && IS_NOT_NULL(node
= NODE_CDR(node
))) {
2377 r
= get_char_len_node1(NODE_CAR(node
), reg
, &tlen2
, level
);
2386 r
= GET_CHAR_LEN_TOP_ALT_VARLEN
;
2388 r
= GET_CHAR_LEN_VARLEN
;
2398 StrNode
* sn
= STR_(node
);
2401 while (s
< sn
->end
) {
2402 s
+= enclen(reg
->enc
, s
);
2410 QuantNode
* qn
= QUANT_(node
);
2412 if (qn
->lower
== qn
->upper
) {
2413 if (qn
->upper
== 0) {
2417 r
= get_char_len_node1(NODE_BODY(node
), reg
, &tlen
, level
);
2419 *len
= distance_multiply(tlen
, qn
->lower
);
2423 r
= GET_CHAR_LEN_VARLEN
;
2429 if (! NODE_IS_RECURSION(node
))
2430 r
= get_char_len_node1(NODE_BODY(node
), reg
, len
, level
);
2432 r
= GET_CHAR_LEN_VARLEN
;
2443 BagNode
* en
= BAG_(node
);
2448 if (NODE_IS_CLEN_FIXED(node
))
2449 *len
= en
->char_len
;
2451 r
= get_char_len_node1(NODE_BODY(node
), reg
, len
, level
);
2453 en
->char_len
= *len
;
2454 NODE_STATUS_ADD(node
, CLEN_FIXED
);
2460 case BAG_STOP_BACKTRACK
:
2461 r
= get_char_len_node1(NODE_BODY(node
), reg
, len
, level
);
2467 r
= get_char_len_node1(NODE_BODY(node
), reg
, &clen
, level
);
2469 if (IS_NOT_NULL(en
->te
.Then
)) {
2470 r
= get_char_len_node1(en
->te
.Then
, reg
, &tlen
, level
);
2474 if (IS_NOT_NULL(en
->te
.Else
)) {
2475 r
= get_char_len_node1(en
->te
.Else
, reg
, &elen
, level
);
2480 if (clen
+ tlen
!= elen
) {
2481 r
= GET_CHAR_LEN_VARLEN
;
2498 if (NODE_IS_CHECKER(node
))
2502 r
= GET_CHAR_LEN_VARLEN
;
2510 get_char_len_node(Node
* node
, regex_t
* reg
, int* len
)
2512 return get_char_len_node1(node
, reg
, len
, 0);
2515 /* x is not included y ==> 1 : 0 */
2517 is_exclusive(Node
* x
, Node
* y
, regex_t
* reg
)
2525 ytype
= NODE_TYPE(y
);
2526 switch (NODE_TYPE(x
)) {
2529 if (CTYPE_(x
)->ctype
== CTYPE_ANYCHAR
||
2530 CTYPE_(y
)->ctype
== CTYPE_ANYCHAR
)
2535 if (CTYPE_(y
)->ctype
== CTYPE_(x
)->ctype
&&
2536 CTYPE_(y
)->not != CTYPE_(x
)->not &&
2537 CTYPE_(y
)->ascii_mode
== CTYPE_(x
)->ascii_mode
)
2547 tmp
= x
; x
= y
; y
= tmp
;
2565 CClassNode
* xc
= CCLASS_(x
);
2569 switch (CTYPE_(y
)->ctype
) {
2574 case ONIGENC_CTYPE_WORD
:
2575 if (CTYPE_(y
)->not == 0) {
2576 if (IS_NULL(xc
->mbuf
) && !IS_NCCLASS_NOT(xc
)) {
2577 range
= CTYPE_(y
)->ascii_mode
!= 0 ? 128 : SINGLE_BYTE_SIZE
;
2578 for (i
= 0; i
< range
; i
++) {
2579 if (BITSET_AT(xc
->bs
, i
)) {
2580 if (ONIGENC_IS_CODE_WORD(reg
->enc
, i
)) return 0;
2588 if (IS_NOT_NULL(xc
->mbuf
)) return 0;
2589 if (IS_NCCLASS_NOT(xc
)) return 0;
2591 range
= CTYPE_(y
)->ascii_mode
!= 0 ? 128 : SINGLE_BYTE_SIZE
;
2592 for (i
= 0; i
< range
; i
++) {
2593 if (! ONIGENC_IS_CODE_WORD(reg
->enc
, i
)) {
2594 if (BITSET_AT(xc
->bs
, i
))
2598 for (i
= range
; i
< SINGLE_BYTE_SIZE
; i
++) {
2599 if (BITSET_AT(xc
->bs
, i
)) return 0;
2613 CClassNode
* yc
= CCLASS_(y
);
2615 for (i
= 0; i
< SINGLE_BYTE_SIZE
; i
++) {
2616 v
= BITSET_AT(xc
->bs
, i
);
2617 if ((v
!= 0 && !IS_NCCLASS_NOT(xc
)) || (v
== 0 && IS_NCCLASS_NOT(xc
))) {
2618 v
= BITSET_AT(yc
->bs
, i
);
2619 if ((v
!= 0 && !IS_NCCLASS_NOT(yc
)) ||
2620 (v
== 0 && IS_NCCLASS_NOT(yc
)))
2624 if ((IS_NULL(xc
->mbuf
) && !IS_NCCLASS_NOT(xc
)) ||
2625 (IS_NULL(yc
->mbuf
) && !IS_NCCLASS_NOT(yc
)))
2643 StrNode
* xs
= STR_(x
);
2645 if (NODE_STRING_LEN(x
) == 0)
2650 switch (CTYPE_(y
)->ctype
) {
2654 case ONIGENC_CTYPE_WORD
:
2655 if (CTYPE_(y
)->ascii_mode
== 0) {
2656 if (ONIGENC_IS_MBC_WORD(reg
->enc
, xs
->s
, xs
->end
))
2657 return CTYPE_(y
)->not;
2659 return !(CTYPE_(y
)->not);
2662 if (ONIGENC_IS_MBC_WORD_ASCII(reg
->enc
, xs
->s
, xs
->end
))
2663 return CTYPE_(y
)->not;
2665 return !(CTYPE_(y
)->not);
2675 CClassNode
* cc
= CCLASS_(y
);
2677 code
= ONIGENC_MBC_TO_CODE(reg
->enc
, xs
->s
,
2678 xs
->s
+ ONIGENC_MBC_MAXLEN(reg
->enc
));
2679 return onig_is_code_in_cc(reg
->enc
, code
, cc
) == 0;
2686 StrNode
* ys
= STR_(y
);
2688 len
= NODE_STRING_LEN(x
);
2689 if (len
> NODE_STRING_LEN(y
)) len
= NODE_STRING_LEN(y
);
2690 if (NODE_STRING_IS_AMBIG(x
) || NODE_STRING_IS_AMBIG(y
)) {
2695 for (i
= 0, p
= ys
->s
, q
= xs
->s
; i
< len
; i
++, p
++, q
++) {
2696 if (*p
!= *q
) return 1;
2716 get_head_value_node(Node
* node
, int exact
, regex_t
* reg
)
2718 Node
* n
= NULL_NODE
;
2720 switch (NODE_TYPE(node
)) {
2729 if (CTYPE_(node
)->ctype
== CTYPE_ANYCHAR
)
2739 n
= get_head_value_node(NODE_CAR(node
), exact
, reg
);
2744 StrNode
* sn
= STR_(node
);
2746 if (sn
->end
<= sn
->s
)
2750 ! IS_IGNORECASE(reg
->options
) || NODE_STRING_IS_RAW(node
)) {
2758 QuantNode
* qn
= QUANT_(node
);
2759 if (qn
->lower
> 0) {
2760 if (IS_NOT_NULL(qn
->head_exact
))
2763 n
= get_head_value_node(NODE_BODY(node
), exact
, reg
);
2770 BagNode
* en
= BAG_(node
);
2774 OnigOptionType options
= reg
->options
;
2776 reg
->options
= BAG_(node
)->o
.options
;
2777 n
= get_head_value_node(NODE_BODY(node
), exact
, reg
);
2778 reg
->options
= options
;
2783 case BAG_STOP_BACKTRACK
:
2785 n
= get_head_value_node(NODE_BODY(node
), exact
, reg
);
2792 if (ANCHOR_(node
)->type
== ANCR_PREC_READ
)
2793 n
= get_head_value_node(NODE_BODY(node
), exact
, reg
);
2805 check_type_tree(Node
* node
, int type_mask
, int bag_mask
, int anchor_mask
)
2810 type
= NODE_TYPE(node
);
2811 if ((NODE_TYPE2BIT(type
) & type_mask
) == 0)
2818 r
= check_type_tree(NODE_CAR(node
), type_mask
, bag_mask
, anchor_mask
);
2819 } while (r
== 0 && IS_NOT_NULL(node
= NODE_CDR(node
)));
2823 r
= check_type_tree(NODE_BODY(node
), type_mask
, bag_mask
, anchor_mask
);
2828 BagNode
* en
= BAG_(node
);
2829 if (((1<<en
->type
) & bag_mask
) == 0)
2832 r
= check_type_tree(NODE_BODY(node
), type_mask
, bag_mask
, anchor_mask
);
2833 if (r
== 0 && en
->type
== BAG_IF_ELSE
) {
2834 if (IS_NOT_NULL(en
->te
.Then
)) {
2835 r
= check_type_tree(en
->te
.Then
, type_mask
, bag_mask
, anchor_mask
);
2838 if (IS_NOT_NULL(en
->te
.Else
)) {
2839 r
= check_type_tree(en
->te
.Else
, type_mask
, bag_mask
, anchor_mask
);
2846 type
= ANCHOR_(node
)->type
;
2847 if ((type
& anchor_mask
) == 0)
2850 if (IS_NOT_NULL(NODE_BODY(node
)))
2851 r
= check_type_tree(NODE_BODY(node
), type_mask
, bag_mask
, anchor_mask
);
2862 tree_min_len(Node
* node
, ScanEnv
* env
)
2868 switch (NODE_TYPE(node
)) {
2870 if (! NODE_IS_CHECKER(node
)) {
2873 MemEnv
* mem_env
= SCANENV_MEMENV(env
);
2874 BackRefNode
* br
= BACKREF_(node
);
2875 if (NODE_IS_RECURSION(node
)) break;
2877 backs
= BACKREFS_P(br
);
2878 len
= tree_min_len(mem_env
[backs
[0]].node
, env
);
2879 for (i
= 1; i
< br
->back_num
; i
++) {
2880 tmin
= tree_min_len(mem_env
[backs
[i
]].node
, env
);
2881 if (len
> tmin
) len
= tmin
;
2889 Node
* t
= NODE_BODY(node
);
2890 if (NODE_IS_RECURSION(node
)) {
2891 if (NODE_IS_MIN_FIXED(t
))
2892 len
= BAG_(t
)->min_len
;
2895 len
= tree_min_len(t
, env
);
2902 tmin
= tree_min_len(NODE_CAR(node
), env
);
2903 len
= distance_add(len
, tmin
);
2904 } while (IS_NOT_NULL(node
= NODE_CDR(node
)));
2913 tmin
= tree_min_len(x
, env
);
2914 if (y
== node
) len
= tmin
;
2915 else if (len
> tmin
) len
= tmin
;
2916 } while (IS_NOT_NULL(y
= NODE_CDR(y
)));
2922 StrNode
* sn
= STR_(node
);
2923 len
= (int )(sn
->end
- sn
->s
);
2929 len
= ONIGENC_MBC_MINLEN(env
->enc
);
2934 QuantNode
* qn
= QUANT_(node
);
2936 if (qn
->lower
> 0) {
2937 len
= tree_min_len(NODE_BODY(node
), env
);
2938 len
= distance_multiply(len
, qn
->lower
);
2945 BagNode
* en
= BAG_(node
);
2948 if (NODE_IS_MIN_FIXED(node
))
2951 if (NODE_IS_MARK1(node
))
2952 len
= 0; /* recursive */
2954 NODE_STATUS_ADD(node
, MARK1
);
2955 len
= tree_min_len(NODE_BODY(node
), env
);
2956 NODE_STATUS_REMOVE(node
, MARK1
);
2959 NODE_STATUS_ADD(node
, MIN_FIXED
);
2965 case BAG_STOP_BACKTRACK
:
2966 len
= tree_min_len(NODE_BODY(node
), env
);
2972 len
= tree_min_len(NODE_BODY(node
), env
);
2973 if (IS_NOT_NULL(en
->te
.Then
))
2974 len
+= tree_min_len(en
->te
.Then
, env
);
2975 if (IS_NOT_NULL(en
->te
.Else
))
2976 elen
= tree_min_len(en
->te
.Else
, env
);
2979 if (elen
< len
) len
= elen
;
2988 GimmickNode
* g
= GIMMICK_(node
);
2989 if (g
->type
== GIMMICK_FAIL
) {
3004 tree_max_len(Node
* node
, ScanEnv
* env
)
3010 switch (NODE_TYPE(node
)) {
3013 tmax
= tree_max_len(NODE_CAR(node
), env
);
3014 len
= distance_add(len
, tmax
);
3015 } while (IS_NOT_NULL(node
= NODE_CDR(node
)));
3020 tmax
= tree_max_len(NODE_CAR(node
), env
);
3021 if (len
< tmax
) len
= tmax
;
3022 } while (IS_NOT_NULL(node
= NODE_CDR(node
)));
3027 StrNode
* sn
= STR_(node
);
3028 len
= (OnigLen
)(sn
->end
- sn
->s
);
3034 len
= ONIGENC_MBC_MAXLEN_DIST(env
->enc
);
3038 if (! NODE_IS_CHECKER(node
)) {
3041 MemEnv
* mem_env
= SCANENV_MEMENV(env
);
3042 BackRefNode
* br
= BACKREF_(node
);
3043 if (NODE_IS_RECURSION(node
)) {
3047 backs
= BACKREFS_P(br
);
3048 for (i
= 0; i
< br
->back_num
; i
++) {
3049 tmax
= tree_max_len(mem_env
[backs
[i
]].node
, env
);
3050 if (len
< tmax
) len
= tmax
;
3057 if (! NODE_IS_RECURSION(node
))
3058 len
= tree_max_len(NODE_BODY(node
), env
);
3066 QuantNode
* qn
= QUANT_(node
);
3068 if (qn
->upper
!= 0) {
3069 len
= tree_max_len(NODE_BODY(node
), env
);
3071 if (! IS_INFINITE_REPEAT(qn
->upper
))
3072 len
= distance_multiply(len
, qn
->upper
);
3082 BagNode
* en
= BAG_(node
);
3085 if (NODE_IS_MAX_FIXED(node
))
3088 if (NODE_IS_MARK1(node
))
3091 NODE_STATUS_ADD(node
, MARK1
);
3092 len
= tree_max_len(NODE_BODY(node
), env
);
3093 NODE_STATUS_REMOVE(node
, MARK1
);
3096 NODE_STATUS_ADD(node
, MAX_FIXED
);
3102 case BAG_STOP_BACKTRACK
:
3103 len
= tree_max_len(NODE_BODY(node
), env
);
3109 len
= tree_max_len(NODE_BODY(node
), env
);
3110 if (IS_NOT_NULL(en
->te
.Then
)) {
3111 tlen
= tree_max_len(en
->te
.Then
, env
);
3112 len
= distance_add(len
, tlen
);
3114 if (IS_NOT_NULL(en
->te
.Else
))
3115 elen
= tree_max_len(en
->te
.Else
, env
);
3118 if (elen
> len
) len
= elen
;
3135 check_backrefs(Node
* node
, ScanEnv
* env
)
3139 switch (NODE_TYPE(node
)) {
3143 r
= check_backrefs(NODE_CAR(node
), env
);
3144 } while (r
== 0 && IS_NOT_NULL(node
= NODE_CDR(node
)));
3148 if (! ANCHOR_HAS_BODY(ANCHOR_(node
))) {
3154 r
= check_backrefs(NODE_BODY(node
), env
);
3158 r
= check_backrefs(NODE_BODY(node
), env
);
3160 BagNode
* en
= BAG_(node
);
3162 if (en
->type
== BAG_IF_ELSE
) {
3163 if (r
!= 0) return r
;
3164 if (IS_NOT_NULL(en
->te
.Then
)) {
3165 r
= check_backrefs(en
->te
.Then
, env
);
3166 if (r
!= 0) return r
;
3168 if (IS_NOT_NULL(en
->te
.Else
)) {
3169 r
= check_backrefs(en
->te
.Else
, env
);
3178 BackRefNode
* br
= BACKREF_(node
);
3179 int* backs
= BACKREFS_P(br
);
3180 MemEnv
* mem_env
= SCANENV_MEMENV(env
);
3182 for (i
= 0; i
< br
->back_num
; i
++) {
3183 if (backs
[i
] > env
->num_mem
)
3184 return ONIGERR_INVALID_BACKREF
;
3186 NODE_STATUS_ADD(mem_env
[backs
[i
]].node
, BACKREF
);
3203 #define RECURSION_EXIST (1<<0)
3204 #define RECURSION_MUST (1<<1)
3205 #define RECURSION_INFINITE (1<<2)
3208 infinite_recursive_call_check(Node
* node
, ScanEnv
* env
, int head
)
3213 switch (NODE_TYPE(node
)) {
3221 ret
= infinite_recursive_call_check(NODE_CAR(x
), env
, head
);
3222 if (ret
< 0 || (ret
& RECURSION_INFINITE
) != 0) return ret
;
3225 min
= tree_min_len(NODE_CAR(x
), env
);
3226 if (min
!= 0) head
= 0;
3228 } while (IS_NOT_NULL(x
= NODE_CDR(x
)));
3236 must
= RECURSION_MUST
;
3238 ret
= infinite_recursive_call_check(NODE_CAR(node
), env
, head
);
3239 if (ret
< 0 || (ret
& RECURSION_INFINITE
) != 0) return ret
;
3241 r
|= (ret
& RECURSION_EXIST
);
3243 } while (IS_NOT_NULL(node
= NODE_CDR(node
)));
3249 r
= infinite_recursive_call_check(NODE_BODY(node
), env
, head
);
3250 if (r
< 0) return r
;
3251 if ((r
& RECURSION_MUST
) != 0) {
3252 if (QUANT_(node
)->lower
== 0)
3253 r
&= ~RECURSION_MUST
;
3258 if (! ANCHOR_HAS_BODY(ANCHOR_(node
)))
3262 r
= infinite_recursive_call_check(NODE_BODY(node
), env
, head
);
3267 BagNode
* en
= BAG_(node
);
3269 if (en
->type
== BAG_MEMORY
) {
3270 if (NODE_IS_MARK2(node
))
3272 else if (NODE_IS_MARK1(node
))
3273 return (head
== 0 ? RECURSION_EXIST
| RECURSION_MUST
3274 : RECURSION_EXIST
| RECURSION_MUST
| RECURSION_INFINITE
);
3276 NODE_STATUS_ADD(node
, MARK2
);
3277 r
= infinite_recursive_call_check(NODE_BODY(node
), env
, head
);
3278 NODE_STATUS_REMOVE(node
, MARK2
);
3281 else if (en
->type
== BAG_IF_ELSE
) {
3284 ret
= infinite_recursive_call_check(NODE_BODY(node
), env
, head
);
3285 if (ret
< 0 || (ret
& RECURSION_INFINITE
) != 0) return ret
;
3287 if (IS_NOT_NULL(en
->te
.Then
)) {
3290 min
= tree_min_len(NODE_BODY(node
), env
);
3294 ret
= infinite_recursive_call_check(en
->te
.Then
, env
, min
!= 0 ? 0:head
);
3295 if (ret
< 0 || (ret
& RECURSION_INFINITE
) != 0) return ret
;
3298 if (IS_NOT_NULL(en
->te
.Else
)) {
3299 eret
= infinite_recursive_call_check(en
->te
.Else
, env
, head
);
3300 if (eret
< 0 || (eret
& RECURSION_INFINITE
) != 0) return eret
;
3301 r
|= (eret
& RECURSION_EXIST
);
3302 if ((eret
& RECURSION_MUST
) == 0)
3303 r
&= ~RECURSION_MUST
;
3307 r
= infinite_recursive_call_check(NODE_BODY(node
), env
, head
);
3320 infinite_recursive_call_check_trav(Node
* node
, ScanEnv
* env
)
3324 switch (NODE_TYPE(node
)) {
3328 r
= infinite_recursive_call_check_trav(NODE_CAR(node
), env
);
3329 } while (r
== 0 && IS_NOT_NULL(node
= NODE_CDR(node
)));
3333 if (! ANCHOR_HAS_BODY(ANCHOR_(node
))) {
3339 r
= infinite_recursive_call_check_trav(NODE_BODY(node
), env
);
3344 BagNode
* en
= BAG_(node
);
3346 if (en
->type
== BAG_MEMORY
) {
3347 if (NODE_IS_RECURSION(node
) && NODE_IS_CALLED(node
)) {
3350 NODE_STATUS_ADD(node
, MARK1
);
3352 ret
= infinite_recursive_call_check(NODE_BODY(node
), env
, 1);
3353 if (ret
< 0) return ret
;
3354 else if ((ret
& (RECURSION_MUST
| RECURSION_INFINITE
)) != 0)
3355 return ONIGERR_NEVER_ENDING_RECURSION
;
3357 NODE_STATUS_REMOVE(node
, MARK1
);
3360 else if (en
->type
== BAG_IF_ELSE
) {
3361 if (IS_NOT_NULL(en
->te
.Then
)) {
3362 r
= infinite_recursive_call_check_trav(en
->te
.Then
, env
);
3363 if (r
!= 0) return r
;
3365 if (IS_NOT_NULL(en
->te
.Else
)) {
3366 r
= infinite_recursive_call_check_trav(en
->te
.Else
, env
);
3367 if (r
!= 0) return r
;
3372 r
= infinite_recursive_call_check_trav(NODE_BODY(node
), env
);
3384 recursive_call_check(Node
* node
)
3388 switch (NODE_TYPE(node
)) {
3393 r
|= recursive_call_check(NODE_CAR(node
));
3394 } while (IS_NOT_NULL(node
= NODE_CDR(node
)));
3398 if (! ANCHOR_HAS_BODY(ANCHOR_(node
))) {
3404 r
= recursive_call_check(NODE_BODY(node
));
3408 r
= recursive_call_check(NODE_BODY(node
));
3410 if (NODE_IS_MARK1(NODE_BODY(node
)))
3411 NODE_STATUS_ADD(node
, RECURSION
);
3417 BagNode
* en
= BAG_(node
);
3419 if (en
->type
== BAG_MEMORY
) {
3420 if (NODE_IS_MARK2(node
))
3422 else if (NODE_IS_MARK1(node
))
3423 return 1; /* recursion */
3425 NODE_STATUS_ADD(node
, MARK2
);
3426 r
= recursive_call_check(NODE_BODY(node
));
3427 NODE_STATUS_REMOVE(node
, MARK2
);
3430 else if (en
->type
== BAG_IF_ELSE
) {
3432 if (IS_NOT_NULL(en
->te
.Then
)) {
3433 r
|= recursive_call_check(en
->te
.Then
);
3435 if (IS_NOT_NULL(en
->te
.Else
)) {
3436 r
|= recursive_call_check(en
->te
.Else
);
3438 r
|= recursive_call_check(NODE_BODY(node
));
3441 r
= recursive_call_check(NODE_BODY(node
));
3454 #define IN_RECURSION (1<<0)
3455 #define FOUND_CALLED_NODE 1
3458 recursive_call_check_trav(Node
* node
, ScanEnv
* env
, int state
)
3462 switch (NODE_TYPE(node
)) {
3468 ret
= recursive_call_check_trav(NODE_CAR(node
), env
, state
);
3469 if (ret
== FOUND_CALLED_NODE
) r
= FOUND_CALLED_NODE
;
3470 else if (ret
< 0) return ret
;
3471 } while (IS_NOT_NULL(node
= NODE_CDR(node
)));
3476 r
= recursive_call_check_trav(NODE_BODY(node
), env
, state
);
3477 if (QUANT_(node
)->upper
== 0) {
3478 if (r
== FOUND_CALLED_NODE
)
3479 QUANT_(node
)->is_refered
= 1;
3485 AnchorNode
* an
= ANCHOR_(node
);
3486 if (ANCHOR_HAS_BODY(an
))
3487 r
= recursive_call_check_trav(NODE_ANCHOR_BODY(an
), env
, state
);
3495 BagNode
* en
= BAG_(node
);
3497 if (en
->type
== BAG_MEMORY
) {
3498 if (NODE_IS_CALLED(node
) || (state
& IN_RECURSION
) != 0) {
3499 if (! NODE_IS_RECURSION(node
)) {
3500 NODE_STATUS_ADD(node
, MARK1
);
3501 r
= recursive_call_check(NODE_BODY(node
));
3503 NODE_STATUS_ADD(node
, RECURSION
);
3504 NODE_STATUS_REMOVE(node
, MARK1
);
3507 if (NODE_IS_CALLED(node
))
3508 r
= FOUND_CALLED_NODE
;
3513 if (NODE_IS_RECURSION(node
))
3514 state1
|= IN_RECURSION
;
3516 ret
= recursive_call_check_trav(NODE_BODY(node
), env
, state1
);
3517 if (ret
== FOUND_CALLED_NODE
)
3518 r
= FOUND_CALLED_NODE
;
3520 if (en
->type
== BAG_IF_ELSE
) {
3521 if (IS_NOT_NULL(en
->te
.Then
)) {
3522 ret
= recursive_call_check_trav(en
->te
.Then
, env
, state1
);
3523 if (ret
== FOUND_CALLED_NODE
)
3524 r
= FOUND_CALLED_NODE
;
3526 if (IS_NOT_NULL(en
->te
.Else
)) {
3527 ret
= recursive_call_check_trav(en
->te
.Else
, env
, state1
);
3528 if (ret
== FOUND_CALLED_NODE
)
3529 r
= FOUND_CALLED_NODE
;
3544 #define IN_ALT (1<<0)
3545 #define IN_NOT (1<<1)
3546 #define IN_REAL_REPEAT (1<<2)
3547 #define IN_VAR_REPEAT (1<<3)
3548 #define IN_ZERO_REPEAT (1<<4)
3549 #define IN_MULTI_ENTRY (1<<5)
3550 #define IN_LOOK_BEHIND (1<<6)
3553 /* divide different length alternatives in look-behind.
3554 (?<=A|B) ==> (?<=A)|(?<=B)
3555 (?<!A|B) ==> (?<!A)(?<!B)
3558 divide_look_behind_alternatives(Node
* node
)
3560 Node
*head
, *np
, *insert_node
;
3561 AnchorNode
* an
= ANCHOR_(node
);
3562 int anc_type
= an
->type
;
3564 head
= NODE_ANCHOR_BODY(an
);
3565 np
= NODE_CAR(head
);
3566 swap_node(node
, head
);
3567 NODE_CAR(node
) = head
;
3568 NODE_BODY(head
) = np
;
3571 while (IS_NOT_NULL(np
= NODE_CDR(np
))) {
3572 insert_node
= onig_node_new_anchor(anc_type
, an
->ascii_mode
);
3573 CHECK_NULL_RETURN_MEMERR(insert_node
);
3574 NODE_BODY(insert_node
) = NODE_CAR(np
);
3575 NODE_CAR(np
) = insert_node
;
3578 if (anc_type
== ANCR_LOOK_BEHIND_NOT
) {
3581 NODE_SET_TYPE(np
, NODE_LIST
); /* alt -> list */
3582 } while (IS_NOT_NULL(np
= NODE_CDR(np
)));
3588 setup_look_behind(Node
* node
, regex_t
* reg
, ScanEnv
* env
)
3591 AnchorNode
* an
= ANCHOR_(node
);
3593 r
= get_char_len_node(NODE_ANCHOR_BODY(an
), reg
, &len
);
3596 else if (r
== GET_CHAR_LEN_VARLEN
)
3597 r
= ONIGERR_INVALID_LOOK_BEHIND_PATTERN
;
3598 else if (r
== GET_CHAR_LEN_TOP_ALT_VARLEN
) {
3599 if (IS_SYNTAX_BV(env
->syntax
, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND
))
3600 r
= divide_look_behind_alternatives(node
);
3602 r
= ONIGERR_INVALID_LOOK_BEHIND_PATTERN
;
3609 next_setup(Node
* node
, Node
* next_node
, regex_t
* reg
)
3614 type
= NODE_TYPE(node
);
3615 if (type
== NODE_QUANT
) {
3616 QuantNode
* qn
= QUANT_(node
);
3617 if (qn
->greedy
&& IS_INFINITE_REPEAT(qn
->upper
)) {
3618 #ifdef USE_QUANT_PEEK_NEXT
3619 Node
* n
= get_head_value_node(next_node
, 1, reg
);
3620 /* '\0': for UTF-16BE etc... */
3621 if (IS_NOT_NULL(n
) && STR_(n
)->s
[0] != '\0') {
3622 qn
->next_head_exact
= n
;
3625 /* automatic posseivation a*b ==> (?>a*)b */
3626 if (qn
->lower
<= 1) {
3627 if (is_strict_real_node(NODE_BODY(node
))) {
3629 x
= get_head_value_node(NODE_BODY(node
), 0, reg
);
3630 if (IS_NOT_NULL(x
)) {
3631 y
= get_head_value_node(next_node
, 0, reg
);
3632 if (IS_NOT_NULL(y
) && is_exclusive(x
, y
, reg
)) {
3633 Node
* en
= onig_node_new_bag(BAG_STOP_BACKTRACK
);
3634 CHECK_NULL_RETURN_MEMERR(en
);
3635 NODE_STATUS_ADD(en
, STRICT_REAL_REPEAT
);
3636 swap_node(node
, en
);
3637 NODE_BODY(node
) = en
;
3644 else if (type
== NODE_BAG
) {
3645 BagNode
* en
= BAG_(node
);
3646 if (en
->type
== BAG_MEMORY
) {
3647 node
= NODE_BODY(node
);
3656 update_string_node_case_fold(regex_t
* reg
, Node
*node
)
3658 UChar
*p
, *end
, buf
[ONIGENC_MBC_CASE_FOLD_MAXLEN
];
3659 UChar
*sbuf
, *ebuf
, *sp
;
3660 int r
, i
, len
, sbuf_size
;
3661 StrNode
* sn
= STR_(node
);
3664 sbuf_size
= (int )(end
- sn
->s
) * 2;
3665 sbuf
= (UChar
* )xmalloc(sbuf_size
);
3666 CHECK_NULL_RETURN_MEMERR(sbuf
);
3667 ebuf
= sbuf
+ sbuf_size
;
3672 len
= ONIGENC_MBC_CASE_FOLD(reg
->enc
, reg
->case_fold_flag
, &p
, end
, buf
);
3673 for (i
= 0; i
< len
; i
++) {
3675 sbuf
= (UChar
* )xrealloc(sbuf
, sbuf_size
* 2, sbuf_size
);
3676 CHECK_NULL_RETURN_MEMERR(sbuf
);
3677 sp
= sbuf
+ sbuf_size
;
3679 ebuf
= sbuf
+ sbuf_size
;
3686 r
= onig_node_str_set(node
, sbuf
, sp
);
3697 expand_case_fold_make_rem_string(Node
** rnode
, UChar
*s
, UChar
*end
, regex_t
* reg
)
3702 node
= onig_node_new_str(s
, end
);
3703 if (IS_NULL(node
)) return ONIGERR_MEMORY
;
3705 r
= update_string_node_case_fold(reg
, node
);
3707 onig_node_free(node
);
3711 NODE_STRING_SET_AMBIG(node
);
3712 NODE_STRING_SET_DONT_GET_OPT_INFO(node
);
3718 expand_case_fold_string_alt(int item_num
, OnigCaseFoldCodeItem items
[], UChar
*p
,
3719 int slen
, UChar
*end
, regex_t
* reg
, Node
**rnode
)
3724 Node
*anode
, *var_anode
, *snode
, *xnode
, *an
;
3725 UChar buf
[ONIGENC_CODE_TO_MBC_MAXLEN
];
3727 *rnode
= var_anode
= NULL_NODE
;
3730 for (i
= 0; i
< item_num
; i
++) {
3731 if (items
[i
].byte_len
!= slen
) {
3738 *rnode
= var_anode
= onig_node_new_alt(NULL_NODE
, NULL_NODE
);
3739 if (IS_NULL(var_anode
)) return ONIGERR_MEMORY
;
3741 xnode
= onig_node_new_list(NULL
, NULL
);
3742 if (IS_NULL(xnode
)) goto mem_err
;
3743 NODE_CAR(var_anode
) = xnode
;
3745 anode
= onig_node_new_alt(NULL_NODE
, NULL_NODE
);
3746 if (IS_NULL(anode
)) goto mem_err
;
3747 NODE_CAR(xnode
) = anode
;
3750 *rnode
= anode
= onig_node_new_alt(NULL_NODE
, NULL_NODE
);
3751 if (IS_NULL(anode
)) return ONIGERR_MEMORY
;
3754 snode
= onig_node_new_str(p
, p
+ slen
);
3755 if (IS_NULL(snode
)) goto mem_err
;
3757 NODE_CAR(anode
) = snode
;
3759 for (i
= 0; i
< item_num
; i
++) {
3760 snode
= onig_node_new_str(NULL
, NULL
);
3761 if (IS_NULL(snode
)) goto mem_err
;
3763 for (j
= 0; j
< items
[i
].code_len
; j
++) {
3764 len
= ONIGENC_CODE_TO_MBC(reg
->enc
, items
[i
].code
[j
], buf
);
3770 r
= onig_node_str_cat(snode
, buf
, buf
+ len
);
3771 if (r
!= 0) goto mem_err2
;
3774 an
= onig_node_new_alt(NULL_NODE
, NULL_NODE
);
3778 //The NULL pointer check is not necessary. It is added just for pass static
3779 //analysis. When condition "items[i].byte_len != slen" is true, "varlen = 1"
3780 //in line 3503 will be reached ,so that "if (IS_NULL(var_anode)) return ONIGERR_MEMORY"
3781 //in line 3510 will be executed, so the null pointer has been checked before
3782 //deferenced in line 3584.
3783 if (items
[i
].byte_len
!= slen
&& IS_NOT_NULL(var_anode
)) {
3785 UChar
*q
= p
+ items
[i
].byte_len
;
3788 r
= expand_case_fold_make_rem_string(&rem
, q
, end
, reg
);
3794 xnode
= onig_node_list_add(NULL_NODE
, snode
);
3795 if (IS_NULL(xnode
)) {
3797 onig_node_free(rem
);
3800 if (IS_NULL(onig_node_list_add(xnode
, rem
))) {
3802 onig_node_free(xnode
);
3803 onig_node_free(rem
);
3807 NODE_CAR(an
) = xnode
;
3810 NODE_CAR(an
) = snode
;
3813 NODE_CDR(var_anode
) = an
;
3817 NODE_CAR(an
) = snode
;
3818 NODE_CDR(anode
) = an
;
3826 onig_node_free(snode
);
3829 onig_node_free(*rnode
);
3831 return ONIGERR_MEMORY
;
3835 is_good_case_fold_items_for_search(OnigEncoding enc
, int slen
,
3836 int n
, OnigCaseFoldCodeItem items
[])
3839 UChar buf
[ONIGENC_MBC_CASE_FOLD_MAXLEN
];
3841 for (i
= 0; i
< n
; i
++) {
3842 OnigCaseFoldCodeItem
* item
= items
+ i
;
3844 if (item
->code_len
!= 1) return 0;
3845 if (item
->byte_len
!= slen
) return 0;
3846 len
= ONIGENC_CODE_TO_MBC(enc
, item
->code
[0], buf
);
3847 if (len
!= slen
) return 0;
3853 #define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8
3856 expand_case_fold_string(Node
* node
, regex_t
* reg
, int state
)
3858 int r
, n
, len
, alt_num
;
3860 int prev_is_ambig
, prev_is_good
, is_good
, is_in_look_behind
;
3861 UChar
*start
, *end
, *p
;
3863 Node
*top_root
, *root
, *snode
, *prev_node
;
3864 OnigCaseFoldCodeItem items
[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM
];
3865 UChar buf
[ONIGENC_MBC_CASE_FOLD_MAXLEN
];
3868 if (NODE_STRING_IS_AMBIG(node
)) return 0;
3874 if (start
>= end
) return 0;
3876 is_in_look_behind
= (state
& IN_LOOK_BEHIND
) != 0;
3879 top_root
= root
= prev_node
= snode
= NULL_NODE
;
3883 n
= ONIGENC_GET_CASE_FOLD_CODES_BY_STR(reg
->enc
, reg
->case_fold_flag
,
3890 len
= enclen(reg
->enc
, p
);
3891 is_good
= is_good_case_fold_items_for_search(reg
->enc
, len
, n
, items
);
3893 if (is_in_look_behind
||
3894 (IS_NOT_NULL(snode
) ||
3896 /* expand single char case: ex. /(?i:a)/ */
3897 && !(p
== start
&& p
+ len
>= end
)))) {
3898 if (IS_NULL(snode
)) {
3899 if (IS_NULL(root
) && IS_NOT_NULL(prev_node
)) {
3900 top_root
= root
= onig_node_list_add(NULL_NODE
, prev_node
);
3901 if (IS_NULL(root
)) {
3902 onig_node_free(prev_node
);
3907 prev_node
= snode
= onig_node_new_str(NULL
, NULL
);
3908 if (IS_NULL(snode
)) goto mem_err
;
3909 if (IS_NOT_NULL(root
)) {
3910 if (IS_NULL(onig_node_list_add(root
, snode
))) {
3911 onig_node_free(snode
);
3916 prev_is_ambig
= -1; /* -1: new */
3917 prev_is_good
= 0; /* escape compiler warning */
3920 prev_is_ambig
= NODE_STRING_IS_AMBIG(snode
);
3921 prev_is_good
= NODE_STRING_IS_GOOD_AMBIG(snode
);
3926 fold_len
= ONIGENC_MBC_CASE_FOLD(reg
->enc
, reg
->case_fold_flag
,
3931 foldp
= p
; fold_len
= len
;
3934 if ((prev_is_ambig
== 0 && n
!= 0) ||
3935 (prev_is_ambig
> 0 && (n
== 0 || prev_is_good
!= is_good
))) {
3936 if (IS_NULL(root
) /* && IS_NOT_NULL(prev_node) */) {
3937 top_root
= root
= onig_node_list_add(NULL_NODE
, prev_node
);
3938 if (IS_NULL(root
)) {
3939 onig_node_free(prev_node
);
3944 prev_node
= snode
= onig_node_new_str(foldp
, foldp
+ fold_len
);
3945 if (IS_NULL(snode
)) goto mem_err
;
3946 if (IS_NULL(onig_node_list_add(root
, snode
))) {
3947 onig_node_free(snode
);
3952 r
= onig_node_str_cat(snode
, foldp
, foldp
+ fold_len
);
3953 if (r
!= 0) goto err
;
3956 if (n
!= 0) NODE_STRING_SET_AMBIG(snode
);
3957 if (is_good
!= 0) NODE_STRING_SET_GOOD_AMBIG(snode
);
3961 if (alt_num
> THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION
) break;
3963 if (IS_NULL(root
) && IS_NOT_NULL(prev_node
)) {
3964 top_root
= root
= onig_node_list_add(NULL_NODE
, prev_node
);
3965 if (IS_NULL(root
)) {
3966 onig_node_free(prev_node
);
3971 r
= expand_case_fold_string_alt(n
, items
, p
, len
, end
, reg
, &prev_node
);
3972 if (r
< 0) goto mem_err
;
3974 if (IS_NULL(root
)) {
3975 top_root
= prev_node
;
3978 if (IS_NULL(onig_node_list_add(root
, prev_node
))) {
3979 onig_node_free(prev_node
);
3984 root
= NODE_CAR(prev_node
);
3987 if (IS_NOT_NULL(root
)) {
3988 if (IS_NULL(onig_node_list_add(root
, prev_node
))) {
3989 onig_node_free(prev_node
);
4004 r
= expand_case_fold_make_rem_string(&srem
, p
, end
, reg
);
4005 if (r
!= 0) goto mem_err
;
4007 if (IS_NOT_NULL(prev_node
) && IS_NULL(root
)) {
4008 top_root
= root
= onig_node_list_add(NULL_NODE
, prev_node
);
4009 if (IS_NULL(root
)) {
4010 onig_node_free(srem
);
4011 onig_node_free(prev_node
);
4016 if (IS_NULL(root
)) {
4020 if (IS_NULL(onig_node_list_add(root
, srem
))) {
4021 onig_node_free(srem
);
4028 top_root
= (IS_NOT_NULL(top_root
) ? top_root
: prev_node
);
4029 swap_node(node
, top_root
);
4030 onig_node_free(top_root
);
4037 onig_node_free(top_root
);
4041 #ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT
4042 static enum BodyEmptyType
4043 quantifiers_memory_node_info(Node
* node
)
4045 int r
= BODY_IS_EMPTY_POSSIBILITY
;
4047 switch (NODE_TYPE(node
)) {
4053 v
= quantifiers_memory_node_info(NODE_CAR(node
));
4055 } while (IS_NOT_NULL(node
= NODE_CDR(node
)));
4061 if (NODE_IS_RECURSION(node
)) {
4062 return BODY_IS_EMPTY_POSSIBILITY_REC
; /* tiny version */
4065 r
= quantifiers_memory_node_info(NODE_BODY(node
));
4071 QuantNode
* qn
= QUANT_(node
);
4072 if (qn
->upper
!= 0) {
4073 r
= quantifiers_memory_node_info(NODE_BODY(node
));
4080 BagNode
* en
= BAG_(node
);
4083 if (NODE_IS_RECURSION(node
)) {
4084 return BODY_IS_EMPTY_POSSIBILITY_REC
;
4086 return BODY_IS_EMPTY_POSSIBILITY_MEM
;
4090 case BAG_STOP_BACKTRACK
:
4091 r
= quantifiers_memory_node_info(NODE_BODY(node
));
4096 r
= quantifiers_memory_node_info(NODE_BODY(node
));
4097 if (IS_NOT_NULL(en
->te
.Then
)) {
4098 v
= quantifiers_memory_node_info(en
->te
.Then
);
4101 if (IS_NOT_NULL(en
->te
.Else
)) {
4102 v
= quantifiers_memory_node_info(en
->te
.Else
);
4123 #endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */
4132 setup_call_node_call(CallNode
* cn
, ScanEnv
* env
, int state
)
4134 MemEnv
* mem_env
= SCANENV_MEMENV(env
);
4136 if (cn
->by_number
!= 0) {
4137 int gnum
= cn
->group_num
;
4139 if (env
->num_named
> 0 &&
4140 IS_SYNTAX_BV(env
->syntax
, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP
) &&
4141 ! ONIG_IS_OPTION_ON(env
->options
, ONIG_OPTION_CAPTURE_GROUP
)) {
4142 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED
;
4145 if (gnum
> env
->num_mem
) {
4146 onig_scan_env_set_error_string(env
, ONIGERR_UNDEFINED_GROUP_REFERENCE
,
4147 cn
->name
, cn
->name_end
);
4148 return ONIGERR_UNDEFINED_GROUP_REFERENCE
;
4152 NODE_CALL_BODY(cn
) = mem_env
[cn
->group_num
].node
;
4153 if (IS_NULL(NODE_CALL_BODY(cn
))) {
4154 onig_scan_env_set_error_string(env
, ONIGERR_UNDEFINED_NAME_REFERENCE
,
4155 cn
->name
, cn
->name_end
);
4156 return ONIGERR_UNDEFINED_NAME_REFERENCE
;
4162 int n
= onig_name_to_group_numbers(env
->reg
, cn
->name
, cn
->name_end
, &refs
);
4164 onig_scan_env_set_error_string(env
, ONIGERR_UNDEFINED_NAME_REFERENCE
,
4165 cn
->name
, cn
->name_end
);
4166 return ONIGERR_UNDEFINED_NAME_REFERENCE
;
4169 onig_scan_env_set_error_string(env
, ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL
,
4170 cn
->name
, cn
->name_end
);
4171 return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL
;
4174 cn
->group_num
= refs
[0];
4183 setup_call2_call(Node
* node
)
4185 switch (NODE_TYPE(node
)) {
4189 setup_call2_call(NODE_CAR(node
));
4190 } while (IS_NOT_NULL(node
= NODE_CDR(node
)));
4194 setup_call2_call(NODE_BODY(node
));
4198 if (ANCHOR_HAS_BODY(ANCHOR_(node
)))
4199 setup_call2_call(NODE_BODY(node
));
4204 BagNode
* en
= BAG_(node
);
4206 if (en
->type
== BAG_MEMORY
) {
4207 if (! NODE_IS_MARK1(node
)) {
4208 NODE_STATUS_ADD(node
, MARK1
);
4209 setup_call2_call(NODE_BODY(node
));
4210 NODE_STATUS_REMOVE(node
, MARK1
);
4213 else if (en
->type
== BAG_IF_ELSE
) {
4214 setup_call2_call(NODE_BODY(node
));
4215 if (IS_NOT_NULL(en
->te
.Then
))
4216 setup_call2_call(en
->te
.Then
);
4217 if (IS_NOT_NULL(en
->te
.Else
))
4218 setup_call2_call(en
->te
.Else
);
4221 setup_call2_call(NODE_BODY(node
));
4227 if (! NODE_IS_MARK1(node
)) {
4228 NODE_STATUS_ADD(node
, MARK1
);
4230 CallNode
* cn
= CALL_(node
);
4231 Node
* called
= NODE_CALL_BODY(cn
);
4235 NODE_STATUS_ADD(called
, CALLED
);
4236 BAG_(called
)->m
.entry_count
++;
4237 setup_call2_call(called
);
4239 NODE_STATUS_REMOVE(node
, MARK1
);
4249 setup_call(Node
* node
, ScanEnv
* env
, int state
)
4253 switch (NODE_TYPE(node
)) {
4257 r
= setup_call(NODE_CAR(node
), env
, state
);
4258 } while (r
== 0 && IS_NOT_NULL(node
= NODE_CDR(node
)));
4262 if (QUANT_(node
)->upper
== 0)
4263 state
|= IN_ZERO_REPEAT
;
4265 r
= setup_call(NODE_BODY(node
), env
, state
);
4269 if (ANCHOR_HAS_BODY(ANCHOR_(node
)))
4270 r
= setup_call(NODE_BODY(node
), env
, state
);
4277 BagNode
* en
= BAG_(node
);
4279 if (en
->type
== BAG_MEMORY
) {
4280 if ((state
& IN_ZERO_REPEAT
) != 0) {
4281 NODE_STATUS_ADD(node
, IN_ZERO_REPEAT
);
4282 BAG_(node
)->m
.entry_count
--;
4284 r
= setup_call(NODE_BODY(node
), env
, state
);
4286 else if (en
->type
== BAG_IF_ELSE
) {
4287 r
= setup_call(NODE_BODY(node
), env
, state
);
4288 if (r
!= 0) return r
;
4289 if (IS_NOT_NULL(en
->te
.Then
)) {
4290 r
= setup_call(en
->te
.Then
, env
, state
);
4291 if (r
!= 0) return r
;
4293 if (IS_NOT_NULL(en
->te
.Else
))
4294 r
= setup_call(en
->te
.Else
, env
, state
);
4297 r
= setup_call(NODE_BODY(node
), env
, state
);
4302 if ((state
& IN_ZERO_REPEAT
) != 0) {
4303 NODE_STATUS_ADD(node
, IN_ZERO_REPEAT
);
4304 CALL_(node
)->entry_count
--;
4307 r
= setup_call_node_call(CALL_(node
), env
, state
);
4319 setup_call2(Node
* node
)
4323 switch (NODE_TYPE(node
)) {
4327 r
= setup_call2(NODE_CAR(node
));
4328 } while (r
== 0 && IS_NOT_NULL(node
= NODE_CDR(node
)));
4332 if (QUANT_(node
)->upper
!= 0)
4333 r
= setup_call2(NODE_BODY(node
));
4337 if (ANCHOR_HAS_BODY(ANCHOR_(node
)))
4338 r
= setup_call2(NODE_BODY(node
));
4342 if (! NODE_IS_IN_ZERO_REPEAT(node
))
4343 r
= setup_call2(NODE_BODY(node
));
4346 BagNode
* en
= BAG_(node
);
4348 if (r
!= 0) return r
;
4349 if (en
->type
== BAG_IF_ELSE
) {
4350 if (IS_NOT_NULL(en
->te
.Then
)) {
4351 r
= setup_call2(en
->te
.Then
);
4352 if (r
!= 0) return r
;
4354 if (IS_NOT_NULL(en
->te
.Else
))
4355 r
= setup_call2(en
->te
.Else
);
4361 if (! NODE_IS_IN_ZERO_REPEAT(node
)) {
4362 setup_call2_call(node
);
4375 setup_called_state_call(Node
* node
, int state
)
4377 switch (NODE_TYPE(node
)) {
4383 setup_called_state_call(NODE_CAR(node
), state
);
4384 } while (IS_NOT_NULL(node
= NODE_CDR(node
)));
4389 QuantNode
* qn
= QUANT_(node
);
4391 if (IS_INFINITE_REPEAT(qn
->upper
) || qn
->upper
>= 2)
4392 state
|= IN_REAL_REPEAT
;
4393 if (qn
->lower
!= qn
->upper
)
4394 state
|= IN_VAR_REPEAT
;
4396 setup_called_state_call(NODE_QUANT_BODY(qn
), state
);
4402 AnchorNode
* an
= ANCHOR_(node
);
4405 case ANCR_PREC_READ_NOT
:
4406 case ANCR_LOOK_BEHIND_NOT
:
4409 case ANCR_PREC_READ
:
4410 case ANCR_LOOK_BEHIND
:
4411 setup_called_state_call(NODE_ANCHOR_BODY(an
), state
);
4421 BagNode
* en
= BAG_(node
);
4423 if (en
->type
== BAG_MEMORY
) {
4424 if (NODE_IS_MARK1(node
)) {
4425 if ((~en
->m
.called_state
& state
) != 0) {
4426 en
->m
.called_state
|= state
;
4427 setup_called_state_call(NODE_BODY(node
), state
);
4431 NODE_STATUS_ADD(node
, MARK1
);
4432 en
->m
.called_state
|= state
;
4433 setup_called_state_call(NODE_BODY(node
), state
);
4434 NODE_STATUS_REMOVE(node
, MARK1
);
4437 else if (en
->type
== BAG_IF_ELSE
) {
4438 if (IS_NOT_NULL(en
->te
.Then
)) {
4439 setup_called_state_call(en
->te
.Then
, state
);
4441 if (IS_NOT_NULL(en
->te
.Else
))
4442 setup_called_state_call(en
->te
.Else
, state
);
4445 setup_called_state_call(NODE_BODY(node
), state
);
4451 setup_called_state_call(NODE_BODY(node
), state
);
4460 setup_called_state(Node
* node
, int state
)
4462 switch (NODE_TYPE(node
)) {
4468 setup_called_state(NODE_CAR(node
), state
);
4469 } while (IS_NOT_NULL(node
= NODE_CDR(node
)));
4474 setup_called_state_call(node
, state
);
4480 BagNode
* en
= BAG_(node
);
4484 if (en
->m
.entry_count
> 1)
4485 state
|= IN_MULTI_ENTRY
;
4487 en
->m
.called_state
|= state
;
4490 case BAG_STOP_BACKTRACK
:
4491 setup_called_state(NODE_BODY(node
), state
);
4494 setup_called_state(NODE_BODY(node
), state
);
4495 if (IS_NOT_NULL(en
->te
.Then
))
4496 setup_called_state(en
->te
.Then
, state
);
4497 if (IS_NOT_NULL(en
->te
.Else
))
4498 setup_called_state(en
->te
.Else
, state
);
4506 QuantNode
* qn
= QUANT_(node
);
4508 if (IS_INFINITE_REPEAT(qn
->upper
) || qn
->upper
>= 2)
4509 state
|= IN_REAL_REPEAT
;
4510 if (qn
->lower
!= qn
->upper
)
4511 state
|= IN_VAR_REPEAT
;
4513 setup_called_state(NODE_QUANT_BODY(qn
), state
);
4519 AnchorNode
* an
= ANCHOR_(node
);
4522 case ANCR_PREC_READ_NOT
:
4523 case ANCR_LOOK_BEHIND_NOT
:
4526 case ANCR_PREC_READ
:
4527 case ANCR_LOOK_BEHIND
:
4528 setup_called_state(NODE_ANCHOR_BODY(an
), state
);
4546 #endif /* USE_CALL */
4549 static int setup_tree(Node
* node
, regex_t
* reg
, int state
, ScanEnv
* env
);
4555 setup_anchor(Node
* node
, regex_t
* reg
, int state
, ScanEnv
* env
)
4557 /* allowed node types in look-behind */
4558 #define ALLOWED_TYPE_IN_LB \
4559 ( NODE_BIT_LIST | NODE_BIT_ALT | NODE_BIT_STRING | NODE_BIT_CCLASS \
4560 | NODE_BIT_CTYPE | NODE_BIT_ANCHOR | NODE_BIT_BAG | NODE_BIT_QUANT \
4561 | NODE_BIT_CALL | NODE_BIT_GIMMICK)
4563 #define ALLOWED_BAG_IN_LB ( 1<<BAG_MEMORY | 1<<BAG_OPTION | 1<<BAG_IF_ELSE )
4564 #define ALLOWED_BAG_IN_LB_NOT ( 1<<BAG_OPTION | 1<<BAG_IF_ELSE )
4566 #define ALLOWED_ANCHOR_IN_LB \
4567 ( ANCR_LOOK_BEHIND | ANCR_BEGIN_LINE | ANCR_END_LINE | ANCR_BEGIN_BUF \
4568 | ANCR_BEGIN_POSITION | ANCR_WORD_BOUNDARY | ANCR_NO_WORD_BOUNDARY \
4569 | ANCR_WORD_BEGIN | ANCR_WORD_END \
4570 | ANCR_TEXT_SEGMENT_BOUNDARY | ANCR_NO_TEXT_SEGMENT_BOUNDARY )
4572 #define ALLOWED_ANCHOR_IN_LB_NOT \
4573 ( ANCR_LOOK_BEHIND | ANCR_LOOK_BEHIND_NOT | ANCR_BEGIN_LINE \
4574 | ANCR_END_LINE | ANCR_BEGIN_BUF | ANCR_BEGIN_POSITION | ANCR_WORD_BOUNDARY \
4575 | ANCR_NO_WORD_BOUNDARY | ANCR_WORD_BEGIN | ANCR_WORD_END \
4576 | ANCR_TEXT_SEGMENT_BOUNDARY | ANCR_NO_TEXT_SEGMENT_BOUNDARY )
4579 AnchorNode
* an
= ANCHOR_(node
);
4582 case ANCR_PREC_READ
:
4583 r
= setup_tree(NODE_ANCHOR_BODY(an
), reg
, state
, env
);
4585 case ANCR_PREC_READ_NOT
:
4586 r
= setup_tree(NODE_ANCHOR_BODY(an
), reg
, (state
| IN_NOT
), env
);
4589 case ANCR_LOOK_BEHIND
:
4591 r
= check_type_tree(NODE_ANCHOR_BODY(an
), ALLOWED_TYPE_IN_LB
,
4592 ALLOWED_BAG_IN_LB
, ALLOWED_ANCHOR_IN_LB
);
4593 if (r
< 0) return r
;
4594 if (r
> 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN
;
4595 r
= setup_tree(NODE_ANCHOR_BODY(an
), reg
, (state
|IN_LOOK_BEHIND
), env
);
4596 if (r
!= 0) return r
;
4597 r
= setup_look_behind(node
, reg
, env
);
4601 case ANCR_LOOK_BEHIND_NOT
:
4603 r
= check_type_tree(NODE_ANCHOR_BODY(an
), ALLOWED_TYPE_IN_LB
,
4604 ALLOWED_BAG_IN_LB_NOT
, ALLOWED_ANCHOR_IN_LB_NOT
);
4605 if (r
< 0) return r
;
4606 if (r
> 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN
;
4607 r
= setup_tree(NODE_ANCHOR_BODY(an
), reg
, (state
|IN_NOT
|IN_LOOK_BEHIND
),
4609 if (r
!= 0) return r
;
4610 r
= setup_look_behind(node
, reg
, env
);
4626 setup_quant(Node
* node
, regex_t
* reg
, int state
, ScanEnv
* env
)
4630 QuantNode
* qn
= QUANT_(node
);
4631 Node
* body
= NODE_BODY(node
);
4633 if ((state
& IN_REAL_REPEAT
) != 0) {
4634 NODE_STATUS_ADD(node
, IN_REAL_REPEAT
);
4636 if ((state
& IN_MULTI_ENTRY
) != 0) {
4637 NODE_STATUS_ADD(node
, IN_MULTI_ENTRY
);
4640 if (IS_INFINITE_REPEAT(qn
->upper
) || qn
->upper
>= 1) {
4641 d
= tree_min_len(body
, env
);
4643 #ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT
4644 qn
->emptiness
= quantifiers_memory_node_info(body
);
4645 if (qn
->emptiness
== BODY_IS_EMPTY_POSSIBILITY_REC
) {
4646 if (NODE_TYPE(body
) == NODE_BAG
&&
4647 BAG_(body
)->type
== BAG_MEMORY
) {
4648 MEM_STATUS_ON(env
->bt_mem_end
, BAG_(body
)->m
.regnum
);
4652 qn
->emptiness
= BODY_IS_EMPTY_POSSIBILITY
;
4657 if (IS_INFINITE_REPEAT(qn
->upper
) || qn
->upper
>= 2)
4658 state
|= IN_REAL_REPEAT
;
4659 if (qn
->lower
!= qn
->upper
)
4660 state
|= IN_VAR_REPEAT
;
4662 r
= setup_tree(body
, reg
, state
, env
);
4663 if (r
!= 0) return r
;
4666 #define EXPAND_STRING_MAX_LENGTH 100
4667 if (NODE_TYPE(body
) == NODE_STRING
) {
4668 if (!IS_INFINITE_REPEAT(qn
->lower
) && qn
->lower
== qn
->upper
&&
4669 qn
->lower
> 1 && qn
->lower
<= EXPAND_STRING_MAX_LENGTH
) {
4670 int len
= NODE_STRING_LEN(body
);
4671 StrNode
* sn
= STR_(body
);
4673 if (len
* qn
->lower
<= EXPAND_STRING_MAX_LENGTH
) {
4674 int i
, n
= qn
->lower
;
4675 onig_node_conv_to_str_node(node
, STR_(body
)->flag
);
4676 for (i
= 0; i
< n
; i
++) {
4677 r
= onig_node_str_cat(node
, sn
->s
, sn
->end
);
4678 if (r
!= 0) return r
;
4680 onig_node_free(body
);
4686 if (qn
->greedy
&& (qn
->emptiness
== BODY_IS_NOT_EMPTY
)) {
4687 if (NODE_TYPE(body
) == NODE_QUANT
) {
4688 QuantNode
* tqn
= QUANT_(body
);
4689 if (IS_NOT_NULL(tqn
->head_exact
)) {
4690 qn
->head_exact
= tqn
->head_exact
;
4691 tqn
->head_exact
= NULL
;
4695 qn
->head_exact
= get_head_value_node(NODE_BODY(node
), 1, reg
);
4702 /* setup_tree does the following work.
4703 1. check empty loop. (set qn->emptiness)
4704 2. expand ignore-case in char class.
4705 3. set memory status bit flags. (reg->mem_stats)
4706 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact].
4707 5. find invalid patterns in look-behind.
4708 6. expand repeated string.
4711 setup_tree(Node
* node
, regex_t
* reg
, int state
, ScanEnv
* env
)
4715 switch (NODE_TYPE(node
)) {
4718 Node
* prev
= NULL_NODE
;
4720 r
= setup_tree(NODE_CAR(node
), reg
, state
, env
);
4721 if (IS_NOT_NULL(prev
) && r
== 0) {
4722 r
= next_setup(prev
, NODE_CAR(node
), reg
);
4724 prev
= NODE_CAR(node
);
4725 } while (r
== 0 && IS_NOT_NULL(node
= NODE_CDR(node
)));
4731 r
= setup_tree(NODE_CAR(node
), reg
, (state
| IN_ALT
), env
);
4732 } while (r
== 0 && IS_NOT_NULL(node
= NODE_CDR(node
)));
4736 if (IS_IGNORECASE(reg
->options
) && !NODE_STRING_IS_RAW(node
)) {
4737 r
= expand_case_fold_string(node
, reg
, state
);
4745 BackRefNode
* br
= BACKREF_(node
);
4747 for (i
= 0; i
< br
->back_num
; i
++) {
4748 if (p
[i
] > env
->num_mem
) return ONIGERR_INVALID_BACKREF
;
4749 MEM_STATUS_ON(env
->backrefed_mem
, p
[i
]);
4750 MEM_STATUS_ON(env
->bt_mem_start
, p
[i
]);
4751 #ifdef USE_BACKREF_WITH_LEVEL
4752 if (NODE_IS_NEST_LEVEL(node
)) {
4753 MEM_STATUS_ON(env
->bt_mem_end
, p
[i
]);
4762 BagNode
* en
= BAG_(node
);
4767 OnigOptionType options
= reg
->options
;
4768 reg
->options
= BAG_(node
)->o
.options
;
4769 r
= setup_tree(NODE_BODY(node
), reg
, state
, env
);
4770 reg
->options
= options
;
4776 state
|= en
->m
.called_state
;
4779 if ((state
& (IN_ALT
| IN_NOT
| IN_VAR_REPEAT
| IN_MULTI_ENTRY
)) != 0
4780 || NODE_IS_RECURSION(node
)) {
4781 MEM_STATUS_ON(env
->bt_mem_start
, en
->m
.regnum
);
4783 r
= setup_tree(NODE_BODY(node
), reg
, state
, env
);
4786 case BAG_STOP_BACKTRACK
:
4788 Node
* target
= NODE_BODY(node
);
4789 r
= setup_tree(target
, reg
, state
, env
);
4790 if (NODE_TYPE(target
) == NODE_QUANT
) {
4791 QuantNode
* tqn
= QUANT_(target
);
4792 if (IS_INFINITE_REPEAT(tqn
->upper
) && tqn
->lower
<= 1 &&
4793 tqn
->greedy
!= 0) { /* (?>a*), a*+ etc... */
4794 if (is_strict_real_node(NODE_BODY(target
)))
4795 NODE_STATUS_ADD(node
, STRICT_REAL_REPEAT
);
4802 r
= setup_tree(NODE_BODY(node
), reg
, (state
| IN_ALT
), env
);
4803 if (r
!= 0) return r
;
4804 if (IS_NOT_NULL(en
->te
.Then
)) {
4805 r
= setup_tree(en
->te
.Then
, reg
, (state
| IN_ALT
), env
);
4806 if (r
!= 0) return r
;
4808 if (IS_NOT_NULL(en
->te
.Else
))
4809 r
= setup_tree(en
->te
.Else
, reg
, (state
| IN_ALT
), env
);
4816 r
= setup_quant(node
, reg
, state
, env
);
4820 r
= setup_anchor(node
, reg
, state
, env
);
4837 set_sunday_quick_search_or_bmh_skip_table(regex_t
* reg
, int case_expand
,
4838 UChar
* s
, UChar
* end
,
4839 UChar skip
[], int* roffset
)
4841 int i
, j
, k
, len
, offset
;
4845 OnigCaseFoldCodeItem items
[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM
];
4846 UChar buf
[ONIGENC_MBC_CASE_FOLD_MAXLEN
];
4849 offset
= ENC_GET_SKIP_OFFSET(enc
);
4850 if (offset
== ENC_SKIP_OFFSET_1_OR_0
) {
4853 len
= enclen(enc
, p
);
4854 if (p
+ len
>= end
) {
4855 if (len
== 1) offset
= 1;
4863 len
= (int )(end
- s
);
4864 if (len
+ offset
>= 255)
4865 return ONIGERR_PARSER_BUG
;
4869 for (i
= 0; i
< CHAR_MAP_SIZE
; i
++) {
4870 skip
[i
] = (UChar
)(len
+ offset
);
4873 for (p
= s
; p
< end
; ) {
4876 clen
= enclen(enc
, p
);
4877 if (p
+ clen
> end
) clen
= (int )(end
- p
);
4879 len
= (int )(end
- p
);
4880 for (j
= 0; j
< clen
; j
++) {
4881 z
= len
- j
+ (offset
- 1);
4886 if (case_expand
!= 0) {
4887 n
= ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc
, reg
->case_fold_flag
,
4889 for (k
= 0; k
< n
; k
++) {
4890 ONIGENC_CODE_TO_MBC(enc
, items
[k
].code
[0], buf
);
4891 for (j
= 0; j
< clen
; j
++) {
4892 z
= len
- j
+ (offset
- 1);
4894 if (skip
[buf
[j
]] > z
)
4907 #define OPT_EXACT_MAXLEN 24
4909 #if OPT_EXACT_MAXLEN >= 255
4910 #error Too big OPT_EXACT_MAXLEN
4914 OnigLen min
; /* min byte length */
4915 OnigLen max
; /* max byte length */
4921 OnigOptionType options
;
4922 OnigCaseFoldType case_fold_flag
;
4932 MinMax mmd
; /* position */
4938 UChar s
[OPT_EXACT_MAXLEN
];
4942 MinMax mmd
; /* position */
4944 int value
; /* weighted value */
4945 UChar map
[CHAR_MAP_SIZE
];
4951 OptStr sb
; /* boundary */
4952 OptStr sm
; /* middle */
4953 OptStr spr
; /* prec read (?=...) */
4954 OptMap map
; /* boundary */
4959 map_position_value(OnigEncoding enc
, int i
)
4961 static const short int Vals
[] = {
4962 5, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1,
4963 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4964 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
4965 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5,
4966 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
4967 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5,
4968 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
4969 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1
4972 if (i
< (int )(sizeof(Vals
)/sizeof(Vals
[0]))) {
4973 if (i
== 0 && ONIGENC_MBC_MINLEN(enc
) > 1)
4976 return (int )Vals
[i
];
4979 return 4; /* Take it easy. */
4983 distance_value(MinMax
* mm
)
4985 /* 1000 / (min-max-dist + 1) */
4986 static const short int dist_vals
[] = {
4987 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100,
4988 91, 83, 77, 71, 67, 63, 59, 56, 53, 50,
4989 48, 45, 43, 42, 40, 38, 37, 36, 34, 33,
4990 32, 31, 30, 29, 29, 28, 27, 26, 26, 25,
4991 24, 24, 23, 23, 22, 22, 21, 21, 20, 20,
4992 20, 19, 19, 19, 18, 18, 18, 17, 17, 17,
4993 16, 16, 16, 16, 15, 15, 15, 15, 14, 14,
4994 14, 14, 14, 14, 13, 13, 13, 13, 13, 13,
4995 12, 12, 12, 12, 12, 12, 11, 11, 11, 11,
4996 11, 11, 11, 11, 11, 10, 10, 10, 10, 10
5001 if (mm
->max
== INFINITE_LEN
) return 0;
5003 d
= mm
->max
- mm
->min
;
5004 if (d
< (OnigLen
)(sizeof(dist_vals
)/sizeof(dist_vals
[0])))
5005 /* return dist_vals[d] * 16 / (mm->min + 12); */
5006 return (int )dist_vals
[d
];
5012 comp_distance_value(MinMax
* d1
, MinMax
* d2
, int v1
, int v2
)
5014 if (v2
<= 0) return -1;
5015 if (v1
<= 0) return 1;
5017 v1
*= distance_value(d1
);
5018 v2
*= distance_value(d2
);
5020 if (v2
> v1
) return 1;
5021 if (v2
< v1
) return -1;
5023 if (d2
->min
< d1
->min
) return 1;
5024 if (d2
->min
> d1
->min
) return -1;
5029 is_equal_mml(MinMax
* a
, MinMax
* b
)
5031 return a
->min
== b
->min
&& a
->max
== b
->max
;
5035 set_mml(MinMax
* l
, OnigLen min
, OnigLen max
)
5042 clear_mml(MinMax
* l
)
5044 l
->min
= l
->max
= 0;
5048 copy_mml(MinMax
* to
, MinMax
* from
)
5050 to
->min
= from
->min
;
5051 to
->max
= from
->max
;
5055 add_mml(MinMax
* to
, MinMax
* from
)
5057 to
->min
= distance_add(to
->min
, from
->min
);
5058 to
->max
= distance_add(to
->max
, from
->max
);
5062 alt_merge_mml(MinMax
* to
, MinMax
* from
)
5064 if (to
->min
> from
->min
) to
->min
= from
->min
;
5065 if (to
->max
< from
->max
) to
->max
= from
->max
;
5069 copy_opt_env(OptEnv
* to
, OptEnv
* from
)
5075 clear_opt_anc_info(OptAnc
* a
)
5082 copy_opt_anc_info(OptAnc
* to
, OptAnc
* from
)
5088 concat_opt_anc_info(OptAnc
* to
, OptAnc
* left
, OptAnc
* right
,
5089 OnigLen left_len
, OnigLen right_len
)
5091 clear_opt_anc_info(to
);
5093 to
->left
= left
->left
;
5094 if (left_len
== 0) {
5095 to
->left
|= right
->left
;
5098 to
->right
= right
->right
;
5099 if (right_len
== 0) {
5100 to
->right
|= left
->right
;
5103 to
->right
|= (left
->right
& ANCR_PREC_READ_NOT
);
5110 if (a
== ANCR_END_BUF
|| a
== ANCR_SEMI_END_BUF
||
5111 a
== ANCR_END_LINE
|| a
== ANCR_PREC_READ
|| a
== ANCR_PREC_READ_NOT
)
5118 is_set_opt_anc_info(OptAnc
* to
, int anc
)
5120 if ((to
->left
& anc
) != 0) return 1;
5122 return ((to
->right
& anc
) != 0 ? 1 : 0);
5126 add_opt_anc_info(OptAnc
* to
, int anc
)
5135 remove_opt_anc_info(OptAnc
* to
, int anc
)
5144 alt_merge_opt_anc_info(OptAnc
* to
, OptAnc
* add
)
5146 to
->left
&= add
->left
;
5147 to
->right
&= add
->right
;
5151 is_full_opt_exact(OptStr
* e
)
5153 return e
->len
>= OPT_EXACT_MAXLEN
;
5157 clear_opt_exact(OptStr
* e
)
5160 clear_opt_anc_info(&e
->anc
);
5163 e
->good_case_fold
= 0;
5169 copy_opt_exact(OptStr
* to
, OptStr
* from
)
5175 concat_opt_exact(OptStr
* to
, OptStr
* add
, OnigEncoding enc
)
5181 if (add
->case_fold
!= 0) {
5182 if (! to
->case_fold
) {
5183 if (to
->len
> 1 || to
->len
>= add
->len
) return 0; /* avoid */
5188 if (to
->good_case_fold
!= 0) {
5189 if (add
->good_case_fold
== 0) return 0;
5197 for (i
= to
->len
; p
< end
; ) {
5198 len
= enclen(enc
, p
);
5199 if (i
+ len
> OPT_EXACT_MAXLEN
) {
5203 for (j
= 0; j
< len
&& p
< end
; j
++)
5208 to
->reach_end
= (p
== end
? add
->reach_end
: 0);
5210 concat_opt_anc_info(&tanc
, &to
->anc
, &add
->anc
, 1, 1);
5211 if (! to
->reach_end
) tanc
.right
= 0;
5212 copy_opt_anc_info(&to
->anc
, &tanc
);
5218 concat_opt_exact_str(OptStr
* to
, UChar
* s
, UChar
* end
, OnigEncoding enc
)
5223 for (i
= to
->len
, p
= s
; p
< end
&& i
< OPT_EXACT_MAXLEN
; ) {
5224 len
= enclen(enc
, p
);
5225 if (i
+ len
> OPT_EXACT_MAXLEN
) break;
5226 for (j
= 0; j
< len
&& p
< end
; j
++)
5232 if (p
>= end
&& to
->len
== (int )(end
- s
))
5237 alt_merge_opt_exact(OptStr
* to
, OptStr
* add
, OptEnv
* env
)
5241 if (add
->len
== 0 || to
->len
== 0) {
5242 clear_opt_exact(to
);
5246 if (! is_equal_mml(&to
->mmd
, &add
->mmd
)) {
5247 clear_opt_exact(to
);
5251 for (i
= 0; i
< to
->len
&& i
< add
->len
; ) {
5252 if (to
->s
[i
] != add
->s
[i
]) break;
5253 len
= enclen(env
->enc
, to
->s
+ i
);
5255 for (j
= 1; j
< len
; j
++) {
5256 if (to
->s
[i
+j
] != add
->s
[i
+j
]) break;
5262 if (! add
->reach_end
|| i
< add
->len
|| i
< to
->len
) {
5266 if (add
->case_fold
!= 0)
5268 if (add
->good_case_fold
== 0)
5269 to
->good_case_fold
= 0;
5271 alt_merge_opt_anc_info(&to
->anc
, &add
->anc
);
5272 if (! to
->reach_end
) to
->anc
.right
= 0;
5276 select_opt_exact(OnigEncoding enc
, OptStr
* now
, OptStr
* alt
)
5287 copy_opt_exact(now
, alt
);
5290 else if (vn
<= 2 && va
<= 2) {
5291 /* ByteValTable[x] is big value --> low price */
5292 va
= map_position_value(enc
, now
->s
[0]);
5293 vn
= map_position_value(enc
, alt
->s
[0]);
5295 if (now
->len
> 1) vn
+= 5;
5296 if (alt
->len
> 1) va
+= 5;
5299 if (now
->case_fold
== 0) vn
*= 2;
5300 if (alt
->case_fold
== 0) va
*= 2;
5302 if (now
->good_case_fold
!= 0) vn
*= 4;
5303 if (alt
->good_case_fold
!= 0) va
*= 4;
5305 if (comp_distance_value(&now
->mmd
, &alt
->mmd
, vn
, va
) > 0)
5306 copy_opt_exact(now
, alt
);
5310 clear_opt_map(OptMap
* map
)
5312 static const OptMap clean_info
= {
5315 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5316 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5317 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5318 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5319 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5320 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5321 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5322 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5323 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5324 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5325 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5326 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5327 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5328 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5329 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5330 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
5334 xmemcpy(map
, &clean_info
, sizeof(OptMap
));
5338 copy_opt_map(OptMap
* to
, OptMap
* from
)
5340 xmemcpy(to
,from
,sizeof(OptMap
));
5344 add_char_opt_map(OptMap
* m
, UChar c
, OnigEncoding enc
)
5346 if (m
->map
[c
] == 0) {
5348 m
->value
+= map_position_value(enc
, c
);
5353 add_char_amb_opt_map(OptMap
* map
, UChar
* p
, UChar
* end
,
5354 OnigEncoding enc
, OnigCaseFoldType fold_flag
)
5356 OnigCaseFoldCodeItem items
[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM
];
5357 UChar buf
[ONIGENC_CODE_TO_MBC_MAXLEN
];
5360 add_char_opt_map(map
, p
[0], enc
);
5362 fold_flag
= DISABLE_CASE_FOLD_MULTI_CHAR(fold_flag
);
5363 n
= ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc
, fold_flag
, p
, end
, items
);
5364 if (n
< 0) return n
;
5366 for (i
= 0; i
< n
; i
++) {
5367 ONIGENC_CODE_TO_MBC(enc
, items
[i
].code
[0], buf
);
5368 add_char_opt_map(map
, buf
[0], enc
);
5375 select_opt_map(OptMap
* now
, OptMap
* alt
)
5377 static int z
= 1<<15; /* 32768: something big value */
5381 if (alt
->value
== 0) return ;
5382 if (now
->value
== 0) {
5383 copy_opt_map(now
, alt
);
5387 vn
= z
/ now
->value
;
5388 va
= z
/ alt
->value
;
5389 if (comp_distance_value(&now
->mmd
, &alt
->mmd
, vn
, va
) > 0)
5390 copy_opt_map(now
, alt
);
5394 comp_opt_exact_or_map(OptStr
* e
, OptMap
* m
)
5396 #define COMP_EM_BASE 20
5400 if (m
->value
<= 0) return -1;
5402 if (e
->case_fold
!= 0) {
5403 if (e
->good_case_fold
!= 0)
5411 ae
= COMP_EM_BASE
* e
->len
* case_value
;
5412 am
= COMP_EM_BASE
* 5 * 2 / m
->value
;
5413 return comp_distance_value(&e
->mmd
, &m
->mmd
, ae
, am
);
5417 alt_merge_opt_map(OnigEncoding enc
, OptMap
* to
, OptMap
* add
)
5421 /* if (! is_equal_mml(&to->mmd, &add->mmd)) return ; */
5422 if (to
->value
== 0) return ;
5423 if (add
->value
== 0 || to
->mmd
.max
< add
->mmd
.min
) {
5428 alt_merge_mml(&to
->mmd
, &add
->mmd
);
5431 for (i
= 0; i
< CHAR_MAP_SIZE
; i
++) {
5436 val
+= map_position_value(enc
, i
);
5440 alt_merge_opt_anc_info(&to
->anc
, &add
->anc
);
5444 set_bound_node_opt_info(OptNode
* opt
, MinMax
* plen
)
5446 copy_mml(&(opt
->sb
.mmd
), plen
);
5447 copy_mml(&(opt
->spr
.mmd
), plen
);
5448 copy_mml(&(opt
->map
.mmd
), plen
);
5452 clear_node_opt_info(OptNode
* opt
)
5454 clear_mml(&opt
->len
);
5455 clear_opt_anc_info(&opt
->anc
);
5456 clear_opt_exact(&opt
->sb
);
5457 clear_opt_exact(&opt
->sm
);
5458 clear_opt_exact(&opt
->spr
);
5459 clear_opt_map(&opt
->map
);
5463 copy_node_opt_info(OptNode
* to
, OptNode
* from
)
5465 xmemcpy(to
,from
,sizeof(OptNode
));
5469 concat_left_node_opt_info(OnigEncoding enc
, OptNode
* to
, OptNode
* add
)
5471 int sb_reach
, sm_reach
;
5474 concat_opt_anc_info(&tanc
, &to
->anc
, &add
->anc
, to
->len
.max
, add
->len
.max
);
5475 copy_opt_anc_info(&to
->anc
, &tanc
);
5477 if (add
->sb
.len
> 0 && to
->len
.max
== 0) {
5478 concat_opt_anc_info(&tanc
, &to
->anc
, &add
->sb
.anc
, to
->len
.max
, add
->len
.max
);
5479 copy_opt_anc_info(&add
->sb
.anc
, &tanc
);
5482 if (add
->map
.value
> 0 && to
->len
.max
== 0) {
5483 if (add
->map
.mmd
.max
== 0)
5484 add
->map
.anc
.left
|= to
->anc
.left
;
5487 sb_reach
= to
->sb
.reach_end
;
5488 sm_reach
= to
->sm
.reach_end
;
5490 if (add
->len
.max
!= 0)
5491 to
->sb
.reach_end
= to
->sm
.reach_end
= 0;
5493 if (add
->sb
.len
> 0) {
5495 concat_opt_exact(&to
->sb
, &add
->sb
, enc
);
5496 clear_opt_exact(&add
->sb
);
5498 else if (sm_reach
) {
5499 concat_opt_exact(&to
->sm
, &add
->sb
, enc
);
5500 clear_opt_exact(&add
->sb
);
5503 select_opt_exact(enc
, &to
->sm
, &add
->sb
);
5504 select_opt_exact(enc
, &to
->sm
, &add
->sm
);
5506 if (to
->spr
.len
> 0) {
5507 if (add
->len
.max
> 0) {
5508 if (to
->spr
.len
> (int )add
->len
.max
)
5509 to
->spr
.len
= add
->len
.max
;
5511 if (to
->spr
.mmd
.max
== 0)
5512 select_opt_exact(enc
, &to
->sb
, &to
->spr
);
5514 select_opt_exact(enc
, &to
->sm
, &to
->spr
);
5517 else if (add
->spr
.len
> 0) {
5518 copy_opt_exact(&to
->spr
, &add
->spr
);
5521 select_opt_map(&to
->map
, &add
->map
);
5522 add_mml(&to
->len
, &add
->len
);
5526 alt_merge_node_opt_info(OptNode
* to
, OptNode
* add
, OptEnv
* env
)
5528 alt_merge_opt_anc_info(&to
->anc
, &add
->anc
);
5529 alt_merge_opt_exact(&to
->sb
, &add
->sb
, env
);
5530 alt_merge_opt_exact(&to
->sm
, &add
->sm
, env
);
5531 alt_merge_opt_exact(&to
->spr
, &add
->spr
, env
);
5532 alt_merge_opt_map(env
->enc
, &to
->map
, &add
->map
);
5534 alt_merge_mml(&to
->len
, &add
->len
);
5538 #define MAX_NODE_OPT_INFO_REF_COUNT 5
5541 optimize_nodes(Node
* node
, OptNode
* opt
, OptEnv
* env
)
5550 clear_node_opt_info(opt
);
5551 set_bound_node_opt_info(opt
, &env
->mmd
);
5553 switch (NODE_TYPE(node
)) {
5559 copy_opt_env(&nenv
, env
);
5561 r
= optimize_nodes(NODE_CAR(nd
), &xo
, &nenv
);
5563 add_mml(&nenv
.mmd
, &xo
.len
);
5564 concat_left_node_opt_info(enc
, opt
, &xo
);
5566 } while (r
== 0 && IS_NOT_NULL(nd
= NODE_CDR(nd
)));
5575 r
= optimize_nodes(NODE_CAR(nd
), &xo
, env
);
5577 if (nd
== node
) copy_node_opt_info(opt
, &xo
);
5578 else alt_merge_node_opt_info(opt
, &xo
, env
);
5580 } while ((r
== 0) && IS_NOT_NULL(nd
= NODE_CDR(nd
)));
5586 StrNode
* sn
= STR_(node
);
5587 int slen
= (int )(sn
->end
- sn
->s
);
5588 /* int is_raw = NODE_STRING_IS_RAW(node); */
5590 if (! NODE_STRING_IS_AMBIG(node
)) {
5591 concat_opt_exact_str(&opt
->sb
, sn
->s
, sn
->end
, enc
);
5593 add_char_opt_map(&opt
->map
, *(sn
->s
), enc
);
5595 set_mml(&opt
->len
, slen
, slen
);
5600 if (NODE_STRING_IS_DONT_GET_OPT_INFO(node
)) {
5601 int n
= onigenc_strlen(enc
, sn
->s
, sn
->end
);
5602 max
= ONIGENC_MBC_MAXLEN_DIST(enc
) * n
;
5605 concat_opt_exact_str(&opt
->sb
, sn
->s
, sn
->end
, enc
);
5606 opt
->sb
.case_fold
= 1;
5607 if (NODE_STRING_IS_GOOD_AMBIG(node
))
5608 opt
->sb
.good_case_fold
= 1;
5611 r
= add_char_amb_opt_map(&opt
->map
, sn
->s
, sn
->end
,
5612 enc
, env
->case_fold_flag
);
5619 set_mml(&opt
->len
, slen
, max
);
5627 CClassNode
* cc
= CCLASS_(node
);
5629 /* no need to check ignore case. (set in setup_tree()) */
5631 if (IS_NOT_NULL(cc
->mbuf
) || IS_NCCLASS_NOT(cc
)) {
5632 OnigLen min
= ONIGENC_MBC_MINLEN(enc
);
5633 OnigLen max
= ONIGENC_MBC_MAXLEN_DIST(enc
);
5635 set_mml(&opt
->len
, min
, max
);
5638 for (i
= 0; i
< SINGLE_BYTE_SIZE
; i
++) {
5639 z
= BITSET_AT(cc
->bs
, i
);
5640 if ((z
&& ! IS_NCCLASS_NOT(cc
)) || (! z
&& IS_NCCLASS_NOT(cc
))) {
5641 add_char_opt_map(&opt
->map
, (UChar
)i
, enc
);
5644 set_mml(&opt
->len
, 1, 1);
5654 max
= ONIGENC_MBC_MAXLEN_DIST(enc
);
5659 switch (CTYPE_(node
)->ctype
) {
5663 case ONIGENC_CTYPE_WORD
:
5664 range
= CTYPE_(node
)->ascii_mode
!= 0 ? 128 : SINGLE_BYTE_SIZE
;
5665 if (CTYPE_(node
)->not != 0) {
5666 for (i
= 0; i
< range
; i
++) {
5667 if (! ONIGENC_IS_CODE_WORD(enc
, i
)) {
5668 add_char_opt_map(&opt
->map
, (UChar
)i
, enc
);
5671 for (i
= range
; i
< SINGLE_BYTE_SIZE
; i
++) {
5672 add_char_opt_map(&opt
->map
, (UChar
)i
, enc
);
5676 for (i
= 0; i
< range
; i
++) {
5677 if (ONIGENC_IS_CODE_WORD(enc
, i
)) {
5678 add_char_opt_map(&opt
->map
, (UChar
)i
, enc
);
5686 min
= ONIGENC_MBC_MINLEN(enc
);
5688 set_mml(&opt
->len
, min
, max
);
5693 switch (ANCHOR_(node
)->type
) {
5694 case ANCR_BEGIN_BUF
:
5695 case ANCR_BEGIN_POSITION
:
5696 case ANCR_BEGIN_LINE
:
5698 case ANCR_SEMI_END_BUF
:
5700 case ANCR_PREC_READ_NOT
:
5701 case ANCR_LOOK_BEHIND
:
5702 add_opt_anc_info(&opt
->anc
, ANCHOR_(node
)->type
);
5705 case ANCR_PREC_READ
:
5707 r
= optimize_nodes(NODE_BODY(node
), &xo
, env
);
5710 copy_opt_exact(&opt
->spr
, &xo
.sb
);
5711 else if (xo
.sm
.len
> 0)
5712 copy_opt_exact(&opt
->spr
, &xo
.sm
);
5714 opt
->spr
.reach_end
= 0;
5716 if (xo
.map
.value
> 0)
5717 copy_opt_map(&opt
->map
, &xo
.map
);
5722 case ANCR_LOOK_BEHIND_NOT
:
5728 if (! NODE_IS_CHECKER(node
)) {
5730 OnigLen min
, max
, tmin
, tmax
;
5731 MemEnv
* mem_env
= SCANENV_MEMENV(env
->scan_env
);
5732 BackRefNode
* br
= BACKREF_(node
);
5734 if (NODE_IS_RECURSION(node
)) {
5735 set_mml(&opt
->len
, 0, INFINITE_LEN
);
5738 backs
= BACKREFS_P(br
);
5739 min
= tree_min_len(mem_env
[backs
[0]].node
, env
->scan_env
);
5740 max
= tree_max_len(mem_env
[backs
[0]].node
, env
->scan_env
);
5741 for (i
= 1; i
< br
->back_num
; i
++) {
5742 tmin
= tree_min_len(mem_env
[backs
[i
]].node
, env
->scan_env
);
5743 tmax
= tree_max_len(mem_env
[backs
[i
]].node
, env
->scan_env
);
5744 if (min
> tmin
) min
= tmin
;
5745 if (max
< tmax
) max
= tmax
;
5747 set_mml(&opt
->len
, min
, max
);
5753 if (NODE_IS_RECURSION(node
))
5754 set_mml(&opt
->len
, 0, INFINITE_LEN
);
5756 OnigOptionType save
= env
->options
;
5757 env
->options
= BAG_(NODE_BODY(node
))->o
.options
;
5758 r
= optimize_nodes(NODE_BODY(node
), opt
, env
);
5759 env
->options
= save
;
5767 QuantNode
* qn
= QUANT_(node
);
5769 r
= optimize_nodes(NODE_BODY(node
), &xo
, env
);
5772 if (qn
->lower
> 0) {
5773 copy_node_opt_info(opt
, &xo
);
5774 if (xo
.sb
.len
> 0) {
5775 if (xo
.sb
.reach_end
) {
5776 for (i
= 2; i
<= qn
->lower
&& ! is_full_opt_exact(&opt
->sb
); i
++) {
5777 int rc
= concat_opt_exact(&opt
->sb
, &xo
.sb
, enc
);
5780 if (i
< qn
->lower
) opt
->sb
.reach_end
= 0;
5784 if (qn
->lower
!= qn
->upper
) {
5785 opt
->sb
.reach_end
= 0;
5786 opt
->sm
.reach_end
= 0;
5789 opt
->sm
.reach_end
= 0;
5792 if (IS_INFINITE_REPEAT(qn
->upper
)) {
5793 if (env
->mmd
.max
== 0 &&
5794 NODE_IS_ANYCHAR(NODE_BODY(node
)) && qn
->greedy
!= 0) {
5795 if (IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn
), env
)))
5796 add_opt_anc_info(&opt
->anc
, ANCR_ANYCHAR_INF_ML
);
5798 add_opt_anc_info(&opt
->anc
, ANCR_ANYCHAR_INF
);
5801 max
= (xo
.len
.max
> 0 ? INFINITE_LEN
: 0);
5804 max
= distance_multiply(xo
.len
.max
, qn
->upper
);
5807 min
= distance_multiply(xo
.len
.min
, qn
->lower
);
5808 set_mml(&opt
->len
, min
, max
);
5814 BagNode
* en
= BAG_(node
);
5819 OnigOptionType save
= env
->options
;
5821 env
->options
= en
->o
.options
;
5822 r
= optimize_nodes(NODE_BODY(node
), opt
, env
);
5823 env
->options
= save
;
5830 if (en
->opt_count
> MAX_NODE_OPT_INFO_REF_COUNT
) {
5835 if (NODE_IS_MIN_FIXED(node
)) min
= en
->min_len
;
5836 if (NODE_IS_MAX_FIXED(node
)) max
= en
->max_len
;
5837 set_mml(&opt
->len
, min
, max
);
5842 r
= optimize_nodes(NODE_BODY(node
), opt
, env
);
5843 if (is_set_opt_anc_info(&opt
->anc
, ANCR_ANYCHAR_INF_MASK
)) {
5844 if (MEM_STATUS_AT0(env
->scan_env
->backrefed_mem
, en
->m
.regnum
))
5845 remove_opt_anc_info(&opt
->anc
, ANCR_ANYCHAR_INF_MASK
);
5850 case BAG_STOP_BACKTRACK
:
5851 r
= optimize_nodes(NODE_BODY(node
), opt
, env
);
5858 copy_opt_env(&nenv
, env
);
5859 r
= optimize_nodes(NODE_BAG_BODY(en
), &xo
, &nenv
);
5861 add_mml(&nenv
.mmd
, &xo
.len
);
5862 concat_left_node_opt_info(enc
, opt
, &xo
);
5863 if (IS_NOT_NULL(en
->te
.Then
)) {
5864 r
= optimize_nodes(en
->te
.Then
, &xo
, &nenv
);
5866 concat_left_node_opt_info(enc
, opt
, &xo
);
5870 if (IS_NOT_NULL(en
->te
.Else
)) {
5871 r
= optimize_nodes(en
->te
.Else
, &xo
, env
);
5873 alt_merge_node_opt_info(opt
, &xo
, env
);
5887 fprintf(stderr
, "optimize_nodes: undefined node type %d\n", NODE_TYPE(node
));
5889 r
= ONIGERR_TYPE_BUG
;
5897 set_optimize_exact(regex_t
* reg
, OptStr
* e
)
5901 if (e
->len
== 0) return 0;
5903 reg
->exact
= (UChar
* )xmalloc(e
->len
);
5904 CHECK_NULL_RETURN_MEMERR(reg
->exact
);
5905 xmemcpy(reg
->exact
, e
->s
, e
->len
);
5906 reg
->exact_end
= reg
->exact
+ e
->len
;
5909 reg
->optimize
= OPTIMIZE_STR_CASE_FOLD
;
5910 if (e
->good_case_fold
!= 0) {
5912 r
= set_sunday_quick_search_or_bmh_skip_table(reg
, 1,
5913 reg
->exact
, reg
->exact_end
,
5914 reg
->map
, &(reg
->map_offset
));
5915 if (r
!= 0) return r
;
5916 reg
->optimize
= OPTIMIZE_STR_CASE_FOLD_FAST
;
5924 ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg
->enc
, reg
->exact
, reg
->exact_end
);
5926 if (e
->len
>= 2 || (e
->len
>= 1 && allow_reverse
)) {
5927 r
= set_sunday_quick_search_or_bmh_skip_table(reg
, 0,
5928 reg
->exact
, reg
->exact_end
,
5929 reg
->map
, &(reg
->map_offset
));
5930 if (r
!= 0) return r
;
5932 reg
->optimize
= (allow_reverse
!= 0
5934 : OPTIMIZE_STR_FAST_STEP_FORWARD
);
5937 reg
->optimize
= OPTIMIZE_STR
;
5941 reg
->dmin
= e
->mmd
.min
;
5942 reg
->dmax
= e
->mmd
.max
;
5944 if (reg
->dmin
!= INFINITE_LEN
) {
5945 reg
->threshold_len
= reg
->dmin
+ (int )(reg
->exact_end
- reg
->exact
);
5952 set_optimize_map(regex_t
* reg
, OptMap
* m
)
5956 for (i
= 0; i
< CHAR_MAP_SIZE
; i
++)
5957 reg
->map
[i
] = m
->map
[i
];
5959 reg
->optimize
= OPTIMIZE_MAP
;
5960 reg
->dmin
= m
->mmd
.min
;
5961 reg
->dmax
= m
->mmd
.max
;
5963 if (reg
->dmin
!= INFINITE_LEN
) {
5964 reg
->threshold_len
= reg
->dmin
+ 1;
5969 set_sub_anchor(regex_t
* reg
, OptAnc
* anc
)
5971 reg
->sub_anchor
|= anc
->left
& ANCR_BEGIN_LINE
;
5972 reg
->sub_anchor
|= anc
->right
& ANCR_END_LINE
;
5975 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
5976 static void print_optimize_info(FILE* f
, regex_t
* reg
);
5980 set_optimize_info_from_tree(Node
* node
, regex_t
* reg
, ScanEnv
* scan_env
)
5987 env
.options
= reg
->options
;
5988 env
.case_fold_flag
= reg
->case_fold_flag
;
5989 env
.scan_env
= scan_env
;
5990 clear_mml(&env
.mmd
);
5992 r
= optimize_nodes(node
, &opt
, &env
);
5993 if (r
!= 0) return r
;
5995 reg
->anchor
= opt
.anc
.left
& (ANCR_BEGIN_BUF
|
5996 ANCR_BEGIN_POSITION
| ANCR_ANYCHAR_INF
| ANCR_ANYCHAR_INF_ML
|
5999 if ((opt
.anc
.left
& (ANCR_LOOK_BEHIND
| ANCR_PREC_READ_NOT
)) != 0)
6000 reg
->anchor
&= ~ANCR_ANYCHAR_INF_ML
;
6002 reg
->anchor
|= opt
.anc
.right
& (ANCR_END_BUF
| ANCR_SEMI_END_BUF
|
6003 ANCR_PREC_READ_NOT
);
6005 if (reg
->anchor
& (ANCR_END_BUF
| ANCR_SEMI_END_BUF
)) {
6006 reg
->anchor_dmin
= opt
.len
.min
;
6007 reg
->anchor_dmax
= opt
.len
.max
;
6010 if (opt
.sb
.len
> 0 || opt
.sm
.len
> 0) {
6011 select_opt_exact(reg
->enc
, &opt
.sb
, &opt
.sm
);
6012 if (opt
.map
.value
> 0 && comp_opt_exact_or_map(&opt
.sb
, &opt
.map
) > 0) {
6016 r
= set_optimize_exact(reg
, &opt
.sb
);
6017 set_sub_anchor(reg
, &opt
.sb
.anc
);
6020 else if (opt
.map
.value
> 0) {
6022 set_optimize_map(reg
, &opt
.map
);
6023 set_sub_anchor(reg
, &opt
.map
.anc
);
6026 reg
->sub_anchor
|= opt
.anc
.left
& ANCR_BEGIN_LINE
;
6027 if (opt
.len
.max
== 0)
6028 reg
->sub_anchor
|= opt
.anc
.right
& ANCR_END_LINE
;
6031 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
6032 print_optimize_info(stderr
, reg
);
6038 clear_optimize_info(regex_t
* reg
)
6040 reg
->optimize
= OPTIMIZE_NONE
;
6042 reg
->anchor_dmin
= 0;
6043 reg
->anchor_dmax
= 0;
6044 reg
->sub_anchor
= 0;
6045 reg
->exact_end
= (UChar
* )NULL
;
6046 reg
->map_offset
= 0;
6047 reg
->threshold_len
= 0;
6048 if (IS_NOT_NULL(reg
->exact
)) {
6050 reg
->exact
= (UChar
* )NULL
;
6056 static void print_enc_string(FILE* fp
, OnigEncoding enc
,
6057 const UChar
*s
, const UChar
*end
)
6059 fprintf(fp
, "\nPATTERN: /");
6061 if (ONIGENC_MBC_MINLEN(enc
) > 1) {
6067 code
= ONIGENC_MBC_TO_CODE(enc
, p
, end
);
6069 fprintf(fp
, " 0x%04x ", (int )code
);
6072 fputc((int )code
, fp
);
6075 p
+= enclen(enc
, p
);
6080 fputc((int )*s
, fp
);
6088 #endif /* ONIG_DEBUG */
6090 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
6093 print_distance_range(FILE* f
, OnigLen a
, OnigLen b
)
6095 if (a
== INFINITE_LEN
)
6098 fprintf(f
, "(%u)", a
);
6102 if (b
== INFINITE_LEN
)
6105 fprintf(f
, "(%u)", b
);
6109 print_anchor(FILE* f
, int anchor
)
6115 if (anchor
& ANCR_BEGIN_BUF
) {
6116 fprintf(f
, "begin-buf");
6119 if (anchor
& ANCR_BEGIN_LINE
) {
6120 if (q
) fprintf(f
, ", ");
6122 fprintf(f
, "begin-line");
6124 if (anchor
& ANCR_BEGIN_POSITION
) {
6125 if (q
) fprintf(f
, ", ");
6127 fprintf(f
, "begin-pos");
6129 if (anchor
& ANCR_END_BUF
) {
6130 if (q
) fprintf(f
, ", ");
6132 fprintf(f
, "end-buf");
6134 if (anchor
& ANCR_SEMI_END_BUF
) {
6135 if (q
) fprintf(f
, ", ");
6137 fprintf(f
, "semi-end-buf");
6139 if (anchor
& ANCR_END_LINE
) {
6140 if (q
) fprintf(f
, ", ");
6142 fprintf(f
, "end-line");
6144 if (anchor
& ANCR_ANYCHAR_INF
) {
6145 if (q
) fprintf(f
, ", ");
6147 fprintf(f
, "anychar-inf");
6149 if (anchor
& ANCR_ANYCHAR_INF_ML
) {
6150 if (q
) fprintf(f
, ", ");
6151 fprintf(f
, "anychar-inf-ml");
6158 print_optimize_info(FILE* f
, regex_t
* reg
)
6160 static const char* on
[] = { "NONE", "STR",
6161 "STR_FAST", "STR_FAST_STEP_FORWARD",
6162 "STR_CASE_FOLD_FAST", "STR_CASE_FOLD", "MAP" };
6164 fprintf(f
, "optimize: %s\n", on
[reg
->optimize
]);
6165 fprintf(f
, " anchor: "); print_anchor(f
, reg
->anchor
);
6166 if ((reg
->anchor
& ANCR_END_BUF_MASK
) != 0)
6167 print_distance_range(f
, reg
->anchor_dmin
, reg
->anchor_dmax
);
6170 if (reg
->optimize
) {
6171 fprintf(f
, " sub anchor: "); print_anchor(f
, reg
->sub_anchor
);
6178 fprintf(f
, "exact: [");
6179 for (p
= reg
->exact
; p
< reg
->exact_end
; p
++) {
6182 fprintf(f
, "]: length: %ld\n", (reg
->exact_end
- reg
->exact
));
6184 else if (reg
->optimize
& OPTIMIZE_MAP
) {
6187 for (i
= 0; i
< CHAR_MAP_SIZE
; i
++)
6188 if (reg
->map
[i
]) n
++;
6190 fprintf(f
, "map: n=%d\n", n
);
6194 for (i
= 0; i
< CHAR_MAP_SIZE
; i
++) {
6195 if (reg
->map
[i
] != 0) {
6196 if (c
> 0) fputs(", ", f
);
6198 if (ONIGENC_MBC_MAXLEN(reg
->enc
) == 1 &&
6199 ONIGENC_IS_CODE_PRINT(reg
->enc
, (OnigCodePoint
)i
))
6202 fprintf(f
, "%d", i
);
6213 onig_get_regex_ext(regex_t
* reg
)
6215 if (IS_NULL(reg
->extp
)) {
6216 RegexExt
* ext
= (RegexExt
* )xmalloc(sizeof(*ext
));
6217 if (IS_NULL(ext
)) return 0;
6220 ext
->pattern_end
= 0;
6223 ext
->callout_num
= 0;
6224 ext
->callout_list_alloc
= 0;
6225 ext
->callout_list
= 0;
6235 free_regex_ext(RegexExt
* ext
)
6237 if (IS_NOT_NULL(ext
)) {
6238 if (IS_NOT_NULL(ext
->pattern
))
6239 xfree((void* )ext
->pattern
);
6242 if (IS_NOT_NULL(ext
->tag_table
))
6243 onig_callout_tag_table_free(ext
->tag_table
);
6245 if (IS_NOT_NULL(ext
->callout_list
))
6246 onig_free_reg_callout_list(ext
->callout_num
, ext
->callout_list
);
6254 onig_ext_set_pattern(regex_t
* reg
, const UChar
* pattern
, const UChar
* pattern_end
)
6259 ext
= onig_get_regex_ext(reg
);
6260 CHECK_NULL_RETURN_MEMERR(ext
);
6262 s
= onigenc_strdup(reg
->enc
, pattern
, pattern_end
);
6263 CHECK_NULL_RETURN_MEMERR(s
);
6266 ext
->pattern_end
= s
+ (pattern_end
- pattern
);
6272 onig_free_body(regex_t
* reg
)
6274 if (IS_NOT_NULL(reg
)) {
6276 if (IS_NOT_NULL(reg
->string_pool
)) {
6277 xfree(reg
->string_pool
);
6278 reg
->string_pool_end
= reg
->string_pool
= 0;
6280 if (IS_NOT_NULL(reg
->exact
)) xfree(reg
->exact
);
6281 if (IS_NOT_NULL(reg
->repeat_range
)) xfree(reg
->repeat_range
);
6282 if (IS_NOT_NULL(reg
->extp
)) {
6283 free_regex_ext(reg
->extp
);
6287 onig_names_free(reg
);
6292 onig_free(regex_t
* reg
)
6294 if (IS_NOT_NULL(reg
)) {
6295 onig_free_body(reg
);
6301 #ifdef ONIG_DEBUG_PARSE
6302 static void print_tree
P_((FILE* f
, Node
* node
));
6305 extern int onig_init_for_match_at(regex_t
* reg
);
6308 onig_compile(regex_t
* reg
, const UChar
* pattern
, const UChar
* pattern_end
,
6309 OnigErrorInfo
* einfo
)
6315 UnsetAddrList uslist
;
6319 if (IS_NOT_NULL(einfo
)) {
6320 einfo
->enc
= reg
->enc
;
6321 einfo
->par
= (UChar
* )NULL
;
6325 print_enc_string(stderr
, reg
->enc
, pattern
, pattern_end
);
6328 if (reg
->ops_alloc
== 0) {
6329 r
= ops_init(reg
, OPS_INIT_SIZE
);
6330 if (r
!= 0) goto end
;
6335 reg
->string_pool
= 0;
6336 reg
->string_pool_end
= 0;
6338 reg
->num_repeat
= 0;
6339 reg
->num_null_check
= 0;
6340 reg
->repeat_range_alloc
= 0;
6341 reg
->repeat_range
= (OnigRepeatRange
* )NULL
;
6343 r
= onig_parse_tree(&root
, pattern
, pattern_end
, reg
, &scan_env
);
6344 if (r
!= 0) goto err
;
6346 /* mixed use named group and no-named group */
6347 if (scan_env
.num_named
> 0 &&
6348 IS_SYNTAX_BV(scan_env
.syntax
, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP
) &&
6349 ! ONIG_IS_OPTION_ON(reg
->options
, ONIG_OPTION_CAPTURE_GROUP
)) {
6350 if (scan_env
.num_named
!= scan_env
.num_mem
)
6351 r
= disable_noname_group_capture(&root
, reg
, &scan_env
);
6353 r
= numbered_ref_check(root
);
6355 if (r
!= 0) goto err
;
6358 r
= check_backrefs(root
, &scan_env
);
6359 if (r
!= 0) goto err
;
6362 if (scan_env
.num_call
> 0) {
6363 r
= unset_addr_list_init(&uslist
, scan_env
.num_call
);
6364 if (r
!= 0) goto err
;
6365 scan_env
.unset_addr_list
= &uslist
;
6366 r
= setup_call(root
, &scan_env
, 0);
6367 if (r
!= 0) goto err_unset
;
6368 r
= setup_call2(root
);
6369 if (r
!= 0) goto err_unset
;
6370 r
= recursive_call_check_trav(root
, &scan_env
, 0);
6371 if (r
< 0) goto err_unset
;
6372 r
= infinite_recursive_call_check_trav(root
, &scan_env
);
6373 if (r
!= 0) goto err_unset
;
6375 setup_called_state(root
, 0);
6378 reg
->num_call
= scan_env
.num_call
;
6381 r
= setup_tree(root
, reg
, 0, &scan_env
);
6382 if (r
!= 0) goto err_unset
;
6384 #ifdef ONIG_DEBUG_PARSE
6385 print_tree(stderr
, root
);
6388 reg
->capture_history
= scan_env
.capture_history
;
6389 reg
->bt_mem_start
= scan_env
.bt_mem_start
;
6390 reg
->bt_mem_start
|= reg
->capture_history
;
6391 if (IS_FIND_CONDITION(reg
->options
))
6392 MEM_STATUS_ON_ALL(reg
->bt_mem_end
);
6394 reg
->bt_mem_end
= scan_env
.bt_mem_end
;
6395 reg
->bt_mem_end
|= reg
->capture_history
;
6397 reg
->bt_mem_start
|= reg
->bt_mem_end
;
6399 clear_optimize_info(reg
);
6400 #ifndef ONIG_DONT_OPTIMIZE
6401 r
= set_optimize_info_from_tree(root
, reg
, &scan_env
);
6402 if (r
!= 0) goto err_unset
;
6405 if (IS_NOT_NULL(scan_env
.mem_env_dynamic
)) {
6406 xfree(scan_env
.mem_env_dynamic
);
6407 scan_env
.mem_env_dynamic
= (MemEnv
* )NULL
;
6410 r
= compile_tree(root
, reg
, &scan_env
);
6412 if (scan_env
.keep_num
> 0) {
6413 r
= add_op(reg
, OP_UPDATE_VAR
);
6414 if (r
!= 0) goto err
;
6416 COP(reg
)->update_var
.type
= UPDATE_VAR_KEEP_FROM_STACK_LAST
;
6417 COP(reg
)->update_var
.id
= 0; /* not used */
6420 r
= add_op(reg
, OP_END
);
6421 if (r
!= 0) goto err
;
6424 if (scan_env
.num_call
> 0) {
6425 r
= fix_unset_addr_list(&uslist
, reg
);
6426 unset_addr_list_end(&uslist
);
6427 if (r
!= 0) goto err
;
6431 if ((reg
->num_repeat
!= 0) || (reg
->bt_mem_end
!= 0)
6433 || (IS_NOT_NULL(reg
->extp
) && reg
->extp
->callout_num
!= 0)
6436 reg
->stack_pop_level
= STACK_POP_LEVEL_ALL
;
6438 if (reg
->bt_mem_start
!= 0)
6439 reg
->stack_pop_level
= STACK_POP_LEVEL_MEM_START
;
6441 reg
->stack_pop_level
= STACK_POP_LEVEL_FREE
;
6444 r
= ops_make_string_pool(reg
);
6445 if (r
!= 0) goto err
;
6448 else if (scan_env
.num_call
> 0) {
6449 unset_addr_list_end(&uslist
);
6452 onig_node_free(root
);
6454 #ifdef ONIG_DEBUG_COMPILE
6455 onig_print_names(stderr
, reg
);
6456 onig_print_compiled_byte_code_list(stderr
, reg
);
6459 #ifdef USE_DIRECT_THREADED_CODE
6460 /* opcode -> opaddr */
6461 onig_init_for_match_at(reg
);
6469 if (scan_env
.num_call
> 0) {
6470 unset_addr_list_end(&uslist
);
6474 if (IS_NOT_NULL(scan_env
.error
)) {
6475 if (IS_NOT_NULL(einfo
)) {
6476 einfo
->par
= scan_env
.error
;
6477 einfo
->par_end
= scan_env
.error_end
;
6481 onig_node_free(root
);
6482 if (IS_NOT_NULL(scan_env
.mem_env_dynamic
))
6483 xfree(scan_env
.mem_env_dynamic
);
6488 static int onig_inited
= 0;
6491 onig_reg_init(regex_t
* reg
, OnigOptionType option
, OnigCaseFoldType case_fold_flag
,
6492 OnigEncoding enc
, OnigSyntaxType
* syntax
)
6496 xmemset(reg
, 0, sizeof(*reg
));
6498 if (onig_inited
== 0) {
6500 return ONIGERR_LIBRARY_IS_NOT_INITIALIZED
;
6502 r
= onig_initialize(&enc
, 1);
6504 return ONIGERR_FAIL_TO_INITIALIZE
;
6506 onig_warning("You didn't call onig_initialize() explicitly");
6511 return ONIGERR_INVALID_ARGUMENT
;
6513 if (ONIGENC_IS_UNDEF(enc
))
6514 return ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED
;
6516 if ((option
& (ONIG_OPTION_DONT_CAPTURE_GROUP
|ONIG_OPTION_CAPTURE_GROUP
))
6517 == (ONIG_OPTION_DONT_CAPTURE_GROUP
|ONIG_OPTION_CAPTURE_GROUP
)) {
6518 return ONIGERR_INVALID_COMBINATION_OF_OPTIONS
;
6521 if ((option
& ONIG_OPTION_NEGATE_SINGLELINE
) != 0) {
6522 option
|= syntax
->options
;
6523 option
&= ~ONIG_OPTION_SINGLELINE
;
6526 option
|= syntax
->options
;
6529 (reg
)->options
= option
;
6530 (reg
)->syntax
= syntax
;
6531 (reg
)->optimize
= 0;
6532 (reg
)->exact
= (UChar
* )NULL
;
6533 (reg
)->extp
= (RegexExt
* )NULL
;
6535 (reg
)->ops
= (Operation
* )NULL
;
6536 (reg
)->ops_curr
= (Operation
* )NULL
;
6537 (reg
)->ops_used
= 0;
6538 (reg
)->ops_alloc
= 0;
6539 (reg
)->name_table
= (void* )NULL
;
6541 (reg
)->case_fold_flag
= case_fold_flag
;
6546 onig_new_without_alloc(regex_t
* reg
,
6547 const UChar
* pattern
, const UChar
* pattern_end
,
6548 OnigOptionType option
, OnigEncoding enc
,
6549 OnigSyntaxType
* syntax
, OnigErrorInfo
* einfo
)
6553 r
= onig_reg_init(reg
, option
, ONIGENC_CASE_FOLD_DEFAULT
, enc
, syntax
);
6554 if (r
!= 0) return r
;
6556 r
= onig_compile(reg
, pattern
, pattern_end
, einfo
);
6561 onig_new(regex_t
** reg
, const UChar
* pattern
, const UChar
* pattern_end
,
6562 OnigOptionType option
, OnigEncoding enc
, OnigSyntaxType
* syntax
,
6563 OnigErrorInfo
* einfo
)
6567 *reg
= (regex_t
* )xmalloc(sizeof(regex_t
));
6568 if (IS_NULL(*reg
)) return ONIGERR_MEMORY
;
6570 r
= onig_reg_init(*reg
, option
, ONIGENC_CASE_FOLD_DEFAULT
, enc
, syntax
);
6571 if (r
!= 0) goto err
;
6573 r
= onig_compile(*reg
, pattern
, pattern_end
, einfo
);
6583 onig_initialize(OnigEncoding encodings
[], int n
)
6588 if (onig_inited
!= 0)
6595 for (i
= 0; i
< n
; i
++) {
6596 OnigEncoding enc
= encodings
[i
];
6597 r
= onig_initialize_encoding(enc
);
6605 typedef struct EndCallListItem
{
6606 struct EndCallListItem
* next
;
6608 } EndCallListItemType
;
6610 static EndCallListItemType
* EndCallTop
;
6612 extern void onig_add_end_call(void (*func
)(void))
6614 EndCallListItemType
* item
;
6616 item
= (EndCallListItemType
* )xmalloc(sizeof(*item
));
6617 if (item
== 0) return ;
6619 item
->next
= EndCallTop
;
6626 exec_end_call_list(void)
6628 EndCallListItemType
* prev
;
6631 while (EndCallTop
!= 0) {
6632 func
= EndCallTop
->func
;
6636 EndCallTop
= EndCallTop
->next
;
6644 exec_end_call_list();
6647 onig_global_callout_names_free();
6658 onig_is_in_code_range(const UChar
* p
, OnigCodePoint code
)
6660 OnigCodePoint n
, *data
;
6661 OnigCodePoint low
, high
, x
;
6663 GET_CODE_POINT(n
, p
);
6664 data
= (OnigCodePoint
* )p
;
6667 for (low
= 0, high
= n
; low
< high
; ) {
6668 x
= (low
+ high
) >> 1;
6669 if (code
> data
[x
* 2 + 1])
6675 return ((low
< n
&& code
>= data
[low
* 2]) ? 1 : 0);
6679 onig_is_code_in_cc_len(int elen
, OnigCodePoint code
, /* CClassNode* */ void* cc_arg
)
6682 CClassNode
* cc
= (CClassNode
* )cc_arg
;
6684 if (elen
> 1 || (code
>= SINGLE_BYTE_SIZE
)) {
6685 if (IS_NULL(cc
->mbuf
)) {
6689 found
= onig_is_in_code_range(cc
->mbuf
->p
, code
) != 0;
6693 found
= BITSET_AT(cc
->bs
, code
) != 0;
6696 if (IS_NCCLASS_NOT(cc
))
6703 onig_is_code_in_cc(OnigEncoding enc
, OnigCodePoint code
, CClassNode
* cc
)
6707 if (ONIGENC_MBC_MINLEN(enc
) > 1) {
6711 len
= ONIGENC_CODE_TO_MBCLEN(enc
, code
);
6712 if (len
< 0) return 0;
6714 return onig_is_code_in_cc_len(len
, code
, cc
);
6718 #ifdef ONIG_DEBUG_PARSE
6721 p_string(FILE* f
, int len
, UChar
* s
)
6724 while (len
-- > 0) { fputc(*s
++, f
); }
6728 Indent(FILE* f
, int indent
)
6731 for (i
= 0; i
< indent
; i
++) putc(' ', f
);
6735 print_indent_tree(FILE* f
, Node
* node
, int indent
)
6743 if (IS_NULL(node
)) {
6744 fprintf(f
, "ERROR: null node!!!\n");
6748 type
= NODE_TYPE(node
);
6752 if (type
== NODE_LIST
)
6753 fprintf(f
, "<list:%p>\n", node
);
6755 fprintf(f
, "<alt:%p>\n", node
);
6757 print_indent_tree(f
, NODE_CAR(node
), indent
+ add
);
6758 while (IS_NOT_NULL(node
= NODE_CDR(node
))) {
6759 if (NODE_TYPE(node
) != type
) {
6760 fprintf(f
, "ERROR: list/alt right is not a cons. %d\n", NODE_TYPE(node
));
6763 print_indent_tree(f
, NODE_CAR(node
), indent
+ add
);
6773 if (NODE_STRING_IS_RAW(node
))
6775 else if (NODE_STRING_IS_AMBIG(node
))
6780 if (NODE_STRING_IS_GOOD_AMBIG(node
))
6785 if (NODE_STRING_IS_DONT_GET_OPT_INFO(node
))
6786 dont
= " (dont-opt)";
6790 fprintf(f
, "<string%s%s%s:%p>", mode
, good
, dont
, node
);
6791 for (p
= STR_(node
)->s
; p
< STR_(node
)->end
; p
++) {
6792 if (*p
>= 0x20 && *p
< 0x7f)
6795 fprintf(f
, " 0x%02x", *p
);
6802 fprintf(f
, "<cclass:%p>", node
);
6803 if (IS_NCCLASS_NOT(CCLASS_(node
))) fputs(" not", f
);
6804 if (CCLASS_(node
)->mbuf
) {
6805 BBuf
* bbuf
= CCLASS_(node
)->mbuf
;
6806 for (i
= 0; i
< bbuf
->used
; i
++) {
6807 if (i
> 0) fprintf(f
, ",");
6808 fprintf(f
, "%0x", bbuf
->p
[i
]);
6814 fprintf(f
, "<ctype:%p> ", node
);
6815 switch (CTYPE_(node
)->ctype
) {
6817 fprintf(f
, "<anychar:%p>", node
);
6820 case ONIGENC_CTYPE_WORD
:
6821 if (CTYPE_(node
)->not != 0)
6822 fputs("not word", f
);
6826 if (CTYPE_(node
)->ascii_mode
!= 0)
6827 fputs(" (ascii)", f
);
6832 fprintf(f
, "ERROR: undefined ctype.\n");
6838 fprintf(f
, "<anchor:%p> ", node
);
6839 switch (ANCHOR_(node
)->type
) {
6840 case ANCR_BEGIN_BUF
: fputs("begin buf", f
); break;
6841 case ANCR_END_BUF
: fputs("end buf", f
); break;
6842 case ANCR_BEGIN_LINE
: fputs("begin line", f
); break;
6843 case ANCR_END_LINE
: fputs("end line", f
); break;
6844 case ANCR_SEMI_END_BUF
: fputs("semi end buf", f
); break;
6845 case ANCR_BEGIN_POSITION
: fputs("begin position", f
); break;
6847 case ANCR_WORD_BOUNDARY
: fputs("word boundary", f
); break;
6848 case ANCR_NO_WORD_BOUNDARY
: fputs("not word boundary", f
); break;
6849 #ifdef USE_WORD_BEGIN_END
6850 case ANCR_WORD_BEGIN
: fputs("word begin", f
); break;
6851 case ANCR_WORD_END
: fputs("word end", f
); break;
6853 case ANCR_TEXT_SEGMENT_BOUNDARY
:
6854 fputs("text-segment boundary", f
); break;
6855 case ANCR_NO_TEXT_SEGMENT_BOUNDARY
:
6856 fputs("no text-segment boundary", f
); break;
6857 case ANCR_PREC_READ
:
6858 fprintf(f
, "prec read\n");
6859 print_indent_tree(f
, NODE_BODY(node
), indent
+ add
);
6861 case ANCR_PREC_READ_NOT
:
6862 fprintf(f
, "prec read not\n");
6863 print_indent_tree(f
, NODE_BODY(node
), indent
+ add
);
6865 case ANCR_LOOK_BEHIND
:
6866 fprintf(f
, "look behind\n");
6867 print_indent_tree(f
, NODE_BODY(node
), indent
+ add
);
6869 case ANCR_LOOK_BEHIND_NOT
:
6870 fprintf(f
, "look behind not\n");
6871 print_indent_tree(f
, NODE_BODY(node
), indent
+ add
);
6875 fprintf(f
, "ERROR: undefined anchor type.\n");
6883 BackRefNode
* br
= BACKREF_(node
);
6885 fprintf(f
, "<backref%s:%p>", NODE_IS_CHECKER(node
) ? "-checker" : "", node
);
6886 for (i
= 0; i
< br
->back_num
; i
++) {
6887 if (i
> 0) fputs(", ", f
);
6888 fprintf(f
, "%d", p
[i
]);
6896 CallNode
* cn
= CALL_(node
);
6897 fprintf(f
, "<call:%p>", node
);
6898 p_string(f
, cn
->name_end
- cn
->name
, cn
->name
);
6904 fprintf(f
, "<quantifier:%p>{%d,%d}%s\n", node
,
6905 QUANT_(node
)->lower
, QUANT_(node
)->upper
,
6906 (QUANT_(node
)->greedy
? "" : "?"));
6907 print_indent_tree(f
, NODE_BODY(node
), indent
+ add
);
6911 fprintf(f
, "<bag:%p> ", node
);
6912 switch (BAG_(node
)->type
) {
6914 fprintf(f
, "option:%d", BAG_(node
)->o
.options
);
6917 fprintf(f
, "memory:%d", BAG_(node
)->m
.regnum
);
6919 case BAG_STOP_BACKTRACK
:
6920 fprintf(f
, "stop-bt");
6923 fprintf(f
, "if-else");
6927 print_indent_tree(f
, NODE_BODY(node
), indent
+ add
);
6931 fprintf(f
, "<gimmick:%p> ", node
);
6932 switch (GIMMICK_(node
)->type
) {
6937 fprintf(f
, "save:%d:%d", GIMMICK_(node
)->detail_type
, GIMMICK_(node
)->id
);
6939 case GIMMICK_UPDATE_VAR
:
6940 fprintf(f
, "update_var:%d:%d", GIMMICK_(node
)->detail_type
, GIMMICK_(node
)->id
);
6943 case GIMMICK_CALLOUT
:
6944 switch (GIMMICK_(node
)->detail_type
) {
6945 case ONIG_CALLOUT_OF_CONTENTS
:
6946 fprintf(f
, "callout:contents:%d", GIMMICK_(node
)->num
);
6948 case ONIG_CALLOUT_OF_NAME
:
6949 fprintf(f
, "callout:name:%d:%d", GIMMICK_(node
)->id
, GIMMICK_(node
)->num
);
6957 fprintf(f
, "print_indent_tree: undefined node type %d\n", NODE_TYPE(node
));
6961 if (type
!= NODE_LIST
&& type
!= NODE_ALT
&& type
!= NODE_QUANT
&&
6968 print_tree(FILE* f
, Node
* node
)
6970 print_indent_tree(f
, node
, 0);