]> git.proxmox.com Git - mirror_edk2.git/blob - MdeModulePkg/Universal/RegularExpressionDxe/Oniguruma/regcomp.c
d847ca4233e2b82144db0357540bce441ff938ef
[mirror_edk2.git] / MdeModulePkg / Universal / RegularExpressionDxe / Oniguruma / regcomp.c
1 /**********************************************************************
2 regcomp.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include "regparse.h"
31
32 #define OPS_INIT_SIZE 8
33
34 OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN;
35
36 #if 0
37 typedef struct {
38 int n;
39 int alloc;
40 int* v;
41 } int_stack;
42
43 static int
44 make_int_stack(int_stack** rs, int init_size)
45 {
46 int_stack* s;
47 int* v;
48
49 *rs = 0;
50
51 s = xmalloc(sizeof(*s));
52 if (IS_NULL(s)) return ONIGERR_MEMORY;
53
54 v = (int* )xmalloc(sizeof(int) * init_size);
55 if (IS_NULL(v)) {
56 xfree(s);
57 return ONIGERR_MEMORY;
58 }
59
60 s->n = 0;
61 s->alloc = init_size;
62 s->v = v;
63
64 *rs = s;
65 return ONIG_NORMAL;
66 }
67
68 static void
69 free_int_stack(int_stack* s)
70 {
71 if (IS_NOT_NULL(s)) {
72 if (IS_NOT_NULL(s->v))
73 xfree(s->v);
74 xfree(s);
75 }
76 }
77
78 static int
79 int_stack_push(int_stack* s, int v)
80 {
81 if (s->n >= s->alloc) {
82 int new_size = s->alloc * 2;
83 int* nv = (int* )xrealloc(s->v, sizeof(int) * new_size, sizeof(int) * s->alloc);
84 if (IS_NULL(nv)) return ONIGERR_MEMORY;
85
86 s->alloc = new_size;
87 s->v = nv;
88 }
89
90 s->v[s->n] = v;
91 s->n++;
92 return ONIG_NORMAL;
93 }
94
95 static int
96 int_stack_pop(int_stack* s)
97 {
98 int v;
99
100 #ifdef ONIG_DEBUG
101 if (s->n <= 0) {
102 fprintf(stderr, "int_stack_pop: fail empty. %p\n", s);
103 return 0;
104 }
105 #endif
106
107 v = s->v[s->n];
108 s->n--;
109 return v;
110 }
111 #endif
112
113 static int
114 ops_init(regex_t* reg, int init_alloc_size)
115 {
116 Operation* p;
117 size_t size;
118
119 if (init_alloc_size > 0) {
120 size = sizeof(Operation) * init_alloc_size;
121 p = (Operation* )xmalloc(size);
122 CHECK_NULL_RETURN_MEMERR(p);
123 #ifdef USE_DIRECT_THREADED_CODE
124 {
125 enum OpCode* cp;
126 size = sizeof(enum OpCode) * init_alloc_size;
127 cp = (enum OpCode* )xmalloc(size);
128 CHECK_NULL_RETURN_MEMERR(cp);
129 reg->ocs = cp;
130 }
131 #endif
132 }
133 else {
134 p = (Operation* )0;
135 #ifdef USE_DIRECT_THREADED_CODE
136 reg->ocs = (enum OpCode* )0;
137 #endif
138 }
139
140 reg->ops = p;
141 reg->ops_curr = 0; /* !!! not yet done ops_new() */
142 reg->ops_alloc = init_alloc_size;
143 reg->ops_used = 0;
144
145 return ONIG_NORMAL;
146 }
147
148 static int
149 ops_expand(regex_t* reg, int n)
150 {
151 #define MIN_OPS_EXPAND_SIZE 4
152
153 #ifdef USE_DIRECT_THREADED_CODE
154 enum OpCode* cp;
155 #endif
156 Operation* p;
157 size_t size;
158
159 if (n <= 0) n = MIN_OPS_EXPAND_SIZE;
160
161 n += reg->ops_alloc;
162
163 size = sizeof(Operation) * n;
164 p = (Operation* )xrealloc(reg->ops, size, sizeof(Operation) * reg->ops_alloc);
165 CHECK_NULL_RETURN_MEMERR(p);
166
167 #ifdef USE_DIRECT_THREADED_CODE
168 size = sizeof(enum OpCode) * n;
169 cp = (enum OpCode* )xrealloc(reg->ocs, size, sizeof(enum OpCode) * reg->ops_alloc);
170 CHECK_NULL_RETURN_MEMERR(cp);
171 reg->ocs = cp;
172 #endif
173
174 reg->ops = p;
175 reg->ops_alloc = n;
176 if (reg->ops_used == 0)
177 reg->ops_curr = 0;
178 else
179 reg->ops_curr = reg->ops + (reg->ops_used - 1);
180
181 return ONIG_NORMAL;
182 }
183
184 static int
185 ops_new(regex_t* reg)
186 {
187 int r;
188
189 if (reg->ops_used >= reg->ops_alloc) {
190 r = ops_expand(reg, reg->ops_alloc);
191 if (r != ONIG_NORMAL) return r;
192 }
193
194 reg->ops_curr = reg->ops + reg->ops_used;
195 reg->ops_used++;
196
197 xmemset(reg->ops_curr, 0, sizeof(Operation));
198 return ONIG_NORMAL;
199 }
200
201 static int
202 is_in_string_pool(regex_t* reg, UChar* s)
203 {
204 return (s >= reg->string_pool && s < reg->string_pool_end);
205 }
206
207 static void
208 ops_free(regex_t* reg)
209 {
210 int i;
211
212 if (IS_NULL(reg->ops)) return ;
213
214 for (i = 0; i < (int )reg->ops_used; i++) {
215 enum OpCode opcode;
216 Operation* op;
217
218 op = reg->ops + i;
219
220 #ifdef USE_DIRECT_THREADED_CODE
221 opcode = *(reg->ocs + i);
222 #else
223 opcode = op->opcode;
224 #endif
225
226 switch (opcode) {
227 case OP_EXACTMBN:
228 if (! is_in_string_pool(reg, op->exact_len_n.s))
229 xfree(op->exact_len_n.s);
230 break;
231 case OP_EXACTN: case OP_EXACTMB2N: case OP_EXACTMB3N: case OP_EXACTN_IC:
232 if (! is_in_string_pool(reg, op->exact_n.s))
233 xfree(op->exact_n.s);
234 break;
235 case OP_EXACT1: case OP_EXACT2: case OP_EXACT3: case OP_EXACT4:
236 case OP_EXACT5: case OP_EXACTMB2N1: case OP_EXACTMB2N2:
237 case OP_EXACTMB2N3: case OP_EXACT1_IC:
238 break;
239
240 case OP_CCLASS_NOT: case OP_CCLASS:
241 xfree(op->cclass.bsp);
242 break;
243
244 case OP_CCLASS_MB_NOT: case OP_CCLASS_MB:
245 xfree(op->cclass_mb.mb);
246 break;
247 case OP_CCLASS_MIX_NOT: case OP_CCLASS_MIX:
248 xfree(op->cclass_mix.mb);
249 xfree(op->cclass_mix.bsp);
250 break;
251
252 case OP_BACKREF1: case OP_BACKREF2: case OP_BACKREF_N: case OP_BACKREF_N_IC:
253 break;
254 case OP_BACKREF_MULTI: case OP_BACKREF_MULTI_IC:
255 case OP_BACKREF_WITH_LEVEL:
256 case OP_BACKREF_WITH_LEVEL_IC:
257 case OP_BACKREF_CHECK:
258 case OP_BACKREF_CHECK_WITH_LEVEL:
259 if (op->backref_general.num != 1)
260 xfree(op->backref_general.ns);
261 break;
262
263 default:
264 break;
265 }
266 }
267
268 xfree(reg->ops);
269 #ifdef USE_DIRECT_THREADED_CODE
270 xfree(reg->ocs);
271 reg->ocs = 0;
272 #endif
273
274 reg->ops = 0;
275 reg->ops_curr = 0;
276 reg->ops_alloc = 0;
277 reg->ops_used = 0;
278 }
279
280 static int
281 ops_calc_size_of_string_pool(regex_t* reg)
282 {
283 int i;
284 int total;
285
286 if (IS_NULL(reg->ops)) return 0;
287
288 total = 0;
289 for (i = 0; i < (int )reg->ops_used; i++) {
290 enum OpCode opcode;
291 Operation* op;
292
293 op = reg->ops + i;
294 #ifdef USE_DIRECT_THREADED_CODE
295 opcode = *(reg->ocs + i);
296 #else
297 opcode = op->opcode;
298 #endif
299
300 switch (opcode) {
301 case OP_EXACTMBN:
302 total += op->exact_len_n.len * op->exact_len_n.n;
303 break;
304 case OP_EXACTN:
305 case OP_EXACTN_IC:
306 total += op->exact_n.n;
307 break;
308 case OP_EXACTMB2N:
309 total += op->exact_n.n * 2;
310 break;
311 case OP_EXACTMB3N:
312 total += op->exact_n.n * 3;
313 break;
314
315 default:
316 break;
317 }
318 }
319
320 return total;
321 }
322
323 static int
324 ops_make_string_pool(regex_t* reg)
325 {
326 int i;
327 int len;
328 int size;
329 UChar* pool;
330 UChar* curr;
331
332 size = ops_calc_size_of_string_pool(reg);
333 if (size <= 0) {
334 return 0;
335 }
336
337 curr = pool = (UChar* )xmalloc((size_t )size);
338 CHECK_NULL_RETURN_MEMERR(pool);
339
340 for (i = 0; i < (int )reg->ops_used; i++) {
341 enum OpCode opcode;
342 Operation* op;
343
344 op = reg->ops + i;
345 #ifdef USE_DIRECT_THREADED_CODE
346 opcode = *(reg->ocs + i);
347 #else
348 opcode = op->opcode;
349 #endif
350
351 switch (opcode) {
352 case OP_EXACTMBN:
353 len = op->exact_len_n.len * op->exact_len_n.n;
354 xmemcpy(curr, op->exact_len_n.s, len);
355 xfree(op->exact_len_n.s);
356 op->exact_len_n.s = curr;
357 curr += len;
358 break;
359 case OP_EXACTN:
360 case OP_EXACTN_IC:
361 len = op->exact_n.n;
362 copy:
363 xmemcpy(curr, op->exact_n.s, len);
364 xfree(op->exact_n.s);
365 op->exact_n.s = curr;
366 curr += len;
367 break;
368 case OP_EXACTMB2N:
369 len = op->exact_n.n * 2;
370 goto copy;
371 break;
372 case OP_EXACTMB3N:
373 len = op->exact_n.n * 3;
374 goto copy;
375 break;
376
377 default:
378 break;
379 }
380 }
381
382 reg->string_pool = pool;
383 reg->string_pool_end = pool + size;
384 return 0;
385 }
386
387 extern OnigCaseFoldType
388 onig_get_default_case_fold_flag(void)
389 {
390 return OnigDefaultCaseFoldFlag;
391 }
392
393 extern int
394 onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)
395 {
396 OnigDefaultCaseFoldFlag = case_fold_flag;
397 return 0;
398 }
399
400 static int
401 int_multiply_cmp(int x, int y, int v)
402 {
403 if (x == 0 || y == 0) return -1;
404
405 if (x < INT_MAX / y) {
406 int xy = x * y;
407 if (xy > v) return 1;
408 else {
409 if (xy == v) return 0;
410 else return -1;
411 }
412 }
413 else
414 return 1;
415 }
416
417 extern int
418 onig_positive_int_multiply(int x, int y)
419 {
420 if (x == 0 || y == 0) return 0;
421
422 if (x < INT_MAX / y)
423 return x * y;
424 else
425 return -1;
426 }
427
428
429 static void
430 swap_node(Node* a, Node* b)
431 {
432 Node c;
433
434 c = *a; *a = *b; *b = c;
435
436 if (NODE_TYPE(a) == NODE_STRING) {
437 StrNode* sn = STR_(a);
438 if (sn->capacity == 0) {
439 int len = (int )(sn->end - sn->s);
440 sn->s = sn->buf;
441 sn->end = sn->s + len;
442 }
443 }
444
445 if (NODE_TYPE(b) == NODE_STRING) {
446 StrNode* sn = STR_(b);
447 if (sn->capacity == 0) {
448 int len = (int )(sn->end - sn->s);
449 sn->s = sn->buf;
450 sn->end = sn->s + len;
451 }
452 }
453 }
454
455 static OnigLen
456 distance_add(OnigLen d1, OnigLen d2)
457 {
458 if (d1 == INFINITE_LEN || d2 == INFINITE_LEN)
459 return INFINITE_LEN;
460 else {
461 if (d1 <= INFINITE_LEN - d2) return d1 + d2;
462 else return INFINITE_LEN;
463 }
464 }
465
466 static OnigLen
467 distance_multiply(OnigLen d, int m)
468 {
469 if (m == 0) return 0;
470
471 if (d < INFINITE_LEN / m)
472 return d * m;
473 else
474 return INFINITE_LEN;
475 }
476
477 static int
478 bitset_is_empty(BitSetRef bs)
479 {
480 int i;
481
482 for (i = 0; i < (int )BITSET_SIZE; i++) {
483 if (bs[i] != 0) return 0;
484 }
485 return 1;
486 }
487
488 #ifdef USE_CALL
489
490 static int
491 unset_addr_list_init(UnsetAddrList* list, int size)
492 {
493 UnsetAddr* p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size);
494 CHECK_NULL_RETURN_MEMERR(p);
495
496 list->num = 0;
497 list->alloc = size;
498 list->us = p;
499 return 0;
500 }
501
502 static void
503 unset_addr_list_end(UnsetAddrList* list)
504 {
505 if (IS_NOT_NULL(list->us))
506 xfree(list->us);
507 }
508
509 static int
510 unset_addr_list_add(UnsetAddrList* list, int offset, struct _Node* node)
511 {
512 UnsetAddr* p;
513 int size;
514
515 if (list->num >= list->alloc) {
516 size = list->alloc * 2;
517 p = (UnsetAddr* )xrealloc(list->us, sizeof(UnsetAddr) * size, sizeof(UnsetAddr)* list->alloc);
518 CHECK_NULL_RETURN_MEMERR(p);
519 list->alloc = size;
520 list->us = p;
521 }
522
523 list->us[list->num].offset = offset;
524 list->us[list->num].target = node;
525 list->num++;
526 return 0;
527 }
528 #endif /* USE_CALL */
529
530
531 static int
532 add_op(regex_t* reg, int opcode)
533 {
534 int r;
535
536 r = ops_new(reg);
537 if (r != ONIG_NORMAL) return r;
538
539 #ifdef USE_DIRECT_THREADED_CODE
540 *(reg->ocs + (reg->ops_curr - reg->ops)) = opcode;
541 #else
542 reg->ops_curr->opcode = opcode;
543 #endif
544
545 return 0;
546 }
547
548 static int compile_length_tree(Node* node, regex_t* reg);
549 static int compile_tree(Node* node, regex_t* reg, ScanEnv* env);
550
551
552 #define IS_NEED_STR_LEN_OP_EXACT(op) \
553 ((op) == OP_EXACTN || (op) == OP_EXACTMB2N ||\
554 (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC)
555
556 static int
557 select_str_opcode(int mb_len, int str_len, int ignore_case)
558 {
559 int op;
560
561 if (ignore_case) {
562 switch (str_len) {
563 case 1: op = OP_EXACT1_IC; break;
564 default: op = OP_EXACTN_IC; break;
565 }
566 }
567 else {
568 switch (mb_len) {
569 case 1:
570 switch (str_len) {
571 case 1: op = OP_EXACT1; break;
572 case 2: op = OP_EXACT2; break;
573 case 3: op = OP_EXACT3; break;
574 case 4: op = OP_EXACT4; break;
575 case 5: op = OP_EXACT5; break;
576 default: op = OP_EXACTN; break;
577 }
578 break;
579
580 case 2:
581 switch (str_len) {
582 case 1: op = OP_EXACTMB2N1; break;
583 case 2: op = OP_EXACTMB2N2; break;
584 case 3: op = OP_EXACTMB2N3; break;
585 default: op = OP_EXACTMB2N; break;
586 }
587 break;
588
589 case 3:
590 op = OP_EXACTMB3N;
591 break;
592
593 default:
594 op = OP_EXACTMBN;
595 break;
596 }
597 }
598 return op;
599 }
600
601 static int
602 is_strict_real_node(Node* node)
603 {
604 switch (NODE_TYPE(node)) {
605 case NODE_STRING:
606 {
607 StrNode* sn = STR_(node);
608 return (sn->end != sn->s);
609 }
610 break;
611
612 case NODE_CCLASS:
613 case NODE_CTYPE:
614 return 1;
615 break;
616
617 default:
618 return 0;
619 break;
620 }
621 }
622
623 static int
624 compile_tree_empty_check(Node* node, regex_t* reg, int emptiness, ScanEnv* env)
625 {
626 int r;
627 int saved_num_null_check = reg->num_null_check;
628
629 if (emptiness != BODY_IS_NOT_EMPTY) {
630 r = add_op(reg, OP_EMPTY_CHECK_START);
631 if (r != 0) return r;
632 COP(reg)->empty_check_start.mem = reg->num_null_check; /* NULL CHECK ID */
633 reg->num_null_check++;
634 }
635
636 r = compile_tree(node, reg, env);
637 if (r != 0) return r;
638
639 if (emptiness != BODY_IS_NOT_EMPTY) {
640 if (emptiness == BODY_IS_EMPTY_POSSIBILITY)
641 r = add_op(reg, OP_EMPTY_CHECK_END);
642 else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_MEM)
643 r = add_op(reg, OP_EMPTY_CHECK_END_MEMST);
644 else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_REC)
645 r = add_op(reg, OP_EMPTY_CHECK_END_MEMST_PUSH);
646
647 if (r != 0) return r;
648 COP(reg)->empty_check_end.mem = saved_num_null_check; /* NULL CHECK ID */
649 }
650 return r;
651 }
652
653 #ifdef USE_CALL
654 static int
655 compile_call(CallNode* node, regex_t* reg, ScanEnv* env)
656 {
657 int r;
658 int offset;
659
660 r = add_op(reg, OP_CALL);
661 if (r != 0) return r;
662
663 COP(reg)->call.addr = 0; /* dummy addr. */
664
665 offset = COP_CURR_OFFSET_BYTES(reg, call.addr);
666 r = unset_addr_list_add(env->unset_addr_list, offset, NODE_CALL_BODY(node));
667 return r;
668 }
669 #endif
670
671 static int
672 compile_tree_n_times(Node* node, int n, regex_t* reg, ScanEnv* env)
673 {
674 int i, r;
675
676 for (i = 0; i < n; i++) {
677 r = compile_tree(node, reg, env);
678 if (r != 0) return r;
679 }
680 return 0;
681 }
682
683 static int
684 add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, int str_len,
685 regex_t* reg ARG_UNUSED, int ignore_case)
686 {
687 return 1;
688 }
689
690 static int
691 add_compile_string(UChar* s, int mb_len, int str_len,
692 regex_t* reg, int ignore_case)
693 {
694 int op;
695 int r;
696 int byte_len;
697 UChar* p;
698 UChar* end;
699
700 op = select_str_opcode(mb_len, str_len, ignore_case);
701 r = add_op(reg, op);
702 if (r != 0) return r;
703
704 byte_len = mb_len * str_len;
705 end = s + byte_len;
706
707 if (op == OP_EXACTMBN) {
708 p = onigenc_strdup(reg->enc, s, end);
709 CHECK_NULL_RETURN_MEMERR(p);
710
711 COP(reg)->exact_len_n.len = mb_len;
712 COP(reg)->exact_len_n.n = str_len;
713 COP(reg)->exact_len_n.s = p;
714 }
715 else if (IS_NEED_STR_LEN_OP_EXACT(op)) {
716 p = onigenc_strdup(reg->enc, s, end);
717 CHECK_NULL_RETURN_MEMERR(p);
718
719 if (op == OP_EXACTN_IC)
720 COP(reg)->exact_n.n = byte_len;
721 else
722 COP(reg)->exact_n.n = str_len;
723
724 COP(reg)->exact_n.s = p;
725 }
726 else {
727 xmemcpy(COP(reg)->exact.s, s, (size_t )byte_len);
728 COP(reg)->exact.s[byte_len] = '\0';
729 }
730
731 return 0;
732 }
733
734 static int
735 compile_length_string_node(Node* node, regex_t* reg)
736 {
737 int rlen, r, len, prev_len, slen, ambig;
738 UChar *p, *prev;
739 StrNode* sn;
740 OnigEncoding enc = reg->enc;
741
742 sn = STR_(node);
743 if (sn->end <= sn->s)
744 return 0;
745
746 ambig = NODE_STRING_IS_AMBIG(node);
747
748 p = prev = sn->s;
749 prev_len = enclen(enc, p);
750 p += prev_len;
751 slen = 1;
752 rlen = 0;
753
754 for (; p < sn->end; ) {
755 len = enclen(enc, p);
756 if (len == prev_len) {
757 slen++;
758 }
759 else {
760 r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
761 rlen += r;
762 prev = p;
763 slen = 1;
764 prev_len = len;
765 }
766 p += len;
767 }
768
769 r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
770 rlen += r;
771 return rlen;
772 }
773
774 static int
775 compile_length_string_raw_node(StrNode* sn, regex_t* reg)
776 {
777 if (sn->end <= sn->s)
778 return 0;
779
780 return add_compile_string_length(sn->s, 1 /* sb */, (int )(sn->end - sn->s),
781 reg, 0);
782 }
783
784 static int
785 compile_string_node(Node* node, regex_t* reg)
786 {
787 int r, len, prev_len, slen, ambig;
788 UChar *p, *prev, *end;
789 StrNode* sn;
790 OnigEncoding enc = reg->enc;
791
792 sn = STR_(node);
793 if (sn->end <= sn->s)
794 return 0;
795
796 end = sn->end;
797 ambig = NODE_STRING_IS_AMBIG(node);
798
799 p = prev = sn->s;
800 prev_len = enclen(enc, p);
801 p += prev_len;
802 slen = 1;
803
804 for (; p < end; ) {
805 len = enclen(enc, p);
806 if (len == prev_len) {
807 slen++;
808 }
809 else {
810 r = add_compile_string(prev, prev_len, slen, reg, ambig);
811 if (r != 0) return r;
812
813 prev = p;
814 slen = 1;
815 prev_len = len;
816 }
817
818 p += len;
819 }
820
821 return add_compile_string(prev, prev_len, slen, reg, ambig);
822 }
823
824 static int
825 compile_string_raw_node(StrNode* sn, regex_t* reg)
826 {
827 if (sn->end <= sn->s)
828 return 0;
829
830 return add_compile_string(sn->s, 1 /* sb */, (int )(sn->end - sn->s), reg, 0);
831 }
832
833 static void*
834 set_multi_byte_cclass(BBuf* mbuf, regex_t* reg)
835 {
836 size_t len;
837 void* p;
838
839 len = (size_t )mbuf->used;
840 p = xmalloc(len);
841 if (IS_NULL(p)) return NULL;
842
843 xmemcpy(p, mbuf->p, len);
844 return p;
845 }
846
847 static int
848 compile_length_cclass_node(CClassNode* cc, regex_t* reg)
849 {
850 return 1;
851 }
852
853 static int
854 compile_cclass_node(CClassNode* cc, regex_t* reg)
855 {
856 int r;
857
858 if (IS_NULL(cc->mbuf)) {
859 r = add_op(reg, IS_NCCLASS_NOT(cc) ? OP_CCLASS_NOT : OP_CCLASS);
860 if (r != 0) return r;
861
862 COP(reg)->cclass.bsp = xmalloc(SIZE_BITSET);
863 CHECK_NULL_RETURN_MEMERR(COP(reg)->cclass.bsp);
864 xmemcpy(COP(reg)->cclass.bsp, cc->bs, SIZE_BITSET);
865 }
866 else {
867 void* p;
868
869 if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
870 r = add_op(reg, IS_NCCLASS_NOT(cc) ? OP_CCLASS_MB_NOT : OP_CCLASS_MB);
871 if (r != 0) return r;
872
873 p = set_multi_byte_cclass(cc->mbuf, reg);
874 CHECK_NULL_RETURN_MEMERR(p);
875 COP(reg)->cclass_mb.mb = p;
876 }
877 else {
878 r = add_op(reg, IS_NCCLASS_NOT(cc) ? OP_CCLASS_MIX_NOT : OP_CCLASS_MIX);
879 if (r != 0) return r;
880
881 COP(reg)->cclass_mix.bsp = xmalloc(SIZE_BITSET);
882 CHECK_NULL_RETURN_MEMERR(COP(reg)->cclass_mix.bsp);
883 xmemcpy(COP(reg)->cclass_mix.bsp, cc->bs, SIZE_BITSET);
884
885 p = set_multi_byte_cclass(cc->mbuf, reg);
886 CHECK_NULL_RETURN_MEMERR(p);
887 COP(reg)->cclass_mix.mb = p;
888 }
889 }
890
891 return 0;
892 }
893
894 static int
895 entry_repeat_range(regex_t* reg, int id, int lower, int upper)
896 {
897 #define REPEAT_RANGE_ALLOC 4
898
899 OnigRepeatRange* p;
900
901 if (reg->repeat_range_alloc == 0) {
902 p = (OnigRepeatRange* )xmalloc(sizeof(OnigRepeatRange) * REPEAT_RANGE_ALLOC);
903 CHECK_NULL_RETURN_MEMERR(p);
904 reg->repeat_range = p;
905 reg->repeat_range_alloc = REPEAT_RANGE_ALLOC;
906 }
907 else if (reg->repeat_range_alloc <= id) {
908 int n;
909 n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC;
910 p = (OnigRepeatRange* )xrealloc(reg->repeat_range,
911 sizeof(OnigRepeatRange) * n,
912 sizeof(OnigRepeatRange) * reg->repeat_range_alloc);
913 CHECK_NULL_RETURN_MEMERR(p);
914 reg->repeat_range = p;
915 reg->repeat_range_alloc = n;
916 }
917 else {
918 p = reg->repeat_range;
919 }
920
921 p[id].lower = lower;
922 p[id].upper = (IS_INFINITE_REPEAT(upper) ? 0x7fffffff : upper);
923 return 0;
924 }
925
926 static int
927 compile_range_repeat_node(QuantNode* qn, int target_len, int emptiness,
928 regex_t* reg, ScanEnv* env)
929 {
930 int r;
931 int num_repeat = reg->num_repeat++;
932
933 r = add_op(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG);
934 if (r != 0) return r;
935
936 COP(reg)->repeat.id = num_repeat;
937 COP(reg)->repeat.addr = SIZE_INC_OP + target_len + SIZE_OP_REPEAT_INC;
938
939 r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper);
940 if (r != 0) return r;
941
942 r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env);
943 if (r != 0) return r;
944
945 if (
946 #ifdef USE_CALL
947 NODE_IS_IN_MULTI_ENTRY(qn) ||
948 #endif
949 NODE_IS_IN_REAL_REPEAT(qn)) {
950 r = add_op(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG);
951 }
952 else {
953 r = add_op(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG);
954 }
955 if (r != 0) return r;
956
957 COP(reg)->repeat_inc.id = num_repeat;
958 return r;
959 }
960
961 static int
962 is_anychar_infinite_greedy(QuantNode* qn)
963 {
964 if (qn->greedy && IS_INFINITE_REPEAT(qn->upper) &&
965 NODE_IS_ANYCHAR(NODE_QUANT_BODY(qn)))
966 return 1;
967 else
968 return 0;
969 }
970
971 #define QUANTIFIER_EXPAND_LIMIT_SIZE 10
972 #define CKN_ON (ckn > 0)
973
974 static int
975 compile_length_quantifier_node(QuantNode* qn, regex_t* reg)
976 {
977 int len, mod_tlen;
978 int infinite = IS_INFINITE_REPEAT(qn->upper);
979 enum BodyEmptyType emptiness = qn->emptiness;
980 int tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg);
981
982 if (tlen < 0) return tlen;
983 if (tlen == 0) return 0;
984
985 /* anychar repeat */
986 if (is_anychar_infinite_greedy(qn)) {
987 if (qn->lower <= 1 ||
988 int_multiply_cmp(tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0) {
989 if (IS_NOT_NULL(qn->next_head_exact))
990 return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower;
991 else
992 return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower;
993 }
994 }
995
996 mod_tlen = tlen;
997 if (emptiness != BODY_IS_NOT_EMPTY)
998 mod_tlen += SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END;
999
1000 if (infinite &&
1001 (qn->lower <= 1 ||
1002 int_multiply_cmp(tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1003 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1004 len = SIZE_OP_JUMP;
1005 }
1006 else {
1007 len = tlen * qn->lower;
1008 }
1009
1010 if (qn->greedy) {
1011 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1012 if (IS_NOT_NULL(qn->head_exact))
1013 len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP;
1014 else
1015 #endif
1016 if (IS_NOT_NULL(qn->next_head_exact))
1017 len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP;
1018 else
1019 len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP;
1020 }
1021 else
1022 len += SIZE_OP_JUMP + mod_tlen + SIZE_OP_PUSH;
1023 }
1024 else if (qn->upper == 0) {
1025 if (qn->is_refered != 0) { /* /(?<n>..){0}/ */
1026 len = SIZE_OP_JUMP + tlen;
1027 }
1028 else
1029 len = 0;
1030 }
1031 else if (!infinite && qn->greedy &&
1032 (qn->upper == 1 ||
1033 int_multiply_cmp(tlen + SIZE_OP_PUSH, qn->upper,
1034 QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1035 len = tlen * qn->lower;
1036 len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower);
1037 }
1038 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1039 len = SIZE_OP_PUSH + SIZE_OP_JUMP + tlen;
1040 }
1041 else {
1042 len = SIZE_OP_REPEAT_INC + mod_tlen + SIZE_OP_REPEAT;
1043 }
1044
1045 return len;
1046 }
1047
1048 static int
1049 compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env)
1050 {
1051 int i, r, mod_tlen;
1052 int infinite = IS_INFINITE_REPEAT(qn->upper);
1053 enum BodyEmptyType emptiness = qn->emptiness;
1054 int tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg);
1055
1056 if (tlen < 0) return tlen;
1057 if (tlen == 0) return 0;
1058
1059 if (is_anychar_infinite_greedy(qn) &&
1060 (qn->lower <= 1 ||
1061 int_multiply_cmp(tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1062 r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);
1063 if (r != 0) return r;
1064 if (IS_NOT_NULL(qn->next_head_exact)) {
1065 r = add_op(reg,
1066 IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn), reg)) ?
1067 OP_ANYCHAR_ML_STAR_PEEK_NEXT : OP_ANYCHAR_STAR_PEEK_NEXT);
1068 if (r != 0) return r;
1069
1070 COP(reg)->anychar_star_peek_next.c = STR_(qn->next_head_exact)->s[0];
1071 return 0;
1072 }
1073 else {
1074 r = add_op(reg,
1075 IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn), reg)) ?
1076 OP_ANYCHAR_ML_STAR : OP_ANYCHAR_STAR);
1077 return r;
1078 }
1079 }
1080
1081 mod_tlen = tlen;
1082 if (emptiness != BODY_IS_NOT_EMPTY)
1083 mod_tlen += SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END;
1084
1085 if (infinite &&
1086 (qn->lower <= 1 ||
1087 int_multiply_cmp(tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1088 int addr;
1089
1090 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1091 r = add_op(reg, OP_JUMP);
1092 if (r != 0) return r;
1093 if (qn->greedy) {
1094 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1095 if (IS_NOT_NULL(qn->head_exact))
1096 COP(reg)->jump.addr = SIZE_OP_PUSH_OR_JUMP_EXACT1 + SIZE_INC_OP;
1097 else
1098 #endif
1099 if (IS_NOT_NULL(qn->next_head_exact))
1100 COP(reg)->jump.addr = SIZE_OP_PUSH_IF_PEEK_NEXT + SIZE_INC_OP;
1101 else
1102 COP(reg)->jump.addr = SIZE_OP_PUSH + SIZE_INC_OP;
1103 }
1104 else {
1105 COP(reg)->jump.addr = SIZE_OP_JUMP + SIZE_INC_OP;
1106 }
1107 }
1108 else {
1109 r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);
1110 if (r != 0) return r;
1111 }
1112
1113 if (qn->greedy) {
1114 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1115 if (IS_NOT_NULL(qn->head_exact)) {
1116 r = add_op(reg, OP_PUSH_OR_JUMP_EXACT1);
1117 if (r != 0) return r;
1118 COP(reg)->push_or_jump_exact1.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP;
1119 COP(reg)->push_or_jump_exact1.c = STR_(qn->head_exact)->s[0];
1120
1121 r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env);
1122 if (r != 0) return r;
1123
1124 addr = -(mod_tlen + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1);
1125 }
1126 else
1127 #endif
1128 if (IS_NOT_NULL(qn->next_head_exact)) {
1129 r = add_op(reg, OP_PUSH_IF_PEEK_NEXT);
1130 if (r != 0) return r;
1131 COP(reg)->push_if_peek_next.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP;
1132 COP(reg)->push_if_peek_next.c = STR_(qn->next_head_exact)->s[0];
1133
1134 r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env);
1135 if (r != 0) return r;
1136
1137 addr = -(mod_tlen + (int )SIZE_OP_PUSH_IF_PEEK_NEXT);
1138 }
1139 else {
1140 r = add_op(reg, OP_PUSH);
1141 if (r != 0) return r;
1142 COP(reg)->push.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP;
1143
1144 r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env);
1145 if (r != 0) return r;
1146
1147 addr = -(mod_tlen + (int )SIZE_OP_PUSH);
1148 }
1149
1150 r = add_op(reg, OP_JUMP);
1151 if (r != 0) return r;
1152 COP(reg)->jump.addr = addr;
1153 }
1154 else {
1155 r = add_op(reg, OP_JUMP);
1156 if (r != 0) return r;
1157 COP(reg)->jump.addr = mod_tlen + SIZE_INC_OP;
1158
1159 r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env);
1160 if (r != 0) return r;
1161
1162 r = add_op(reg, OP_PUSH);
1163 if (r != 0) return r;
1164 COP(reg)->push.addr = -mod_tlen;
1165 }
1166 }
1167 else if (qn->upper == 0) {
1168 if (qn->is_refered != 0) { /* /(?<n>..){0}/ */
1169 r = add_op(reg, OP_JUMP);
1170 if (r != 0) return r;
1171 COP(reg)->jump.addr = tlen + SIZE_INC_OP;
1172
1173 r = compile_tree(NODE_QUANT_BODY(qn), reg, env);
1174 }
1175 else {
1176 /* Nothing output */
1177 r = 0;
1178 }
1179 }
1180 else if (! infinite && qn->greedy &&
1181 (qn->upper == 1 ||
1182 int_multiply_cmp(tlen + SIZE_OP_PUSH, qn->upper,
1183 QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) {
1184 int n = qn->upper - qn->lower;
1185
1186 r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);
1187 if (r != 0) return r;
1188
1189 for (i = 0; i < n; i++) {
1190 int v = onig_positive_int_multiply(n - i, tlen + SIZE_OP_PUSH);
1191 if (v < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
1192
1193 r = add_op(reg, OP_PUSH);
1194 if (r != 0) return r;
1195 COP(reg)->push.addr = v;
1196
1197 r = compile_tree(NODE_QUANT_BODY(qn), reg, env);
1198 if (r != 0) return r;
1199 }
1200 }
1201 else if (! qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1202 r = add_op(reg, OP_PUSH);
1203 if (r != 0) return r;
1204 COP(reg)->push.addr = SIZE_INC_OP + SIZE_OP_JUMP;
1205
1206 r = add_op(reg, OP_JUMP);
1207 if (r != 0) return r;
1208 COP(reg)->jump.addr = tlen + SIZE_INC_OP;
1209
1210 r = compile_tree(NODE_QUANT_BODY(qn), reg, env);
1211 }
1212 else {
1213 r = compile_range_repeat_node(qn, mod_tlen, emptiness, reg, env);
1214 }
1215 return r;
1216 }
1217
1218 static int
1219 compile_length_option_node(BagNode* node, regex_t* reg)
1220 {
1221 int tlen;
1222 OnigOptionType prev = reg->options;
1223
1224 reg->options = node->o.options;
1225 tlen = compile_length_tree(NODE_BAG_BODY(node), reg);
1226 reg->options = prev;
1227
1228 return tlen;
1229 }
1230
1231 static int
1232 compile_option_node(BagNode* node, regex_t* reg, ScanEnv* env)
1233 {
1234 int r;
1235 OnigOptionType prev = reg->options;
1236
1237 reg->options = node->o.options;
1238 r = compile_tree(NODE_BAG_BODY(node), reg, env);
1239 reg->options = prev;
1240
1241 return r;
1242 }
1243
1244 static int
1245 compile_length_bag_node(BagNode* node, regex_t* reg)
1246 {
1247 int len;
1248 int tlen;
1249
1250 if (node->type == BAG_OPTION)
1251 return compile_length_option_node(node, reg);
1252
1253 if (NODE_BAG_BODY(node)) {
1254 tlen = compile_length_tree(NODE_BAG_BODY(node), reg);
1255 if (tlen < 0) return tlen;
1256 }
1257 else
1258 tlen = 0;
1259
1260 switch (node->type) {
1261 case BAG_MEMORY:
1262 #ifdef USE_CALL
1263
1264 if (node->m.regnum == 0 && NODE_IS_CALLED(node)) {
1265 len = tlen + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN;
1266 return len;
1267 }
1268
1269 if (NODE_IS_CALLED(node)) {
1270 len = SIZE_OP_MEMORY_START_PUSH + tlen
1271 + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN;
1272 if (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum))
1273 len += (NODE_IS_RECURSION(node)
1274 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1275 else
1276 len += (NODE_IS_RECURSION(node)
1277 ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1278 }
1279 else if (NODE_IS_RECURSION(node)) {
1280 len = SIZE_OP_MEMORY_START_PUSH;
1281 len += tlen + (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum)
1282 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_REC);
1283 }
1284 else
1285 #endif
1286 {
1287 if (MEM_STATUS_AT0(reg->bt_mem_start, node->m.regnum))
1288 len = SIZE_OP_MEMORY_START_PUSH;
1289 else
1290 len = SIZE_OP_MEMORY_START;
1291
1292 len += tlen + (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum)
1293 ? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END);
1294 }
1295 break;
1296
1297 case BAG_STOP_BACKTRACK:
1298 if (NODE_IS_STRICT_REAL_REPEAT(node)) {
1299 int v;
1300 QuantNode* qn;
1301
1302 qn = QUANT_(NODE_BAG_BODY(node));
1303 tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg);
1304 if (tlen < 0) return tlen;
1305
1306 v = onig_positive_int_multiply(qn->lower, tlen);
1307 if (v < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
1308 len = v + SIZE_OP_PUSH + tlen + SIZE_OP_POP_OUT + SIZE_OP_JUMP;
1309 }
1310 else {
1311 len = SIZE_OP_ATOMIC_START + tlen + SIZE_OP_ATOMIC_END;
1312 }
1313 break;
1314
1315 case BAG_IF_ELSE:
1316 {
1317 Node* cond = NODE_BAG_BODY(node);
1318 Node* Then = node->te.Then;
1319 Node* Else = node->te.Else;
1320
1321 len = compile_length_tree(cond, reg);
1322 if (len < 0) return len;
1323 len += SIZE_OP_PUSH;
1324 len += SIZE_OP_ATOMIC_START + SIZE_OP_ATOMIC_END;
1325
1326 if (IS_NOT_NULL(Then)) {
1327 tlen = compile_length_tree(Then, reg);
1328 if (tlen < 0) return tlen;
1329 len += tlen;
1330 }
1331
1332 len += SIZE_OP_JUMP + SIZE_OP_ATOMIC_END;
1333
1334 if (IS_NOT_NULL(Else)) {
1335 tlen = compile_length_tree(Else, reg);
1336 if (tlen < 0) return tlen;
1337 len += tlen;
1338 }
1339 }
1340 break;
1341
1342 case BAG_OPTION:
1343 /* never come here, but set for escape warning */
1344 len = 0;
1345 break;
1346 }
1347
1348 return len;
1349 }
1350
1351 static int get_char_len_node(Node* node, regex_t* reg, int* len);
1352
1353 static int
1354 compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env)
1355 {
1356 int r;
1357 int len;
1358
1359 #ifdef USE_CALL
1360 if (NODE_IS_CALLED(node)) {
1361 r = add_op(reg, OP_CALL);
1362 if (r != 0) return r;
1363
1364 node->m.called_addr = COP_CURR_OFFSET(reg) + 1 + SIZE_OP_JUMP;
1365 NODE_STATUS_ADD(node, ADDR_FIXED);
1366 COP(reg)->call.addr = (int )node->m.called_addr;
1367
1368 if (node->m.regnum == 0) {
1369 len = compile_length_tree(NODE_BAG_BODY(node), reg);
1370 len += SIZE_OP_RETURN;
1371
1372 r = add_op(reg, OP_JUMP);
1373 if (r != 0) return r;
1374 COP(reg)->jump.addr = len + SIZE_INC_OP;
1375
1376 r = compile_tree(NODE_BAG_BODY(node), reg, env);
1377 if (r != 0) return r;
1378
1379 r = add_op(reg, OP_RETURN);
1380 return r;
1381 }
1382 else {
1383 len = compile_length_tree(NODE_BAG_BODY(node), reg);
1384 len += (SIZE_OP_MEMORY_START_PUSH + SIZE_OP_RETURN);
1385 if (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum))
1386 len += (NODE_IS_RECURSION(node)
1387 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1388 else
1389 len += (NODE_IS_RECURSION(node)
1390 ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1391
1392 r = add_op(reg, OP_JUMP);
1393 if (r != 0) return r;
1394 COP(reg)->jump.addr = len + SIZE_INC_OP;
1395 }
1396 }
1397 #endif
1398
1399 if (MEM_STATUS_AT0(reg->bt_mem_start, node->m.regnum))
1400 r = add_op(reg, OP_MEMORY_START_PUSH);
1401 else
1402 r = add_op(reg, OP_MEMORY_START);
1403 if (r != 0) return r;
1404 COP(reg)->memory_start.num = node->m.regnum;
1405
1406 r = compile_tree(NODE_BAG_BODY(node), reg, env);
1407 if (r != 0) return r;
1408
1409 #ifdef USE_CALL
1410 if (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum))
1411 r = add_op(reg, (NODE_IS_RECURSION(node)
1412 ? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH));
1413 else
1414 r = add_op(reg, (NODE_IS_RECURSION(node) ? OP_MEMORY_END_REC : OP_MEMORY_END));
1415 if (r != 0) return r;
1416 COP(reg)->memory_end.num = node->m.regnum;
1417
1418 if (NODE_IS_CALLED(node)) {
1419 if (r != 0) return r;
1420 r = add_op(reg, OP_RETURN);
1421 }
1422 #else
1423 if (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum))
1424 r = add_op(reg, OP_MEMORY_END_PUSH);
1425 else
1426 r = add_op(reg, OP_MEMORY_END);
1427 if (r != 0) return r;
1428 COP(reg)->memory_end.num = node->m.regnum;
1429 #endif
1430
1431 return r;
1432 }
1433
1434 static int
1435 compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env)
1436 {
1437 int r, len;
1438
1439 switch (node->type) {
1440 case BAG_MEMORY:
1441 r = compile_bag_memory_node(node, reg, env);
1442 break;
1443
1444 case BAG_OPTION:
1445 r = compile_option_node(node, reg, env);
1446 break;
1447
1448 case BAG_STOP_BACKTRACK:
1449 if (NODE_IS_STRICT_REAL_REPEAT(node)) {
1450 QuantNode* qn = QUANT_(NODE_BAG_BODY(node));
1451 r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);
1452 if (r != 0) return r;
1453
1454 len = compile_length_tree(NODE_QUANT_BODY(qn), reg);
1455 if (len < 0) return len;
1456
1457 r = add_op(reg, OP_PUSH);
1458 if (r != 0) return r;
1459 COP(reg)->push.addr = SIZE_INC_OP + len + SIZE_OP_POP_OUT + SIZE_OP_JUMP;
1460
1461 r = compile_tree(NODE_QUANT_BODY(qn), reg, env);
1462 if (r != 0) return r;
1463 r = add_op(reg, OP_POP_OUT);
1464 if (r != 0) return r;
1465
1466 r = add_op(reg, OP_JUMP);
1467 if (r != 0) return r;
1468 COP(reg)->jump.addr = -((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP_OUT);
1469 }
1470 else {
1471 r = add_op(reg, OP_ATOMIC_START);
1472 if (r != 0) return r;
1473 r = compile_tree(NODE_BAG_BODY(node), reg, env);
1474 if (r != 0) return r;
1475 r = add_op(reg, OP_ATOMIC_END);
1476 }
1477 break;
1478
1479 case BAG_IF_ELSE:
1480 {
1481 int cond_len, then_len, else_len, jump_len;
1482 Node* cond = NODE_BAG_BODY(node);
1483 Node* Then = node->te.Then;
1484 Node* Else = node->te.Else;
1485
1486 r = add_op(reg, OP_ATOMIC_START);
1487 if (r != 0) return r;
1488
1489 cond_len = compile_length_tree(cond, reg);
1490 if (cond_len < 0) return cond_len;
1491 if (IS_NOT_NULL(Then)) {
1492 then_len = compile_length_tree(Then, reg);
1493 if (then_len < 0) return then_len;
1494 }
1495 else
1496 then_len = 0;
1497
1498 jump_len = cond_len + then_len + SIZE_OP_ATOMIC_END + SIZE_OP_JUMP;
1499
1500 r = add_op(reg, OP_PUSH);
1501 if (r != 0) return r;
1502 COP(reg)->push.addr = SIZE_INC_OP + jump_len;
1503
1504 r = compile_tree(cond, reg, env);
1505 if (r != 0) return r;
1506 r = add_op(reg, OP_ATOMIC_END);
1507 if (r != 0) return r;
1508
1509 if (IS_NOT_NULL(Then)) {
1510 r = compile_tree(Then, reg, env);
1511 if (r != 0) return r;
1512 }
1513
1514 if (IS_NOT_NULL(Else)) {
1515 else_len = compile_length_tree(Else, reg);
1516 if (else_len < 0) return else_len;
1517 }
1518 else
1519 else_len = 0;
1520
1521 r = add_op(reg, OP_JUMP);
1522 if (r != 0) return r;
1523 COP(reg)->jump.addr = SIZE_OP_ATOMIC_END + else_len + SIZE_INC_OP;
1524
1525 r = add_op(reg, OP_ATOMIC_END);
1526 if (r != 0) return r;
1527
1528 if (IS_NOT_NULL(Else)) {
1529 r = compile_tree(Else, reg, env);
1530 }
1531 }
1532 break;
1533 }
1534
1535 return r;
1536 }
1537
1538 static int
1539 compile_length_anchor_node(AnchorNode* node, regex_t* reg)
1540 {
1541 int len;
1542 int tlen = 0;
1543
1544 if (IS_NOT_NULL(NODE_ANCHOR_BODY(node))) {
1545 tlen = compile_length_tree(NODE_ANCHOR_BODY(node), reg);
1546 if (tlen < 0) return tlen;
1547 }
1548
1549 switch (node->type) {
1550 case ANCR_PREC_READ:
1551 len = SIZE_OP_PREC_READ_START + tlen + SIZE_OP_PREC_READ_END;
1552 break;
1553 case ANCR_PREC_READ_NOT:
1554 len = SIZE_OP_PREC_READ_NOT_START + tlen + SIZE_OP_PREC_READ_NOT_END;
1555 break;
1556 case ANCR_LOOK_BEHIND:
1557 len = SIZE_OP_LOOK_BEHIND + tlen;
1558 break;
1559 case ANCR_LOOK_BEHIND_NOT:
1560 len = SIZE_OP_LOOK_BEHIND_NOT_START + tlen + SIZE_OP_LOOK_BEHIND_NOT_END;
1561 break;
1562
1563 case ANCR_WORD_BOUNDARY:
1564 case ANCR_NO_WORD_BOUNDARY:
1565 #ifdef USE_WORD_BEGIN_END
1566 case ANCR_WORD_BEGIN:
1567 case ANCR_WORD_END:
1568 #endif
1569 len = SIZE_OP_WORD_BOUNDARY;
1570 break;
1571
1572 case ANCR_TEXT_SEGMENT_BOUNDARY:
1573 case ANCR_NO_TEXT_SEGMENT_BOUNDARY:
1574 len = SIZE_OPCODE;
1575 break;
1576
1577 default:
1578 len = SIZE_OPCODE;
1579 break;
1580 }
1581
1582 return len;
1583 }
1584
1585 static int
1586 compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env)
1587 {
1588 int r, len;
1589 enum OpCode op;
1590
1591 switch (node->type) {
1592 case ANCR_BEGIN_BUF: r = add_op(reg, OP_BEGIN_BUF); break;
1593 case ANCR_END_BUF: r = add_op(reg, OP_END_BUF); break;
1594 case ANCR_BEGIN_LINE: r = add_op(reg, OP_BEGIN_LINE); break;
1595 case ANCR_END_LINE: r = add_op(reg, OP_END_LINE); break;
1596 case ANCR_SEMI_END_BUF: r = add_op(reg, OP_SEMI_END_BUF); break;
1597 case ANCR_BEGIN_POSITION: r = add_op(reg, OP_BEGIN_POSITION); break;
1598
1599 case ANCR_WORD_BOUNDARY:
1600 op = OP_WORD_BOUNDARY;
1601 word:
1602 r = add_op(reg, op);
1603 if (r != 0) return r;
1604 COP(reg)->word_boundary.mode = (ModeType )node->ascii_mode;
1605 break;
1606
1607 case ANCR_NO_WORD_BOUNDARY:
1608 op = OP_NO_WORD_BOUNDARY; goto word;
1609 break;
1610 #ifdef USE_WORD_BEGIN_END
1611 case ANCR_WORD_BEGIN:
1612 op = OP_WORD_BEGIN; goto word;
1613 break;
1614 case ANCR_WORD_END:
1615 op = OP_WORD_END; goto word;
1616 break;
1617 #endif
1618
1619 case ANCR_TEXT_SEGMENT_BOUNDARY:
1620 case ANCR_NO_TEXT_SEGMENT_BOUNDARY:
1621 {
1622 enum TextSegmentBoundaryType type;
1623
1624 r = add_op(reg, OP_TEXT_SEGMENT_BOUNDARY);
1625 if (r != 0) return r;
1626
1627 type = EXTENDED_GRAPHEME_CLUSTER_BOUNDARY;
1628 #ifdef USE_UNICODE_WORD_BREAK
1629 if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_TEXT_SEGMENT_WORD))
1630 type = WORD_BOUNDARY;
1631 #endif
1632
1633 COP(reg)->text_segment_boundary.type = type;
1634 COP(reg)->text_segment_boundary.not =
1635 (node->type == ANCR_NO_TEXT_SEGMENT_BOUNDARY ? 1 : 0);
1636 }
1637 break;
1638
1639 case ANCR_PREC_READ:
1640 r = add_op(reg, OP_PREC_READ_START);
1641 if (r != 0) return r;
1642 r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
1643 if (r != 0) return r;
1644 r = add_op(reg, OP_PREC_READ_END);
1645 break;
1646
1647 case ANCR_PREC_READ_NOT:
1648 len = compile_length_tree(NODE_ANCHOR_BODY(node), reg);
1649 if (len < 0) return len;
1650
1651 r = add_op(reg, OP_PREC_READ_NOT_START);
1652 if (r != 0) return r;
1653 COP(reg)->prec_read_not_start.addr = SIZE_INC_OP + len + SIZE_OP_PREC_READ_NOT_END;
1654 r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
1655 if (r != 0) return r;
1656 r = add_op(reg, OP_PREC_READ_NOT_END);
1657 break;
1658
1659 case ANCR_LOOK_BEHIND:
1660 {
1661 int n;
1662 r = add_op(reg, OP_LOOK_BEHIND);
1663 if (r != 0) return r;
1664 if (node->char_len < 0) {
1665 r = get_char_len_node(NODE_ANCHOR_BODY(node), reg, &n);
1666 if (r != 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1667 }
1668 else
1669 n = node->char_len;
1670
1671 COP(reg)->look_behind.len = n;
1672 r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
1673 }
1674 break;
1675
1676 case ANCR_LOOK_BEHIND_NOT:
1677 {
1678 int n;
1679
1680 len = compile_length_tree(NODE_ANCHOR_BODY(node), reg);
1681 r = add_op(reg, OP_LOOK_BEHIND_NOT_START);
1682 if (r != 0) return r;
1683 COP(reg)->look_behind_not_start.addr = SIZE_INC_OP + len + SIZE_OP_LOOK_BEHIND_NOT_END;
1684
1685 if (node->char_len < 0) {
1686 r = get_char_len_node(NODE_ANCHOR_BODY(node), reg, &n);
1687 if (r != 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1688 }
1689 else
1690 n = node->char_len;
1691
1692 COP(reg)->look_behind_not_start.len = n;
1693
1694 r = compile_tree(NODE_ANCHOR_BODY(node), reg, env);
1695 if (r != 0) return r;
1696 r = add_op(reg, OP_LOOK_BEHIND_NOT_END);
1697 }
1698 break;
1699
1700 default:
1701 return ONIGERR_TYPE_BUG;
1702 break;
1703 }
1704
1705 return r;
1706 }
1707
1708 static int
1709 compile_gimmick_node(GimmickNode* node, regex_t* reg)
1710 {
1711 int r;
1712
1713 switch (node->type) {
1714 case GIMMICK_FAIL:
1715 r = add_op(reg, OP_FAIL);
1716 break;
1717
1718 case GIMMICK_SAVE:
1719 r = add_op(reg, OP_PUSH_SAVE_VAL);
1720 if (r != 0) return r;
1721 COP(reg)->push_save_val.type = node->detail_type;
1722 COP(reg)->push_save_val.id = node->id;
1723 break;
1724
1725 case GIMMICK_UPDATE_VAR:
1726 r = add_op(reg, OP_UPDATE_VAR);
1727 if (r != 0) return r;
1728 COP(reg)->update_var.type = node->detail_type;
1729 COP(reg)->update_var.id = node->id;
1730 break;
1731
1732 #ifdef USE_CALLOUT
1733 case GIMMICK_CALLOUT:
1734 switch (node->detail_type) {
1735 case ONIG_CALLOUT_OF_CONTENTS:
1736 case ONIG_CALLOUT_OF_NAME:
1737 {
1738 if (node->detail_type == ONIG_CALLOUT_OF_NAME) {
1739 r = add_op(reg, OP_CALLOUT_NAME);
1740 if (r != 0) return r;
1741 COP(reg)->callout_name.id = node->id;
1742 COP(reg)->callout_name.num = node->num;
1743 }
1744 else {
1745 r = add_op(reg, OP_CALLOUT_CONTENTS);
1746 if (r != 0) return r;
1747 COP(reg)->callout_contents.num = node->num;
1748 }
1749 }
1750 break;
1751
1752 default:
1753 r = ONIGERR_TYPE_BUG;
1754 break;
1755 }
1756 #endif
1757 }
1758
1759 return r;
1760 }
1761
1762 static int
1763 compile_length_gimmick_node(GimmickNode* node, regex_t* reg)
1764 {
1765 int len;
1766
1767 switch (node->type) {
1768 case GIMMICK_FAIL:
1769 len = SIZE_OP_FAIL;
1770 break;
1771
1772 case GIMMICK_SAVE:
1773 len = SIZE_OP_PUSH_SAVE_VAL;
1774 break;
1775
1776 case GIMMICK_UPDATE_VAR:
1777 len = SIZE_OP_UPDATE_VAR;
1778 break;
1779
1780 #ifdef USE_CALLOUT
1781 case GIMMICK_CALLOUT:
1782 switch (node->detail_type) {
1783 case ONIG_CALLOUT_OF_CONTENTS:
1784 len = SIZE_OP_CALLOUT_CONTENTS;
1785 break;
1786 case ONIG_CALLOUT_OF_NAME:
1787 len = SIZE_OP_CALLOUT_NAME;
1788 break;
1789
1790 default:
1791 len = ONIGERR_TYPE_BUG;
1792 break;
1793 }
1794 break;
1795 #endif
1796 }
1797
1798 return len;
1799 }
1800
1801 static int
1802 compile_length_tree(Node* node, regex_t* reg)
1803 {
1804 int len, r;
1805
1806 switch (NODE_TYPE(node)) {
1807 case NODE_LIST:
1808 len = 0;
1809 do {
1810 r = compile_length_tree(NODE_CAR(node), reg);
1811 if (r < 0) return r;
1812 len += r;
1813 } while (IS_NOT_NULL(node = NODE_CDR(node)));
1814 r = len;
1815 break;
1816
1817 case NODE_ALT:
1818 {
1819 int n;
1820
1821 n = r = 0;
1822 do {
1823 r += compile_length_tree(NODE_CAR(node), reg);
1824 n++;
1825 } while (IS_NOT_NULL(node = NODE_CDR(node)));
1826 r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1);
1827 }
1828 break;
1829
1830 case NODE_STRING:
1831 if (NODE_STRING_IS_RAW(node))
1832 r = compile_length_string_raw_node(STR_(node), reg);
1833 else
1834 r = compile_length_string_node(node, reg);
1835 break;
1836
1837 case NODE_CCLASS:
1838 r = compile_length_cclass_node(CCLASS_(node), reg);
1839 break;
1840
1841 case NODE_CTYPE:
1842 r = SIZE_OPCODE;
1843 break;
1844
1845 case NODE_BACKREF:
1846 r = SIZE_OP_BACKREF;
1847 break;
1848
1849 #ifdef USE_CALL
1850 case NODE_CALL:
1851 r = SIZE_OP_CALL;
1852 break;
1853 #endif
1854
1855 case NODE_QUANT:
1856 r = compile_length_quantifier_node(QUANT_(node), reg);
1857 break;
1858
1859 case NODE_BAG:
1860 r = compile_length_bag_node(BAG_(node), reg);
1861 break;
1862
1863 case NODE_ANCHOR:
1864 r = compile_length_anchor_node(ANCHOR_(node), reg);
1865 break;
1866
1867 case NODE_GIMMICK:
1868 r = compile_length_gimmick_node(GIMMICK_(node), reg);
1869 break;
1870
1871 default:
1872 return ONIGERR_TYPE_BUG;
1873 break;
1874 }
1875
1876 return r;
1877 }
1878
1879 static int
1880 compile_tree(Node* node, regex_t* reg, ScanEnv* env)
1881 {
1882 int n, len, pos, r = 0;
1883
1884 switch (NODE_TYPE(node)) {
1885 case NODE_LIST:
1886 do {
1887 r = compile_tree(NODE_CAR(node), reg, env);
1888 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
1889 break;
1890
1891 case NODE_ALT:
1892 {
1893 Node* x = node;
1894 len = 0;
1895 do {
1896 len += compile_length_tree(NODE_CAR(x), reg);
1897 if (IS_NOT_NULL(NODE_CDR(x))) {
1898 len += SIZE_OP_PUSH + SIZE_OP_JUMP;
1899 }
1900 } while (IS_NOT_NULL(x = NODE_CDR(x)));
1901 pos = COP_CURR_OFFSET(reg) + 1 + len; /* goal position */
1902
1903 do {
1904 len = compile_length_tree(NODE_CAR(node), reg);
1905 if (IS_NOT_NULL(NODE_CDR(node))) {
1906 enum OpCode push = NODE_IS_SUPER(node) ? OP_PUSH_SUPER : OP_PUSH;
1907 r = add_op(reg, push);
1908 if (r != 0) break;
1909 COP(reg)->push.addr = SIZE_INC_OP + len + SIZE_OP_JUMP;
1910 }
1911 r = compile_tree(NODE_CAR(node), reg, env);
1912 if (r != 0) break;
1913 if (IS_NOT_NULL(NODE_CDR(node))) {
1914 len = pos - (COP_CURR_OFFSET(reg) + 1);
1915 r = add_op(reg, OP_JUMP);
1916 if (r != 0) break;
1917 COP(reg)->jump.addr = len;
1918 }
1919 } while (IS_NOT_NULL(node = NODE_CDR(node)));
1920 }
1921 break;
1922
1923 case NODE_STRING:
1924 if (NODE_STRING_IS_RAW(node))
1925 r = compile_string_raw_node(STR_(node), reg);
1926 else
1927 r = compile_string_node(node, reg);
1928 break;
1929
1930 case NODE_CCLASS:
1931 r = compile_cclass_node(CCLASS_(node), reg);
1932 break;
1933
1934 case NODE_CTYPE:
1935 {
1936 int op;
1937
1938 switch (CTYPE_(node)->ctype) {
1939 case CTYPE_ANYCHAR:
1940 r = add_op(reg, IS_MULTILINE(CTYPE_OPTION(node, reg)) ?
1941 OP_ANYCHAR_ML : OP_ANYCHAR);
1942 break;
1943
1944 case ONIGENC_CTYPE_WORD:
1945 if (CTYPE_(node)->ascii_mode == 0) {
1946 op = CTYPE_(node)->not != 0 ? OP_NO_WORD : OP_WORD;
1947 }
1948 else {
1949 op = CTYPE_(node)->not != 0 ? OP_NO_WORD_ASCII : OP_WORD_ASCII;
1950 }
1951 r = add_op(reg, op);
1952 break;
1953
1954 default:
1955 return ONIGERR_TYPE_BUG;
1956 break;
1957 }
1958 }
1959 break;
1960
1961 case NODE_BACKREF:
1962 {
1963 BackRefNode* br = BACKREF_(node);
1964
1965 if (NODE_IS_CHECKER(node)) {
1966 #ifdef USE_BACKREF_WITH_LEVEL
1967 if (NODE_IS_NEST_LEVEL(node)) {
1968 r = add_op(reg, OP_BACKREF_CHECK_WITH_LEVEL);
1969 if (r != 0) return r;
1970 COP(reg)->backref_general.nest_level = br->nest_level;
1971 }
1972 else
1973 #endif
1974 {
1975 r = add_op(reg, OP_BACKREF_CHECK);
1976 if (r != 0) return r;
1977 }
1978 goto add_bacref_mems;
1979 }
1980 else {
1981 #ifdef USE_BACKREF_WITH_LEVEL
1982 if (NODE_IS_NEST_LEVEL(node)) {
1983 if ((reg->options & ONIG_OPTION_IGNORECASE) != 0)
1984 r = add_op(reg, OP_BACKREF_WITH_LEVEL_IC);
1985 else
1986 r = add_op(reg, OP_BACKREF_WITH_LEVEL);
1987
1988 if (r != 0) return r;
1989 COP(reg)->backref_general.nest_level = br->nest_level;
1990 goto add_bacref_mems;
1991 }
1992 else
1993 #endif
1994 if (br->back_num == 1) {
1995 n = br->back_static[0];
1996 if (IS_IGNORECASE(reg->options)) {
1997 r = add_op(reg, OP_BACKREF_N_IC);
1998 if (r != 0) return r;
1999 COP(reg)->backref_n.n1 = n;
2000 }
2001 else {
2002 switch (n) {
2003 case 1: r = add_op(reg, OP_BACKREF1); break;
2004 case 2: r = add_op(reg, OP_BACKREF2); break;
2005 default:
2006 r = add_op(reg, OP_BACKREF_N);
2007 if (r != 0) return r;
2008 COP(reg)->backref_n.n1 = n;
2009 break;
2010 }
2011 }
2012 }
2013 else {
2014 int num;
2015 int* p;
2016
2017 r = add_op(reg, IS_IGNORECASE(reg->options) ?
2018 OP_BACKREF_MULTI_IC : OP_BACKREF_MULTI);
2019 if (r != 0) return r;
2020
2021 add_bacref_mems:
2022 num = br->back_num;
2023 COP(reg)->backref_general.num = num;
2024 if (num == 1) {
2025 COP(reg)->backref_general.n1 = br->back_static[0];
2026 }
2027 else {
2028 int i, j;
2029 MemNumType* ns;
2030
2031 ns = xmalloc(sizeof(MemNumType) * num);
2032 CHECK_NULL_RETURN_MEMERR(ns);
2033 COP(reg)->backref_general.ns = ns;
2034 p = BACKREFS_P(br);
2035 for (i = num - 1, j = 0; i >= 0; i--, j++) {
2036 ns[j] = p[i];
2037 }
2038 }
2039 }
2040 }
2041 }
2042 break;
2043
2044 #ifdef USE_CALL
2045 case NODE_CALL:
2046 r = compile_call(CALL_(node), reg, env);
2047 break;
2048 #endif
2049
2050 case NODE_QUANT:
2051 r = compile_quantifier_node(QUANT_(node), reg, env);
2052 break;
2053
2054 case NODE_BAG:
2055 r = compile_bag_node(BAG_(node), reg, env);
2056 break;
2057
2058 case NODE_ANCHOR:
2059 r = compile_anchor_node(ANCHOR_(node), reg, env);
2060 break;
2061
2062 case NODE_GIMMICK:
2063 r = compile_gimmick_node(GIMMICK_(node), reg);
2064 break;
2065
2066 default:
2067 #ifdef ONIG_DEBUG
2068 fprintf(stderr, "compile_tree: undefined node type %d\n", NODE_TYPE(node));
2069 #endif
2070 break;
2071 }
2072
2073 return r;
2074 }
2075
2076 static int
2077 noname_disable_map(Node** plink, GroupNumRemap* map, int* counter)
2078 {
2079 int r = 0;
2080 Node* node = *plink;
2081
2082 switch (NODE_TYPE(node)) {
2083 case NODE_LIST:
2084 case NODE_ALT:
2085 do {
2086 r = noname_disable_map(&(NODE_CAR(node)), map, counter);
2087 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2088 break;
2089
2090 case NODE_QUANT:
2091 {
2092 Node** ptarget = &(NODE_BODY(node));
2093 Node* old = *ptarget;
2094 r = noname_disable_map(ptarget, map, counter);
2095 if (*ptarget != old && NODE_TYPE(*ptarget) == NODE_QUANT) {
2096 onig_reduce_nested_quantifier(node, *ptarget);
2097 }
2098 }
2099 break;
2100
2101 case NODE_BAG:
2102 {
2103 BagNode* en = BAG_(node);
2104 if (en->type == BAG_MEMORY) {
2105 if (NODE_IS_NAMED_GROUP(node)) {
2106 (*counter)++;
2107 map[en->m.regnum].new_val = *counter;
2108 en->m.regnum = *counter;
2109 r = noname_disable_map(&(NODE_BODY(node)), map, counter);
2110 }
2111 else {
2112 *plink = NODE_BODY(node);
2113 NODE_BODY(node) = NULL_NODE;
2114 onig_node_free(node);
2115 r = noname_disable_map(plink, map, counter);
2116 }
2117 }
2118 else if (en->type == BAG_IF_ELSE) {
2119 r = noname_disable_map(&(NODE_BAG_BODY(en)), map, counter);
2120 if (r != 0) return r;
2121 if (IS_NOT_NULL(en->te.Then)) {
2122 r = noname_disable_map(&(en->te.Then), map, counter);
2123 if (r != 0) return r;
2124 }
2125 if (IS_NOT_NULL(en->te.Else)) {
2126 r = noname_disable_map(&(en->te.Else), map, counter);
2127 if (r != 0) return r;
2128 }
2129 }
2130 else
2131 r = noname_disable_map(&(NODE_BODY(node)), map, counter);
2132 }
2133 break;
2134
2135 case NODE_ANCHOR:
2136 if (IS_NOT_NULL(NODE_BODY(node)))
2137 r = noname_disable_map(&(NODE_BODY(node)), map, counter);
2138 break;
2139
2140 default:
2141 break;
2142 }
2143
2144 return r;
2145 }
2146
2147 static int
2148 renumber_node_backref(Node* node, GroupNumRemap* map)
2149 {
2150 int i, pos, n, old_num;
2151 int *backs;
2152 BackRefNode* bn = BACKREF_(node);
2153
2154 if (! NODE_IS_BY_NAME(node))
2155 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
2156
2157 old_num = bn->back_num;
2158 if (IS_NULL(bn->back_dynamic))
2159 backs = bn->back_static;
2160 else
2161 backs = bn->back_dynamic;
2162
2163 for (i = 0, pos = 0; i < old_num; i++) {
2164 n = map[backs[i]].new_val;
2165 if (n > 0) {
2166 backs[pos] = n;
2167 pos++;
2168 }
2169 }
2170
2171 bn->back_num = pos;
2172 return 0;
2173 }
2174
2175 static int
2176 renumber_by_map(Node* node, GroupNumRemap* map)
2177 {
2178 int r = 0;
2179
2180 switch (NODE_TYPE(node)) {
2181 case NODE_LIST:
2182 case NODE_ALT:
2183 do {
2184 r = renumber_by_map(NODE_CAR(node), map);
2185 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2186 break;
2187
2188 case NODE_QUANT:
2189 r = renumber_by_map(NODE_BODY(node), map);
2190 break;
2191
2192 case NODE_BAG:
2193 {
2194 BagNode* en = BAG_(node);
2195
2196 r = renumber_by_map(NODE_BODY(node), map);
2197 if (r != 0) return r;
2198
2199 if (en->type == BAG_IF_ELSE) {
2200 if (IS_NOT_NULL(en->te.Then)) {
2201 r = renumber_by_map(en->te.Then, map);
2202 if (r != 0) return r;
2203 }
2204 if (IS_NOT_NULL(en->te.Else)) {
2205 r = renumber_by_map(en->te.Else, map);
2206 if (r != 0) return r;
2207 }
2208 }
2209 }
2210 break;
2211
2212 case NODE_BACKREF:
2213 r = renumber_node_backref(node, map);
2214 break;
2215
2216 case NODE_ANCHOR:
2217 if (IS_NOT_NULL(NODE_BODY(node)))
2218 r = renumber_by_map(NODE_BODY(node), map);
2219 break;
2220
2221 default:
2222 break;
2223 }
2224
2225 return r;
2226 }
2227
2228 static int
2229 numbered_ref_check(Node* node)
2230 {
2231 int r = 0;
2232
2233 switch (NODE_TYPE(node)) {
2234 case NODE_LIST:
2235 case NODE_ALT:
2236 do {
2237 r = numbered_ref_check(NODE_CAR(node));
2238 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2239 break;
2240
2241 case NODE_ANCHOR:
2242 if (IS_NULL(NODE_BODY(node)))
2243 break;
2244 /* fall */
2245 case NODE_QUANT:
2246 r = numbered_ref_check(NODE_BODY(node));
2247 break;
2248
2249 case NODE_BAG:
2250 {
2251 BagNode* en = BAG_(node);
2252
2253 r = numbered_ref_check(NODE_BODY(node));
2254 if (r != 0) return r;
2255
2256 if (en->type == BAG_IF_ELSE) {
2257 if (IS_NOT_NULL(en->te.Then)) {
2258 r = numbered_ref_check(en->te.Then);
2259 if (r != 0) return r;
2260 }
2261 if (IS_NOT_NULL(en->te.Else)) {
2262 r = numbered_ref_check(en->te.Else);
2263 if (r != 0) return r;
2264 }
2265 }
2266 }
2267
2268 break;
2269
2270 case NODE_BACKREF:
2271 if (! NODE_IS_BY_NAME(node))
2272 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
2273 break;
2274
2275 default:
2276 break;
2277 }
2278
2279 return r;
2280 }
2281
2282 static int
2283 disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env)
2284 {
2285 int r, i, pos, counter;
2286 int result;
2287 MemStatusType loc;
2288 GroupNumRemap* map;
2289
2290 map = (GroupNumRemap* )xmalloc(sizeof(GroupNumRemap) * (env->num_mem + 1));
2291 CHECK_NULL_RETURN_MEMERR(map);
2292 for (i = 1; i <= env->num_mem; i++) {
2293 map[i].new_val = 0;
2294 }
2295 counter = 0;
2296 r = noname_disable_map(root, map, &counter);
2297 if (r != 0) return r;
2298
2299 r = renumber_by_map(*root, map);
2300 if (r != 0) return r;
2301
2302 for (i = 1, pos = 1; i <= env->num_mem; i++) {
2303 if (map[i].new_val > 0) {
2304 SCANENV_MEMENV(env)[pos] = SCANENV_MEMENV(env)[i];
2305 pos++;
2306 }
2307 }
2308
2309 loc = env->capture_history;
2310 MEM_STATUS_CLEAR(env->capture_history);
2311 for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
2312 if (MEM_STATUS_AT(loc, i)) {
2313 MEM_STATUS_ON_SIMPLE(env->capture_history, map[i].new_val);
2314 }
2315 }
2316
2317 env->num_mem = env->num_named;
2318 reg->num_mem = env->num_named;
2319 result = onig_renumber_name_table(reg, map);
2320 xfree(map);
2321 return result;
2322 }
2323
2324 #ifdef USE_CALL
2325 static int
2326 fix_unset_addr_list(UnsetAddrList* uslist, regex_t* reg)
2327 {
2328 int i, offset;
2329 BagNode* en;
2330 AbsAddrType addr;
2331 AbsAddrType* paddr;
2332
2333 for (i = 0; i < uslist->num; i++) {
2334 if (! NODE_IS_ADDR_FIXED(uslist->us[i].target))
2335 return ONIGERR_PARSER_BUG;
2336
2337 en = BAG_(uslist->us[i].target);
2338 addr = en->m.called_addr;
2339 offset = uslist->us[i].offset;
2340
2341 paddr = (AbsAddrType* )((char* )reg->ops + offset);
2342 *paddr = addr;
2343 }
2344 return 0;
2345 }
2346 #endif
2347
2348
2349 #define GET_CHAR_LEN_VARLEN -1
2350 #define GET_CHAR_LEN_TOP_ALT_VARLEN -2
2351
2352 /* fixed size pattern node only */
2353 static int
2354 get_char_len_node1(Node* node, regex_t* reg, int* len, int level)
2355 {
2356 int tlen;
2357 int r = 0;
2358
2359 level++;
2360 *len = 0;
2361 switch (NODE_TYPE(node)) {
2362 case NODE_LIST:
2363 do {
2364 r = get_char_len_node1(NODE_CAR(node), reg, &tlen, level);
2365 if (r == 0)
2366 *len = distance_add(*len, tlen);
2367 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2368 break;
2369
2370 case NODE_ALT:
2371 {
2372 int tlen2;
2373 int varlen = 0;
2374
2375 r = get_char_len_node1(NODE_CAR(node), reg, &tlen, level);
2376 while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))) {
2377 r = get_char_len_node1(NODE_CAR(node), reg, &tlen2, level);
2378 if (r == 0) {
2379 if (tlen != tlen2)
2380 varlen = 1;
2381 }
2382 }
2383 if (r == 0) {
2384 if (varlen != 0) {
2385 if (level == 1)
2386 r = GET_CHAR_LEN_TOP_ALT_VARLEN;
2387 else
2388 r = GET_CHAR_LEN_VARLEN;
2389 }
2390 else
2391 *len = tlen;
2392 }
2393 }
2394 break;
2395
2396 case NODE_STRING:
2397 {
2398 StrNode* sn = STR_(node);
2399 UChar *s = sn->s;
2400
2401 while (s < sn->end) {
2402 s += enclen(reg->enc, s);
2403 (*len)++;
2404 }
2405 }
2406 break;
2407
2408 case NODE_QUANT:
2409 {
2410 QuantNode* qn = QUANT_(node);
2411
2412 if (qn->lower == qn->upper) {
2413 if (qn->upper == 0) {
2414 *len = 0;
2415 }
2416 else {
2417 r = get_char_len_node1(NODE_BODY(node), reg, &tlen, level);
2418 if (r == 0)
2419 *len = distance_multiply(tlen, qn->lower);
2420 }
2421 }
2422 else
2423 r = GET_CHAR_LEN_VARLEN;
2424 }
2425 break;
2426
2427 #ifdef USE_CALL
2428 case NODE_CALL:
2429 if (! NODE_IS_RECURSION(node))
2430 r = get_char_len_node1(NODE_BODY(node), reg, len, level);
2431 else
2432 r = GET_CHAR_LEN_VARLEN;
2433 break;
2434 #endif
2435
2436 case NODE_CTYPE:
2437 case NODE_CCLASS:
2438 *len = 1;
2439 break;
2440
2441 case NODE_BAG:
2442 {
2443 BagNode* en = BAG_(node);
2444
2445 switch (en->type) {
2446 case BAG_MEMORY:
2447 #ifdef USE_CALL
2448 if (NODE_IS_CLEN_FIXED(node))
2449 *len = en->char_len;
2450 else {
2451 r = get_char_len_node1(NODE_BODY(node), reg, len, level);
2452 if (r == 0) {
2453 en->char_len = *len;
2454 NODE_STATUS_ADD(node, CLEN_FIXED);
2455 }
2456 }
2457 break;
2458 #endif
2459 case BAG_OPTION:
2460 case BAG_STOP_BACKTRACK:
2461 r = get_char_len_node1(NODE_BODY(node), reg, len, level);
2462 break;
2463 case BAG_IF_ELSE:
2464 {
2465 int clen, elen;
2466
2467 r = get_char_len_node1(NODE_BODY(node), reg, &clen, level);
2468 if (r == 0) {
2469 if (IS_NOT_NULL(en->te.Then)) {
2470 r = get_char_len_node1(en->te.Then, reg, &tlen, level);
2471 if (r != 0) break;
2472 }
2473 else tlen = 0;
2474 if (IS_NOT_NULL(en->te.Else)) {
2475 r = get_char_len_node1(en->te.Else, reg, &elen, level);
2476 if (r != 0) break;
2477 }
2478 else elen = 0;
2479
2480 if (clen + tlen != elen) {
2481 r = GET_CHAR_LEN_VARLEN;
2482 }
2483 else {
2484 *len = elen;
2485 }
2486 }
2487 }
2488 break;
2489 }
2490 }
2491 break;
2492
2493 case NODE_ANCHOR:
2494 case NODE_GIMMICK:
2495 break;
2496
2497 case NODE_BACKREF:
2498 if (NODE_IS_CHECKER(node))
2499 break;
2500 /* fall */
2501 default:
2502 r = GET_CHAR_LEN_VARLEN;
2503 break;
2504 }
2505
2506 return r;
2507 }
2508
2509 static int
2510 get_char_len_node(Node* node, regex_t* reg, int* len)
2511 {
2512 return get_char_len_node1(node, reg, len, 0);
2513 }
2514
2515 /* x is not included y ==> 1 : 0 */
2516 static int
2517 is_exclusive(Node* x, Node* y, regex_t* reg)
2518 {
2519 int i, len;
2520 OnigCodePoint code;
2521 UChar *p;
2522 NodeType ytype;
2523
2524 retry:
2525 ytype = NODE_TYPE(y);
2526 switch (NODE_TYPE(x)) {
2527 case NODE_CTYPE:
2528 {
2529 if (CTYPE_(x)->ctype == CTYPE_ANYCHAR ||
2530 CTYPE_(y)->ctype == CTYPE_ANYCHAR)
2531 break;
2532
2533 switch (ytype) {
2534 case NODE_CTYPE:
2535 if (CTYPE_(y)->ctype == CTYPE_(x)->ctype &&
2536 CTYPE_(y)->not != CTYPE_(x)->not &&
2537 CTYPE_(y)->ascii_mode == CTYPE_(x)->ascii_mode)
2538 return 1;
2539 else
2540 return 0;
2541 break;
2542
2543 case NODE_CCLASS:
2544 swap:
2545 {
2546 Node* tmp;
2547 tmp = x; x = y; y = tmp;
2548 goto retry;
2549 }
2550 break;
2551
2552 case NODE_STRING:
2553 goto swap;
2554 break;
2555
2556 default:
2557 break;
2558 }
2559 }
2560 break;
2561
2562 case NODE_CCLASS:
2563 {
2564 int range;
2565 CClassNode* xc = CCLASS_(x);
2566
2567 switch (ytype) {
2568 case NODE_CTYPE:
2569 switch (CTYPE_(y)->ctype) {
2570 case CTYPE_ANYCHAR:
2571 return 0;
2572 break;
2573
2574 case ONIGENC_CTYPE_WORD:
2575 if (CTYPE_(y)->not == 0) {
2576 if (IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) {
2577 range = CTYPE_(y)->ascii_mode != 0 ? 128 : SINGLE_BYTE_SIZE;
2578 for (i = 0; i < range; i++) {
2579 if (BITSET_AT(xc->bs, i)) {
2580 if (ONIGENC_IS_CODE_WORD(reg->enc, i)) return 0;
2581 }
2582 }
2583 return 1;
2584 }
2585 return 0;
2586 }
2587 else {
2588 if (IS_NOT_NULL(xc->mbuf)) return 0;
2589 if (IS_NCCLASS_NOT(xc)) return 0;
2590
2591 range = CTYPE_(y)->ascii_mode != 0 ? 128 : SINGLE_BYTE_SIZE;
2592 for (i = 0; i < range; i++) {
2593 if (! ONIGENC_IS_CODE_WORD(reg->enc, i)) {
2594 if (BITSET_AT(xc->bs, i))
2595 return 0;
2596 }
2597 }
2598 for (i = range; i < SINGLE_BYTE_SIZE; i++) {
2599 if (BITSET_AT(xc->bs, i)) return 0;
2600 }
2601 return 1;
2602 }
2603 break;
2604
2605 default:
2606 break;
2607 }
2608 break;
2609
2610 case NODE_CCLASS:
2611 {
2612 int v;
2613 CClassNode* yc = CCLASS_(y);
2614
2615 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2616 v = BITSET_AT(xc->bs, i);
2617 if ((v != 0 && !IS_NCCLASS_NOT(xc)) || (v == 0 && IS_NCCLASS_NOT(xc))) {
2618 v = BITSET_AT(yc->bs, i);
2619 if ((v != 0 && !IS_NCCLASS_NOT(yc)) ||
2620 (v == 0 && IS_NCCLASS_NOT(yc)))
2621 return 0;
2622 }
2623 }
2624 if ((IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) ||
2625 (IS_NULL(yc->mbuf) && !IS_NCCLASS_NOT(yc)))
2626 return 1;
2627 return 0;
2628 }
2629 break;
2630
2631 case NODE_STRING:
2632 goto swap;
2633 break;
2634
2635 default:
2636 break;
2637 }
2638 }
2639 break;
2640
2641 case NODE_STRING:
2642 {
2643 StrNode* xs = STR_(x);
2644
2645 if (NODE_STRING_LEN(x) == 0)
2646 break;
2647
2648 switch (ytype) {
2649 case NODE_CTYPE:
2650 switch (CTYPE_(y)->ctype) {
2651 case CTYPE_ANYCHAR:
2652 break;
2653
2654 case ONIGENC_CTYPE_WORD:
2655 if (CTYPE_(y)->ascii_mode == 0) {
2656 if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end))
2657 return CTYPE_(y)->not;
2658 else
2659 return !(CTYPE_(y)->not);
2660 }
2661 else {
2662 if (ONIGENC_IS_MBC_WORD_ASCII(reg->enc, xs->s, xs->end))
2663 return CTYPE_(y)->not;
2664 else
2665 return !(CTYPE_(y)->not);
2666 }
2667 break;
2668 default:
2669 break;
2670 }
2671 break;
2672
2673 case NODE_CCLASS:
2674 {
2675 CClassNode* cc = CCLASS_(y);
2676
2677 code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s,
2678 xs->s + ONIGENC_MBC_MAXLEN(reg->enc));
2679 return onig_is_code_in_cc(reg->enc, code, cc) == 0;
2680 }
2681 break;
2682
2683 case NODE_STRING:
2684 {
2685 UChar *q;
2686 StrNode* ys = STR_(y);
2687
2688 len = NODE_STRING_LEN(x);
2689 if (len > NODE_STRING_LEN(y)) len = NODE_STRING_LEN(y);
2690 if (NODE_STRING_IS_AMBIG(x) || NODE_STRING_IS_AMBIG(y)) {
2691 /* tiny version */
2692 return 0;
2693 }
2694 else {
2695 for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) {
2696 if (*p != *q) return 1;
2697 }
2698 }
2699 }
2700 break;
2701
2702 default:
2703 break;
2704 }
2705 }
2706 break;
2707
2708 default:
2709 break;
2710 }
2711
2712 return 0;
2713 }
2714
2715 static Node*
2716 get_head_value_node(Node* node, int exact, regex_t* reg)
2717 {
2718 Node* n = NULL_NODE;
2719
2720 switch (NODE_TYPE(node)) {
2721 case NODE_BACKREF:
2722 case NODE_ALT:
2723 #ifdef USE_CALL
2724 case NODE_CALL:
2725 #endif
2726 break;
2727
2728 case NODE_CTYPE:
2729 if (CTYPE_(node)->ctype == CTYPE_ANYCHAR)
2730 break;
2731 /* fall */
2732 case NODE_CCLASS:
2733 if (exact == 0) {
2734 n = node;
2735 }
2736 break;
2737
2738 case NODE_LIST:
2739 n = get_head_value_node(NODE_CAR(node), exact, reg);
2740 break;
2741
2742 case NODE_STRING:
2743 {
2744 StrNode* sn = STR_(node);
2745
2746 if (sn->end <= sn->s)
2747 break;
2748
2749 if (exact == 0 ||
2750 ! IS_IGNORECASE(reg->options) || NODE_STRING_IS_RAW(node)) {
2751 n = node;
2752 }
2753 }
2754 break;
2755
2756 case NODE_QUANT:
2757 {
2758 QuantNode* qn = QUANT_(node);
2759 if (qn->lower > 0) {
2760 if (IS_NOT_NULL(qn->head_exact))
2761 n = qn->head_exact;
2762 else
2763 n = get_head_value_node(NODE_BODY(node), exact, reg);
2764 }
2765 }
2766 break;
2767
2768 case NODE_BAG:
2769 {
2770 BagNode* en = BAG_(node);
2771 switch (en->type) {
2772 case BAG_OPTION:
2773 {
2774 OnigOptionType options = reg->options;
2775
2776 reg->options = BAG_(node)->o.options;
2777 n = get_head_value_node(NODE_BODY(node), exact, reg);
2778 reg->options = options;
2779 }
2780 break;
2781
2782 case BAG_MEMORY:
2783 case BAG_STOP_BACKTRACK:
2784 case BAG_IF_ELSE:
2785 n = get_head_value_node(NODE_BODY(node), exact, reg);
2786 break;
2787 }
2788 }
2789 break;
2790
2791 case NODE_ANCHOR:
2792 if (ANCHOR_(node)->type == ANCR_PREC_READ)
2793 n = get_head_value_node(NODE_BODY(node), exact, reg);
2794 break;
2795
2796 case NODE_GIMMICK:
2797 default:
2798 break;
2799 }
2800
2801 return n;
2802 }
2803
2804 static int
2805 check_type_tree(Node* node, int type_mask, int bag_mask, int anchor_mask)
2806 {
2807 NodeType type;
2808 int r = 0;
2809
2810 type = NODE_TYPE(node);
2811 if ((NODE_TYPE2BIT(type) & type_mask) == 0)
2812 return 1;
2813
2814 switch (type) {
2815 case NODE_LIST:
2816 case NODE_ALT:
2817 do {
2818 r = check_type_tree(NODE_CAR(node), type_mask, bag_mask, anchor_mask);
2819 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
2820 break;
2821
2822 case NODE_QUANT:
2823 r = check_type_tree(NODE_BODY(node), type_mask, bag_mask, anchor_mask);
2824 break;
2825
2826 case NODE_BAG:
2827 {
2828 BagNode* en = BAG_(node);
2829 if (((1<<en->type) & bag_mask) == 0)
2830 return 1;
2831
2832 r = check_type_tree(NODE_BODY(node), type_mask, bag_mask, anchor_mask);
2833 if (r == 0 && en->type == BAG_IF_ELSE) {
2834 if (IS_NOT_NULL(en->te.Then)) {
2835 r = check_type_tree(en->te.Then, type_mask, bag_mask, anchor_mask);
2836 if (r != 0) break;
2837 }
2838 if (IS_NOT_NULL(en->te.Else)) {
2839 r = check_type_tree(en->te.Else, type_mask, bag_mask, anchor_mask);
2840 }
2841 }
2842 }
2843 break;
2844
2845 case NODE_ANCHOR:
2846 type = ANCHOR_(node)->type;
2847 if ((type & anchor_mask) == 0)
2848 return 1;
2849
2850 if (IS_NOT_NULL(NODE_BODY(node)))
2851 r = check_type_tree(NODE_BODY(node), type_mask, bag_mask, anchor_mask);
2852 break;
2853
2854 case NODE_GIMMICK:
2855 default:
2856 break;
2857 }
2858 return r;
2859 }
2860
2861 static OnigLen
2862 tree_min_len(Node* node, ScanEnv* env)
2863 {
2864 OnigLen len;
2865 OnigLen tmin;
2866
2867 len = 0;
2868 switch (NODE_TYPE(node)) {
2869 case NODE_BACKREF:
2870 if (! NODE_IS_CHECKER(node)) {
2871 int i;
2872 int* backs;
2873 MemEnv* mem_env = SCANENV_MEMENV(env);
2874 BackRefNode* br = BACKREF_(node);
2875 if (NODE_IS_RECURSION(node)) break;
2876
2877 backs = BACKREFS_P(br);
2878 len = tree_min_len(mem_env[backs[0]].node, env);
2879 for (i = 1; i < br->back_num; i++) {
2880 tmin = tree_min_len(mem_env[backs[i]].node, env);
2881 if (len > tmin) len = tmin;
2882 }
2883 }
2884 break;
2885
2886 #ifdef USE_CALL
2887 case NODE_CALL:
2888 {
2889 Node* t = NODE_BODY(node);
2890 if (NODE_IS_RECURSION(node)) {
2891 if (NODE_IS_MIN_FIXED(t))
2892 len = BAG_(t)->min_len;
2893 }
2894 else
2895 len = tree_min_len(t, env);
2896 }
2897 break;
2898 #endif
2899
2900 case NODE_LIST:
2901 do {
2902 tmin = tree_min_len(NODE_CAR(node), env);
2903 len = distance_add(len, tmin);
2904 } while (IS_NOT_NULL(node = NODE_CDR(node)));
2905 break;
2906
2907 case NODE_ALT:
2908 {
2909 Node *x, *y;
2910 y = node;
2911 do {
2912 x = NODE_CAR(y);
2913 tmin = tree_min_len(x, env);
2914 if (y == node) len = tmin;
2915 else if (len > tmin) len = tmin;
2916 } while (IS_NOT_NULL(y = NODE_CDR(y)));
2917 }
2918 break;
2919
2920 case NODE_STRING:
2921 {
2922 StrNode* sn = STR_(node);
2923 len = (int )(sn->end - sn->s);
2924 }
2925 break;
2926
2927 case NODE_CTYPE:
2928 case NODE_CCLASS:
2929 len = ONIGENC_MBC_MINLEN(env->enc);
2930 break;
2931
2932 case NODE_QUANT:
2933 {
2934 QuantNode* qn = QUANT_(node);
2935
2936 if (qn->lower > 0) {
2937 len = tree_min_len(NODE_BODY(node), env);
2938 len = distance_multiply(len, qn->lower);
2939 }
2940 }
2941 break;
2942
2943 case NODE_BAG:
2944 {
2945 BagNode* en = BAG_(node);
2946 switch (en->type) {
2947 case BAG_MEMORY:
2948 if (NODE_IS_MIN_FIXED(node))
2949 len = en->min_len;
2950 else {
2951 if (NODE_IS_MARK1(node))
2952 len = 0; /* recursive */
2953 else {
2954 NODE_STATUS_ADD(node, MARK1);
2955 len = tree_min_len(NODE_BODY(node), env);
2956 NODE_STATUS_REMOVE(node, MARK1);
2957
2958 en->min_len = len;
2959 NODE_STATUS_ADD(node, MIN_FIXED);
2960 }
2961 }
2962 break;
2963
2964 case BAG_OPTION:
2965 case BAG_STOP_BACKTRACK:
2966 len = tree_min_len(NODE_BODY(node), env);
2967 break;
2968 case BAG_IF_ELSE:
2969 {
2970 OnigLen elen;
2971
2972 len = tree_min_len(NODE_BODY(node), env);
2973 if (IS_NOT_NULL(en->te.Then))
2974 len += tree_min_len(en->te.Then, env);
2975 if (IS_NOT_NULL(en->te.Else))
2976 elen = tree_min_len(en->te.Else, env);
2977 else elen = 0;
2978
2979 if (elen < len) len = elen;
2980 }
2981 break;
2982 }
2983 }
2984 break;
2985
2986 case NODE_GIMMICK:
2987 {
2988 GimmickNode* g = GIMMICK_(node);
2989 if (g->type == GIMMICK_FAIL) {
2990 len = INFINITE_LEN;
2991 break;
2992 }
2993 }
2994 /* fall */
2995 case NODE_ANCHOR:
2996 default:
2997 break;
2998 }
2999
3000 return len;
3001 }
3002
3003 static OnigLen
3004 tree_max_len(Node* node, ScanEnv* env)
3005 {
3006 OnigLen len;
3007 OnigLen tmax;
3008
3009 len = 0;
3010 switch (NODE_TYPE(node)) {
3011 case NODE_LIST:
3012 do {
3013 tmax = tree_max_len(NODE_CAR(node), env);
3014 len = distance_add(len, tmax);
3015 } while (IS_NOT_NULL(node = NODE_CDR(node)));
3016 break;
3017
3018 case NODE_ALT:
3019 do {
3020 tmax = tree_max_len(NODE_CAR(node), env);
3021 if (len < tmax) len = tmax;
3022 } while (IS_NOT_NULL(node = NODE_CDR(node)));
3023 break;
3024
3025 case NODE_STRING:
3026 {
3027 StrNode* sn = STR_(node);
3028 len = (OnigLen )(sn->end - sn->s);
3029 }
3030 break;
3031
3032 case NODE_CTYPE:
3033 case NODE_CCLASS:
3034 len = ONIGENC_MBC_MAXLEN_DIST(env->enc);
3035 break;
3036
3037 case NODE_BACKREF:
3038 if (! NODE_IS_CHECKER(node)) {
3039 int i;
3040 int* backs;
3041 MemEnv* mem_env = SCANENV_MEMENV(env);
3042 BackRefNode* br = BACKREF_(node);
3043 if (NODE_IS_RECURSION(node)) {
3044 len = INFINITE_LEN;
3045 break;
3046 }
3047 backs = BACKREFS_P(br);
3048 for (i = 0; i < br->back_num; i++) {
3049 tmax = tree_max_len(mem_env[backs[i]].node, env);
3050 if (len < tmax) len = tmax;
3051 }
3052 }
3053 break;
3054
3055 #ifdef USE_CALL
3056 case NODE_CALL:
3057 if (! NODE_IS_RECURSION(node))
3058 len = tree_max_len(NODE_BODY(node), env);
3059 else
3060 len = INFINITE_LEN;
3061 break;
3062 #endif
3063
3064 case NODE_QUANT:
3065 {
3066 QuantNode* qn = QUANT_(node);
3067
3068 if (qn->upper != 0) {
3069 len = tree_max_len(NODE_BODY(node), env);
3070 if (len != 0) {
3071 if (! IS_INFINITE_REPEAT(qn->upper))
3072 len = distance_multiply(len, qn->upper);
3073 else
3074 len = INFINITE_LEN;
3075 }
3076 }
3077 }
3078 break;
3079
3080 case NODE_BAG:
3081 {
3082 BagNode* en = BAG_(node);
3083 switch (en->type) {
3084 case BAG_MEMORY:
3085 if (NODE_IS_MAX_FIXED(node))
3086 len = en->max_len;
3087 else {
3088 if (NODE_IS_MARK1(node))
3089 len = INFINITE_LEN;
3090 else {
3091 NODE_STATUS_ADD(node, MARK1);
3092 len = tree_max_len(NODE_BODY(node), env);
3093 NODE_STATUS_REMOVE(node, MARK1);
3094
3095 en->max_len = len;
3096 NODE_STATUS_ADD(node, MAX_FIXED);
3097 }
3098 }
3099 break;
3100
3101 case BAG_OPTION:
3102 case BAG_STOP_BACKTRACK:
3103 len = tree_max_len(NODE_BODY(node), env);
3104 break;
3105 case BAG_IF_ELSE:
3106 {
3107 OnigLen tlen, elen;
3108
3109 len = tree_max_len(NODE_BODY(node), env);
3110 if (IS_NOT_NULL(en->te.Then)) {
3111 tlen = tree_max_len(en->te.Then, env);
3112 len = distance_add(len, tlen);
3113 }
3114 if (IS_NOT_NULL(en->te.Else))
3115 elen = tree_max_len(en->te.Else, env);
3116 else elen = 0;
3117
3118 if (elen > len) len = elen;
3119 }
3120 break;
3121 }
3122 }
3123 break;
3124
3125 case NODE_ANCHOR:
3126 case NODE_GIMMICK:
3127 default:
3128 break;
3129 }
3130
3131 return len;
3132 }
3133
3134 static int
3135 check_backrefs(Node* node, ScanEnv* env)
3136 {
3137 int r;
3138
3139 switch (NODE_TYPE(node)) {
3140 case NODE_LIST:
3141 case NODE_ALT:
3142 do {
3143 r = check_backrefs(NODE_CAR(node), env);
3144 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
3145 break;
3146
3147 case NODE_ANCHOR:
3148 if (! ANCHOR_HAS_BODY(ANCHOR_(node))) {
3149 r = 0;
3150 break;
3151 }
3152 /* fall */
3153 case NODE_QUANT:
3154 r = check_backrefs(NODE_BODY(node), env);
3155 break;
3156
3157 case NODE_BAG:
3158 r = check_backrefs(NODE_BODY(node), env);
3159 {
3160 BagNode* en = BAG_(node);
3161
3162 if (en->type == BAG_IF_ELSE) {
3163 if (r != 0) return r;
3164 if (IS_NOT_NULL(en->te.Then)) {
3165 r = check_backrefs(en->te.Then, env);
3166 if (r != 0) return r;
3167 }
3168 if (IS_NOT_NULL(en->te.Else)) {
3169 r = check_backrefs(en->te.Else, env);
3170 }
3171 }
3172 }
3173 break;
3174
3175 case NODE_BACKREF:
3176 {
3177 int i;
3178 BackRefNode* br = BACKREF_(node);
3179 int* backs = BACKREFS_P(br);
3180 MemEnv* mem_env = SCANENV_MEMENV(env);
3181
3182 for (i = 0; i < br->back_num; i++) {
3183 if (backs[i] > env->num_mem)
3184 return ONIGERR_INVALID_BACKREF;
3185
3186 NODE_STATUS_ADD(mem_env[backs[i]].node, BACKREF);
3187 }
3188 r = 0;
3189 }
3190 break;
3191
3192 default:
3193 r = 0;
3194 break;
3195 }
3196
3197 return r;
3198 }
3199
3200
3201 #ifdef USE_CALL
3202
3203 #define RECURSION_EXIST (1<<0)
3204 #define RECURSION_MUST (1<<1)
3205 #define RECURSION_INFINITE (1<<2)
3206
3207 static int
3208 infinite_recursive_call_check(Node* node, ScanEnv* env, int head)
3209 {
3210 int ret;
3211 int r = 0;
3212
3213 switch (NODE_TYPE(node)) {
3214 case NODE_LIST:
3215 {
3216 Node *x;
3217 OnigLen min;
3218
3219 x = node;
3220 do {
3221 ret = infinite_recursive_call_check(NODE_CAR(x), env, head);
3222 if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret;
3223 r |= ret;
3224 if (head != 0) {
3225 min = tree_min_len(NODE_CAR(x), env);
3226 if (min != 0) head = 0;
3227 }
3228 } while (IS_NOT_NULL(x = NODE_CDR(x)));
3229 }
3230 break;
3231
3232 case NODE_ALT:
3233 {
3234 int must;
3235
3236 must = RECURSION_MUST;
3237 do {
3238 ret = infinite_recursive_call_check(NODE_CAR(node), env, head);
3239 if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret;
3240
3241 r |= (ret & RECURSION_EXIST);
3242 must &= ret;
3243 } while (IS_NOT_NULL(node = NODE_CDR(node)));
3244 r |= must;
3245 }
3246 break;
3247
3248 case NODE_QUANT:
3249 r = infinite_recursive_call_check(NODE_BODY(node), env, head);
3250 if (r < 0) return r;
3251 if ((r & RECURSION_MUST) != 0) {
3252 if (QUANT_(node)->lower == 0)
3253 r &= ~RECURSION_MUST;
3254 }
3255 break;
3256
3257 case NODE_ANCHOR:
3258 if (! ANCHOR_HAS_BODY(ANCHOR_(node)))
3259 break;
3260 /* fall */
3261 case NODE_CALL:
3262 r = infinite_recursive_call_check(NODE_BODY(node), env, head);
3263 break;
3264
3265 case NODE_BAG:
3266 {
3267 BagNode* en = BAG_(node);
3268
3269 if (en->type == BAG_MEMORY) {
3270 if (NODE_IS_MARK2(node))
3271 return 0;
3272 else if (NODE_IS_MARK1(node))
3273 return (head == 0 ? RECURSION_EXIST | RECURSION_MUST
3274 : RECURSION_EXIST | RECURSION_MUST | RECURSION_INFINITE);
3275 else {
3276 NODE_STATUS_ADD(node, MARK2);
3277 r = infinite_recursive_call_check(NODE_BODY(node), env, head);
3278 NODE_STATUS_REMOVE(node, MARK2);
3279 }
3280 }
3281 else if (en->type == BAG_IF_ELSE) {
3282 int eret;
3283
3284 ret = infinite_recursive_call_check(NODE_BODY(node), env, head);
3285 if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret;
3286 r |= ret;
3287 if (IS_NOT_NULL(en->te.Then)) {
3288 OnigLen min;
3289 if (head != 0) {
3290 min = tree_min_len(NODE_BODY(node), env);
3291 }
3292 else min = 0;
3293
3294 ret = infinite_recursive_call_check(en->te.Then, env, min != 0 ? 0:head);
3295 if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret;
3296 r |= ret;
3297 }
3298 if (IS_NOT_NULL(en->te.Else)) {
3299 eret = infinite_recursive_call_check(en->te.Else, env, head);
3300 if (eret < 0 || (eret & RECURSION_INFINITE) != 0) return eret;
3301 r |= (eret & RECURSION_EXIST);
3302 if ((eret & RECURSION_MUST) == 0)
3303 r &= ~RECURSION_MUST;
3304 }
3305 }
3306 else {
3307 r = infinite_recursive_call_check(NODE_BODY(node), env, head);
3308 }
3309 }
3310 break;
3311
3312 default:
3313 break;
3314 }
3315
3316 return r;
3317 }
3318
3319 static int
3320 infinite_recursive_call_check_trav(Node* node, ScanEnv* env)
3321 {
3322 int r;
3323
3324 switch (NODE_TYPE(node)) {
3325 case NODE_LIST:
3326 case NODE_ALT:
3327 do {
3328 r = infinite_recursive_call_check_trav(NODE_CAR(node), env);
3329 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
3330 break;
3331
3332 case NODE_ANCHOR:
3333 if (! ANCHOR_HAS_BODY(ANCHOR_(node))) {
3334 r = 0;
3335 break;
3336 }
3337 /* fall */
3338 case NODE_QUANT:
3339 r = infinite_recursive_call_check_trav(NODE_BODY(node), env);
3340 break;
3341
3342 case NODE_BAG:
3343 {
3344 BagNode* en = BAG_(node);
3345
3346 if (en->type == BAG_MEMORY) {
3347 if (NODE_IS_RECURSION(node) && NODE_IS_CALLED(node)) {
3348 int ret;
3349
3350 NODE_STATUS_ADD(node, MARK1);
3351
3352 ret = infinite_recursive_call_check(NODE_BODY(node), env, 1);
3353 if (ret < 0) return ret;
3354 else if ((ret & (RECURSION_MUST | RECURSION_INFINITE)) != 0)
3355 return ONIGERR_NEVER_ENDING_RECURSION;
3356
3357 NODE_STATUS_REMOVE(node, MARK1);
3358 }
3359 }
3360 else if (en->type == BAG_IF_ELSE) {
3361 if (IS_NOT_NULL(en->te.Then)) {
3362 r = infinite_recursive_call_check_trav(en->te.Then, env);
3363 if (r != 0) return r;
3364 }
3365 if (IS_NOT_NULL(en->te.Else)) {
3366 r = infinite_recursive_call_check_trav(en->te.Else, env);
3367 if (r != 0) return r;
3368 }
3369 }
3370 }
3371
3372 r = infinite_recursive_call_check_trav(NODE_BODY(node), env);
3373 break;
3374
3375 default:
3376 r = 0;
3377 break;
3378 }
3379
3380 return r;
3381 }
3382
3383 static int
3384 recursive_call_check(Node* node)
3385 {
3386 int r;
3387
3388 switch (NODE_TYPE(node)) {
3389 case NODE_LIST:
3390 case NODE_ALT:
3391 r = 0;
3392 do {
3393 r |= recursive_call_check(NODE_CAR(node));
3394 } while (IS_NOT_NULL(node = NODE_CDR(node)));
3395 break;
3396
3397 case NODE_ANCHOR:
3398 if (! ANCHOR_HAS_BODY(ANCHOR_(node))) {
3399 r = 0;
3400 break;
3401 }
3402 /* fall */
3403 case NODE_QUANT:
3404 r = recursive_call_check(NODE_BODY(node));
3405 break;
3406
3407 case NODE_CALL:
3408 r = recursive_call_check(NODE_BODY(node));
3409 if (r != 0) {
3410 if (NODE_IS_MARK1(NODE_BODY(node)))
3411 NODE_STATUS_ADD(node, RECURSION);
3412 }
3413 break;
3414
3415 case NODE_BAG:
3416 {
3417 BagNode* en = BAG_(node);
3418
3419 if (en->type == BAG_MEMORY) {
3420 if (NODE_IS_MARK2(node))
3421 return 0;
3422 else if (NODE_IS_MARK1(node))
3423 return 1; /* recursion */
3424 else {
3425 NODE_STATUS_ADD(node, MARK2);
3426 r = recursive_call_check(NODE_BODY(node));
3427 NODE_STATUS_REMOVE(node, MARK2);
3428 }
3429 }
3430 else if (en->type == BAG_IF_ELSE) {
3431 r = 0;
3432 if (IS_NOT_NULL(en->te.Then)) {
3433 r |= recursive_call_check(en->te.Then);
3434 }
3435 if (IS_NOT_NULL(en->te.Else)) {
3436 r |= recursive_call_check(en->te.Else);
3437 }
3438 r |= recursive_call_check(NODE_BODY(node));
3439 }
3440 else {
3441 r = recursive_call_check(NODE_BODY(node));
3442 }
3443 }
3444 break;
3445
3446 default:
3447 r = 0;
3448 break;
3449 }
3450
3451 return r;
3452 }
3453
3454 #define IN_RECURSION (1<<0)
3455 #define FOUND_CALLED_NODE 1
3456
3457 static int
3458 recursive_call_check_trav(Node* node, ScanEnv* env, int state)
3459 {
3460 int r = 0;
3461
3462 switch (NODE_TYPE(node)) {
3463 case NODE_LIST:
3464 case NODE_ALT:
3465 {
3466 int ret;
3467 do {
3468 ret = recursive_call_check_trav(NODE_CAR(node), env, state);
3469 if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE;
3470 else if (ret < 0) return ret;
3471 } while (IS_NOT_NULL(node = NODE_CDR(node)));
3472 }
3473 break;
3474
3475 case NODE_QUANT:
3476 r = recursive_call_check_trav(NODE_BODY(node), env, state);
3477 if (QUANT_(node)->upper == 0) {
3478 if (r == FOUND_CALLED_NODE)
3479 QUANT_(node)->is_refered = 1;
3480 }
3481 break;
3482
3483 case NODE_ANCHOR:
3484 {
3485 AnchorNode* an = ANCHOR_(node);
3486 if (ANCHOR_HAS_BODY(an))
3487 r = recursive_call_check_trav(NODE_ANCHOR_BODY(an), env, state);
3488 }
3489 break;
3490
3491 case NODE_BAG:
3492 {
3493 int ret;
3494 int state1;
3495 BagNode* en = BAG_(node);
3496
3497 if (en->type == BAG_MEMORY) {
3498 if (NODE_IS_CALLED(node) || (state & IN_RECURSION) != 0) {
3499 if (! NODE_IS_RECURSION(node)) {
3500 NODE_STATUS_ADD(node, MARK1);
3501 r = recursive_call_check(NODE_BODY(node));
3502 if (r != 0)
3503 NODE_STATUS_ADD(node, RECURSION);
3504 NODE_STATUS_REMOVE(node, MARK1);
3505 }
3506
3507 if (NODE_IS_CALLED(node))
3508 r = FOUND_CALLED_NODE;
3509 }
3510 }
3511
3512 state1 = state;
3513 if (NODE_IS_RECURSION(node))
3514 state1 |= IN_RECURSION;
3515
3516 ret = recursive_call_check_trav(NODE_BODY(node), env, state1);
3517 if (ret == FOUND_CALLED_NODE)
3518 r = FOUND_CALLED_NODE;
3519
3520 if (en->type == BAG_IF_ELSE) {
3521 if (IS_NOT_NULL(en->te.Then)) {
3522 ret = recursive_call_check_trav(en->te.Then, env, state1);
3523 if (ret == FOUND_CALLED_NODE)
3524 r = FOUND_CALLED_NODE;
3525 }
3526 if (IS_NOT_NULL(en->te.Else)) {
3527 ret = recursive_call_check_trav(en->te.Else, env, state1);
3528 if (ret == FOUND_CALLED_NODE)
3529 r = FOUND_CALLED_NODE;
3530 }
3531 }
3532 }
3533 break;
3534
3535 default:
3536 break;
3537 }
3538
3539 return r;
3540 }
3541
3542 #endif
3543
3544 #define IN_ALT (1<<0)
3545 #define IN_NOT (1<<1)
3546 #define IN_REAL_REPEAT (1<<2)
3547 #define IN_VAR_REPEAT (1<<3)
3548 #define IN_ZERO_REPEAT (1<<4)
3549 #define IN_MULTI_ENTRY (1<<5)
3550 #define IN_LOOK_BEHIND (1<<6)
3551
3552
3553 /* divide different length alternatives in look-behind.
3554 (?<=A|B) ==> (?<=A)|(?<=B)
3555 (?<!A|B) ==> (?<!A)(?<!B)
3556 */
3557 static int
3558 divide_look_behind_alternatives(Node* node)
3559 {
3560 Node *head, *np, *insert_node;
3561 AnchorNode* an = ANCHOR_(node);
3562 int anc_type = an->type;
3563
3564 head = NODE_ANCHOR_BODY(an);
3565 np = NODE_CAR(head);
3566 swap_node(node, head);
3567 NODE_CAR(node) = head;
3568 NODE_BODY(head) = np;
3569
3570 np = node;
3571 while (IS_NOT_NULL(np = NODE_CDR(np))) {
3572 insert_node = onig_node_new_anchor(anc_type, an->ascii_mode);
3573 CHECK_NULL_RETURN_MEMERR(insert_node);
3574 NODE_BODY(insert_node) = NODE_CAR(np);
3575 NODE_CAR(np) = insert_node;
3576 }
3577
3578 if (anc_type == ANCR_LOOK_BEHIND_NOT) {
3579 np = node;
3580 do {
3581 NODE_SET_TYPE(np, NODE_LIST); /* alt -> list */
3582 } while (IS_NOT_NULL(np = NODE_CDR(np)));
3583 }
3584 return 0;
3585 }
3586
3587 static int
3588 setup_look_behind(Node* node, regex_t* reg, ScanEnv* env)
3589 {
3590 int r, len;
3591 AnchorNode* an = ANCHOR_(node);
3592
3593 r = get_char_len_node(NODE_ANCHOR_BODY(an), reg, &len);
3594 if (r == 0)
3595 an->char_len = len;
3596 else if (r == GET_CHAR_LEN_VARLEN)
3597 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3598 else if (r == GET_CHAR_LEN_TOP_ALT_VARLEN) {
3599 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND))
3600 r = divide_look_behind_alternatives(node);
3601 else
3602 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3603 }
3604
3605 return r;
3606 }
3607
3608 static int
3609 next_setup(Node* node, Node* next_node, regex_t* reg)
3610 {
3611 NodeType type;
3612
3613 retry:
3614 type = NODE_TYPE(node);
3615 if (type == NODE_QUANT) {
3616 QuantNode* qn = QUANT_(node);
3617 if (qn->greedy && IS_INFINITE_REPEAT(qn->upper)) {
3618 #ifdef USE_QUANT_PEEK_NEXT
3619 Node* n = get_head_value_node(next_node, 1, reg);
3620 /* '\0': for UTF-16BE etc... */
3621 if (IS_NOT_NULL(n) && STR_(n)->s[0] != '\0') {
3622 qn->next_head_exact = n;
3623 }
3624 #endif
3625 /* automatic posseivation a*b ==> (?>a*)b */
3626 if (qn->lower <= 1) {
3627 if (is_strict_real_node(NODE_BODY(node))) {
3628 Node *x, *y;
3629 x = get_head_value_node(NODE_BODY(node), 0, reg);
3630 if (IS_NOT_NULL(x)) {
3631 y = get_head_value_node(next_node, 0, reg);
3632 if (IS_NOT_NULL(y) && is_exclusive(x, y, reg)) {
3633 Node* en = onig_node_new_bag(BAG_STOP_BACKTRACK);
3634 CHECK_NULL_RETURN_MEMERR(en);
3635 NODE_STATUS_ADD(en, STRICT_REAL_REPEAT);
3636 swap_node(node, en);
3637 NODE_BODY(node) = en;
3638 }
3639 }
3640 }
3641 }
3642 }
3643 }
3644 else if (type == NODE_BAG) {
3645 BagNode* en = BAG_(node);
3646 if (en->type == BAG_MEMORY) {
3647 node = NODE_BODY(node);
3648 goto retry;
3649 }
3650 }
3651 return 0;
3652 }
3653
3654
3655 static int
3656 update_string_node_case_fold(regex_t* reg, Node *node)
3657 {
3658 UChar *p, *end, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
3659 UChar *sbuf, *ebuf, *sp;
3660 int r, i, len, sbuf_size;
3661 StrNode* sn = STR_(node);
3662
3663 end = sn->end;
3664 sbuf_size = (int )(end - sn->s) * 2;
3665 sbuf = (UChar* )xmalloc(sbuf_size);
3666 CHECK_NULL_RETURN_MEMERR(sbuf);
3667 ebuf = sbuf + sbuf_size;
3668
3669 sp = sbuf;
3670 p = sn->s;
3671 while (p < end) {
3672 len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, &p, end, buf);
3673 for (i = 0; i < len; i++) {
3674 if (sp >= ebuf) {
3675 sbuf = (UChar* )xrealloc(sbuf, sbuf_size * 2, sbuf_size);
3676 CHECK_NULL_RETURN_MEMERR(sbuf);
3677 sp = sbuf + sbuf_size;
3678 sbuf_size *= 2;
3679 ebuf = sbuf + sbuf_size;
3680 }
3681
3682 *sp++ = buf[i];
3683 }
3684 }
3685
3686 r = onig_node_str_set(node, sbuf, sp);
3687 if (r != 0) {
3688 xfree(sbuf);
3689 return r;
3690 }
3691
3692 xfree(sbuf);
3693 return 0;
3694 }
3695
3696 static int
3697 expand_case_fold_make_rem_string(Node** rnode, UChar *s, UChar *end, regex_t* reg)
3698 {
3699 int r;
3700 Node *node;
3701
3702 node = onig_node_new_str(s, end);
3703 if (IS_NULL(node)) return ONIGERR_MEMORY;
3704
3705 r = update_string_node_case_fold(reg, node);
3706 if (r != 0) {
3707 onig_node_free(node);
3708 return r;
3709 }
3710
3711 NODE_STRING_SET_AMBIG(node);
3712 NODE_STRING_SET_DONT_GET_OPT_INFO(node);
3713 *rnode = node;
3714 return 0;
3715 }
3716
3717 static int
3718 expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[], UChar *p,
3719 int slen, UChar *end, regex_t* reg, Node **rnode)
3720 {
3721 int r, i, j;
3722 int len;
3723 int varlen;
3724 Node *anode, *var_anode, *snode, *xnode, *an;
3725 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
3726
3727 *rnode = var_anode = NULL_NODE;
3728
3729 varlen = 0;
3730 for (i = 0; i < item_num; i++) {
3731 if (items[i].byte_len != slen) {
3732 varlen = 1;
3733 break;
3734 }
3735 }
3736
3737 if (varlen != 0) {
3738 *rnode = var_anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3739 if (IS_NULL(var_anode)) return ONIGERR_MEMORY;
3740
3741 xnode = onig_node_new_list(NULL, NULL);
3742 if (IS_NULL(xnode)) goto mem_err;
3743 NODE_CAR(var_anode) = xnode;
3744
3745 anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3746 if (IS_NULL(anode)) goto mem_err;
3747 NODE_CAR(xnode) = anode;
3748 }
3749 else {
3750 *rnode = anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3751 if (IS_NULL(anode)) return ONIGERR_MEMORY;
3752 }
3753
3754 snode = onig_node_new_str(p, p + slen);
3755 if (IS_NULL(snode)) goto mem_err;
3756
3757 NODE_CAR(anode) = snode;
3758
3759 for (i = 0; i < item_num; i++) {
3760 snode = onig_node_new_str(NULL, NULL);
3761 if (IS_NULL(snode)) goto mem_err;
3762
3763 for (j = 0; j < items[i].code_len; j++) {
3764 len = ONIGENC_CODE_TO_MBC(reg->enc, items[i].code[j], buf);
3765 if (len < 0) {
3766 r = len;
3767 goto mem_err2;
3768 }
3769
3770 r = onig_node_str_cat(snode, buf, buf + len);
3771 if (r != 0) goto mem_err2;
3772 }
3773
3774 an = onig_node_new_alt(NULL_NODE, NULL_NODE);
3775 if (IS_NULL(an)) {
3776 goto mem_err2;
3777 }
3778 //The NULL pointer check is not necessary. It is added just for pass static
3779 //analysis. When condition "items[i].byte_len != slen" is true, "varlen = 1"
3780 //in line 3503 will be reached ,so that "if (IS_NULL(var_anode)) return ONIGERR_MEMORY"
3781 //in line 3510 will be executed, so the null pointer has been checked before
3782 //deferenced in line 3584.
3783 if (items[i].byte_len != slen && IS_NOT_NULL(var_anode)) {
3784 Node *rem;
3785 UChar *q = p + items[i].byte_len;
3786
3787 if (q < end) {
3788 r = expand_case_fold_make_rem_string(&rem, q, end, reg);
3789 if (r != 0) {
3790 onig_node_free(an);
3791 goto mem_err2;
3792 }
3793
3794 xnode = onig_node_list_add(NULL_NODE, snode);
3795 if (IS_NULL(xnode)) {
3796 onig_node_free(an);
3797 onig_node_free(rem);
3798 goto mem_err2;
3799 }
3800 if (IS_NULL(onig_node_list_add(xnode, rem))) {
3801 onig_node_free(an);
3802 onig_node_free(xnode);
3803 onig_node_free(rem);
3804 goto mem_err;
3805 }
3806
3807 NODE_CAR(an) = xnode;
3808 }
3809 else {
3810 NODE_CAR(an) = snode;
3811 }
3812
3813 NODE_CDR(var_anode) = an;
3814 var_anode = an;
3815 }
3816 else {
3817 NODE_CAR(an) = snode;
3818 NODE_CDR(anode) = an;
3819 anode = an;
3820 }
3821 }
3822
3823 return varlen;
3824
3825 mem_err2:
3826 onig_node_free(snode);
3827
3828 mem_err:
3829 onig_node_free(*rnode);
3830
3831 return ONIGERR_MEMORY;
3832 }
3833
3834 static int
3835 is_good_case_fold_items_for_search(OnigEncoding enc, int slen,
3836 int n, OnigCaseFoldCodeItem items[])
3837 {
3838 int i, len;
3839 UChar buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
3840
3841 for (i = 0; i < n; i++) {
3842 OnigCaseFoldCodeItem* item = items + i;
3843
3844 if (item->code_len != 1) return 0;
3845 if (item->byte_len != slen) return 0;
3846 len = ONIGENC_CODE_TO_MBC(enc, item->code[0], buf);
3847 if (len != slen) return 0;
3848 }
3849
3850 return 1;
3851 }
3852
3853 #define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8
3854
3855 static int
3856 expand_case_fold_string(Node* node, regex_t* reg, int state)
3857 {
3858 int r, n, len, alt_num;
3859 int fold_len;
3860 int prev_is_ambig, prev_is_good, is_good, is_in_look_behind;
3861 UChar *start, *end, *p;
3862 UChar* foldp;
3863 Node *top_root, *root, *snode, *prev_node;
3864 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
3865 UChar buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
3866 StrNode* sn;
3867
3868 if (NODE_STRING_IS_AMBIG(node)) return 0;
3869
3870 sn = STR_(node);
3871
3872 start = sn->s;
3873 end = sn->end;
3874 if (start >= end) return 0;
3875
3876 is_in_look_behind = (state & IN_LOOK_BEHIND) != 0;
3877
3878 r = 0;
3879 top_root = root = prev_node = snode = NULL_NODE;
3880 alt_num = 1;
3881 p = start;
3882 while (p < end) {
3883 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(reg->enc, reg->case_fold_flag,
3884 p, end, items);
3885 if (n < 0) {
3886 r = n;
3887 goto err;
3888 }
3889
3890 len = enclen(reg->enc, p);
3891 is_good = is_good_case_fold_items_for_search(reg->enc, len, n, items);
3892
3893 if (is_in_look_behind ||
3894 (IS_NOT_NULL(snode) ||
3895 (is_good
3896 /* expand single char case: ex. /(?i:a)/ */
3897 && !(p == start && p + len >= end)))) {
3898 if (IS_NULL(snode)) {
3899 if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3900 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3901 if (IS_NULL(root)) {
3902 onig_node_free(prev_node);
3903 goto mem_err;
3904 }
3905 }
3906
3907 prev_node = snode = onig_node_new_str(NULL, NULL);
3908 if (IS_NULL(snode)) goto mem_err;
3909 if (IS_NOT_NULL(root)) {
3910 if (IS_NULL(onig_node_list_add(root, snode))) {
3911 onig_node_free(snode);
3912 goto mem_err;
3913 }
3914 }
3915
3916 prev_is_ambig = -1; /* -1: new */
3917 prev_is_good = 0; /* escape compiler warning */
3918 }
3919 else {
3920 prev_is_ambig = NODE_STRING_IS_AMBIG(snode);
3921 prev_is_good = NODE_STRING_IS_GOOD_AMBIG(snode);
3922 }
3923
3924 if (n != 0) {
3925 foldp = p;
3926 fold_len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag,
3927 &foldp, end, buf);
3928 foldp = buf;
3929 }
3930 else {
3931 foldp = p; fold_len = len;
3932 }
3933
3934 if ((prev_is_ambig == 0 && n != 0) ||
3935 (prev_is_ambig > 0 && (n == 0 || prev_is_good != is_good))) {
3936 if (IS_NULL(root) /* && IS_NOT_NULL(prev_node) */) {
3937 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3938 if (IS_NULL(root)) {
3939 onig_node_free(prev_node);
3940 goto mem_err;
3941 }
3942 }
3943
3944 prev_node = snode = onig_node_new_str(foldp, foldp + fold_len);
3945 if (IS_NULL(snode)) goto mem_err;
3946 if (IS_NULL(onig_node_list_add(root, snode))) {
3947 onig_node_free(snode);
3948 goto mem_err;
3949 }
3950 }
3951 else {
3952 r = onig_node_str_cat(snode, foldp, foldp + fold_len);
3953 if (r != 0) goto err;
3954 }
3955
3956 if (n != 0) NODE_STRING_SET_AMBIG(snode);
3957 if (is_good != 0) NODE_STRING_SET_GOOD_AMBIG(snode);
3958 }
3959 else {
3960 alt_num *= (n + 1);
3961 if (alt_num > THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION) break;
3962
3963 if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3964 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3965 if (IS_NULL(root)) {
3966 onig_node_free(prev_node);
3967 goto mem_err;
3968 }
3969 }
3970
3971 r = expand_case_fold_string_alt(n, items, p, len, end, reg, &prev_node);
3972 if (r < 0) goto mem_err;
3973 if (r == 1) {
3974 if (IS_NULL(root)) {
3975 top_root = prev_node;
3976 }
3977 else {
3978 if (IS_NULL(onig_node_list_add(root, prev_node))) {
3979 onig_node_free(prev_node);
3980 goto mem_err;
3981 }
3982 }
3983
3984 root = NODE_CAR(prev_node);
3985 }
3986 else { /* r == 0 */
3987 if (IS_NOT_NULL(root)) {
3988 if (IS_NULL(onig_node_list_add(root, prev_node))) {
3989 onig_node_free(prev_node);
3990 goto mem_err;
3991 }
3992 }
3993 }
3994
3995 snode = NULL_NODE;
3996 }
3997
3998 p += len;
3999 }
4000
4001 if (p < end) {
4002 Node *srem;
4003
4004 r = expand_case_fold_make_rem_string(&srem, p, end, reg);
4005 if (r != 0) goto mem_err;
4006
4007 if (IS_NOT_NULL(prev_node) && IS_NULL(root)) {
4008 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
4009 if (IS_NULL(root)) {
4010 onig_node_free(srem);
4011 onig_node_free(prev_node);
4012 goto mem_err;
4013 }
4014 }
4015
4016 if (IS_NULL(root)) {
4017 prev_node = srem;
4018 }
4019 else {
4020 if (IS_NULL(onig_node_list_add(root, srem))) {
4021 onig_node_free(srem);
4022 goto mem_err;
4023 }
4024 }
4025 }
4026
4027 /* ending */
4028 top_root = (IS_NOT_NULL(top_root) ? top_root : prev_node);
4029 swap_node(node, top_root);
4030 onig_node_free(top_root);
4031 return 0;
4032
4033 mem_err:
4034 r = ONIGERR_MEMORY;
4035
4036 err:
4037 onig_node_free(top_root);
4038 return r;
4039 }
4040
4041 #ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT
4042 static enum BodyEmptyType
4043 quantifiers_memory_node_info(Node* node)
4044 {
4045 int r = BODY_IS_EMPTY_POSSIBILITY;
4046
4047 switch (NODE_TYPE(node)) {
4048 case NODE_LIST:
4049 case NODE_ALT:
4050 {
4051 int v;
4052 do {
4053 v = quantifiers_memory_node_info(NODE_CAR(node));
4054 if (v > r) r = v;
4055 } while (IS_NOT_NULL(node = NODE_CDR(node)));
4056 }
4057 break;
4058
4059 #ifdef USE_CALL
4060 case NODE_CALL:
4061 if (NODE_IS_RECURSION(node)) {
4062 return BODY_IS_EMPTY_POSSIBILITY_REC; /* tiny version */
4063 }
4064 else
4065 r = quantifiers_memory_node_info(NODE_BODY(node));
4066 break;
4067 #endif
4068
4069 case NODE_QUANT:
4070 {
4071 QuantNode* qn = QUANT_(node);
4072 if (qn->upper != 0) {
4073 r = quantifiers_memory_node_info(NODE_BODY(node));
4074 }
4075 }
4076 break;
4077
4078 case NODE_BAG:
4079 {
4080 BagNode* en = BAG_(node);
4081 switch (en->type) {
4082 case BAG_MEMORY:
4083 if (NODE_IS_RECURSION(node)) {
4084 return BODY_IS_EMPTY_POSSIBILITY_REC;
4085 }
4086 return BODY_IS_EMPTY_POSSIBILITY_MEM;
4087 break;
4088
4089 case BAG_OPTION:
4090 case BAG_STOP_BACKTRACK:
4091 r = quantifiers_memory_node_info(NODE_BODY(node));
4092 break;
4093 case BAG_IF_ELSE:
4094 {
4095 int v;
4096 r = quantifiers_memory_node_info(NODE_BODY(node));
4097 if (IS_NOT_NULL(en->te.Then)) {
4098 v = quantifiers_memory_node_info(en->te.Then);
4099 if (v > r) r = v;
4100 }
4101 if (IS_NOT_NULL(en->te.Else)) {
4102 v = quantifiers_memory_node_info(en->te.Else);
4103 if (v > r) r = v;
4104 }
4105 }
4106 break;
4107 }
4108 }
4109 break;
4110
4111 case NODE_BACKREF:
4112 case NODE_STRING:
4113 case NODE_CTYPE:
4114 case NODE_CCLASS:
4115 case NODE_ANCHOR:
4116 case NODE_GIMMICK:
4117 default:
4118 break;
4119 }
4120
4121 return r;
4122 }
4123 #endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */
4124
4125
4126 #ifdef USE_CALL
4127
4128 #ifdef __GNUC__
4129 __inline
4130 #endif
4131 static int
4132 setup_call_node_call(CallNode* cn, ScanEnv* env, int state)
4133 {
4134 MemEnv* mem_env = SCANENV_MEMENV(env);
4135
4136 if (cn->by_number != 0) {
4137 int gnum = cn->group_num;
4138
4139 if (env->num_named > 0 &&
4140 IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
4141 ! ONIG_IS_OPTION_ON(env->options, ONIG_OPTION_CAPTURE_GROUP)) {
4142 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
4143 }
4144
4145 if (gnum > env->num_mem) {
4146 onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_GROUP_REFERENCE,
4147 cn->name, cn->name_end);
4148 return ONIGERR_UNDEFINED_GROUP_REFERENCE;
4149 }
4150
4151 set_call_attr:
4152 NODE_CALL_BODY(cn) = mem_env[cn->group_num].node;
4153 if (IS_NULL(NODE_CALL_BODY(cn))) {
4154 onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE,
4155 cn->name, cn->name_end);
4156 return ONIGERR_UNDEFINED_NAME_REFERENCE;
4157 }
4158 }
4159 else {
4160 int *refs;
4161
4162 int n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end, &refs);
4163 if (n <= 0) {
4164 onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE,
4165 cn->name, cn->name_end);
4166 return ONIGERR_UNDEFINED_NAME_REFERENCE;
4167 }
4168 else if (n > 1) {
4169 onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL,
4170 cn->name, cn->name_end);
4171 return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL;
4172 }
4173 else {
4174 cn->group_num = refs[0];
4175 goto set_call_attr;
4176 }
4177 }
4178
4179 return 0;
4180 }
4181
4182 static void
4183 setup_call2_call(Node* node)
4184 {
4185 switch (NODE_TYPE(node)) {
4186 case NODE_LIST:
4187 case NODE_ALT:
4188 do {
4189 setup_call2_call(NODE_CAR(node));
4190 } while (IS_NOT_NULL(node = NODE_CDR(node)));
4191 break;
4192
4193 case NODE_QUANT:
4194 setup_call2_call(NODE_BODY(node));
4195 break;
4196
4197 case NODE_ANCHOR:
4198 if (ANCHOR_HAS_BODY(ANCHOR_(node)))
4199 setup_call2_call(NODE_BODY(node));
4200 break;
4201
4202 case NODE_BAG:
4203 {
4204 BagNode* en = BAG_(node);
4205
4206 if (en->type == BAG_MEMORY) {
4207 if (! NODE_IS_MARK1(node)) {
4208 NODE_STATUS_ADD(node, MARK1);
4209 setup_call2_call(NODE_BODY(node));
4210 NODE_STATUS_REMOVE(node, MARK1);
4211 }
4212 }
4213 else if (en->type == BAG_IF_ELSE) {
4214 setup_call2_call(NODE_BODY(node));
4215 if (IS_NOT_NULL(en->te.Then))
4216 setup_call2_call(en->te.Then);
4217 if (IS_NOT_NULL(en->te.Else))
4218 setup_call2_call(en->te.Else);
4219 }
4220 else {
4221 setup_call2_call(NODE_BODY(node));
4222 }
4223 }
4224 break;
4225
4226 case NODE_CALL:
4227 if (! NODE_IS_MARK1(node)) {
4228 NODE_STATUS_ADD(node, MARK1);
4229 {
4230 CallNode* cn = CALL_(node);
4231 Node* called = NODE_CALL_BODY(cn);
4232
4233 cn->entry_count++;
4234
4235 NODE_STATUS_ADD(called, CALLED);
4236 BAG_(called)->m.entry_count++;
4237 setup_call2_call(called);
4238 }
4239 NODE_STATUS_REMOVE(node, MARK1);
4240 }
4241 break;
4242
4243 default:
4244 break;
4245 }
4246 }
4247
4248 static int
4249 setup_call(Node* node, ScanEnv* env, int state)
4250 {
4251 int r;
4252
4253 switch (NODE_TYPE(node)) {
4254 case NODE_LIST:
4255 case NODE_ALT:
4256 do {
4257 r = setup_call(NODE_CAR(node), env, state);
4258 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
4259 break;
4260
4261 case NODE_QUANT:
4262 if (QUANT_(node)->upper == 0)
4263 state |= IN_ZERO_REPEAT;
4264
4265 r = setup_call(NODE_BODY(node), env, state);
4266 break;
4267
4268 case NODE_ANCHOR:
4269 if (ANCHOR_HAS_BODY(ANCHOR_(node)))
4270 r = setup_call(NODE_BODY(node), env, state);
4271 else
4272 r = 0;
4273 break;
4274
4275 case NODE_BAG:
4276 {
4277 BagNode* en = BAG_(node);
4278
4279 if (en->type == BAG_MEMORY) {
4280 if ((state & IN_ZERO_REPEAT) != 0) {
4281 NODE_STATUS_ADD(node, IN_ZERO_REPEAT);
4282 BAG_(node)->m.entry_count--;
4283 }
4284 r = setup_call(NODE_BODY(node), env, state);
4285 }
4286 else if (en->type == BAG_IF_ELSE) {
4287 r = setup_call(NODE_BODY(node), env, state);
4288 if (r != 0) return r;
4289 if (IS_NOT_NULL(en->te.Then)) {
4290 r = setup_call(en->te.Then, env, state);
4291 if (r != 0) return r;
4292 }
4293 if (IS_NOT_NULL(en->te.Else))
4294 r = setup_call(en->te.Else, env, state);
4295 }
4296 else
4297 r = setup_call(NODE_BODY(node), env, state);
4298 }
4299 break;
4300
4301 case NODE_CALL:
4302 if ((state & IN_ZERO_REPEAT) != 0) {
4303 NODE_STATUS_ADD(node, IN_ZERO_REPEAT);
4304 CALL_(node)->entry_count--;
4305 }
4306
4307 r = setup_call_node_call(CALL_(node), env, state);
4308 break;
4309
4310 default:
4311 r = 0;
4312 break;
4313 }
4314
4315 return r;
4316 }
4317
4318 static int
4319 setup_call2(Node* node)
4320 {
4321 int r = 0;
4322
4323 switch (NODE_TYPE(node)) {
4324 case NODE_LIST:
4325 case NODE_ALT:
4326 do {
4327 r = setup_call2(NODE_CAR(node));
4328 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
4329 break;
4330
4331 case NODE_QUANT:
4332 if (QUANT_(node)->upper != 0)
4333 r = setup_call2(NODE_BODY(node));
4334 break;
4335
4336 case NODE_ANCHOR:
4337 if (ANCHOR_HAS_BODY(ANCHOR_(node)))
4338 r = setup_call2(NODE_BODY(node));
4339 break;
4340
4341 case NODE_BAG:
4342 if (! NODE_IS_IN_ZERO_REPEAT(node))
4343 r = setup_call2(NODE_BODY(node));
4344
4345 {
4346 BagNode* en = BAG_(node);
4347
4348 if (r != 0) return r;
4349 if (en->type == BAG_IF_ELSE) {
4350 if (IS_NOT_NULL(en->te.Then)) {
4351 r = setup_call2(en->te.Then);
4352 if (r != 0) return r;
4353 }
4354 if (IS_NOT_NULL(en->te.Else))
4355 r = setup_call2(en->te.Else);
4356 }
4357 }
4358 break;
4359
4360 case NODE_CALL:
4361 if (! NODE_IS_IN_ZERO_REPEAT(node)) {
4362 setup_call2_call(node);
4363 }
4364 break;
4365
4366 default:
4367 break;
4368 }
4369
4370 return r;
4371 }
4372
4373
4374 static void
4375 setup_called_state_call(Node* node, int state)
4376 {
4377 switch (NODE_TYPE(node)) {
4378 case NODE_ALT:
4379 state |= IN_ALT;
4380 /* fall */
4381 case NODE_LIST:
4382 do {
4383 setup_called_state_call(NODE_CAR(node), state);
4384 } while (IS_NOT_NULL(node = NODE_CDR(node)));
4385 break;
4386
4387 case NODE_QUANT:
4388 {
4389 QuantNode* qn = QUANT_(node);
4390
4391 if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2)
4392 state |= IN_REAL_REPEAT;
4393 if (qn->lower != qn->upper)
4394 state |= IN_VAR_REPEAT;
4395
4396 setup_called_state_call(NODE_QUANT_BODY(qn), state);
4397 }
4398 break;
4399
4400 case NODE_ANCHOR:
4401 {
4402 AnchorNode* an = ANCHOR_(node);
4403
4404 switch (an->type) {
4405 case ANCR_PREC_READ_NOT:
4406 case ANCR_LOOK_BEHIND_NOT:
4407 state |= IN_NOT;
4408 /* fall */
4409 case ANCR_PREC_READ:
4410 case ANCR_LOOK_BEHIND:
4411 setup_called_state_call(NODE_ANCHOR_BODY(an), state);
4412 break;
4413 default:
4414 break;
4415 }
4416 }
4417 break;
4418
4419 case NODE_BAG:
4420 {
4421 BagNode* en = BAG_(node);
4422
4423 if (en->type == BAG_MEMORY) {
4424 if (NODE_IS_MARK1(node)) {
4425 if ((~en->m.called_state & state) != 0) {
4426 en->m.called_state |= state;
4427 setup_called_state_call(NODE_BODY(node), state);
4428 }
4429 }
4430 else {
4431 NODE_STATUS_ADD(node, MARK1);
4432 en->m.called_state |= state;
4433 setup_called_state_call(NODE_BODY(node), state);
4434 NODE_STATUS_REMOVE(node, MARK1);
4435 }
4436 }
4437 else if (en->type == BAG_IF_ELSE) {
4438 if (IS_NOT_NULL(en->te.Then)) {
4439 setup_called_state_call(en->te.Then, state);
4440 }
4441 if (IS_NOT_NULL(en->te.Else))
4442 setup_called_state_call(en->te.Else, state);
4443 }
4444 else {
4445 setup_called_state_call(NODE_BODY(node), state);
4446 }
4447 }
4448 break;
4449
4450 case NODE_CALL:
4451 setup_called_state_call(NODE_BODY(node), state);
4452 break;
4453
4454 default:
4455 break;
4456 }
4457 }
4458
4459 static void
4460 setup_called_state(Node* node, int state)
4461 {
4462 switch (NODE_TYPE(node)) {
4463 case NODE_ALT:
4464 state |= IN_ALT;
4465 /* fall */
4466 case NODE_LIST:
4467 do {
4468 setup_called_state(NODE_CAR(node), state);
4469 } while (IS_NOT_NULL(node = NODE_CDR(node)));
4470 break;
4471
4472 #ifdef USE_CALL
4473 case NODE_CALL:
4474 setup_called_state_call(node, state);
4475 break;
4476 #endif
4477
4478 case NODE_BAG:
4479 {
4480 BagNode* en = BAG_(node);
4481
4482 switch (en->type) {
4483 case BAG_MEMORY:
4484 if (en->m.entry_count > 1)
4485 state |= IN_MULTI_ENTRY;
4486
4487 en->m.called_state |= state;
4488 /* fall */
4489 case BAG_OPTION:
4490 case BAG_STOP_BACKTRACK:
4491 setup_called_state(NODE_BODY(node), state);
4492 break;
4493 case BAG_IF_ELSE:
4494 setup_called_state(NODE_BODY(node), state);
4495 if (IS_NOT_NULL(en->te.Then))
4496 setup_called_state(en->te.Then, state);
4497 if (IS_NOT_NULL(en->te.Else))
4498 setup_called_state(en->te.Else, state);
4499 break;
4500 }
4501 }
4502 break;
4503
4504 case NODE_QUANT:
4505 {
4506 QuantNode* qn = QUANT_(node);
4507
4508 if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2)
4509 state |= IN_REAL_REPEAT;
4510 if (qn->lower != qn->upper)
4511 state |= IN_VAR_REPEAT;
4512
4513 setup_called_state(NODE_QUANT_BODY(qn), state);
4514 }
4515 break;
4516
4517 case NODE_ANCHOR:
4518 {
4519 AnchorNode* an = ANCHOR_(node);
4520
4521 switch (an->type) {
4522 case ANCR_PREC_READ_NOT:
4523 case ANCR_LOOK_BEHIND_NOT:
4524 state |= IN_NOT;
4525 /* fall */
4526 case ANCR_PREC_READ:
4527 case ANCR_LOOK_BEHIND:
4528 setup_called_state(NODE_ANCHOR_BODY(an), state);
4529 break;
4530 default:
4531 break;
4532 }
4533 }
4534 break;
4535
4536 case NODE_BACKREF:
4537 case NODE_STRING:
4538 case NODE_CTYPE:
4539 case NODE_CCLASS:
4540 case NODE_GIMMICK:
4541 default:
4542 break;
4543 }
4544 }
4545
4546 #endif /* USE_CALL */
4547
4548
4549 static int setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env);
4550
4551 #ifdef __GNUC__
4552 __inline
4553 #endif
4554 static int
4555 setup_anchor(Node* node, regex_t* reg, int state, ScanEnv* env)
4556 {
4557 /* allowed node types in look-behind */
4558 #define ALLOWED_TYPE_IN_LB \
4559 ( NODE_BIT_LIST | NODE_BIT_ALT | NODE_BIT_STRING | NODE_BIT_CCLASS \
4560 | NODE_BIT_CTYPE | NODE_BIT_ANCHOR | NODE_BIT_BAG | NODE_BIT_QUANT \
4561 | NODE_BIT_CALL | NODE_BIT_GIMMICK)
4562
4563 #define ALLOWED_BAG_IN_LB ( 1<<BAG_MEMORY | 1<<BAG_OPTION | 1<<BAG_IF_ELSE )
4564 #define ALLOWED_BAG_IN_LB_NOT ( 1<<BAG_OPTION | 1<<BAG_IF_ELSE )
4565
4566 #define ALLOWED_ANCHOR_IN_LB \
4567 ( ANCR_LOOK_BEHIND | ANCR_BEGIN_LINE | ANCR_END_LINE | ANCR_BEGIN_BUF \
4568 | ANCR_BEGIN_POSITION | ANCR_WORD_BOUNDARY | ANCR_NO_WORD_BOUNDARY \
4569 | ANCR_WORD_BEGIN | ANCR_WORD_END \
4570 | ANCR_TEXT_SEGMENT_BOUNDARY | ANCR_NO_TEXT_SEGMENT_BOUNDARY )
4571
4572 #define ALLOWED_ANCHOR_IN_LB_NOT \
4573 ( ANCR_LOOK_BEHIND | ANCR_LOOK_BEHIND_NOT | ANCR_BEGIN_LINE \
4574 | ANCR_END_LINE | ANCR_BEGIN_BUF | ANCR_BEGIN_POSITION | ANCR_WORD_BOUNDARY \
4575 | ANCR_NO_WORD_BOUNDARY | ANCR_WORD_BEGIN | ANCR_WORD_END \
4576 | ANCR_TEXT_SEGMENT_BOUNDARY | ANCR_NO_TEXT_SEGMENT_BOUNDARY )
4577
4578 int r;
4579 AnchorNode* an = ANCHOR_(node);
4580
4581 switch (an->type) {
4582 case ANCR_PREC_READ:
4583 r = setup_tree(NODE_ANCHOR_BODY(an), reg, state, env);
4584 break;
4585 case ANCR_PREC_READ_NOT:
4586 r = setup_tree(NODE_ANCHOR_BODY(an), reg, (state | IN_NOT), env);
4587 break;
4588
4589 case ANCR_LOOK_BEHIND:
4590 {
4591 r = check_type_tree(NODE_ANCHOR_BODY(an), ALLOWED_TYPE_IN_LB,
4592 ALLOWED_BAG_IN_LB, ALLOWED_ANCHOR_IN_LB);
4593 if (r < 0) return r;
4594 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4595 r = setup_tree(NODE_ANCHOR_BODY(an), reg, (state|IN_LOOK_BEHIND), env);
4596 if (r != 0) return r;
4597 r = setup_look_behind(node, reg, env);
4598 }
4599 break;
4600
4601 case ANCR_LOOK_BEHIND_NOT:
4602 {
4603 r = check_type_tree(NODE_ANCHOR_BODY(an), ALLOWED_TYPE_IN_LB,
4604 ALLOWED_BAG_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT);
4605 if (r < 0) return r;
4606 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4607 r = setup_tree(NODE_ANCHOR_BODY(an), reg, (state|IN_NOT|IN_LOOK_BEHIND),
4608 env);
4609 if (r != 0) return r;
4610 r = setup_look_behind(node, reg, env);
4611 }
4612 break;
4613
4614 default:
4615 r = 0;
4616 break;
4617 }
4618
4619 return r;
4620 }
4621
4622 #ifdef __GNUC__
4623 __inline
4624 #endif
4625 static int
4626 setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env)
4627 {
4628 int r;
4629 OnigLen d;
4630 QuantNode* qn = QUANT_(node);
4631 Node* body = NODE_BODY(node);
4632
4633 if ((state & IN_REAL_REPEAT) != 0) {
4634 NODE_STATUS_ADD(node, IN_REAL_REPEAT);
4635 }
4636 if ((state & IN_MULTI_ENTRY) != 0) {
4637 NODE_STATUS_ADD(node, IN_MULTI_ENTRY);
4638 }
4639
4640 if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 1) {
4641 d = tree_min_len(body, env);
4642 if (d == 0) {
4643 #ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT
4644 qn->emptiness = quantifiers_memory_node_info(body);
4645 if (qn->emptiness == BODY_IS_EMPTY_POSSIBILITY_REC) {
4646 if (NODE_TYPE(body) == NODE_BAG &&
4647 BAG_(body)->type == BAG_MEMORY) {
4648 MEM_STATUS_ON(env->bt_mem_end, BAG_(body)->m.regnum);
4649 }
4650 }
4651 #else
4652 qn->emptiness = BODY_IS_EMPTY_POSSIBILITY;
4653 #endif
4654 }
4655 }
4656
4657 if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2)
4658 state |= IN_REAL_REPEAT;
4659 if (qn->lower != qn->upper)
4660 state |= IN_VAR_REPEAT;
4661
4662 r = setup_tree(body, reg, state, env);
4663 if (r != 0) return r;
4664
4665 /* expand string */
4666 #define EXPAND_STRING_MAX_LENGTH 100
4667 if (NODE_TYPE(body) == NODE_STRING) {
4668 if (!IS_INFINITE_REPEAT(qn->lower) && qn->lower == qn->upper &&
4669 qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) {
4670 int len = NODE_STRING_LEN(body);
4671 StrNode* sn = STR_(body);
4672
4673 if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) {
4674 int i, n = qn->lower;
4675 onig_node_conv_to_str_node(node, STR_(body)->flag);
4676 for (i = 0; i < n; i++) {
4677 r = onig_node_str_cat(node, sn->s, sn->end);
4678 if (r != 0) return r;
4679 }
4680 onig_node_free(body);
4681 return r;
4682 }
4683 }
4684 }
4685
4686 if (qn->greedy && (qn->emptiness == BODY_IS_NOT_EMPTY)) {
4687 if (NODE_TYPE(body) == NODE_QUANT) {
4688 QuantNode* tqn = QUANT_(body);
4689 if (IS_NOT_NULL(tqn->head_exact)) {
4690 qn->head_exact = tqn->head_exact;
4691 tqn->head_exact = NULL;
4692 }
4693 }
4694 else {
4695 qn->head_exact = get_head_value_node(NODE_BODY(node), 1, reg);
4696 }
4697 }
4698
4699 return r;
4700 }
4701
4702 /* setup_tree does the following work.
4703 1. check empty loop. (set qn->emptiness)
4704 2. expand ignore-case in char class.
4705 3. set memory status bit flags. (reg->mem_stats)
4706 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact].
4707 5. find invalid patterns in look-behind.
4708 6. expand repeated string.
4709 */
4710 static int
4711 setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
4712 {
4713 int r = 0;
4714
4715 switch (NODE_TYPE(node)) {
4716 case NODE_LIST:
4717 {
4718 Node* prev = NULL_NODE;
4719 do {
4720 r = setup_tree(NODE_CAR(node), reg, state, env);
4721 if (IS_NOT_NULL(prev) && r == 0) {
4722 r = next_setup(prev, NODE_CAR(node), reg);
4723 }
4724 prev = NODE_CAR(node);
4725 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
4726 }
4727 break;
4728
4729 case NODE_ALT:
4730 do {
4731 r = setup_tree(NODE_CAR(node), reg, (state | IN_ALT), env);
4732 } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));
4733 break;
4734
4735 case NODE_STRING:
4736 if (IS_IGNORECASE(reg->options) && !NODE_STRING_IS_RAW(node)) {
4737 r = expand_case_fold_string(node, reg, state);
4738 }
4739 break;
4740
4741 case NODE_BACKREF:
4742 {
4743 int i;
4744 int* p;
4745 BackRefNode* br = BACKREF_(node);
4746 p = BACKREFS_P(br);
4747 for (i = 0; i < br->back_num; i++) {
4748 if (p[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
4749 MEM_STATUS_ON(env->backrefed_mem, p[i]);
4750 MEM_STATUS_ON(env->bt_mem_start, p[i]);
4751 #ifdef USE_BACKREF_WITH_LEVEL
4752 if (NODE_IS_NEST_LEVEL(node)) {
4753 MEM_STATUS_ON(env->bt_mem_end, p[i]);
4754 }
4755 #endif
4756 }
4757 }
4758 break;
4759
4760 case NODE_BAG:
4761 {
4762 BagNode* en = BAG_(node);
4763
4764 switch (en->type) {
4765 case BAG_OPTION:
4766 {
4767 OnigOptionType options = reg->options;
4768 reg->options = BAG_(node)->o.options;
4769 r = setup_tree(NODE_BODY(node), reg, state, env);
4770 reg->options = options;
4771 }
4772 break;
4773
4774 case BAG_MEMORY:
4775 #ifdef USE_CALL
4776 state |= en->m.called_state;
4777 #endif
4778
4779 if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT | IN_MULTI_ENTRY)) != 0
4780 || NODE_IS_RECURSION(node)) {
4781 MEM_STATUS_ON(env->bt_mem_start, en->m.regnum);
4782 }
4783 r = setup_tree(NODE_BODY(node), reg, state, env);
4784 break;
4785
4786 case BAG_STOP_BACKTRACK:
4787 {
4788 Node* target = NODE_BODY(node);
4789 r = setup_tree(target, reg, state, env);
4790 if (NODE_TYPE(target) == NODE_QUANT) {
4791 QuantNode* tqn = QUANT_(target);
4792 if (IS_INFINITE_REPEAT(tqn->upper) && tqn->lower <= 1 &&
4793 tqn->greedy != 0) { /* (?>a*), a*+ etc... */
4794 if (is_strict_real_node(NODE_BODY(target)))
4795 NODE_STATUS_ADD(node, STRICT_REAL_REPEAT);
4796 }
4797 }
4798 }
4799 break;
4800
4801 case BAG_IF_ELSE:
4802 r = setup_tree(NODE_BODY(node), reg, (state | IN_ALT), env);
4803 if (r != 0) return r;
4804 if (IS_NOT_NULL(en->te.Then)) {
4805 r = setup_tree(en->te.Then, reg, (state | IN_ALT), env);
4806 if (r != 0) return r;
4807 }
4808 if (IS_NOT_NULL(en->te.Else))
4809 r = setup_tree(en->te.Else, reg, (state | IN_ALT), env);
4810 break;
4811 }
4812 }
4813 break;
4814
4815 case NODE_QUANT:
4816 r = setup_quant(node, reg, state, env);
4817 break;
4818
4819 case NODE_ANCHOR:
4820 r = setup_anchor(node, reg, state, env);
4821 break;
4822
4823 #ifdef USE_CALL
4824 case NODE_CALL:
4825 #endif
4826 case NODE_CTYPE:
4827 case NODE_CCLASS:
4828 case NODE_GIMMICK:
4829 default:
4830 break;
4831 }
4832
4833 return r;
4834 }
4835
4836 static int
4837 set_sunday_quick_search_or_bmh_skip_table(regex_t* reg, int case_expand,
4838 UChar* s, UChar* end,
4839 UChar skip[], int* roffset)
4840 {
4841 int i, j, k, len, offset;
4842 int n, clen;
4843 UChar* p;
4844 OnigEncoding enc;
4845 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
4846 UChar buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
4847
4848 enc = reg->enc;
4849 offset = ENC_GET_SKIP_OFFSET(enc);
4850 if (offset == ENC_SKIP_OFFSET_1_OR_0) {
4851 UChar* p = s;
4852 while (1) {
4853 len = enclen(enc, p);
4854 if (p + len >= end) {
4855 if (len == 1) offset = 1;
4856 else offset = 0;
4857 break;
4858 }
4859 p += len;
4860 }
4861 }
4862
4863 len = (int )(end - s);
4864 if (len + offset >= 255)
4865 return ONIGERR_PARSER_BUG;
4866
4867 *roffset = offset;
4868
4869 for (i = 0; i < CHAR_MAP_SIZE; i++) {
4870 skip[i] = (UChar )(len + offset);
4871 }
4872
4873 for (p = s; p < end; ) {
4874 int z;
4875
4876 clen = enclen(enc, p);
4877 if (p + clen > end) clen = (int )(end - p);
4878
4879 len = (int )(end - p);
4880 for (j = 0; j < clen; j++) {
4881 z = len - j + (offset - 1);
4882 if (z <= 0) break;
4883 skip[p[j]] = z;
4884 }
4885
4886 if (case_expand != 0) {
4887 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
4888 p, end, items);
4889 for (k = 0; k < n; k++) {
4890 ONIGENC_CODE_TO_MBC(enc, items[k].code[0], buf);
4891 for (j = 0; j < clen; j++) {
4892 z = len - j + (offset - 1);
4893 if (z <= 0) break;
4894 if (skip[buf[j]] > z)
4895 skip[buf[j]] = z;
4896 }
4897 }
4898 }
4899
4900 p += clen;
4901 }
4902
4903 return 0;
4904 }
4905
4906
4907 #define OPT_EXACT_MAXLEN 24
4908
4909 #if OPT_EXACT_MAXLEN >= 255
4910 #error Too big OPT_EXACT_MAXLEN
4911 #endif
4912
4913 typedef struct {
4914 OnigLen min; /* min byte length */
4915 OnigLen max; /* max byte length */
4916 } MinMax;
4917
4918 typedef struct {
4919 MinMax mmd;
4920 OnigEncoding enc;
4921 OnigOptionType options;
4922 OnigCaseFoldType case_fold_flag;
4923 ScanEnv* scan_env;
4924 } OptEnv;
4925
4926 typedef struct {
4927 int left;
4928 int right;
4929 } OptAnc;
4930
4931 typedef struct {
4932 MinMax mmd; /* position */
4933 OptAnc anc;
4934 int reach_end;
4935 int case_fold;
4936 int good_case_fold;
4937 int len;
4938 UChar s[OPT_EXACT_MAXLEN];
4939 } OptStr;
4940
4941 typedef struct {
4942 MinMax mmd; /* position */
4943 OptAnc anc;
4944 int value; /* weighted value */
4945 UChar map[CHAR_MAP_SIZE];
4946 } OptMap;
4947
4948 typedef struct {
4949 MinMax len;
4950 OptAnc anc;
4951 OptStr sb; /* boundary */
4952 OptStr sm; /* middle */
4953 OptStr spr; /* prec read (?=...) */
4954 OptMap map; /* boundary */
4955 } OptNode;
4956
4957
4958 static int
4959 map_position_value(OnigEncoding enc, int i)
4960 {
4961 static const short int Vals[] = {
4962 5, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1,
4963 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4964 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
4965 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5,
4966 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
4967 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5,
4968 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
4969 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1
4970 };
4971
4972 if (i < (int )(sizeof(Vals)/sizeof(Vals[0]))) {
4973 if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1)
4974 return 20;
4975 else
4976 return (int )Vals[i];
4977 }
4978 else
4979 return 4; /* Take it easy. */
4980 }
4981
4982 static int
4983 distance_value(MinMax* mm)
4984 {
4985 /* 1000 / (min-max-dist + 1) */
4986 static const short int dist_vals[] = {
4987 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100,
4988 91, 83, 77, 71, 67, 63, 59, 56, 53, 50,
4989 48, 45, 43, 42, 40, 38, 37, 36, 34, 33,
4990 32, 31, 30, 29, 29, 28, 27, 26, 26, 25,
4991 24, 24, 23, 23, 22, 22, 21, 21, 20, 20,
4992 20, 19, 19, 19, 18, 18, 18, 17, 17, 17,
4993 16, 16, 16, 16, 15, 15, 15, 15, 14, 14,
4994 14, 14, 14, 14, 13, 13, 13, 13, 13, 13,
4995 12, 12, 12, 12, 12, 12, 11, 11, 11, 11,
4996 11, 11, 11, 11, 11, 10, 10, 10, 10, 10
4997 };
4998
4999 OnigLen d;
5000
5001 if (mm->max == INFINITE_LEN) return 0;
5002
5003 d = mm->max - mm->min;
5004 if (d < (OnigLen )(sizeof(dist_vals)/sizeof(dist_vals[0])))
5005 /* return dist_vals[d] * 16 / (mm->min + 12); */
5006 return (int )dist_vals[d];
5007 else
5008 return 1;
5009 }
5010
5011 static int
5012 comp_distance_value(MinMax* d1, MinMax* d2, int v1, int v2)
5013 {
5014 if (v2 <= 0) return -1;
5015 if (v1 <= 0) return 1;
5016
5017 v1 *= distance_value(d1);
5018 v2 *= distance_value(d2);
5019
5020 if (v2 > v1) return 1;
5021 if (v2 < v1) return -1;
5022
5023 if (d2->min < d1->min) return 1;
5024 if (d2->min > d1->min) return -1;
5025 return 0;
5026 }
5027
5028 static int
5029 is_equal_mml(MinMax* a, MinMax* b)
5030 {
5031 return a->min == b->min && a->max == b->max;
5032 }
5033
5034 static void
5035 set_mml(MinMax* l, OnigLen min, OnigLen max)
5036 {
5037 l->min = min;
5038 l->max = max;
5039 }
5040
5041 static void
5042 clear_mml(MinMax* l)
5043 {
5044 l->min = l->max = 0;
5045 }
5046
5047 static void
5048 copy_mml(MinMax* to, MinMax* from)
5049 {
5050 to->min = from->min;
5051 to->max = from->max;
5052 }
5053
5054 static void
5055 add_mml(MinMax* to, MinMax* from)
5056 {
5057 to->min = distance_add(to->min, from->min);
5058 to->max = distance_add(to->max, from->max);
5059 }
5060
5061 static void
5062 alt_merge_mml(MinMax* to, MinMax* from)
5063 {
5064 if (to->min > from->min) to->min = from->min;
5065 if (to->max < from->max) to->max = from->max;
5066 }
5067
5068 static void
5069 copy_opt_env(OptEnv* to, OptEnv* from)
5070 {
5071 *to = *from;
5072 }
5073
5074 static void
5075 clear_opt_anc_info(OptAnc* a)
5076 {
5077 a->left = 0;
5078 a->right = 0;
5079 }
5080
5081 static void
5082 copy_opt_anc_info(OptAnc* to, OptAnc* from)
5083 {
5084 *to = *from;
5085 }
5086
5087 static void
5088 concat_opt_anc_info(OptAnc* to, OptAnc* left, OptAnc* right,
5089 OnigLen left_len, OnigLen right_len)
5090 {
5091 clear_opt_anc_info(to);
5092
5093 to->left = left->left;
5094 if (left_len == 0) {
5095 to->left |= right->left;
5096 }
5097
5098 to->right = right->right;
5099 if (right_len == 0) {
5100 to->right |= left->right;
5101 }
5102 else {
5103 to->right |= (left->right & ANCR_PREC_READ_NOT);
5104 }
5105 }
5106
5107 static int
5108 is_left(int a)
5109 {
5110 if (a == ANCR_END_BUF || a == ANCR_SEMI_END_BUF ||
5111 a == ANCR_END_LINE || a == ANCR_PREC_READ || a == ANCR_PREC_READ_NOT)
5112 return 0;
5113
5114 return 1;
5115 }
5116
5117 static int
5118 is_set_opt_anc_info(OptAnc* to, int anc)
5119 {
5120 if ((to->left & anc) != 0) return 1;
5121
5122 return ((to->right & anc) != 0 ? 1 : 0);
5123 }
5124
5125 static void
5126 add_opt_anc_info(OptAnc* to, int anc)
5127 {
5128 if (is_left(anc))
5129 to->left |= anc;
5130 else
5131 to->right |= anc;
5132 }
5133
5134 static void
5135 remove_opt_anc_info(OptAnc* to, int anc)
5136 {
5137 if (is_left(anc))
5138 to->left &= ~anc;
5139 else
5140 to->right &= ~anc;
5141 }
5142
5143 static void
5144 alt_merge_opt_anc_info(OptAnc* to, OptAnc* add)
5145 {
5146 to->left &= add->left;
5147 to->right &= add->right;
5148 }
5149
5150 static int
5151 is_full_opt_exact(OptStr* e)
5152 {
5153 return e->len >= OPT_EXACT_MAXLEN;
5154 }
5155
5156 static void
5157 clear_opt_exact(OptStr* e)
5158 {
5159 clear_mml(&e->mmd);
5160 clear_opt_anc_info(&e->anc);
5161 e->reach_end = 0;
5162 e->case_fold = 0;
5163 e->good_case_fold = 0;
5164 e->len = 0;
5165 e->s[0] = '\0';
5166 }
5167
5168 static void
5169 copy_opt_exact(OptStr* to, OptStr* from)
5170 {
5171 *to = *from;
5172 }
5173
5174 static int
5175 concat_opt_exact(OptStr* to, OptStr* add, OnigEncoding enc)
5176 {
5177 int i, j, len, r;
5178 UChar *p, *end;
5179 OptAnc tanc;
5180
5181 if (add->case_fold != 0) {
5182 if (! to->case_fold) {
5183 if (to->len > 1 || to->len >= add->len) return 0; /* avoid */
5184
5185 to->case_fold = 1;
5186 }
5187 else {
5188 if (to->good_case_fold != 0) {
5189 if (add->good_case_fold == 0) return 0;
5190 }
5191 }
5192 }
5193
5194 r = 0;
5195 p = add->s;
5196 end = p + add->len;
5197 for (i = to->len; p < end; ) {
5198 len = enclen(enc, p);
5199 if (i + len > OPT_EXACT_MAXLEN) {
5200 r = 1; /* 1:full */
5201 break;
5202 }
5203 for (j = 0; j < len && p < end; j++)
5204 to->s[i++] = *p++;
5205 }
5206
5207 to->len = i;
5208 to->reach_end = (p == end ? add->reach_end : 0);
5209
5210 concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1);
5211 if (! to->reach_end) tanc.right = 0;
5212 copy_opt_anc_info(&to->anc, &tanc);
5213
5214 return r;
5215 }
5216
5217 static void
5218 concat_opt_exact_str(OptStr* to, UChar* s, UChar* end, OnigEncoding enc)
5219 {
5220 int i, j, len;
5221 UChar *p;
5222
5223 for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) {
5224 len = enclen(enc, p);
5225 if (i + len > OPT_EXACT_MAXLEN) break;
5226 for (j = 0; j < len && p < end; j++)
5227 to->s[i++] = *p++;
5228 }
5229
5230 to->len = i;
5231
5232 if (p >= end && to->len == (int )(end - s))
5233 to->reach_end = 1;
5234 }
5235
5236 static void
5237 alt_merge_opt_exact(OptStr* to, OptStr* add, OptEnv* env)
5238 {
5239 int i, j, len;
5240
5241 if (add->len == 0 || to->len == 0) {
5242 clear_opt_exact(to);
5243 return ;
5244 }
5245
5246 if (! is_equal_mml(&to->mmd, &add->mmd)) {
5247 clear_opt_exact(to);
5248 return ;
5249 }
5250
5251 for (i = 0; i < to->len && i < add->len; ) {
5252 if (to->s[i] != add->s[i]) break;
5253 len = enclen(env->enc, to->s + i);
5254
5255 for (j = 1; j < len; j++) {
5256 if (to->s[i+j] != add->s[i+j]) break;
5257 }
5258 if (j < len) break;
5259 i += len;
5260 }
5261
5262 if (! add->reach_end || i < add->len || i < to->len) {
5263 to->reach_end = 0;
5264 }
5265 to->len = i;
5266 if (add->case_fold != 0)
5267 to->case_fold = 1;
5268 if (add->good_case_fold == 0)
5269 to->good_case_fold = 0;
5270
5271 alt_merge_opt_anc_info(&to->anc, &add->anc);
5272 if (! to->reach_end) to->anc.right = 0;
5273 }
5274
5275 static void
5276 select_opt_exact(OnigEncoding enc, OptStr* now, OptStr* alt)
5277 {
5278 int vn, va;
5279
5280 vn = now->len;
5281 va = alt->len;
5282
5283 if (va == 0) {
5284 return ;
5285 }
5286 else if (vn == 0) {
5287 copy_opt_exact(now, alt);
5288 return ;
5289 }
5290 else if (vn <= 2 && va <= 2) {
5291 /* ByteValTable[x] is big value --> low price */
5292 va = map_position_value(enc, now->s[0]);
5293 vn = map_position_value(enc, alt->s[0]);
5294
5295 if (now->len > 1) vn += 5;
5296 if (alt->len > 1) va += 5;
5297 }
5298
5299 if (now->case_fold == 0) vn *= 2;
5300 if (alt->case_fold == 0) va *= 2;
5301
5302 if (now->good_case_fold != 0) vn *= 4;
5303 if (alt->good_case_fold != 0) va *= 4;
5304
5305 if (comp_distance_value(&now->mmd, &alt->mmd, vn, va) > 0)
5306 copy_opt_exact(now, alt);
5307 }
5308
5309 static void
5310 clear_opt_map(OptMap* map)
5311 {
5312 static const OptMap clean_info = {
5313 {0, 0}, {0, 0}, 0,
5314 {
5315 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5316 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5317 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5318 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5319 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5320 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5321 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5322 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5323 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5324 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5325 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5326 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5327 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5328 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5329 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5330 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
5331 }
5332 };
5333
5334 xmemcpy(map, &clean_info, sizeof(OptMap));
5335 }
5336
5337 static void
5338 copy_opt_map(OptMap* to, OptMap* from)
5339 {
5340 xmemcpy(to,from,sizeof(OptMap));
5341 }
5342
5343 static void
5344 add_char_opt_map(OptMap* m, UChar c, OnigEncoding enc)
5345 {
5346 if (m->map[c] == 0) {
5347 m->map[c] = 1;
5348 m->value += map_position_value(enc, c);
5349 }
5350 }
5351
5352 static int
5353 add_char_amb_opt_map(OptMap* map, UChar* p, UChar* end,
5354 OnigEncoding enc, OnigCaseFoldType fold_flag)
5355 {
5356 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
5357 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
5358 int i, n;
5359
5360 add_char_opt_map(map, p[0], enc);
5361
5362 fold_flag = DISABLE_CASE_FOLD_MULTI_CHAR(fold_flag);
5363 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, fold_flag, p, end, items);
5364 if (n < 0) return n;
5365
5366 for (i = 0; i < n; i++) {
5367 ONIGENC_CODE_TO_MBC(enc, items[i].code[0], buf);
5368 add_char_opt_map(map, buf[0], enc);
5369 }
5370
5371 return 0;
5372 }
5373
5374 static void
5375 select_opt_map(OptMap* now, OptMap* alt)
5376 {
5377 static int z = 1<<15; /* 32768: something big value */
5378
5379 int vn, va;
5380
5381 if (alt->value == 0) return ;
5382 if (now->value == 0) {
5383 copy_opt_map(now, alt);
5384 return ;
5385 }
5386
5387 vn = z / now->value;
5388 va = z / alt->value;
5389 if (comp_distance_value(&now->mmd, &alt->mmd, vn, va) > 0)
5390 copy_opt_map(now, alt);
5391 }
5392
5393 static int
5394 comp_opt_exact_or_map(OptStr* e, OptMap* m)
5395 {
5396 #define COMP_EM_BASE 20
5397 int ae, am;
5398 int case_value;
5399
5400 if (m->value <= 0) return -1;
5401
5402 if (e->case_fold != 0) {
5403 if (e->good_case_fold != 0)
5404 case_value = 2;
5405 else
5406 case_value = 1;
5407 }
5408 else
5409 case_value = 3;
5410
5411 ae = COMP_EM_BASE * e->len * case_value;
5412 am = COMP_EM_BASE * 5 * 2 / m->value;
5413 return comp_distance_value(&e->mmd, &m->mmd, ae, am);
5414 }
5415
5416 static void
5417 alt_merge_opt_map(OnigEncoding enc, OptMap* to, OptMap* add)
5418 {
5419 int i, val;
5420
5421 /* if (! is_equal_mml(&to->mmd, &add->mmd)) return ; */
5422 if (to->value == 0) return ;
5423 if (add->value == 0 || to->mmd.max < add->mmd.min) {
5424 clear_opt_map(to);
5425 return ;
5426 }
5427
5428 alt_merge_mml(&to->mmd, &add->mmd);
5429
5430 val = 0;
5431 for (i = 0; i < CHAR_MAP_SIZE; i++) {
5432 if (add->map[i])
5433 to->map[i] = 1;
5434
5435 if (to->map[i])
5436 val += map_position_value(enc, i);
5437 }
5438 to->value = val;
5439
5440 alt_merge_opt_anc_info(&to->anc, &add->anc);
5441 }
5442
5443 static void
5444 set_bound_node_opt_info(OptNode* opt, MinMax* plen)
5445 {
5446 copy_mml(&(opt->sb.mmd), plen);
5447 copy_mml(&(opt->spr.mmd), plen);
5448 copy_mml(&(opt->map.mmd), plen);
5449 }
5450
5451 static void
5452 clear_node_opt_info(OptNode* opt)
5453 {
5454 clear_mml(&opt->len);
5455 clear_opt_anc_info(&opt->anc);
5456 clear_opt_exact(&opt->sb);
5457 clear_opt_exact(&opt->sm);
5458 clear_opt_exact(&opt->spr);
5459 clear_opt_map(&opt->map);
5460 }
5461
5462 static void
5463 copy_node_opt_info(OptNode* to, OptNode* from)
5464 {
5465 xmemcpy(to,from,sizeof(OptNode));
5466 }
5467
5468 static void
5469 concat_left_node_opt_info(OnigEncoding enc, OptNode* to, OptNode* add)
5470 {
5471 int sb_reach, sm_reach;
5472 OptAnc tanc;
5473
5474 concat_opt_anc_info(&tanc, &to->anc, &add->anc, to->len.max, add->len.max);
5475 copy_opt_anc_info(&to->anc, &tanc);
5476
5477 if (add->sb.len > 0 && to->len.max == 0) {
5478 concat_opt_anc_info(&tanc, &to->anc, &add->sb.anc, to->len.max, add->len.max);
5479 copy_opt_anc_info(&add->sb.anc, &tanc);
5480 }
5481
5482 if (add->map.value > 0 && to->len.max == 0) {
5483 if (add->map.mmd.max == 0)
5484 add->map.anc.left |= to->anc.left;
5485 }
5486
5487 sb_reach = to->sb.reach_end;
5488 sm_reach = to->sm.reach_end;
5489
5490 if (add->len.max != 0)
5491 to->sb.reach_end = to->sm.reach_end = 0;
5492
5493 if (add->sb.len > 0) {
5494 if (sb_reach) {
5495 concat_opt_exact(&to->sb, &add->sb, enc);
5496 clear_opt_exact(&add->sb);
5497 }
5498 else if (sm_reach) {
5499 concat_opt_exact(&to->sm, &add->sb, enc);
5500 clear_opt_exact(&add->sb);
5501 }
5502 }
5503 select_opt_exact(enc, &to->sm, &add->sb);
5504 select_opt_exact(enc, &to->sm, &add->sm);
5505
5506 if (to->spr.len > 0) {
5507 if (add->len.max > 0) {
5508 if (to->spr.len > (int )add->len.max)
5509 to->spr.len = add->len.max;
5510
5511 if (to->spr.mmd.max == 0)
5512 select_opt_exact(enc, &to->sb, &to->spr);
5513 else
5514 select_opt_exact(enc, &to->sm, &to->spr);
5515 }
5516 }
5517 else if (add->spr.len > 0) {
5518 copy_opt_exact(&to->spr, &add->spr);
5519 }
5520
5521 select_opt_map(&to->map, &add->map);
5522 add_mml(&to->len, &add->len);
5523 }
5524
5525 static void
5526 alt_merge_node_opt_info(OptNode* to, OptNode* add, OptEnv* env)
5527 {
5528 alt_merge_opt_anc_info(&to->anc, &add->anc);
5529 alt_merge_opt_exact(&to->sb, &add->sb, env);
5530 alt_merge_opt_exact(&to->sm, &add->sm, env);
5531 alt_merge_opt_exact(&to->spr, &add->spr, env);
5532 alt_merge_opt_map(env->enc, &to->map, &add->map);
5533
5534 alt_merge_mml(&to->len, &add->len);
5535 }
5536
5537
5538 #define MAX_NODE_OPT_INFO_REF_COUNT 5
5539
5540 static int
5541 optimize_nodes(Node* node, OptNode* opt, OptEnv* env)
5542 {
5543 int i;
5544 int r;
5545 OptNode xo;
5546 OnigEncoding enc;
5547
5548 r = 0;
5549 enc = env->enc;
5550 clear_node_opt_info(opt);
5551 set_bound_node_opt_info(opt, &env->mmd);
5552
5553 switch (NODE_TYPE(node)) {
5554 case NODE_LIST:
5555 {
5556 OptEnv nenv;
5557 Node* nd = node;
5558
5559 copy_opt_env(&nenv, env);
5560 do {
5561 r = optimize_nodes(NODE_CAR(nd), &xo, &nenv);
5562 if (r == 0) {
5563 add_mml(&nenv.mmd, &xo.len);
5564 concat_left_node_opt_info(enc, opt, &xo);
5565 }
5566 } while (r == 0 && IS_NOT_NULL(nd = NODE_CDR(nd)));
5567 }
5568 break;
5569
5570 case NODE_ALT:
5571 {
5572 Node* nd = node;
5573
5574 do {
5575 r = optimize_nodes(NODE_CAR(nd), &xo, env);
5576 if (r == 0) {
5577 if (nd == node) copy_node_opt_info(opt, &xo);
5578 else alt_merge_node_opt_info(opt, &xo, env);
5579 }
5580 } while ((r == 0) && IS_NOT_NULL(nd = NODE_CDR(nd)));
5581 }
5582 break;
5583
5584 case NODE_STRING:
5585 {
5586 StrNode* sn = STR_(node);
5587 int slen = (int )(sn->end - sn->s);
5588 /* int is_raw = NODE_STRING_IS_RAW(node); */
5589
5590 if (! NODE_STRING_IS_AMBIG(node)) {
5591 concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc);
5592 if (slen > 0) {
5593 add_char_opt_map(&opt->map, *(sn->s), enc);
5594 }
5595 set_mml(&opt->len, slen, slen);
5596 }
5597 else {
5598 int max;
5599
5600 if (NODE_STRING_IS_DONT_GET_OPT_INFO(node)) {
5601 int n = onigenc_strlen(enc, sn->s, sn->end);
5602 max = ONIGENC_MBC_MAXLEN_DIST(enc) * n;
5603 }
5604 else {
5605 concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc);
5606 opt->sb.case_fold = 1;
5607 if (NODE_STRING_IS_GOOD_AMBIG(node))
5608 opt->sb.good_case_fold = 1;
5609
5610 if (slen > 0) {
5611 r = add_char_amb_opt_map(&opt->map, sn->s, sn->end,
5612 enc, env->case_fold_flag);
5613 if (r != 0) break;
5614 }
5615
5616 max = slen;
5617 }
5618
5619 set_mml(&opt->len, slen, max);
5620 }
5621 }
5622 break;
5623
5624 case NODE_CCLASS:
5625 {
5626 int z;
5627 CClassNode* cc = CCLASS_(node);
5628
5629 /* no need to check ignore case. (set in setup_tree()) */
5630
5631 if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) {
5632 OnigLen min = ONIGENC_MBC_MINLEN(enc);
5633 OnigLen max = ONIGENC_MBC_MAXLEN_DIST(enc);
5634
5635 set_mml(&opt->len, min, max);
5636 }
5637 else {
5638 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
5639 z = BITSET_AT(cc->bs, i);
5640 if ((z && ! IS_NCCLASS_NOT(cc)) || (! z && IS_NCCLASS_NOT(cc))) {
5641 add_char_opt_map(&opt->map, (UChar )i, enc);
5642 }
5643 }
5644 set_mml(&opt->len, 1, 1);
5645 }
5646 }
5647 break;
5648
5649 case NODE_CTYPE:
5650 {
5651 int min, max;
5652 int range;
5653
5654 max = ONIGENC_MBC_MAXLEN_DIST(enc);
5655
5656 if (max == 1) {
5657 min = 1;
5658
5659 switch (CTYPE_(node)->ctype) {
5660 case CTYPE_ANYCHAR:
5661 break;
5662
5663 case ONIGENC_CTYPE_WORD:
5664 range = CTYPE_(node)->ascii_mode != 0 ? 128 : SINGLE_BYTE_SIZE;
5665 if (CTYPE_(node)->not != 0) {
5666 for (i = 0; i < range; i++) {
5667 if (! ONIGENC_IS_CODE_WORD(enc, i)) {
5668 add_char_opt_map(&opt->map, (UChar )i, enc);
5669 }
5670 }
5671 for (i = range; i < SINGLE_BYTE_SIZE; i++) {
5672 add_char_opt_map(&opt->map, (UChar )i, enc);
5673 }
5674 }
5675 else {
5676 for (i = 0; i < range; i++) {
5677 if (ONIGENC_IS_CODE_WORD(enc, i)) {
5678 add_char_opt_map(&opt->map, (UChar )i, enc);
5679 }
5680 }
5681 }
5682 break;
5683 }
5684 }
5685 else {
5686 min = ONIGENC_MBC_MINLEN(enc);
5687 }
5688 set_mml(&opt->len, min, max);
5689 }
5690 break;
5691
5692 case NODE_ANCHOR:
5693 switch (ANCHOR_(node)->type) {
5694 case ANCR_BEGIN_BUF:
5695 case ANCR_BEGIN_POSITION:
5696 case ANCR_BEGIN_LINE:
5697 case ANCR_END_BUF:
5698 case ANCR_SEMI_END_BUF:
5699 case ANCR_END_LINE:
5700 case ANCR_PREC_READ_NOT:
5701 case ANCR_LOOK_BEHIND:
5702 add_opt_anc_info(&opt->anc, ANCHOR_(node)->type);
5703 break;
5704
5705 case ANCR_PREC_READ:
5706 {
5707 r = optimize_nodes(NODE_BODY(node), &xo, env);
5708 if (r == 0) {
5709 if (xo.sb.len > 0)
5710 copy_opt_exact(&opt->spr, &xo.sb);
5711 else if (xo.sm.len > 0)
5712 copy_opt_exact(&opt->spr, &xo.sm);
5713
5714 opt->spr.reach_end = 0;
5715
5716 if (xo.map.value > 0)
5717 copy_opt_map(&opt->map, &xo.map);
5718 }
5719 }
5720 break;
5721
5722 case ANCR_LOOK_BEHIND_NOT:
5723 break;
5724 }
5725 break;
5726
5727 case NODE_BACKREF:
5728 if (! NODE_IS_CHECKER(node)) {
5729 int* backs;
5730 OnigLen min, max, tmin, tmax;
5731 MemEnv* mem_env = SCANENV_MEMENV(env->scan_env);
5732 BackRefNode* br = BACKREF_(node);
5733
5734 if (NODE_IS_RECURSION(node)) {
5735 set_mml(&opt->len, 0, INFINITE_LEN);
5736 break;
5737 }
5738 backs = BACKREFS_P(br);
5739 min = tree_min_len(mem_env[backs[0]].node, env->scan_env);
5740 max = tree_max_len(mem_env[backs[0]].node, env->scan_env);
5741 for (i = 1; i < br->back_num; i++) {
5742 tmin = tree_min_len(mem_env[backs[i]].node, env->scan_env);
5743 tmax = tree_max_len(mem_env[backs[i]].node, env->scan_env);
5744 if (min > tmin) min = tmin;
5745 if (max < tmax) max = tmax;
5746 }
5747 set_mml(&opt->len, min, max);
5748 }
5749 break;
5750
5751 #ifdef USE_CALL
5752 case NODE_CALL:
5753 if (NODE_IS_RECURSION(node))
5754 set_mml(&opt->len, 0, INFINITE_LEN);
5755 else {
5756 OnigOptionType save = env->options;
5757 env->options = BAG_(NODE_BODY(node))->o.options;
5758 r = optimize_nodes(NODE_BODY(node), opt, env);
5759 env->options = save;
5760 }
5761 break;
5762 #endif
5763
5764 case NODE_QUANT:
5765 {
5766 OnigLen min, max;
5767 QuantNode* qn = QUANT_(node);
5768
5769 r = optimize_nodes(NODE_BODY(node), &xo, env);
5770 if (r != 0) break;
5771
5772 if (qn->lower > 0) {
5773 copy_node_opt_info(opt, &xo);
5774 if (xo.sb.len > 0) {
5775 if (xo.sb.reach_end) {
5776 for (i = 2; i <= qn->lower && ! is_full_opt_exact(&opt->sb); i++) {
5777 int rc = concat_opt_exact(&opt->sb, &xo.sb, enc);
5778 if (rc > 0) break;
5779 }
5780 if (i < qn->lower) opt->sb.reach_end = 0;
5781 }
5782 }
5783
5784 if (qn->lower != qn->upper) {
5785 opt->sb.reach_end = 0;
5786 opt->sm.reach_end = 0;
5787 }
5788 if (qn->lower > 1)
5789 opt->sm.reach_end = 0;
5790 }
5791
5792 if (IS_INFINITE_REPEAT(qn->upper)) {
5793 if (env->mmd.max == 0 &&
5794 NODE_IS_ANYCHAR(NODE_BODY(node)) && qn->greedy != 0) {
5795 if (IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn), env)))
5796 add_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF_ML);
5797 else
5798 add_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF);
5799 }
5800
5801 max = (xo.len.max > 0 ? INFINITE_LEN : 0);
5802 }
5803 else {
5804 max = distance_multiply(xo.len.max, qn->upper);
5805 }
5806
5807 min = distance_multiply(xo.len.min, qn->lower);
5808 set_mml(&opt->len, min, max);
5809 }
5810 break;
5811
5812 case NODE_BAG:
5813 {
5814 BagNode* en = BAG_(node);
5815
5816 switch (en->type) {
5817 case BAG_OPTION:
5818 {
5819 OnigOptionType save = env->options;
5820
5821 env->options = en->o.options;
5822 r = optimize_nodes(NODE_BODY(node), opt, env);
5823 env->options = save;
5824 }
5825 break;
5826
5827 case BAG_MEMORY:
5828 #ifdef USE_CALL
5829 en->opt_count++;
5830 if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) {
5831 OnigLen min, max;
5832
5833 min = 0;
5834 max = INFINITE_LEN;
5835 if (NODE_IS_MIN_FIXED(node)) min = en->min_len;
5836 if (NODE_IS_MAX_FIXED(node)) max = en->max_len;
5837 set_mml(&opt->len, min, max);
5838 }
5839 else
5840 #endif
5841 {
5842 r = optimize_nodes(NODE_BODY(node), opt, env);
5843 if (is_set_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF_MASK)) {
5844 if (MEM_STATUS_AT0(env->scan_env->backrefed_mem, en->m.regnum))
5845 remove_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF_MASK);
5846 }
5847 }
5848 break;
5849
5850 case BAG_STOP_BACKTRACK:
5851 r = optimize_nodes(NODE_BODY(node), opt, env);
5852 break;
5853
5854 case BAG_IF_ELSE:
5855 {
5856 OptEnv nenv;
5857
5858 copy_opt_env(&nenv, env);
5859 r = optimize_nodes(NODE_BAG_BODY(en), &xo, &nenv);
5860 if (r == 0) {
5861 add_mml(&nenv.mmd, &xo.len);
5862 concat_left_node_opt_info(enc, opt, &xo);
5863 if (IS_NOT_NULL(en->te.Then)) {
5864 r = optimize_nodes(en->te.Then, &xo, &nenv);
5865 if (r == 0) {
5866 concat_left_node_opt_info(enc, opt, &xo);
5867 }
5868 }
5869
5870 if (IS_NOT_NULL(en->te.Else)) {
5871 r = optimize_nodes(en->te.Else, &xo, env);
5872 if (r == 0)
5873 alt_merge_node_opt_info(opt, &xo, env);
5874 }
5875 }
5876 }
5877 break;
5878 }
5879 }
5880 break;
5881
5882 case NODE_GIMMICK:
5883 break;
5884
5885 default:
5886 #ifdef ONIG_DEBUG
5887 fprintf(stderr, "optimize_nodes: undefined node type %d\n", NODE_TYPE(node));
5888 #endif
5889 r = ONIGERR_TYPE_BUG;
5890 break;
5891 }
5892
5893 return r;
5894 }
5895
5896 static int
5897 set_optimize_exact(regex_t* reg, OptStr* e)
5898 {
5899 int r;
5900
5901 if (e->len == 0) return 0;
5902
5903 reg->exact = (UChar* )xmalloc(e->len);
5904 CHECK_NULL_RETURN_MEMERR(reg->exact);
5905 xmemcpy(reg->exact, e->s, e->len);
5906 reg->exact_end = reg->exact + e->len;
5907
5908 if (e->case_fold) {
5909 reg->optimize = OPTIMIZE_STR_CASE_FOLD;
5910 if (e->good_case_fold != 0) {
5911 if (e->len >= 2) {
5912 r = set_sunday_quick_search_or_bmh_skip_table(reg, 1,
5913 reg->exact, reg->exact_end,
5914 reg->map, &(reg->map_offset));
5915 if (r != 0) return r;
5916 reg->optimize = OPTIMIZE_STR_CASE_FOLD_FAST;
5917 }
5918 }
5919 }
5920 else {
5921 int allow_reverse;
5922
5923 allow_reverse =
5924 ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end);
5925
5926 if (e->len >= 2 || (e->len >= 1 && allow_reverse)) {
5927 r = set_sunday_quick_search_or_bmh_skip_table(reg, 0,
5928 reg->exact, reg->exact_end,
5929 reg->map, &(reg->map_offset));
5930 if (r != 0) return r;
5931
5932 reg->optimize = (allow_reverse != 0
5933 ? OPTIMIZE_STR_FAST
5934 : OPTIMIZE_STR_FAST_STEP_FORWARD);
5935 }
5936 else {
5937 reg->optimize = OPTIMIZE_STR;
5938 }
5939 }
5940
5941 reg->dmin = e->mmd.min;
5942 reg->dmax = e->mmd.max;
5943
5944 if (reg->dmin != INFINITE_LEN) {
5945 reg->threshold_len = reg->dmin + (int )(reg->exact_end - reg->exact);
5946 }
5947
5948 return 0;
5949 }
5950
5951 static void
5952 set_optimize_map(regex_t* reg, OptMap* m)
5953 {
5954 int i;
5955
5956 for (i = 0; i < CHAR_MAP_SIZE; i++)
5957 reg->map[i] = m->map[i];
5958
5959 reg->optimize = OPTIMIZE_MAP;
5960 reg->dmin = m->mmd.min;
5961 reg->dmax = m->mmd.max;
5962
5963 if (reg->dmin != INFINITE_LEN) {
5964 reg->threshold_len = reg->dmin + 1;
5965 }
5966 }
5967
5968 static void
5969 set_sub_anchor(regex_t* reg, OptAnc* anc)
5970 {
5971 reg->sub_anchor |= anc->left & ANCR_BEGIN_LINE;
5972 reg->sub_anchor |= anc->right & ANCR_END_LINE;
5973 }
5974
5975 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
5976 static void print_optimize_info(FILE* f, regex_t* reg);
5977 #endif
5978
5979 static int
5980 set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env)
5981 {
5982 int r;
5983 OptNode opt;
5984 OptEnv env;
5985
5986 env.enc = reg->enc;
5987 env.options = reg->options;
5988 env.case_fold_flag = reg->case_fold_flag;
5989 env.scan_env = scan_env;
5990 clear_mml(&env.mmd);
5991
5992 r = optimize_nodes(node, &opt, &env);
5993 if (r != 0) return r;
5994
5995 reg->anchor = opt.anc.left & (ANCR_BEGIN_BUF |
5996 ANCR_BEGIN_POSITION | ANCR_ANYCHAR_INF | ANCR_ANYCHAR_INF_ML |
5997 ANCR_LOOK_BEHIND);
5998
5999 if ((opt.anc.left & (ANCR_LOOK_BEHIND | ANCR_PREC_READ_NOT)) != 0)
6000 reg->anchor &= ~ANCR_ANYCHAR_INF_ML;
6001
6002 reg->anchor |= opt.anc.right & (ANCR_END_BUF | ANCR_SEMI_END_BUF |
6003 ANCR_PREC_READ_NOT);
6004
6005 if (reg->anchor & (ANCR_END_BUF | ANCR_SEMI_END_BUF)) {
6006 reg->anchor_dmin = opt.len.min;
6007 reg->anchor_dmax = opt.len.max;
6008 }
6009
6010 if (opt.sb.len > 0 || opt.sm.len > 0) {
6011 select_opt_exact(reg->enc, &opt.sb, &opt.sm);
6012 if (opt.map.value > 0 && comp_opt_exact_or_map(&opt.sb, &opt.map) > 0) {
6013 goto set_map;
6014 }
6015 else {
6016 r = set_optimize_exact(reg, &opt.sb);
6017 set_sub_anchor(reg, &opt.sb.anc);
6018 }
6019 }
6020 else if (opt.map.value > 0) {
6021 set_map:
6022 set_optimize_map(reg, &opt.map);
6023 set_sub_anchor(reg, &opt.map.anc);
6024 }
6025 else {
6026 reg->sub_anchor |= opt.anc.left & ANCR_BEGIN_LINE;
6027 if (opt.len.max == 0)
6028 reg->sub_anchor |= opt.anc.right & ANCR_END_LINE;
6029 }
6030
6031 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
6032 print_optimize_info(stderr, reg);
6033 #endif
6034 return r;
6035 }
6036
6037 static void
6038 clear_optimize_info(regex_t* reg)
6039 {
6040 reg->optimize = OPTIMIZE_NONE;
6041 reg->anchor = 0;
6042 reg->anchor_dmin = 0;
6043 reg->anchor_dmax = 0;
6044 reg->sub_anchor = 0;
6045 reg->exact_end = (UChar* )NULL;
6046 reg->map_offset = 0;
6047 reg->threshold_len = 0;
6048 if (IS_NOT_NULL(reg->exact)) {
6049 xfree(reg->exact);
6050 reg->exact = (UChar* )NULL;
6051 }
6052 }
6053
6054 #ifdef ONIG_DEBUG
6055
6056 static void print_enc_string(FILE* fp, OnigEncoding enc,
6057 const UChar *s, const UChar *end)
6058 {
6059 fprintf(fp, "\nPATTERN: /");
6060
6061 if (ONIGENC_MBC_MINLEN(enc) > 1) {
6062 const UChar *p;
6063 OnigCodePoint code;
6064
6065 p = s;
6066 while (p < end) {
6067 code = ONIGENC_MBC_TO_CODE(enc, p, end);
6068 if (code >= 0x80) {
6069 fprintf(fp, " 0x%04x ", (int )code);
6070 }
6071 else {
6072 fputc((int )code, fp);
6073 }
6074
6075 p += enclen(enc, p);
6076 }
6077 }
6078 else {
6079 while (s < end) {
6080 fputc((int )*s, fp);
6081 s++;
6082 }
6083 }
6084
6085 fprintf(fp, "/\n");
6086 }
6087
6088 #endif /* ONIG_DEBUG */
6089
6090 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
6091
6092 static void
6093 print_distance_range(FILE* f, OnigLen a, OnigLen b)
6094 {
6095 if (a == INFINITE_LEN)
6096 fputs("inf", f);
6097 else
6098 fprintf(f, "(%u)", a);
6099
6100 fputs("-", f);
6101
6102 if (b == INFINITE_LEN)
6103 fputs("inf", f);
6104 else
6105 fprintf(f, "(%u)", b);
6106 }
6107
6108 static void
6109 print_anchor(FILE* f, int anchor)
6110 {
6111 int q = 0;
6112
6113 fprintf(f, "[");
6114
6115 if (anchor & ANCR_BEGIN_BUF) {
6116 fprintf(f, "begin-buf");
6117 q = 1;
6118 }
6119 if (anchor & ANCR_BEGIN_LINE) {
6120 if (q) fprintf(f, ", ");
6121 q = 1;
6122 fprintf(f, "begin-line");
6123 }
6124 if (anchor & ANCR_BEGIN_POSITION) {
6125 if (q) fprintf(f, ", ");
6126 q = 1;
6127 fprintf(f, "begin-pos");
6128 }
6129 if (anchor & ANCR_END_BUF) {
6130 if (q) fprintf(f, ", ");
6131 q = 1;
6132 fprintf(f, "end-buf");
6133 }
6134 if (anchor & ANCR_SEMI_END_BUF) {
6135 if (q) fprintf(f, ", ");
6136 q = 1;
6137 fprintf(f, "semi-end-buf");
6138 }
6139 if (anchor & ANCR_END_LINE) {
6140 if (q) fprintf(f, ", ");
6141 q = 1;
6142 fprintf(f, "end-line");
6143 }
6144 if (anchor & ANCR_ANYCHAR_INF) {
6145 if (q) fprintf(f, ", ");
6146 q = 1;
6147 fprintf(f, "anychar-inf");
6148 }
6149 if (anchor & ANCR_ANYCHAR_INF_ML) {
6150 if (q) fprintf(f, ", ");
6151 fprintf(f, "anychar-inf-ml");
6152 }
6153
6154 fprintf(f, "]");
6155 }
6156
6157 static void
6158 print_optimize_info(FILE* f, regex_t* reg)
6159 {
6160 static const char* on[] = { "NONE", "STR",
6161 "STR_FAST", "STR_FAST_STEP_FORWARD",
6162 "STR_CASE_FOLD_FAST", "STR_CASE_FOLD", "MAP" };
6163
6164 fprintf(f, "optimize: %s\n", on[reg->optimize]);
6165 fprintf(f, " anchor: "); print_anchor(f, reg->anchor);
6166 if ((reg->anchor & ANCR_END_BUF_MASK) != 0)
6167 print_distance_range(f, reg->anchor_dmin, reg->anchor_dmax);
6168 fprintf(f, "\n");
6169
6170 if (reg->optimize) {
6171 fprintf(f, " sub anchor: "); print_anchor(f, reg->sub_anchor);
6172 fprintf(f, "\n");
6173 }
6174 fprintf(f, "\n");
6175
6176 if (reg->exact) {
6177 UChar *p;
6178 fprintf(f, "exact: [");
6179 for (p = reg->exact; p < reg->exact_end; p++) {
6180 fputc(*p, f);
6181 }
6182 fprintf(f, "]: length: %ld\n", (reg->exact_end - reg->exact));
6183 }
6184 else if (reg->optimize & OPTIMIZE_MAP) {
6185 int c, i, n = 0;
6186
6187 for (i = 0; i < CHAR_MAP_SIZE; i++)
6188 if (reg->map[i]) n++;
6189
6190 fprintf(f, "map: n=%d\n", n);
6191 if (n > 0) {
6192 c = 0;
6193 fputc('[', f);
6194 for (i = 0; i < CHAR_MAP_SIZE; i++) {
6195 if (reg->map[i] != 0) {
6196 if (c > 0) fputs(", ", f);
6197 c++;
6198 if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 &&
6199 ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i))
6200 fputc(i, f);
6201 else
6202 fprintf(f, "%d", i);
6203 }
6204 }
6205 fprintf(f, "]\n");
6206 }
6207 }
6208 }
6209 #endif
6210
6211
6212 extern RegexExt*
6213 onig_get_regex_ext(regex_t* reg)
6214 {
6215 if (IS_NULL(reg->extp)) {
6216 RegexExt* ext = (RegexExt* )xmalloc(sizeof(*ext));
6217 if (IS_NULL(ext)) return 0;
6218
6219 ext->pattern = 0;
6220 ext->pattern_end = 0;
6221 #ifdef USE_CALLOUT
6222 ext->tag_table = 0;
6223 ext->callout_num = 0;
6224 ext->callout_list_alloc = 0;
6225 ext->callout_list = 0;
6226 #endif
6227
6228 reg->extp = ext;
6229 }
6230
6231 return reg->extp;
6232 }
6233
6234 static void
6235 free_regex_ext(RegexExt* ext)
6236 {
6237 if (IS_NOT_NULL(ext)) {
6238 if (IS_NOT_NULL(ext->pattern))
6239 xfree((void* )ext->pattern);
6240
6241 #ifdef USE_CALLOUT
6242 if (IS_NOT_NULL(ext->tag_table))
6243 onig_callout_tag_table_free(ext->tag_table);
6244
6245 if (IS_NOT_NULL(ext->callout_list))
6246 onig_free_reg_callout_list(ext->callout_num, ext->callout_list);
6247 #endif
6248
6249 xfree(ext);
6250 }
6251 }
6252
6253 extern int
6254 onig_ext_set_pattern(regex_t* reg, const UChar* pattern, const UChar* pattern_end)
6255 {
6256 RegexExt* ext;
6257 UChar* s;
6258
6259 ext = onig_get_regex_ext(reg);
6260 CHECK_NULL_RETURN_MEMERR(ext);
6261
6262 s = onigenc_strdup(reg->enc, pattern, pattern_end);
6263 CHECK_NULL_RETURN_MEMERR(s);
6264
6265 ext->pattern = s;
6266 ext->pattern_end = s + (pattern_end - pattern);
6267
6268 return ONIG_NORMAL;
6269 }
6270
6271 extern void
6272 onig_free_body(regex_t* reg)
6273 {
6274 if (IS_NOT_NULL(reg)) {
6275 ops_free(reg);
6276 if (IS_NOT_NULL(reg->string_pool)) {
6277 xfree(reg->string_pool);
6278 reg->string_pool_end = reg->string_pool = 0;
6279 }
6280 if (IS_NOT_NULL(reg->exact)) xfree(reg->exact);
6281 if (IS_NOT_NULL(reg->repeat_range)) xfree(reg->repeat_range);
6282 if (IS_NOT_NULL(reg->extp)) {
6283 free_regex_ext(reg->extp);
6284 reg->extp = 0;
6285 }
6286
6287 onig_names_free(reg);
6288 }
6289 }
6290
6291 extern void
6292 onig_free(regex_t* reg)
6293 {
6294 if (IS_NOT_NULL(reg)) {
6295 onig_free_body(reg);
6296 xfree(reg);
6297 }
6298 }
6299
6300
6301 #ifdef ONIG_DEBUG_PARSE
6302 static void print_tree P_((FILE* f, Node* node));
6303 #endif
6304
6305 extern int onig_init_for_match_at(regex_t* reg);
6306
6307 extern int
6308 onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
6309 OnigErrorInfo* einfo)
6310 {
6311 int r;
6312 Node* root;
6313 ScanEnv scan_env;
6314 #ifdef USE_CALL
6315 UnsetAddrList uslist;
6316 #endif
6317
6318 root = 0;
6319 if (IS_NOT_NULL(einfo)) {
6320 einfo->enc = reg->enc;
6321 einfo->par = (UChar* )NULL;
6322 }
6323
6324 #ifdef ONIG_DEBUG
6325 print_enc_string(stderr, reg->enc, pattern, pattern_end);
6326 #endif
6327
6328 if (reg->ops_alloc == 0) {
6329 r = ops_init(reg, OPS_INIT_SIZE);
6330 if (r != 0) goto end;
6331 }
6332 else
6333 reg->ops_used = 0;
6334
6335 reg->string_pool = 0;
6336 reg->string_pool_end = 0;
6337 reg->num_mem = 0;
6338 reg->num_repeat = 0;
6339 reg->num_null_check = 0;
6340 reg->repeat_range_alloc = 0;
6341 reg->repeat_range = (OnigRepeatRange* )NULL;
6342
6343 r = onig_parse_tree(&root, pattern, pattern_end, reg, &scan_env);
6344 if (r != 0) goto err;
6345
6346 /* mixed use named group and no-named group */
6347 if (scan_env.num_named > 0 &&
6348 IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
6349 ! ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
6350 if (scan_env.num_named != scan_env.num_mem)
6351 r = disable_noname_group_capture(&root, reg, &scan_env);
6352 else
6353 r = numbered_ref_check(root);
6354
6355 if (r != 0) goto err;
6356 }
6357
6358 r = check_backrefs(root, &scan_env);
6359 if (r != 0) goto err;
6360
6361 #ifdef USE_CALL
6362 if (scan_env.num_call > 0) {
6363 r = unset_addr_list_init(&uslist, scan_env.num_call);
6364 if (r != 0) goto err;
6365 scan_env.unset_addr_list = &uslist;
6366 r = setup_call(root, &scan_env, 0);
6367 if (r != 0) goto err_unset;
6368 r = setup_call2(root);
6369 if (r != 0) goto err_unset;
6370 r = recursive_call_check_trav(root, &scan_env, 0);
6371 if (r < 0) goto err_unset;
6372 r = infinite_recursive_call_check_trav(root, &scan_env);
6373 if (r != 0) goto err_unset;
6374
6375 setup_called_state(root, 0);
6376 }
6377
6378 reg->num_call = scan_env.num_call;
6379 #endif
6380
6381 r = setup_tree(root, reg, 0, &scan_env);
6382 if (r != 0) goto err_unset;
6383
6384 #ifdef ONIG_DEBUG_PARSE
6385 print_tree(stderr, root);
6386 #endif
6387
6388 reg->capture_history = scan_env.capture_history;
6389 reg->bt_mem_start = scan_env.bt_mem_start;
6390 reg->bt_mem_start |= reg->capture_history;
6391 if (IS_FIND_CONDITION(reg->options))
6392 MEM_STATUS_ON_ALL(reg->bt_mem_end);
6393 else {
6394 reg->bt_mem_end = scan_env.bt_mem_end;
6395 reg->bt_mem_end |= reg->capture_history;
6396 }
6397 reg->bt_mem_start |= reg->bt_mem_end;
6398
6399 clear_optimize_info(reg);
6400 #ifndef ONIG_DONT_OPTIMIZE
6401 r = set_optimize_info_from_tree(root, reg, &scan_env);
6402 if (r != 0) goto err_unset;
6403 #endif
6404
6405 if (IS_NOT_NULL(scan_env.mem_env_dynamic)) {
6406 xfree(scan_env.mem_env_dynamic);
6407 scan_env.mem_env_dynamic = (MemEnv* )NULL;
6408 }
6409
6410 r = compile_tree(root, reg, &scan_env);
6411 if (r == 0) {
6412 if (scan_env.keep_num > 0) {
6413 r = add_op(reg, OP_UPDATE_VAR);
6414 if (r != 0) goto err;
6415
6416 COP(reg)->update_var.type = UPDATE_VAR_KEEP_FROM_STACK_LAST;
6417 COP(reg)->update_var.id = 0; /* not used */
6418 }
6419
6420 r = add_op(reg, OP_END);
6421 if (r != 0) goto err;
6422
6423 #ifdef USE_CALL
6424 if (scan_env.num_call > 0) {
6425 r = fix_unset_addr_list(&uslist, reg);
6426 unset_addr_list_end(&uslist);
6427 if (r != 0) goto err;
6428 }
6429 #endif
6430
6431 if ((reg->num_repeat != 0) || (reg->bt_mem_end != 0)
6432 #ifdef USE_CALLOUT
6433 || (IS_NOT_NULL(reg->extp) && reg->extp->callout_num != 0)
6434 #endif
6435 )
6436 reg->stack_pop_level = STACK_POP_LEVEL_ALL;
6437 else {
6438 if (reg->bt_mem_start != 0)
6439 reg->stack_pop_level = STACK_POP_LEVEL_MEM_START;
6440 else
6441 reg->stack_pop_level = STACK_POP_LEVEL_FREE;
6442 }
6443
6444 r = ops_make_string_pool(reg);
6445 if (r != 0) goto err;
6446 }
6447 #ifdef USE_CALL
6448 else if (scan_env.num_call > 0) {
6449 unset_addr_list_end(&uslist);
6450 }
6451 #endif
6452 onig_node_free(root);
6453
6454 #ifdef ONIG_DEBUG_COMPILE
6455 onig_print_names(stderr, reg);
6456 onig_print_compiled_byte_code_list(stderr, reg);
6457 #endif
6458
6459 #ifdef USE_DIRECT_THREADED_CODE
6460 /* opcode -> opaddr */
6461 onig_init_for_match_at(reg);
6462 #endif
6463
6464 end:
6465 return r;
6466
6467 err_unset:
6468 #ifdef USE_CALL
6469 if (scan_env.num_call > 0) {
6470 unset_addr_list_end(&uslist);
6471 }
6472 #endif
6473 err:
6474 if (IS_NOT_NULL(scan_env.error)) {
6475 if (IS_NOT_NULL(einfo)) {
6476 einfo->par = scan_env.error;
6477 einfo->par_end = scan_env.error_end;
6478 }
6479 }
6480
6481 onig_node_free(root);
6482 if (IS_NOT_NULL(scan_env.mem_env_dynamic))
6483 xfree(scan_env.mem_env_dynamic);
6484 return r;
6485 }
6486
6487
6488 static int onig_inited = 0;
6489
6490 extern int
6491 onig_reg_init(regex_t* reg, OnigOptionType option, OnigCaseFoldType case_fold_flag,
6492 OnigEncoding enc, OnigSyntaxType* syntax)
6493 {
6494 int r;
6495
6496 xmemset(reg, 0, sizeof(*reg));
6497
6498 if (onig_inited == 0) {
6499 #if 0
6500 return ONIGERR_LIBRARY_IS_NOT_INITIALIZED;
6501 #else
6502 r = onig_initialize(&enc, 1);
6503 if (r != 0)
6504 return ONIGERR_FAIL_TO_INITIALIZE;
6505
6506 onig_warning("You didn't call onig_initialize() explicitly");
6507 #endif
6508 }
6509
6510 if (IS_NULL(reg))
6511 return ONIGERR_INVALID_ARGUMENT;
6512
6513 if (ONIGENC_IS_UNDEF(enc))
6514 return ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED;
6515
6516 if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP))
6517 == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) {
6518 return ONIGERR_INVALID_COMBINATION_OF_OPTIONS;
6519 }
6520
6521 if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) {
6522 option |= syntax->options;
6523 option &= ~ONIG_OPTION_SINGLELINE;
6524 }
6525 else
6526 option |= syntax->options;
6527
6528 (reg)->enc = enc;
6529 (reg)->options = option;
6530 (reg)->syntax = syntax;
6531 (reg)->optimize = 0;
6532 (reg)->exact = (UChar* )NULL;
6533 (reg)->extp = (RegexExt* )NULL;
6534
6535 (reg)->ops = (Operation* )NULL;
6536 (reg)->ops_curr = (Operation* )NULL;
6537 (reg)->ops_used = 0;
6538 (reg)->ops_alloc = 0;
6539 (reg)->name_table = (void* )NULL;
6540
6541 (reg)->case_fold_flag = case_fold_flag;
6542 return 0;
6543 }
6544
6545 extern int
6546 onig_new_without_alloc(regex_t* reg,
6547 const UChar* pattern, const UChar* pattern_end,
6548 OnigOptionType option, OnigEncoding enc,
6549 OnigSyntaxType* syntax, OnigErrorInfo* einfo)
6550 {
6551 int r;
6552
6553 r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
6554 if (r != 0) return r;
6555
6556 r = onig_compile(reg, pattern, pattern_end, einfo);
6557 return r;
6558 }
6559
6560 extern int
6561 onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
6562 OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
6563 OnigErrorInfo* einfo)
6564 {
6565 int r;
6566
6567 *reg = (regex_t* )xmalloc(sizeof(regex_t));
6568 if (IS_NULL(*reg)) return ONIGERR_MEMORY;
6569
6570 r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
6571 if (r != 0) goto err;
6572
6573 r = onig_compile(*reg, pattern, pattern_end, einfo);
6574 if (r != 0) {
6575 err:
6576 onig_free(*reg);
6577 *reg = NULL;
6578 }
6579 return r;
6580 }
6581
6582 extern int
6583 onig_initialize(OnigEncoding encodings[], int n)
6584 {
6585 int i;
6586 int r;
6587
6588 if (onig_inited != 0)
6589 return 0;
6590
6591 onigenc_init();
6592
6593 onig_inited = 1;
6594
6595 for (i = 0; i < n; i++) {
6596 OnigEncoding enc = encodings[i];
6597 r = onig_initialize_encoding(enc);
6598 if (r != 0)
6599 return r;
6600 }
6601
6602 return ONIG_NORMAL;
6603 }
6604
6605 typedef struct EndCallListItem {
6606 struct EndCallListItem* next;
6607 void (*func)(void);
6608 } EndCallListItemType;
6609
6610 static EndCallListItemType* EndCallTop;
6611
6612 extern void onig_add_end_call(void (*func)(void))
6613 {
6614 EndCallListItemType* item;
6615
6616 item = (EndCallListItemType* )xmalloc(sizeof(*item));
6617 if (item == 0) return ;
6618
6619 item->next = EndCallTop;
6620 item->func = func;
6621
6622 EndCallTop = item;
6623 }
6624
6625 static void
6626 exec_end_call_list(void)
6627 {
6628 EndCallListItemType* prev;
6629 void (*func)(void);
6630
6631 while (EndCallTop != 0) {
6632 func = EndCallTop->func;
6633 (*func)();
6634
6635 prev = EndCallTop;
6636 EndCallTop = EndCallTop->next;
6637 xfree(prev);
6638 }
6639 }
6640
6641 extern int
6642 onig_end(void)
6643 {
6644 exec_end_call_list();
6645
6646 #ifdef USE_CALLOUT
6647 onig_global_callout_names_free();
6648 #endif
6649
6650 onigenc_end();
6651
6652 onig_inited = 0;
6653
6654 return 0;
6655 }
6656
6657 extern int
6658 onig_is_in_code_range(const UChar* p, OnigCodePoint code)
6659 {
6660 OnigCodePoint n, *data;
6661 OnigCodePoint low, high, x;
6662
6663 GET_CODE_POINT(n, p);
6664 data = (OnigCodePoint* )p;
6665 data++;
6666
6667 for (low = 0, high = n; low < high; ) {
6668 x = (low + high) >> 1;
6669 if (code > data[x * 2 + 1])
6670 low = x + 1;
6671 else
6672 high = x;
6673 }
6674
6675 return ((low < n && code >= data[low * 2]) ? 1 : 0);
6676 }
6677
6678 extern int
6679 onig_is_code_in_cc_len(int elen, OnigCodePoint code, /* CClassNode* */ void* cc_arg)
6680 {
6681 int found;
6682 CClassNode* cc = (CClassNode* )cc_arg;
6683
6684 if (elen > 1 || (code >= SINGLE_BYTE_SIZE)) {
6685 if (IS_NULL(cc->mbuf)) {
6686 found = 0;
6687 }
6688 else {
6689 found = onig_is_in_code_range(cc->mbuf->p, code) != 0;
6690 }
6691 }
6692 else {
6693 found = BITSET_AT(cc->bs, code) != 0;
6694 }
6695
6696 if (IS_NCCLASS_NOT(cc))
6697 return !found;
6698 else
6699 return found;
6700 }
6701
6702 extern int
6703 onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)
6704 {
6705 int len;
6706
6707 if (ONIGENC_MBC_MINLEN(enc) > 1) {
6708 len = 2;
6709 }
6710 else {
6711 len = ONIGENC_CODE_TO_MBCLEN(enc, code);
6712 if (len < 0) return 0;
6713 }
6714 return onig_is_code_in_cc_len(len, code, cc);
6715 }
6716
6717
6718 #ifdef ONIG_DEBUG_PARSE
6719
6720 static void
6721 p_string(FILE* f, int len, UChar* s)
6722 {
6723 fputs(":", f);
6724 while (len-- > 0) { fputc(*s++, f); }
6725 }
6726
6727 static void
6728 Indent(FILE* f, int indent)
6729 {
6730 int i;
6731 for (i = 0; i < indent; i++) putc(' ', f);
6732 }
6733
6734 static void
6735 print_indent_tree(FILE* f, Node* node, int indent)
6736 {
6737 int i;
6738 NodeType type;
6739 UChar* p;
6740 int add = 3;
6741
6742 Indent(f, indent);
6743 if (IS_NULL(node)) {
6744 fprintf(f, "ERROR: null node!!!\n");
6745 exit (0);
6746 }
6747
6748 type = NODE_TYPE(node);
6749 switch (type) {
6750 case NODE_LIST:
6751 case NODE_ALT:
6752 if (type == NODE_LIST)
6753 fprintf(f, "<list:%p>\n", node);
6754 else
6755 fprintf(f, "<alt:%p>\n", node);
6756
6757 print_indent_tree(f, NODE_CAR(node), indent + add);
6758 while (IS_NOT_NULL(node = NODE_CDR(node))) {
6759 if (NODE_TYPE(node) != type) {
6760 fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NODE_TYPE(node));
6761 exit(0);
6762 }
6763 print_indent_tree(f, NODE_CAR(node), indent + add);
6764 }
6765 break;
6766
6767 case NODE_STRING:
6768 {
6769 char* mode;
6770 char* dont;
6771 char* good;
6772
6773 if (NODE_STRING_IS_RAW(node))
6774 mode = "-raw";
6775 else if (NODE_STRING_IS_AMBIG(node))
6776 mode = "-ambig";
6777 else
6778 mode = "";
6779
6780 if (NODE_STRING_IS_GOOD_AMBIG(node))
6781 good = "-good";
6782 else
6783 good = "";
6784
6785 if (NODE_STRING_IS_DONT_GET_OPT_INFO(node))
6786 dont = " (dont-opt)";
6787 else
6788 dont = "";
6789
6790 fprintf(f, "<string%s%s%s:%p>", mode, good, dont, node);
6791 for (p = STR_(node)->s; p < STR_(node)->end; p++) {
6792 if (*p >= 0x20 && *p < 0x7f)
6793 fputc(*p, f);
6794 else {
6795 fprintf(f, " 0x%02x", *p);
6796 }
6797 }
6798 }
6799 break;
6800
6801 case NODE_CCLASS:
6802 fprintf(f, "<cclass:%p>", node);
6803 if (IS_NCCLASS_NOT(CCLASS_(node))) fputs(" not", f);
6804 if (CCLASS_(node)->mbuf) {
6805 BBuf* bbuf = CCLASS_(node)->mbuf;
6806 for (i = 0; i < bbuf->used; i++) {
6807 if (i > 0) fprintf(f, ",");
6808 fprintf(f, "%0x", bbuf->p[i]);
6809 }
6810 }
6811 break;
6812
6813 case NODE_CTYPE:
6814 fprintf(f, "<ctype:%p> ", node);
6815 switch (CTYPE_(node)->ctype) {
6816 case CTYPE_ANYCHAR:
6817 fprintf(f, "<anychar:%p>", node);
6818 break;
6819
6820 case ONIGENC_CTYPE_WORD:
6821 if (CTYPE_(node)->not != 0)
6822 fputs("not word", f);
6823 else
6824 fputs("word", f);
6825
6826 if (CTYPE_(node)->ascii_mode != 0)
6827 fputs(" (ascii)", f);
6828
6829 break;
6830
6831 default:
6832 fprintf(f, "ERROR: undefined ctype.\n");
6833 exit(0);
6834 }
6835 break;
6836
6837 case NODE_ANCHOR:
6838 fprintf(f, "<anchor:%p> ", node);
6839 switch (ANCHOR_(node)->type) {
6840 case ANCR_BEGIN_BUF: fputs("begin buf", f); break;
6841 case ANCR_END_BUF: fputs("end buf", f); break;
6842 case ANCR_BEGIN_LINE: fputs("begin line", f); break;
6843 case ANCR_END_LINE: fputs("end line", f); break;
6844 case ANCR_SEMI_END_BUF: fputs("semi end buf", f); break;
6845 case ANCR_BEGIN_POSITION: fputs("begin position", f); break;
6846
6847 case ANCR_WORD_BOUNDARY: fputs("word boundary", f); break;
6848 case ANCR_NO_WORD_BOUNDARY: fputs("not word boundary", f); break;
6849 #ifdef USE_WORD_BEGIN_END
6850 case ANCR_WORD_BEGIN: fputs("word begin", f); break;
6851 case ANCR_WORD_END: fputs("word end", f); break;
6852 #endif
6853 case ANCR_TEXT_SEGMENT_BOUNDARY:
6854 fputs("text-segment boundary", f); break;
6855 case ANCR_NO_TEXT_SEGMENT_BOUNDARY:
6856 fputs("no text-segment boundary", f); break;
6857 case ANCR_PREC_READ:
6858 fprintf(f, "prec read\n");
6859 print_indent_tree(f, NODE_BODY(node), indent + add);
6860 break;
6861 case ANCR_PREC_READ_NOT:
6862 fprintf(f, "prec read not\n");
6863 print_indent_tree(f, NODE_BODY(node), indent + add);
6864 break;
6865 case ANCR_LOOK_BEHIND:
6866 fprintf(f, "look behind\n");
6867 print_indent_tree(f, NODE_BODY(node), indent + add);
6868 break;
6869 case ANCR_LOOK_BEHIND_NOT:
6870 fprintf(f, "look behind not\n");
6871 print_indent_tree(f, NODE_BODY(node), indent + add);
6872 break;
6873
6874 default:
6875 fprintf(f, "ERROR: undefined anchor type.\n");
6876 break;
6877 }
6878 break;
6879
6880 case NODE_BACKREF:
6881 {
6882 int* p;
6883 BackRefNode* br = BACKREF_(node);
6884 p = BACKREFS_P(br);
6885 fprintf(f, "<backref%s:%p>", NODE_IS_CHECKER(node) ? "-checker" : "", node);
6886 for (i = 0; i < br->back_num; i++) {
6887 if (i > 0) fputs(", ", f);
6888 fprintf(f, "%d", p[i]);
6889 }
6890 }
6891 break;
6892
6893 #ifdef USE_CALL
6894 case NODE_CALL:
6895 {
6896 CallNode* cn = CALL_(node);
6897 fprintf(f, "<call:%p>", node);
6898 p_string(f, cn->name_end - cn->name, cn->name);
6899 }
6900 break;
6901 #endif
6902
6903 case NODE_QUANT:
6904 fprintf(f, "<quantifier:%p>{%d,%d}%s\n", node,
6905 QUANT_(node)->lower, QUANT_(node)->upper,
6906 (QUANT_(node)->greedy ? "" : "?"));
6907 print_indent_tree(f, NODE_BODY(node), indent + add);
6908 break;
6909
6910 case NODE_BAG:
6911 fprintf(f, "<bag:%p> ", node);
6912 switch (BAG_(node)->type) {
6913 case BAG_OPTION:
6914 fprintf(f, "option:%d", BAG_(node)->o.options);
6915 break;
6916 case BAG_MEMORY:
6917 fprintf(f, "memory:%d", BAG_(node)->m.regnum);
6918 break;
6919 case BAG_STOP_BACKTRACK:
6920 fprintf(f, "stop-bt");
6921 break;
6922 case BAG_IF_ELSE:
6923 fprintf(f, "if-else");
6924 break;
6925 }
6926 fprintf(f, "\n");
6927 print_indent_tree(f, NODE_BODY(node), indent + add);
6928 break;
6929
6930 case NODE_GIMMICK:
6931 fprintf(f, "<gimmick:%p> ", node);
6932 switch (GIMMICK_(node)->type) {
6933 case GIMMICK_FAIL:
6934 fprintf(f, "fail");
6935 break;
6936 case GIMMICK_SAVE:
6937 fprintf(f, "save:%d:%d", GIMMICK_(node)->detail_type, GIMMICK_(node)->id);
6938 break;
6939 case GIMMICK_UPDATE_VAR:
6940 fprintf(f, "update_var:%d:%d", GIMMICK_(node)->detail_type, GIMMICK_(node)->id);
6941 break;
6942 #ifdef USE_CALLOUT
6943 case GIMMICK_CALLOUT:
6944 switch (GIMMICK_(node)->detail_type) {
6945 case ONIG_CALLOUT_OF_CONTENTS:
6946 fprintf(f, "callout:contents:%d", GIMMICK_(node)->num);
6947 break;
6948 case ONIG_CALLOUT_OF_NAME:
6949 fprintf(f, "callout:name:%d:%d", GIMMICK_(node)->id, GIMMICK_(node)->num);
6950 break;
6951 }
6952 #endif
6953 }
6954 break;
6955
6956 default:
6957 fprintf(f, "print_indent_tree: undefined node type %d\n", NODE_TYPE(node));
6958 break;
6959 }
6960
6961 if (type != NODE_LIST && type != NODE_ALT && type != NODE_QUANT &&
6962 type != NODE_BAG)
6963 fprintf(f, "\n");
6964 fflush(f);
6965 }
6966
6967 static void
6968 print_tree(FILE* f, Node* node)
6969 {
6970 print_indent_tree(f, node, 0);
6971 }
6972 #endif