]> git.proxmox.com Git - mirror_smartmontools-debian.git/blame - regex/regex_internal.c
Imported Upstream version 6.1+svn3812
[mirror_smartmontools-debian.git] / regex / regex_internal.c
CommitLineData
832b75ed
GG
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
ee38a438
GI
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 MA 02110-1301 USA. */
832b75ed
GG
20
21static void re_string_construct_common (const char *str, int len,
22 re_string_t *pstr,
23 RE_TRANSLATE_TYPE trans, int icase);
24#ifdef RE_ENABLE_I18N
25static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx,
26 wint_t *last_wc);
27#endif /* RE_ENABLE_I18N */
28static re_dfastate_t *create_newstate_common (re_dfa_t *dfa,
29 const re_node_set *nodes,
30 unsigned int hash);
31static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate,
32 unsigned int hash);
33static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa,
34 const re_node_set *nodes,
35 unsigned int hash);
36static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa,
37 const re_node_set *nodes,
38 unsigned int context,
39 unsigned int hash);
40static unsigned int inline calc_state_hash (const re_node_set *nodes,
41 unsigned int context);
42\f
43/* Functions for string operation. */
44
45/* This function allocate the buffers. It is necessary to call
46 re_string_reconstruct before using the object. */
47
48static reg_errcode_t
49re_string_allocate (pstr, str, len, init_len, trans, icase)
50 re_string_t *pstr;
51 const char *str;
52 int len, init_len, icase;
53 RE_TRANSLATE_TYPE trans;
54{
55 reg_errcode_t ret;
56 int init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
57 re_string_construct_common (str, len, pstr, trans, icase);
58 pstr->stop = pstr->len;
59
60 ret = re_string_realloc_buffers (pstr, init_buf_len);
61 if (BE (ret != REG_NOERROR, 0))
62 return ret;
63
64 pstr->mbs_case = (MBS_CASE_ALLOCATED (pstr) ? pstr->mbs_case
65 : (unsigned char *) str);
66 pstr->mbs = MBS_ALLOCATED (pstr) ? pstr->mbs : pstr->mbs_case;
67 pstr->valid_len = (MBS_CASE_ALLOCATED (pstr) || MBS_ALLOCATED (pstr)
68 || MB_CUR_MAX > 1) ? pstr->valid_len : len;
69 return REG_NOERROR;
70}
71
72/* This function allocate the buffers, and initialize them. */
73
74static reg_errcode_t
75re_string_construct (pstr, str, len, trans, icase)
76 re_string_t *pstr;
77 const char *str;
78 int len, icase;
79 RE_TRANSLATE_TYPE trans;
80{
81 reg_errcode_t ret;
82 re_string_construct_common (str, len, pstr, trans, icase);
83 pstr->stop = pstr->len;
84 /* Set 0 so that this function can initialize whole buffers. */
85 pstr->valid_len = 0;
86
87 if (len > 0)
88 {
89 ret = re_string_realloc_buffers (pstr, len + 1);
90 if (BE (ret != REG_NOERROR, 0))
91 return ret;
92 }
93 pstr->mbs_case = (MBS_CASE_ALLOCATED (pstr) ? pstr->mbs_case
94 : (unsigned char *) str);
95 pstr->mbs = MBS_ALLOCATED (pstr) ? pstr->mbs : pstr->mbs_case;
96
97 if (icase)
98 {
99#ifdef RE_ENABLE_I18N
100 if (MB_CUR_MAX > 1)
101 build_wcs_upper_buffer (pstr);
102 else
103#endif /* RE_ENABLE_I18N */
104 build_upper_buffer (pstr);
105 }
106 else
107 {
108#ifdef RE_ENABLE_I18N
109 if (MB_CUR_MAX > 1)
110 build_wcs_buffer (pstr);
111 else
112#endif /* RE_ENABLE_I18N */
113 {
114 if (trans != NULL)
115 re_string_translate_buffer (pstr);
116 else
117 pstr->valid_len = len;
118 }
119 }
120
121 /* Initialized whole buffers, then valid_len == bufs_len. */
122 pstr->valid_len = pstr->bufs_len;
123 return REG_NOERROR;
124}
125
126/* Helper functions for re_string_allocate, and re_string_construct. */
127
128static reg_errcode_t
129re_string_realloc_buffers (pstr, new_buf_len)
130 re_string_t *pstr;
131 int new_buf_len;
132{
133#ifdef RE_ENABLE_I18N
134 if (MB_CUR_MAX > 1)
135 {
136 wint_t *new_array = re_realloc (pstr->wcs, wint_t, new_buf_len);
137 if (BE (new_array == NULL, 0))
138 return REG_ESPACE;
139 pstr->wcs = new_array;
140 }
141#endif /* RE_ENABLE_I18N */
142 if (MBS_ALLOCATED (pstr))
143 {
144 unsigned char *new_array = re_realloc (pstr->mbs, unsigned char,
145 new_buf_len);
146 if (BE (new_array == NULL, 0))
147 return REG_ESPACE;
148 pstr->mbs = new_array;
149 }
150 if (MBS_CASE_ALLOCATED (pstr))
151 {
152 unsigned char *new_array = re_realloc (pstr->mbs_case, unsigned char,
153 new_buf_len);
154 if (BE (new_array == NULL, 0))
155 return REG_ESPACE;
156 pstr->mbs_case = new_array;
157 if (!MBS_ALLOCATED (pstr))
158 pstr->mbs = pstr->mbs_case;
159 }
160 pstr->bufs_len = new_buf_len;
161 return REG_NOERROR;
162}
163
164
165static void
166re_string_construct_common (str, len, pstr, trans, icase)
167 const char *str;
168 int len;
169 re_string_t *pstr;
170 RE_TRANSLATE_TYPE trans;
171 int icase;
172{
173 memset (pstr, '\0', sizeof (re_string_t));
174 pstr->raw_mbs = (const unsigned char *) str;
175 pstr->len = len;
176 pstr->trans = trans;
177 pstr->icase = icase ? 1 : 0;
178}
179
180#ifdef RE_ENABLE_I18N
181
182/* Build wide character buffer PSTR->WCS.
183 If the byte sequence of the string are:
184 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
185 Then wide character buffer will be:
186 <wc1> , WEOF , <wc2> , WEOF , <wc3>
187 We use WEOF for padding, they indicate that the position isn't
188 a first byte of a multibyte character.
189
190 Note that this function assumes PSTR->VALID_LEN elements are already
191 built and starts from PSTR->VALID_LEN. */
192
193static void
194build_wcs_buffer (pstr)
195 re_string_t *pstr;
196{
197 mbstate_t prev_st;
198 int byte_idx, end_idx, mbclen, remain_len;
199 /* Build the buffers from pstr->valid_len to either pstr->len or
200 pstr->bufs_len. */
201 end_idx = (pstr->bufs_len > pstr->len)? pstr->len : pstr->bufs_len;
202 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
203 {
204 wchar_t wc;
205 remain_len = end_idx - byte_idx;
206 prev_st = pstr->cur_state;
207 mbclen = mbrtowc (&wc, ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
208 + byte_idx), remain_len, &pstr->cur_state);
209 if (BE (mbclen == (size_t) -2, 0))
210 {
211 /* The buffer doesn't have enough space, finish to build. */
212 pstr->cur_state = prev_st;
213 break;
214 }
215 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
216 {
217 /* We treat these cases as a singlebyte character. */
218 mbclen = 1;
219 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
220 pstr->cur_state = prev_st;
221 }
222
223 /* Apply the translateion if we need. */
224 if (pstr->trans != NULL && mbclen == 1)
225 {
226 int ch = pstr->trans[pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]];
227 pstr->mbs_case[byte_idx] = ch;
228 }
229 /* Write wide character and padding. */
230 pstr->wcs[byte_idx++] = wc;
231 /* Write paddings. */
232 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
233 pstr->wcs[byte_idx++] = WEOF;
234 }
235 pstr->valid_len = byte_idx;
236}
237
238/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
239 but for REG_ICASE. */
240
241static void
242build_wcs_upper_buffer (pstr)
243 re_string_t *pstr;
244{
245 mbstate_t prev_st;
246 int byte_idx, end_idx, mbclen, remain_len;
247 /* Build the buffers from pstr->valid_len to either pstr->len or
248 pstr->bufs_len. */
249 end_idx = (pstr->bufs_len > pstr->len)? pstr->len : pstr->bufs_len;
250 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
251 {
252 wchar_t wc;
253 remain_len = end_idx - byte_idx;
254 prev_st = pstr->cur_state;
255 mbclen = mbrtowc (&wc, ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
256 + byte_idx), remain_len, &pstr->cur_state);
257 if (BE (mbclen == (size_t) -2, 0))
258 {
259 /* The buffer doesn't have enough space, finish to build. */
260 pstr->cur_state = prev_st;
261 break;
262 }
263 else if (mbclen == 1 || mbclen == (size_t) -1 || mbclen == 0)
264 {
265 /* In case of a singlebyte character. */
266 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
267 /* Apply the translateion if we need. */
268 if (pstr->trans != NULL && mbclen == 1)
269 {
270 ch = pstr->trans[ch];
271 pstr->mbs_case[byte_idx] = ch;
272 }
273 pstr->wcs[byte_idx] = iswlower (wc) ? toupper (wc) : wc;
274 pstr->mbs[byte_idx++] = islower (ch) ? toupper (ch) : ch;
275 if (BE (mbclen == (size_t) -1, 0))
276 pstr->cur_state = prev_st;
277 }
278 else /* mbclen > 1 */
279 {
280 if (iswlower (wc))
281 wcrtomb ((char *) pstr->mbs + byte_idx, towupper (wc), &prev_st);
282 else
283 memcpy (pstr->mbs + byte_idx,
284 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
285 pstr->wcs[byte_idx++] = iswlower (wc) ? toupper (wc) : wc;
286 /* Write paddings. */
287 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
288 pstr->wcs[byte_idx++] = WEOF;
289 }
290 }
291 pstr->valid_len = byte_idx;
292}
293
294/* Skip characters until the index becomes greater than NEW_RAW_IDX.
295 Return the index. */
296
297static int
298re_string_skip_chars (pstr, new_raw_idx, last_wc)
299 re_string_t *pstr;
300 int new_raw_idx;
301 wint_t *last_wc;
302{
303 mbstate_t prev_st;
304 int rawbuf_idx, mbclen;
305 wchar_t wc = 0;
306
307 /* Skip the characters which are not necessary to check. */
308 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_len;
309 rawbuf_idx < new_raw_idx;)
310 {
311 int remain_len;
312 remain_len = pstr->len - rawbuf_idx;
313 prev_st = pstr->cur_state;
314 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
315 remain_len, &pstr->cur_state);
316 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
317 {
318 /* We treat these cases as a singlebyte character. */
319 mbclen = 1;
320 pstr->cur_state = prev_st;
321 }
322 /* Then proceed the next character. */
323 rawbuf_idx += mbclen;
324 }
325 *last_wc = (wint_t) wc;
326 return rawbuf_idx;
327}
328#endif /* RE_ENABLE_I18N */
329
330/* Build the buffer PSTR->MBS, and apply the translation if we need.
331 This function is used in case of REG_ICASE. */
332
333static void
334build_upper_buffer (pstr)
335 re_string_t *pstr;
336{
337 int char_idx, end_idx;
338 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
339
340 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
341 {
342 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
343 if (pstr->trans != NULL)
344 {
345 ch = pstr->trans[ch];
346 pstr->mbs_case[char_idx] = ch;
347 }
348 if (islower (ch))
349 pstr->mbs[char_idx] = toupper (ch);
350 else
351 pstr->mbs[char_idx] = ch;
352 }
353 pstr->valid_len = char_idx;
354}
355
356/* Apply TRANS to the buffer in PSTR. */
357
358static void
359re_string_translate_buffer (pstr)
360 re_string_t *pstr;
361{
362 int buf_idx, end_idx;
363 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
364
365 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
366 {
367 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
368 pstr->mbs_case[buf_idx] = pstr->trans[ch];
369 }
370
371 pstr->valid_len = buf_idx;
372}
373
374/* This function re-construct the buffers.
375 Concretely, convert to wide character in case of MB_CUR_MAX > 1,
376 convert to upper case in case of REG_ICASE, apply translation. */
377
378static reg_errcode_t
379re_string_reconstruct (pstr, idx, eflags, newline)
380 re_string_t *pstr;
381 int idx, eflags, newline;
382{
383 int offset = idx - pstr->raw_mbs_idx;
384 if (offset < 0)
385 {
386 /* Reset buffer. */
387#ifdef RE_ENABLE_I18N
388 if (MB_CUR_MAX > 1)
389 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
390#endif /* RE_ENABLE_I18N */
391 pstr->len += pstr->raw_mbs_idx;
392 pstr->stop += pstr->raw_mbs_idx;
393 pstr->valid_len = pstr->raw_mbs_idx = 0;
394 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
395 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
396 if (!MBS_CASE_ALLOCATED (pstr))
397 pstr->mbs_case = (unsigned char *) pstr->raw_mbs;
398 if (!MBS_ALLOCATED (pstr) && !MBS_CASE_ALLOCATED (pstr))
399 pstr->mbs = (unsigned char *) pstr->raw_mbs;
400 offset = idx;
401 }
402
403 if (offset != 0)
404 {
405 /* Are the characters which are already checked remain? */
406 if (offset < pstr->valid_len)
407 {
408 /* Yes, move them to the front of the buffer. */
409 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags,
410 newline);
411#ifdef RE_ENABLE_I18N
412 if (MB_CUR_MAX > 1)
413 memmove (pstr->wcs, pstr->wcs + offset,
414 (pstr->valid_len - offset) * sizeof (wint_t));
415#endif /* RE_ENABLE_I18N */
416 if (MBS_ALLOCATED (pstr))
417 memmove (pstr->mbs, pstr->mbs + offset,
418 pstr->valid_len - offset);
419 if (MBS_CASE_ALLOCATED (pstr))
420 memmove (pstr->mbs_case, pstr->mbs_case + offset,
421 pstr->valid_len - offset);
422 pstr->valid_len -= offset;
423#if DEBUG
424 assert (pstr->valid_len > 0);
425#endif
426 }
427 else
428 {
429 /* No, skip all characters until IDX. */
430 pstr->valid_len = 0;
431#ifdef RE_ENABLE_I18N
432 if (MB_CUR_MAX > 1)
433 {
434 int wcs_idx;
435 wint_t wc;
436 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
437 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
438 pstr->wcs[wcs_idx] = WEOF;
439 if (pstr->trans && wc <= 0xff)
440 wc = pstr->trans[wc];
441 pstr->tip_context = (IS_WIDE_WORD_CHAR (wc) ? CONTEXT_WORD
442 : ((newline && IS_WIDE_NEWLINE (wc))
443 ? CONTEXT_NEWLINE : 0));
444 }
445 else
446#endif /* RE_ENABLE_I18N */
447 {
448 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
449 if (pstr->trans)
450 c = pstr->trans[c];
451 pstr->tip_context = (IS_WORD_CHAR (c) ? CONTEXT_WORD
452 : ((newline && IS_NEWLINE (c))
453 ? CONTEXT_NEWLINE : 0));
454 }
455 }
456 if (!MBS_CASE_ALLOCATED (pstr))
457 {
458 pstr->mbs_case += offset;
459 /* In case of !MBS_ALLOCATED && !MBS_CASE_ALLOCATED. */
460 if (!MBS_ALLOCATED (pstr))
461 pstr->mbs += offset;
462 }
463 }
464 pstr->raw_mbs_idx = idx;
465 pstr->len -= offset;
466 pstr->stop -= offset;
467
468 /* Then build the buffers. */
469#ifdef RE_ENABLE_I18N
470 if (MB_CUR_MAX > 1)
471 {
472 if (pstr->icase)
473 build_wcs_upper_buffer (pstr);
474 else
475 build_wcs_buffer (pstr);
476 }
477 else
478#endif /* RE_ENABLE_I18N */
479 {
480 if (pstr->icase)
481 build_upper_buffer (pstr);
482 else if (pstr->trans != NULL)
483 re_string_translate_buffer (pstr);
484 }
485 pstr->cur_idx = 0;
486
487 return REG_NOERROR;
488}
489
490static void
491re_string_destruct (pstr)
492 re_string_t *pstr;
493{
494#ifdef RE_ENABLE_I18N
495 re_free (pstr->wcs);
496#endif /* RE_ENABLE_I18N */
497 if (MBS_ALLOCATED (pstr))
498 re_free (pstr->mbs);
499 if (MBS_CASE_ALLOCATED (pstr))
500 re_free (pstr->mbs_case);
501}
502
503/* Return the context at IDX in INPUT. */
504
505static unsigned int
506re_string_context_at (input, idx, eflags, newline_anchor)
507 const re_string_t *input;
508 int idx, eflags, newline_anchor;
509{
510 int c;
511 if (idx < 0 || idx == input->len)
512 {
513 if (idx < 0)
514 /* In this case, we use the value stored in input->tip_context,
515 since we can't know the character in input->mbs[-1] here. */
516 return input->tip_context;
517 else /* (idx == input->len) */
518 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
519 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
520 }
521#ifdef RE_ENABLE_I18N
522 if (MB_CUR_MAX > 1)
523 {
524 wint_t wc;
525 int wc_idx = idx;
526 while(input->wcs[wc_idx] == WEOF)
527 {
528#ifdef DEBUG
529 /* It must not happen. */
530 assert (wc_idx >= 0);
531#endif
532 --wc_idx;
533 if (wc_idx < 0)
534 return input->tip_context;
535 }
536 wc = input->wcs[wc_idx];
537 if (IS_WIDE_WORD_CHAR (wc))
538 return CONTEXT_WORD;
539 return (newline_anchor && IS_WIDE_NEWLINE (wc)) ? CONTEXT_NEWLINE : 0;
540 }
541 else
542#endif
543 {
544 c = re_string_byte_at (input, idx);
545 if (IS_WORD_CHAR (c))
546 return CONTEXT_WORD;
547 return (newline_anchor && IS_NEWLINE (c)) ? CONTEXT_NEWLINE : 0;
548 }
549}
550\f
551/* Functions for set operation. */
552
553static reg_errcode_t
554re_node_set_alloc (set, size)
555 re_node_set *set;
556 int size;
557{
558 set->alloc = size;
559 set->nelem = 0;
560 set->elems = re_malloc (int, size);
561 if (BE (set->elems == NULL, 0))
562 return REG_ESPACE;
563 return REG_NOERROR;
564}
565
566static reg_errcode_t
567re_node_set_init_1 (set, elem)
568 re_node_set *set;
569 int elem;
570{
571 set->alloc = 1;
572 set->nelem = 1;
573 set->elems = re_malloc (int, 1);
574 if (BE (set->elems == NULL, 0))
575 {
576 set->alloc = set->nelem = 0;
577 return REG_ESPACE;
578 }
579 set->elems[0] = elem;
580 return REG_NOERROR;
581}
582
583static reg_errcode_t
584re_node_set_init_2 (set, elem1, elem2)
585 re_node_set *set;
586 int elem1, elem2;
587{
588 set->alloc = 2;
589 set->elems = re_malloc (int, 2);
590 if (BE (set->elems == NULL, 0))
591 return REG_ESPACE;
592 if (elem1 == elem2)
593 {
594 set->nelem = 1;
595 set->elems[0] = elem1;
596 }
597 else
598 {
599 set->nelem = 2;
600 if (elem1 < elem2)
601 {
602 set->elems[0] = elem1;
603 set->elems[1] = elem2;
604 }
605 else
606 {
607 set->elems[0] = elem2;
608 set->elems[1] = elem1;
609 }
610 }
611 return REG_NOERROR;
612}
613
614static reg_errcode_t
615re_node_set_init_copy (dest, src)
616 re_node_set *dest;
617 const re_node_set *src;
618{
619 dest->nelem = src->nelem;
620 if (src->nelem > 0)
621 {
622 dest->alloc = dest->nelem;
623 dest->elems = re_malloc (int, dest->alloc);
624 if (BE (dest->elems == NULL, 0))
625 {
626 dest->alloc = dest->nelem = 0;
627 return REG_ESPACE;
628 }
629 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
630 }
631 else
632 re_node_set_init_empty (dest);
633 return REG_NOERROR;
634}
635
636/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
637 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
638 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
639
640static reg_errcode_t
641re_node_set_add_intersect (dest, src1, src2)
642 re_node_set *dest;
643 const re_node_set *src1, *src2;
644{
645 int i1, i2, id;
646 if (src1->nelem > 0 && src2->nelem > 0)
647 {
648 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
649 {
650 dest->alloc = src1->nelem + src2->nelem + dest->nelem;
651 dest->elems = re_realloc (dest->elems, int, dest->alloc);
652 if (BE (dest->elems == NULL, 0))
653 return REG_ESPACE;
654 }
655 }
656 else
657 return REG_NOERROR;
658
659 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
660 {
661 if (src1->elems[i1] > src2->elems[i2])
662 {
663 ++i2;
664 continue;
665 }
666 if (src1->elems[i1] == src2->elems[i2])
667 {
668 while (id < dest->nelem && dest->elems[id] < src2->elems[i2])
669 ++id;
670 if (id < dest->nelem && dest->elems[id] == src2->elems[i2])
671 ++id;
672 else
673 {
674 memmove (dest->elems + id + 1, dest->elems + id,
675 sizeof (int) * (dest->nelem - id));
676 dest->elems[id++] = src2->elems[i2++];
677 ++dest->nelem;
678 }
679 }
680 ++i1;
681 }
682 return REG_NOERROR;
683}
684
685/* Calculate the union set of the sets SRC1 and SRC2. And store it to
686 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
687
688static reg_errcode_t
689re_node_set_init_union (dest, src1, src2)
690 re_node_set *dest;
691 const re_node_set *src1, *src2;
692{
693 int i1, i2, id;
694 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
695 {
696 dest->alloc = src1->nelem + src2->nelem;
697 dest->elems = re_malloc (int, dest->alloc);
698 if (BE (dest->elems == NULL, 0))
699 return REG_ESPACE;
700 }
701 else
702 {
703 if (src1 != NULL && src1->nelem > 0)
704 return re_node_set_init_copy (dest, src1);
705 else if (src2 != NULL && src2->nelem > 0)
706 return re_node_set_init_copy (dest, src2);
707 else
708 re_node_set_init_empty (dest);
709 return REG_NOERROR;
710 }
711 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
712 {
713 if (src1->elems[i1] > src2->elems[i2])
714 {
715 dest->elems[id++] = src2->elems[i2++];
716 continue;
717 }
718 if (src1->elems[i1] == src2->elems[i2])
719 ++i2;
720 dest->elems[id++] = src1->elems[i1++];
721 }
722 if (i1 < src1->nelem)
723 {
724 memcpy (dest->elems + id, src1->elems + i1,
725 (src1->nelem - i1) * sizeof (int));
726 id += src1->nelem - i1;
727 }
728 else if (i2 < src2->nelem)
729 {
730 memcpy (dest->elems + id, src2->elems + i2,
731 (src2->nelem - i2) * sizeof (int));
732 id += src2->nelem - i2;
733 }
734 dest->nelem = id;
735 return REG_NOERROR;
736}
737
738/* Calculate the union set of the sets DEST and SRC. And store it to
739 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
740
741static reg_errcode_t
742re_node_set_merge (dest, src)
743 re_node_set *dest;
744 const re_node_set *src;
745{
746 int si, di;
747 if (src == NULL || src->nelem == 0)
748 return REG_NOERROR;
749 if (dest->alloc < src->nelem + dest->nelem)
750 {
751 int *new_buffer;
752 dest->alloc = 2 * (src->nelem + dest->alloc);
753 new_buffer = re_realloc (dest->elems, int, dest->alloc);
754 if (BE (new_buffer == NULL, 0))
755 return REG_ESPACE;
756 dest->elems = new_buffer;
757 }
758
759 for (si = 0, di = 0 ; si < src->nelem && di < dest->nelem ;)
760 {
761 int cp_from, ncp, mid, right, src_elem = src->elems[si];
762 /* Binary search the spot we will add the new element. */
763 right = dest->nelem;
764 while (di < right)
765 {
766 mid = (di + right) / 2;
767 if (dest->elems[mid] < src_elem)
768 di = mid + 1;
769 else
770 right = mid;
771 }
772 if (di >= dest->nelem)
773 break;
774
775 if (dest->elems[di] == src_elem)
776 {
777 /* Skip since, DEST already has the element. */
778 ++di;
779 ++si;
780 continue;
781 }
782
783 /* Skip the src elements which are less than dest->elems[di]. */
784 cp_from = si;
785 while (si < src->nelem && src->elems[si] < dest->elems[di])
786 ++si;
787 /* Copy these src elements. */
788 ncp = si - cp_from;
789 memmove (dest->elems + di + ncp, dest->elems + di,
790 sizeof (int) * (dest->nelem - di));
791 memcpy (dest->elems + di, src->elems + cp_from,
792 sizeof (int) * ncp);
793 /* Update counters. */
794 di += ncp;
795 dest->nelem += ncp;
796 }
797
798 /* Copy remaining src elements. */
799 if (si < src->nelem)
800 {
801 memcpy (dest->elems + di, src->elems + si,
802 sizeof (int) * (src->nelem - si));
803 dest->nelem += src->nelem - si;
804 }
805 return REG_NOERROR;
806}
807
808/* Insert the new element ELEM to the re_node_set* SET.
809 return 0 if SET already has ELEM,
810 return -1 if an error is occured, return 1 otherwise. */
811
812static int
813re_node_set_insert (set, elem)
814 re_node_set *set;
815 int elem;
816{
817 int idx, right, mid;
818 /* In case of the set is empty. */
819 if (set->elems == NULL || set->alloc == 0)
820 {
821 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
822 return 1;
823 else
824 return -1;
825 }
826
827 /* Binary search the spot we will add the new element. */
828 idx = 0;
829 right = set->nelem;
830 while (idx < right)
831 {
832 mid = (idx + right) / 2;
833 if (set->elems[mid] < elem)
834 idx = mid + 1;
835 else
836 right = mid;
837 }
838
839 /* Realloc if we need. */
840 if (set->alloc < set->nelem + 1)
841 {
842 int *new_array;
843 set->alloc = set->alloc * 2;
844 new_array = re_malloc (int, set->alloc);
845 if (BE (new_array == NULL, 0))
846 return -1;
847 /* Copy the elements they are followed by the new element. */
848 if (idx > 0)
849 memcpy (new_array, set->elems, sizeof (int) * (idx));
850 /* Copy the elements which follows the new element. */
851 if (set->nelem - idx > 0)
852 memcpy (new_array + idx + 1, set->elems + idx,
853 sizeof (int) * (set->nelem - idx));
854 re_free (set->elems);
855 set->elems = new_array;
856 }
857 else
858 {
859 /* Move the elements which follows the new element. */
860 if (set->nelem - idx > 0)
861 memmove (set->elems + idx + 1, set->elems + idx,
862 sizeof (int) * (set->nelem - idx));
863 }
864 /* Insert the new element. */
865 set->elems[idx] = elem;
866 ++set->nelem;
867 return 1;
868}
869
870/* Compare two node sets SET1 and SET2.
871 return 1 if SET1 and SET2 are equivalent, retrun 0 otherwise. */
872
873static int
874re_node_set_compare (set1, set2)
875 const re_node_set *set1, *set2;
876{
877 int i;
878 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
879 return 0;
880 for (i = 0 ; i < set1->nelem ; i++)
881 if (set1->elems[i] != set2->elems[i])
882 return 0;
883 return 1;
884}
885
886/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
887
888static int
889re_node_set_contains (set, elem)
890 const re_node_set *set;
891 int elem;
892{
893 int idx, right, mid;
894 if (set->nelem <= 0)
895 return 0;
896
897 /* Binary search the element. */
898 idx = 0;
899 right = set->nelem - 1;
900 while (idx < right)
901 {
902 mid = (idx + right) / 2;
903 if (set->elems[mid] < elem)
904 idx = mid + 1;
905 else
906 right = mid;
907 }
908 return set->elems[idx] == elem ? idx + 1 : 0;
909}
910
911static void
912re_node_set_remove_at (set, idx)
913 re_node_set *set;
914 int idx;
915{
916 if (idx < 0 || idx >= set->nelem)
917 return;
918 if (idx < set->nelem - 1)
919 memmove (set->elems + idx, set->elems + idx + 1,
920 sizeof (int) * (set->nelem - idx - 1));
921 --set->nelem;
922}
923\f
924
925/* Add the token TOKEN to dfa->nodes, and return the index of the token.
926 Or return -1, if an error will be occured. */
927
928static int
929re_dfa_add_node (dfa, token, mode)
930 re_dfa_t *dfa;
931 re_token_t token;
932 int mode;
933{
934 if (dfa->nodes_len >= dfa->nodes_alloc)
935 {
936 re_token_t *new_array;
937 dfa->nodes_alloc *= 2;
938 new_array = re_realloc (dfa->nodes, re_token_t, dfa->nodes_alloc);
939 if (BE (new_array == NULL, 0))
940 return -1;
941 else
942 dfa->nodes = new_array;
943 if (mode)
944 {
945 int *new_nexts, *new_indices;
946 re_node_set *new_edests, *new_eclosures, *new_inveclosures;
947
948 new_nexts = re_realloc (dfa->nexts, int, dfa->nodes_alloc);
949 new_indices = re_realloc (dfa->org_indices, int, dfa->nodes_alloc);
950 new_edests = re_realloc (dfa->edests, re_node_set, dfa->nodes_alloc);
951 new_eclosures = re_realloc (dfa->eclosures, re_node_set,
952 dfa->nodes_alloc);
953 new_inveclosures = re_realloc (dfa->inveclosures, re_node_set,
954 dfa->nodes_alloc);
955 if (BE (new_nexts == NULL || new_indices == NULL
956 || new_edests == NULL || new_eclosures == NULL
957 || new_inveclosures == NULL, 0))
958 return -1;
959 dfa->nexts = new_nexts;
960 dfa->org_indices = new_indices;
961 dfa->edests = new_edests;
962 dfa->eclosures = new_eclosures;
963 dfa->inveclosures = new_inveclosures;
964 }
965 }
966 dfa->nodes[dfa->nodes_len] = token;
967 dfa->nodes[dfa->nodes_len].duplicated = 0;
968 dfa->nodes[dfa->nodes_len].constraint = 0;
969 return dfa->nodes_len++;
970}
971
972static unsigned int inline
973calc_state_hash (nodes, context)
974 const re_node_set *nodes;
975 unsigned int context;
976{
977 unsigned int hash = nodes->nelem + context;
978 int i;
979 for (i = 0 ; i < nodes->nelem ; i++)
980 hash += nodes->elems[i];
981 return hash;
982}
983
984/* Search for the state whose node_set is equivalent to NODES.
985 Return the pointer to the state, if we found it in the DFA.
986 Otherwise create the new one and return it. In case of an error
987 return NULL and set the error code in ERR.
988 Note: - We assume NULL as the invalid state, then it is possible that
989 return value is NULL and ERR is REG_NOERROR.
990 - We never return non-NULL value in case of any errors, it is for
991 optimization. */
992
993static re_dfastate_t*
994re_acquire_state (err, dfa, nodes)
995 reg_errcode_t *err;
996 re_dfa_t *dfa;
997 const re_node_set *nodes;
998{
999 unsigned int hash;
1000 re_dfastate_t *new_state;
1001 struct re_state_table_entry *spot;
1002 int i;
1003 if (BE (nodes->nelem == 0, 0))
1004 {
1005 *err = REG_NOERROR;
1006 return NULL;
1007 }
1008 hash = calc_state_hash (nodes, 0);
1009 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1010
1011 for (i = 0 ; i < spot->num ; i++)
1012 {
1013 re_dfastate_t *state = spot->array[i];
1014 if (hash != state->hash)
1015 continue;
1016 if (re_node_set_compare (&state->nodes, nodes))
1017 return state;
1018 }
1019
1020 /* There are no appropriate state in the dfa, create the new one. */
1021 new_state = create_ci_newstate (dfa, nodes, hash);
1022 if (BE (new_state != NULL, 1))
1023 return new_state;
1024 else
1025 {
1026 *err = REG_ESPACE;
1027 return NULL;
1028 }
1029}
1030
1031/* Search for the state whose node_set is equivalent to NODES and
1032 whose context is equivalent to CONTEXT.
1033 Return the pointer to the state, if we found it in the DFA.
1034 Otherwise create the new one and return it. In case of an error
1035 return NULL and set the error code in ERR.
1036 Note: - We assume NULL as the invalid state, then it is possible that
1037 return value is NULL and ERR is REG_NOERROR.
1038 - We never return non-NULL value in case of any errors, it is for
1039 optimization. */
1040
1041static re_dfastate_t*
1042re_acquire_state_context (err, dfa, nodes, context)
1043 reg_errcode_t *err;
1044 re_dfa_t *dfa;
1045 const re_node_set *nodes;
1046 unsigned int context;
1047{
1048 unsigned int hash;
1049 re_dfastate_t *new_state;
1050 struct re_state_table_entry *spot;
1051 int i;
1052 if (nodes->nelem == 0)
1053 {
1054 *err = REG_NOERROR;
1055 return NULL;
1056 }
1057 hash = calc_state_hash (nodes, context);
1058 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1059
1060 for (i = 0 ; i < spot->num ; i++)
1061 {
1062 re_dfastate_t *state = spot->array[i];
1063 if (hash != state->hash)
1064 continue;
1065 if (re_node_set_compare (state->entrance_nodes, nodes)
1066 && state->context == context)
1067 return state;
1068 }
1069 /* There are no appropriate state in `dfa', create the new one. */
1070 new_state = create_cd_newstate (dfa, nodes, context, hash);
1071 if (BE (new_state != NULL, 1))
1072 return new_state;
1073 else
1074 {
1075 *err = REG_ESPACE;
1076 return NULL;
1077 }
1078}
1079
1080/* Allocate memory for DFA state and initialize common properties.
1081 Return the new state if succeeded, otherwise return NULL. */
1082
1083static re_dfastate_t *
1084create_newstate_common (dfa, nodes, hash)
1085 re_dfa_t *dfa;
1086 const re_node_set *nodes;
1087 unsigned int hash;
1088{
1089 re_dfastate_t *newstate;
1090 reg_errcode_t err;
1091 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1092 if (BE (newstate == NULL, 0))
1093 return NULL;
1094 err = re_node_set_init_copy (&newstate->nodes, nodes);
1095 if (BE (err != REG_NOERROR, 0))
1096 {
1097 re_free (newstate);
1098 return NULL;
1099 }
1100 newstate->trtable = NULL;
1101 newstate->trtable_search = NULL;
1102 newstate->hash = hash;
1103 return newstate;
1104}
1105
1106/* Store the new state NEWSTATE whose hash value is HASH in appropriate
1107 position. Return value indicate the error code if failed. */
1108
1109static reg_errcode_t
1110register_state (dfa, newstate, hash)
1111 re_dfa_t *dfa;
1112 re_dfastate_t *newstate;
1113 unsigned int hash;
1114{
1115 struct re_state_table_entry *spot;
1116 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1117
1118 if (spot->alloc <= spot->num)
1119 {
1120 re_dfastate_t **new_array;
1121 spot->alloc = 2 * spot->num + 2;
1122 new_array = re_realloc (spot->array, re_dfastate_t *, spot->alloc);
1123 if (BE (new_array == NULL, 0))
1124 return REG_ESPACE;
1125 spot->array = new_array;
1126 }
1127 spot->array[spot->num++] = newstate;
1128 return REG_NOERROR;
1129}
1130
1131/* Create the new state which is independ of contexts.
1132 Return the new state if succeeded, otherwise return NULL. */
1133
1134static re_dfastate_t *
1135create_ci_newstate (dfa, nodes, hash)
1136 re_dfa_t *dfa;
1137 const re_node_set *nodes;
1138 unsigned int hash;
1139{
1140 int i;
1141 reg_errcode_t err;
1142 re_dfastate_t *newstate;
1143 newstate = create_newstate_common (dfa, nodes, hash);
1144 if (BE (newstate == NULL, 0))
1145 return NULL;
1146 newstate->entrance_nodes = &newstate->nodes;
1147
1148 for (i = 0 ; i < nodes->nelem ; i++)
1149 {
1150 re_token_t *node = dfa->nodes + nodes->elems[i];
1151 re_token_type_t type = node->type;
1152 if (type == CHARACTER && !node->constraint)
1153 continue;
1154
1155 /* If the state has the halt node, the state is a halt state. */
1156 else if (type == END_OF_RE)
1157 newstate->halt = 1;
1158#ifdef RE_ENABLE_I18N
1159 else if (type == COMPLEX_BRACKET
1160 || (type == OP_PERIOD && MB_CUR_MAX > 1))
1161 newstate->accept_mb = 1;
1162#endif /* RE_ENABLE_I18N */
1163 else if (type == OP_BACK_REF)
1164 newstate->has_backref = 1;
1165 else if (type == ANCHOR || node->constraint)
1166 newstate->has_constraint = 1;
1167 }
1168 err = register_state (dfa, newstate, hash);
1169 if (BE (err != REG_NOERROR, 0))
1170 {
1171 free_state (newstate);
1172 newstate = NULL;
1173 }
1174 return newstate;
1175}
1176
1177/* Create the new state which is depend on the context CONTEXT.
1178 Return the new state if succeeded, otherwise return NULL. */
1179
1180static re_dfastate_t *
1181create_cd_newstate (dfa, nodes, context, hash)
1182 re_dfa_t *dfa;
1183 const re_node_set *nodes;
1184 unsigned int context, hash;
1185{
1186 int i, nctx_nodes = 0;
1187 reg_errcode_t err;
1188 re_dfastate_t *newstate;
1189
1190 newstate = create_newstate_common (dfa, nodes, hash);
1191 if (BE (newstate == NULL, 0))
1192 return NULL;
1193 newstate->context = context;
1194 newstate->entrance_nodes = &newstate->nodes;
1195
1196 for (i = 0 ; i < nodes->nelem ; i++)
1197 {
1198 unsigned int constraint = 0;
1199 re_token_t *node = dfa->nodes + nodes->elems[i];
1200 re_token_type_t type = node->type;
1201 if (node->constraint)
1202 constraint = node->constraint;
1203
1204 if (type == CHARACTER && !constraint)
1205 continue;
1206 /* If the state has the halt node, the state is a halt state. */
1207 else if (type == END_OF_RE)
1208 newstate->halt = 1;
1209#ifdef RE_ENABLE_I18N
1210 else if (type == COMPLEX_BRACKET
1211 || (type == OP_PERIOD && MB_CUR_MAX > 1))
1212 newstate->accept_mb = 1;
1213#endif /* RE_ENABLE_I18N */
1214 else if (type == OP_BACK_REF)
1215 newstate->has_backref = 1;
1216 else if (type == ANCHOR)
1217 constraint = node->opr.ctx_type;
1218
1219 if (constraint)
1220 {
1221 if (newstate->entrance_nodes == &newstate->nodes)
1222 {
1223 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1224 if (BE (newstate->entrance_nodes == NULL, 0))
1225 {
1226 free_state (newstate);
1227 return NULL;
1228 }
1229 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1230 nctx_nodes = 0;
1231 newstate->has_constraint = 1;
1232 }
1233
1234 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1235 {
1236 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1237 ++nctx_nodes;
1238 }
1239 }
1240 }
1241 err = register_state (dfa, newstate, hash);
1242 if (BE (err != REG_NOERROR, 0))
1243 {
1244 free_state (newstate);
1245 newstate = NULL;
1246 }
1247 return newstate;
1248}
1249
1250static void
1251free_state (state)
1252 re_dfastate_t *state;
1253{
1254 if (state->entrance_nodes != &state->nodes)
1255 {
1256 re_node_set_free (state->entrance_nodes);
1257 re_free (state->entrance_nodes);
1258 }
1259 re_node_set_free (&state->nodes);
1260 re_free (state->trtable);
1261 re_free (state->trtable_search);
1262 re_free (state);
1263}