]> git.proxmox.com Git - mirror_smartmontools-debian.git/blame - regex/regex_internal.c
import smartmontools 7.0
[mirror_smartmontools-debian.git] / regex / regex_internal.c
CommitLineData
832b75ed 1/* Extended regular expression matching and search library.
ff28b140 2 Copyright (C) 2002-2018 Free Software Foundation, Inc.
832b75ed
GG
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
ff28b140
TL
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
832b75ed 19
ff28b140 20static void re_string_construct_common (const char *str, Idx len,
832b75ed 21 re_string_t *pstr,
ff28b140
TL
22 RE_TRANSLATE_TYPE trans, bool icase,
23 const re_dfa_t *dfa);
24static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
832b75ed 25 const re_node_set *nodes,
ff28b140
TL
26 re_hashval_t hash);
27static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
832b75ed
GG
28 const re_node_set *nodes,
29 unsigned int context,
ff28b140
TL
30 re_hashval_t hash);
31static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
32 Idx new_buf_len);
33#ifdef RE_ENABLE_I18N
34static void build_wcs_buffer (re_string_t *pstr);
35static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
36#endif /* RE_ENABLE_I18N */
37static void build_upper_buffer (re_string_t *pstr);
38static void re_string_translate_buffer (re_string_t *pstr);
39static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
40 int eflags) __attribute__ ((pure));
832b75ed
GG
41\f
42/* Functions for string operation. */
43
44/* This function allocate the buffers. It is necessary to call
45 re_string_reconstruct before using the object. */
46
47static reg_errcode_t
ff28b140
TL
48__attribute_warn_unused_result__
49re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
50 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
832b75ed
GG
51{
52 reg_errcode_t ret;
ff28b140
TL
53 Idx init_buf_len;
54
55 /* Ensure at least one character fits into the buffers. */
56 if (init_len < dfa->mb_cur_max)
57 init_len = dfa->mb_cur_max;
58 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
59 re_string_construct_common (str, len, pstr, trans, icase, dfa);
832b75ed
GG
60
61 ret = re_string_realloc_buffers (pstr, init_buf_len);
62 if (BE (ret != REG_NOERROR, 0))
63 return ret;
64
ff28b140
TL
65 pstr->word_char = dfa->word_char;
66 pstr->word_ops_used = dfa->word_ops_used;
67 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
68 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
69 pstr->valid_raw_len = pstr->valid_len;
832b75ed
GG
70 return REG_NOERROR;
71}
72
73/* This function allocate the buffers, and initialize them. */
74
75static reg_errcode_t
ff28b140
TL
76__attribute_warn_unused_result__
77re_string_construct (re_string_t *pstr, const char *str, Idx len,
78 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
832b75ed
GG
79{
80 reg_errcode_t ret;
ff28b140
TL
81 memset (pstr, '\0', sizeof (re_string_t));
82 re_string_construct_common (str, len, pstr, trans, icase, dfa);
832b75ed
GG
83
84 if (len > 0)
85 {
86 ret = re_string_realloc_buffers (pstr, len + 1);
87 if (BE (ret != REG_NOERROR, 0))
88 return ret;
89 }
ff28b140 90 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
832b75ed
GG
91
92 if (icase)
93 {
94#ifdef RE_ENABLE_I18N
ff28b140
TL
95 if (dfa->mb_cur_max > 1)
96 {
97 while (1)
98 {
99 ret = build_wcs_upper_buffer (pstr);
100 if (BE (ret != REG_NOERROR, 0))
101 return ret;
102 if (pstr->valid_raw_len >= len)
103 break;
104 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
105 break;
106 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
107 if (BE (ret != REG_NOERROR, 0))
108 return ret;
109 }
110 }
832b75ed
GG
111 else
112#endif /* RE_ENABLE_I18N */
113 build_upper_buffer (pstr);
114 }
115 else
116 {
117#ifdef RE_ENABLE_I18N
ff28b140 118 if (dfa->mb_cur_max > 1)
832b75ed
GG
119 build_wcs_buffer (pstr);
120 else
121#endif /* RE_ENABLE_I18N */
122 {
123 if (trans != NULL)
124 re_string_translate_buffer (pstr);
125 else
ff28b140
TL
126 {
127 pstr->valid_len = pstr->bufs_len;
128 pstr->valid_raw_len = pstr->bufs_len;
129 }
832b75ed
GG
130 }
131 }
132
832b75ed
GG
133 return REG_NOERROR;
134}
135
136/* Helper functions for re_string_allocate, and re_string_construct. */
137
138static reg_errcode_t
ff28b140
TL
139__attribute_warn_unused_result__
140re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
832b75ed
GG
141{
142#ifdef RE_ENABLE_I18N
ff28b140 143 if (pstr->mb_cur_max > 1)
832b75ed 144 {
ff28b140
TL
145 wint_t *new_wcs;
146
147 /* Avoid overflow in realloc. */
148 const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
149 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_buf_len, 0))
832b75ed 150 return REG_ESPACE;
ff28b140
TL
151
152 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
153 if (BE (new_wcs == NULL, 0))
832b75ed 154 return REG_ESPACE;
ff28b140
TL
155 pstr->wcs = new_wcs;
156 if (pstr->offsets != NULL)
157 {
158 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
159 if (BE (new_offsets == NULL, 0))
160 return REG_ESPACE;
161 pstr->offsets = new_offsets;
162 }
832b75ed 163 }
ff28b140
TL
164#endif /* RE_ENABLE_I18N */
165 if (pstr->mbs_allocated)
832b75ed 166 {
ff28b140
TL
167 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
168 new_buf_len);
169 if (BE (new_mbs == NULL, 0))
832b75ed 170 return REG_ESPACE;
ff28b140 171 pstr->mbs = new_mbs;
832b75ed
GG
172 }
173 pstr->bufs_len = new_buf_len;
174 return REG_NOERROR;
175}
176
177
178static void
ff28b140
TL
179re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
180 RE_TRANSLATE_TYPE trans, bool icase,
181 const re_dfa_t *dfa)
832b75ed 182{
832b75ed
GG
183 pstr->raw_mbs = (const unsigned char *) str;
184 pstr->len = len;
ff28b140 185 pstr->raw_len = len;
832b75ed 186 pstr->trans = trans;
ff28b140
TL
187 pstr->icase = icase;
188 pstr->mbs_allocated = (trans != NULL || icase);
189 pstr->mb_cur_max = dfa->mb_cur_max;
190 pstr->is_utf8 = dfa->is_utf8;
191 pstr->map_notascii = dfa->map_notascii;
192 pstr->stop = pstr->len;
193 pstr->raw_stop = pstr->stop;
832b75ed
GG
194}
195
196#ifdef RE_ENABLE_I18N
197
198/* Build wide character buffer PSTR->WCS.
199 If the byte sequence of the string are:
200 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
201 Then wide character buffer will be:
202 <wc1> , WEOF , <wc2> , WEOF , <wc3>
203 We use WEOF for padding, they indicate that the position isn't
204 a first byte of a multibyte character.
205
206 Note that this function assumes PSTR->VALID_LEN elements are already
207 built and starts from PSTR->VALID_LEN. */
208
209static void
ff28b140 210build_wcs_buffer (re_string_t *pstr)
832b75ed 211{
ff28b140
TL
212#ifdef _LIBC
213 unsigned char buf[MB_LEN_MAX];
214 assert (MB_LEN_MAX >= pstr->mb_cur_max);
215#else
216 unsigned char buf[64];
217#endif
832b75ed 218 mbstate_t prev_st;
ff28b140
TL
219 Idx byte_idx, end_idx, remain_len;
220 size_t mbclen;
221
832b75ed
GG
222 /* Build the buffers from pstr->valid_len to either pstr->len or
223 pstr->bufs_len. */
ff28b140 224 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
832b75ed
GG
225 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
226 {
227 wchar_t wc;
ff28b140
TL
228 const char *p;
229
832b75ed
GG
230 remain_len = end_idx - byte_idx;
231 prev_st = pstr->cur_state;
ff28b140
TL
232 /* Apply the translation if we need. */
233 if (BE (pstr->trans != NULL, 0))
832b75ed 234 {
ff28b140
TL
235 int i, ch;
236
237 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
238 {
239 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
240 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
241 }
242 p = (const char *) buf;
832b75ed 243 }
ff28b140
TL
244 else
245 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
246 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
247 if (BE (mbclen == (size_t) -1 || mbclen == 0
248 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0))
832b75ed
GG
249 {
250 /* We treat these cases as a singlebyte character. */
251 mbclen = 1;
252 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
ff28b140
TL
253 if (BE (pstr->trans != NULL, 0))
254 wc = pstr->trans[wc];
832b75ed
GG
255 pstr->cur_state = prev_st;
256 }
ff28b140 257 else if (BE (mbclen == (size_t) -2, 0))
832b75ed 258 {
ff28b140
TL
259 /* The buffer doesn't have enough space, finish to build. */
260 pstr->cur_state = prev_st;
261 break;
832b75ed 262 }
ff28b140 263
832b75ed
GG
264 /* Write wide character and padding. */
265 pstr->wcs[byte_idx++] = wc;
266 /* Write paddings. */
267 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
268 pstr->wcs[byte_idx++] = WEOF;
269 }
270 pstr->valid_len = byte_idx;
ff28b140 271 pstr->valid_raw_len = byte_idx;
832b75ed
GG
272}
273
274/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
275 but for REG_ICASE. */
276
ff28b140
TL
277static reg_errcode_t
278__attribute_warn_unused_result__
279build_wcs_upper_buffer (re_string_t *pstr)
832b75ed
GG
280{
281 mbstate_t prev_st;
ff28b140
TL
282 Idx src_idx, byte_idx, end_idx, remain_len;
283 size_t mbclen;
284#ifdef _LIBC
285 char buf[MB_LEN_MAX];
286 assert (MB_LEN_MAX >= pstr->mb_cur_max);
287#else
288 char buf[64];
289#endif
290
291 byte_idx = pstr->valid_len;
292 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
293
294 /* The following optimization assumes that ASCII characters can be
295 mapped to wide characters with a simple cast. */
296 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
832b75ed 297 {
ff28b140 298 while (byte_idx < end_idx)
832b75ed 299 {
ff28b140
TL
300 wchar_t wc;
301
302 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
303 && mbsinit (&pstr->cur_state))
832b75ed 304 {
ff28b140
TL
305 /* In case of a singlebyte character. */
306 pstr->mbs[byte_idx]
307 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
308 /* The next step uses the assumption that wchar_t is encoded
309 ASCII-safe: all ASCII values can be converted like this. */
310 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
311 ++byte_idx;
312 continue;
313 }
314
315 remain_len = end_idx - byte_idx;
316 prev_st = pstr->cur_state;
317 mbclen = __mbrtowc (&wc,
318 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
319 + byte_idx), remain_len, &pstr->cur_state);
320 if (BE (mbclen < (size_t) -2, 1))
321 {
322 wchar_t wcu = __towupper (wc);
323 if (wcu != wc)
324 {
325 size_t mbcdlen;
326
327 mbcdlen = __wcrtomb (buf, wcu, &prev_st);
328 if (BE (mbclen == mbcdlen, 1))
329 memcpy (pstr->mbs + byte_idx, buf, mbclen);
330 else
331 {
332 src_idx = byte_idx;
333 goto offsets_needed;
334 }
335 }
336 else
337 memcpy (pstr->mbs + byte_idx,
338 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
339 pstr->wcs[byte_idx++] = wcu;
340 /* Write paddings. */
341 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
342 pstr->wcs[byte_idx++] = WEOF;
343 }
344 else if (mbclen == (size_t) -1 || mbclen == 0
345 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
346 {
347 /* It is an invalid character, an incomplete character
348 at the end of the string, or '\0'. Just use the byte. */
349 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
350 pstr->mbs[byte_idx] = ch;
351 /* And also cast it to wide char. */
352 pstr->wcs[byte_idx++] = (wchar_t) ch;
353 if (BE (mbclen == (size_t) -1, 0))
354 pstr->cur_state = prev_st;
832b75ed 355 }
832b75ed 356 else
ff28b140
TL
357 {
358 /* The buffer doesn't have enough space, finish to build. */
359 pstr->cur_state = prev_st;
360 break;
361 }
832b75ed 362 }
ff28b140
TL
363 pstr->valid_len = byte_idx;
364 pstr->valid_raw_len = byte_idx;
365 return REG_NOERROR;
832b75ed 366 }
ff28b140
TL
367 else
368 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
369 {
370 wchar_t wc;
371 const char *p;
372 offsets_needed:
373 remain_len = end_idx - byte_idx;
374 prev_st = pstr->cur_state;
375 if (BE (pstr->trans != NULL, 0))
376 {
377 int i, ch;
378
379 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
380 {
381 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
382 buf[i] = pstr->trans[ch];
383 }
384 p = (const char *) buf;
385 }
386 else
387 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
388 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
389 if (BE (mbclen < (size_t) -2, 1))
390 {
391 wchar_t wcu = __towupper (wc);
392 if (wcu != wc)
393 {
394 size_t mbcdlen;
395
396 mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
397 if (BE (mbclen == mbcdlen, 1))
398 memcpy (pstr->mbs + byte_idx, buf, mbclen);
399 else if (mbcdlen != (size_t) -1)
400 {
401 size_t i;
402
403 if (byte_idx + mbcdlen > pstr->bufs_len)
404 {
405 pstr->cur_state = prev_st;
406 break;
407 }
408
409 if (pstr->offsets == NULL)
410 {
411 pstr->offsets = re_malloc (Idx, pstr->bufs_len);
412
413 if (pstr->offsets == NULL)
414 return REG_ESPACE;
415 }
416 if (!pstr->offsets_needed)
417 {
418 for (i = 0; i < (size_t) byte_idx; ++i)
419 pstr->offsets[i] = i;
420 pstr->offsets_needed = 1;
421 }
422
423 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
424 pstr->wcs[byte_idx] = wcu;
425 pstr->offsets[byte_idx] = src_idx;
426 for (i = 1; i < mbcdlen; ++i)
427 {
428 pstr->offsets[byte_idx + i]
429 = src_idx + (i < mbclen ? i : mbclen - 1);
430 pstr->wcs[byte_idx + i] = WEOF;
431 }
432 pstr->len += mbcdlen - mbclen;
433 if (pstr->raw_stop > src_idx)
434 pstr->stop += mbcdlen - mbclen;
435 end_idx = (pstr->bufs_len > pstr->len)
436 ? pstr->len : pstr->bufs_len;
437 byte_idx += mbcdlen;
438 src_idx += mbclen;
439 continue;
440 }
441 else
442 memcpy (pstr->mbs + byte_idx, p, mbclen);
443 }
444 else
445 memcpy (pstr->mbs + byte_idx, p, mbclen);
446
447 if (BE (pstr->offsets_needed != 0, 0))
448 {
449 size_t i;
450 for (i = 0; i < mbclen; ++i)
451 pstr->offsets[byte_idx + i] = src_idx + i;
452 }
453 src_idx += mbclen;
454
455 pstr->wcs[byte_idx++] = wcu;
456 /* Write paddings. */
457 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
458 pstr->wcs[byte_idx++] = WEOF;
459 }
460 else if (mbclen == (size_t) -1 || mbclen == 0
461 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
462 {
463 /* It is an invalid character or '\0'. Just use the byte. */
464 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
465
466 if (BE (pstr->trans != NULL, 0))
467 ch = pstr->trans [ch];
468 pstr->mbs[byte_idx] = ch;
469
470 if (BE (pstr->offsets_needed != 0, 0))
471 pstr->offsets[byte_idx] = src_idx;
472 ++src_idx;
473
474 /* And also cast it to wide char. */
475 pstr->wcs[byte_idx++] = (wchar_t) ch;
476 if (BE (mbclen == (size_t) -1, 0))
477 pstr->cur_state = prev_st;
478 }
479 else
480 {
481 /* The buffer doesn't have enough space, finish to build. */
482 pstr->cur_state = prev_st;
483 break;
484 }
485 }
832b75ed 486 pstr->valid_len = byte_idx;
ff28b140
TL
487 pstr->valid_raw_len = src_idx;
488 return REG_NOERROR;
832b75ed
GG
489}
490
491/* Skip characters until the index becomes greater than NEW_RAW_IDX.
492 Return the index. */
493
ff28b140
TL
494static Idx
495re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
832b75ed
GG
496{
497 mbstate_t prev_st;
ff28b140
TL
498 Idx rawbuf_idx;
499 size_t mbclen;
500 wint_t wc = WEOF;
832b75ed
GG
501
502 /* Skip the characters which are not necessary to check. */
ff28b140 503 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
832b75ed
GG
504 rawbuf_idx < new_raw_idx;)
505 {
ff28b140
TL
506 wchar_t wc2;
507 Idx remain_len = pstr->raw_len - rawbuf_idx;
832b75ed 508 prev_st = pstr->cur_state;
ff28b140
TL
509 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
510 remain_len, &pstr->cur_state);
832b75ed
GG
511 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
512 {
ff28b140
TL
513 /* We treat these cases as a single byte character. */
514 if (mbclen == 0 || remain_len == 0)
515 wc = L'\0';
516 else
517 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
832b75ed
GG
518 mbclen = 1;
519 pstr->cur_state = prev_st;
520 }
ff28b140
TL
521 else
522 wc = wc2;
832b75ed
GG
523 /* Then proceed the next character. */
524 rawbuf_idx += mbclen;
525 }
ff28b140 526 *last_wc = wc;
832b75ed
GG
527 return rawbuf_idx;
528}
529#endif /* RE_ENABLE_I18N */
530
531/* Build the buffer PSTR->MBS, and apply the translation if we need.
532 This function is used in case of REG_ICASE. */
533
534static void
ff28b140 535build_upper_buffer (re_string_t *pstr)
832b75ed 536{
ff28b140 537 Idx char_idx, end_idx;
832b75ed
GG
538 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
539
540 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
541 {
542 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
ff28b140
TL
543 if (BE (pstr->trans != NULL, 0))
544 ch = pstr->trans[ch];
545 pstr->mbs[char_idx] = toupper (ch);
832b75ed
GG
546 }
547 pstr->valid_len = char_idx;
ff28b140 548 pstr->valid_raw_len = char_idx;
832b75ed
GG
549}
550
551/* Apply TRANS to the buffer in PSTR. */
552
553static void
ff28b140 554re_string_translate_buffer (re_string_t *pstr)
832b75ed 555{
ff28b140 556 Idx buf_idx, end_idx;
832b75ed
GG
557 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
558
559 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
560 {
561 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
ff28b140 562 pstr->mbs[buf_idx] = pstr->trans[ch];
832b75ed
GG
563 }
564
565 pstr->valid_len = buf_idx;
ff28b140 566 pstr->valid_raw_len = buf_idx;
832b75ed
GG
567}
568
569/* This function re-construct the buffers.
ff28b140 570 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
832b75ed
GG
571 convert to upper case in case of REG_ICASE, apply translation. */
572
573static reg_errcode_t
ff28b140
TL
574__attribute_warn_unused_result__
575re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
832b75ed 576{
ff28b140
TL
577 Idx offset;
578
579 if (BE (pstr->raw_mbs_idx <= idx, 0))
580 offset = idx - pstr->raw_mbs_idx;
581 else
832b75ed
GG
582 {
583 /* Reset buffer. */
584#ifdef RE_ENABLE_I18N
ff28b140 585 if (pstr->mb_cur_max > 1)
832b75ed
GG
586 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
587#endif /* RE_ENABLE_I18N */
ff28b140
TL
588 pstr->len = pstr->raw_len;
589 pstr->stop = pstr->raw_stop;
590 pstr->valid_len = 0;
591 pstr->raw_mbs_idx = 0;
592 pstr->valid_raw_len = 0;
593 pstr->offsets_needed = 0;
832b75ed
GG
594 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
595 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
ff28b140 596 if (!pstr->mbs_allocated)
832b75ed
GG
597 pstr->mbs = (unsigned char *) pstr->raw_mbs;
598 offset = idx;
599 }
600
ff28b140 601 if (BE (offset != 0, 1))
832b75ed 602 {
ff28b140
TL
603 /* Should the already checked characters be kept? */
604 if (BE (offset < pstr->valid_raw_len, 1))
832b75ed
GG
605 {
606 /* Yes, move them to the front of the buffer. */
832b75ed 607#ifdef RE_ENABLE_I18N
ff28b140
TL
608 if (BE (pstr->offsets_needed, 0))
609 {
610 Idx low = 0, high = pstr->valid_len, mid;
611 do
612 {
613 mid = (high + low) / 2;
614 if (pstr->offsets[mid] > offset)
615 high = mid;
616 else if (pstr->offsets[mid] < offset)
617 low = mid + 1;
618 else
619 break;
620 }
621 while (low < high);
622 if (pstr->offsets[mid] < offset)
623 ++mid;
624 pstr->tip_context = re_string_context_at (pstr, mid - 1,
625 eflags);
626 /* This can be quite complicated, so handle specially
627 only the common and easy case where the character with
628 different length representation of lower and upper
629 case is present at or after offset. */
630 if (pstr->valid_len > offset
631 && mid == offset && pstr->offsets[mid] == offset)
632 {
633 memmove (pstr->wcs, pstr->wcs + offset,
634 (pstr->valid_len - offset) * sizeof (wint_t));
635 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
636 pstr->valid_len -= offset;
637 pstr->valid_raw_len -= offset;
638 for (low = 0; low < pstr->valid_len; low++)
639 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
640 }
641 else
642 {
643 /* Otherwise, just find out how long the partial multibyte
644 character at offset is and fill it with WEOF/255. */
645 pstr->len = pstr->raw_len - idx + offset;
646 pstr->stop = pstr->raw_stop - idx + offset;
647 pstr->offsets_needed = 0;
648 while (mid > 0 && pstr->offsets[mid - 1] == offset)
649 --mid;
650 while (mid < pstr->valid_len)
651 if (pstr->wcs[mid] != WEOF)
652 break;
653 else
654 ++mid;
655 if (mid == pstr->valid_len)
656 pstr->valid_len = 0;
657 else
658 {
659 pstr->valid_len = pstr->offsets[mid] - offset;
660 if (pstr->valid_len)
661 {
662 for (low = 0; low < pstr->valid_len; ++low)
663 pstr->wcs[low] = WEOF;
664 memset (pstr->mbs, 255, pstr->valid_len);
665 }
666 }
667 pstr->valid_raw_len = pstr->valid_len;
668 }
669 }
670 else
671#endif
672 {
673 pstr->tip_context = re_string_context_at (pstr, offset - 1,
674 eflags);
675#ifdef RE_ENABLE_I18N
676 if (pstr->mb_cur_max > 1)
677 memmove (pstr->wcs, pstr->wcs + offset,
678 (pstr->valid_len - offset) * sizeof (wint_t));
832b75ed 679#endif /* RE_ENABLE_I18N */
ff28b140
TL
680 if (BE (pstr->mbs_allocated, 0))
681 memmove (pstr->mbs, pstr->mbs + offset,
682 pstr->valid_len - offset);
683 pstr->valid_len -= offset;
684 pstr->valid_raw_len -= offset;
685#if defined DEBUG && DEBUG
686 assert (pstr->valid_len > 0);
832b75ed 687#endif
ff28b140 688 }
832b75ed
GG
689 }
690 else
691 {
ff28b140 692#ifdef RE_ENABLE_I18N
832b75ed 693 /* No, skip all characters until IDX. */
ff28b140
TL
694 Idx prev_valid_len = pstr->valid_len;
695
696 if (BE (pstr->offsets_needed, 0))
697 {
698 pstr->len = pstr->raw_len - idx + offset;
699 pstr->stop = pstr->raw_stop - idx + offset;
700 pstr->offsets_needed = 0;
701 }
702#endif
832b75ed
GG
703 pstr->valid_len = 0;
704#ifdef RE_ENABLE_I18N
ff28b140 705 if (pstr->mb_cur_max > 1)
832b75ed 706 {
ff28b140
TL
707 Idx wcs_idx;
708 wint_t wc = WEOF;
709
710 if (pstr->is_utf8)
711 {
712 const unsigned char *raw, *p, *end;
713
714 /* Special case UTF-8. Multi-byte chars start with any
715 byte other than 0x80 - 0xbf. */
716 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
717 end = raw + (offset - pstr->mb_cur_max);
718 if (end < pstr->raw_mbs)
719 end = pstr->raw_mbs;
720 p = raw + offset - 1;
721#ifdef _LIBC
722 /* We know the wchar_t encoding is UCS4, so for the simple
723 case, ASCII characters, skip the conversion step. */
724 if (isascii (*p) && BE (pstr->trans == NULL, 1))
725 {
726 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
727 /* pstr->valid_len = 0; */
728 wc = (wchar_t) *p;
729 }
730 else
731#endif
732 for (; p >= end; --p)
733 if ((*p & 0xc0) != 0x80)
734 {
735 mbstate_t cur_state;
736 wchar_t wc2;
737 Idx mlen = raw + pstr->len - p;
738 unsigned char buf[6];
739 size_t mbclen;
740
741 const unsigned char *pp = p;
742 if (BE (pstr->trans != NULL, 0))
743 {
744 int i = mlen < 6 ? mlen : 6;
745 while (--i >= 0)
746 buf[i] = pstr->trans[p[i]];
747 pp = buf;
748 }
749 /* XXX Don't use mbrtowc, we know which conversion
750 to use (UTF-8 -> UCS4). */
751 memset (&cur_state, 0, sizeof (cur_state));
752 mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
753 &cur_state);
754 if (raw + offset - p <= mbclen
755 && mbclen < (size_t) -2)
756 {
757 memset (&pstr->cur_state, '\0',
758 sizeof (mbstate_t));
759 pstr->valid_len = mbclen - (raw + offset - p);
760 wc = wc2;
761 }
762 break;
763 }
764 }
765
766 if (wc == WEOF)
767 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
768 if (wc == WEOF)
769 pstr->tip_context
770 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
771 else
772 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
773 && IS_WIDE_WORD_CHAR (wc))
774 ? CONTEXT_WORD
775 : ((IS_WIDE_NEWLINE (wc)
776 && pstr->newline_anchor)
777 ? CONTEXT_NEWLINE : 0));
778 if (BE (pstr->valid_len, 0))
779 {
780 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
781 pstr->wcs[wcs_idx] = WEOF;
782 if (pstr->mbs_allocated)
783 memset (pstr->mbs, 255, pstr->valid_len);
784 }
785 pstr->valid_raw_len = pstr->valid_len;
832b75ed
GG
786 }
787 else
788#endif /* RE_ENABLE_I18N */
789 {
790 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
ff28b140 791 pstr->valid_raw_len = 0;
832b75ed
GG
792 if (pstr->trans)
793 c = pstr->trans[c];
ff28b140
TL
794 pstr->tip_context = (bitset_contain (pstr->word_char, c)
795 ? CONTEXT_WORD
796 : ((IS_NEWLINE (c) && pstr->newline_anchor)
832b75ed
GG
797 ? CONTEXT_NEWLINE : 0));
798 }
799 }
ff28b140
TL
800 if (!BE (pstr->mbs_allocated, 0))
801 pstr->mbs += offset;
832b75ed
GG
802 }
803 pstr->raw_mbs_idx = idx;
804 pstr->len -= offset;
805 pstr->stop -= offset;
806
807 /* Then build the buffers. */
808#ifdef RE_ENABLE_I18N
ff28b140 809 if (pstr->mb_cur_max > 1)
832b75ed
GG
810 {
811 if (pstr->icase)
ff28b140
TL
812 {
813 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
814 if (BE (ret != REG_NOERROR, 0))
815 return ret;
816 }
832b75ed
GG
817 else
818 build_wcs_buffer (pstr);
819 }
820 else
821#endif /* RE_ENABLE_I18N */
ff28b140
TL
822 if (BE (pstr->mbs_allocated, 0))
823 {
824 if (pstr->icase)
825 build_upper_buffer (pstr);
826 else if (pstr->trans != NULL)
827 re_string_translate_buffer (pstr);
828 }
829 else
830 pstr->valid_len = pstr->len;
831
832 pstr->cur_idx = 0;
833 return REG_NOERROR;
834}
835
836static unsigned char
837__attribute__ ((pure))
838re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
839{
840 int ch;
841 Idx off;
842
843 /* Handle the common (easiest) cases first. */
844 if (BE (!pstr->mbs_allocated, 1))
845 return re_string_peek_byte (pstr, idx);
846
847#ifdef RE_ENABLE_I18N
848 if (pstr->mb_cur_max > 1
849 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
850 return re_string_peek_byte (pstr, idx);
851#endif
852
853 off = pstr->cur_idx + idx;
854#ifdef RE_ENABLE_I18N
855 if (pstr->offsets_needed)
856 off = pstr->offsets[off];
857#endif
858
859 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
860
861#ifdef RE_ENABLE_I18N
862 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
863 this function returns CAPITAL LETTER I instead of first byte of
864 DOTLESS SMALL LETTER I. The latter would confuse the parser,
865 since peek_byte_case doesn't advance cur_idx in any way. */
866 if (pstr->offsets_needed && !isascii (ch))
867 return re_string_peek_byte (pstr, idx);
868#endif
869
870 return ch;
871}
872
873static unsigned char
874re_string_fetch_byte_case (re_string_t *pstr)
875{
876 if (BE (!pstr->mbs_allocated, 1))
877 return re_string_fetch_byte (pstr);
878
879#ifdef RE_ENABLE_I18N
880 if (pstr->offsets_needed)
832b75ed 881 {
ff28b140
TL
882 Idx off;
883 int ch;
884
885 /* For tr_TR.UTF-8 [[:islower:]] there is
886 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
887 in that case the whole multi-byte character and return
888 the original letter. On the other side, with
889 [[: DOTLESS SMALL LETTER I return [[:I, as doing
890 anything else would complicate things too much. */
891
892 if (!re_string_first_byte (pstr, pstr->cur_idx))
893 return re_string_fetch_byte (pstr);
894
895 off = pstr->offsets[pstr->cur_idx];
896 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
897
898 if (! isascii (ch))
899 return re_string_fetch_byte (pstr);
900
901 re_string_skip_bytes (pstr,
902 re_string_char_size_at (pstr, pstr->cur_idx));
903 return ch;
832b75ed 904 }
ff28b140 905#endif
832b75ed 906
ff28b140 907 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
832b75ed
GG
908}
909
910static void
ff28b140 911re_string_destruct (re_string_t *pstr)
832b75ed
GG
912{
913#ifdef RE_ENABLE_I18N
914 re_free (pstr->wcs);
ff28b140 915 re_free (pstr->offsets);
832b75ed 916#endif /* RE_ENABLE_I18N */
ff28b140 917 if (pstr->mbs_allocated)
832b75ed 918 re_free (pstr->mbs);
832b75ed
GG
919}
920
921/* Return the context at IDX in INPUT. */
922
923static unsigned int
ff28b140 924re_string_context_at (const re_string_t *input, Idx idx, int eflags)
832b75ed
GG
925{
926 int c;
ff28b140
TL
927 if (BE (idx < 0, 0))
928 /* In this case, we use the value stored in input->tip_context,
929 since we can't know the character in input->mbs[-1] here. */
930 return input->tip_context;
931 if (BE (idx == input->len, 0))
932 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
933 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
832b75ed 934#ifdef RE_ENABLE_I18N
ff28b140 935 if (input->mb_cur_max > 1)
832b75ed
GG
936 {
937 wint_t wc;
ff28b140 938 Idx wc_idx = idx;
832b75ed
GG
939 while(input->wcs[wc_idx] == WEOF)
940 {
ff28b140 941#if defined DEBUG && DEBUG
832b75ed
GG
942 /* It must not happen. */
943 assert (wc_idx >= 0);
944#endif
945 --wc_idx;
946 if (wc_idx < 0)
947 return input->tip_context;
948 }
949 wc = input->wcs[wc_idx];
ff28b140 950 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
832b75ed 951 return CONTEXT_WORD;
ff28b140
TL
952 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
953 ? CONTEXT_NEWLINE : 0);
832b75ed
GG
954 }
955 else
956#endif
957 {
958 c = re_string_byte_at (input, idx);
ff28b140 959 if (bitset_contain (input->word_char, c))
832b75ed 960 return CONTEXT_WORD;
ff28b140 961 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
832b75ed
GG
962 }
963}
964\f
965/* Functions for set operation. */
966
967static reg_errcode_t
ff28b140
TL
968__attribute_warn_unused_result__
969re_node_set_alloc (re_node_set *set, Idx size)
832b75ed
GG
970{
971 set->alloc = size;
972 set->nelem = 0;
ff28b140
TL
973 set->elems = re_malloc (Idx, size);
974 if (BE (set->elems == NULL, 0) && (MALLOC_0_IS_NONNULL || size != 0))
832b75ed
GG
975 return REG_ESPACE;
976 return REG_NOERROR;
977}
978
979static reg_errcode_t
ff28b140
TL
980__attribute_warn_unused_result__
981re_node_set_init_1 (re_node_set *set, Idx elem)
832b75ed
GG
982{
983 set->alloc = 1;
984 set->nelem = 1;
ff28b140 985 set->elems = re_malloc (Idx, 1);
832b75ed
GG
986 if (BE (set->elems == NULL, 0))
987 {
988 set->alloc = set->nelem = 0;
989 return REG_ESPACE;
990 }
991 set->elems[0] = elem;
992 return REG_NOERROR;
993}
994
995static reg_errcode_t
ff28b140
TL
996__attribute_warn_unused_result__
997re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
832b75ed
GG
998{
999 set->alloc = 2;
ff28b140 1000 set->elems = re_malloc (Idx, 2);
832b75ed
GG
1001 if (BE (set->elems == NULL, 0))
1002 return REG_ESPACE;
1003 if (elem1 == elem2)
1004 {
1005 set->nelem = 1;
1006 set->elems[0] = elem1;
1007 }
1008 else
1009 {
1010 set->nelem = 2;
1011 if (elem1 < elem2)
1012 {
1013 set->elems[0] = elem1;
1014 set->elems[1] = elem2;
1015 }
1016 else
1017 {
1018 set->elems[0] = elem2;
1019 set->elems[1] = elem1;
1020 }
1021 }
1022 return REG_NOERROR;
1023}
1024
1025static reg_errcode_t
ff28b140
TL
1026__attribute_warn_unused_result__
1027re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
832b75ed
GG
1028{
1029 dest->nelem = src->nelem;
1030 if (src->nelem > 0)
1031 {
1032 dest->alloc = dest->nelem;
ff28b140 1033 dest->elems = re_malloc (Idx, dest->alloc);
832b75ed
GG
1034 if (BE (dest->elems == NULL, 0))
1035 {
1036 dest->alloc = dest->nelem = 0;
1037 return REG_ESPACE;
1038 }
ff28b140 1039 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
832b75ed
GG
1040 }
1041 else
1042 re_node_set_init_empty (dest);
1043 return REG_NOERROR;
1044}
1045
1046/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1047 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1048 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1049
1050static reg_errcode_t
ff28b140
TL
1051__attribute_warn_unused_result__
1052re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1053 const re_node_set *src2)
832b75ed 1054{
ff28b140
TL
1055 Idx i1, i2, is, id, delta, sbase;
1056 if (src1->nelem == 0 || src2->nelem == 0)
1057 return REG_NOERROR;
1058
1059 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1060 conservative estimate. */
1061 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
832b75ed 1062 {
ff28b140
TL
1063 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1064 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1065 if (BE (new_elems == NULL, 0))
1066 return REG_ESPACE;
1067 dest->elems = new_elems;
1068 dest->alloc = new_alloc;
832b75ed 1069 }
832b75ed 1070
ff28b140
TL
1071 /* Find the items in the intersection of SRC1 and SRC2, and copy
1072 into the top of DEST those that are not already in DEST itself. */
1073 sbase = dest->nelem + src1->nelem + src2->nelem;
1074 i1 = src1->nelem - 1;
1075 i2 = src2->nelem - 1;
1076 id = dest->nelem - 1;
1077 for (;;)
832b75ed 1078 {
ff28b140 1079 if (src1->elems[i1] == src2->elems[i2])
832b75ed 1080 {
ff28b140
TL
1081 /* Try to find the item in DEST. Maybe we could binary search? */
1082 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1083 --id;
1084
1085 if (id < 0 || dest->elems[id] != src1->elems[i1])
1086 dest->elems[--sbase] = src1->elems[i1];
1087
1088 if (--i1 < 0 || --i2 < 0)
1089 break;
832b75ed 1090 }
ff28b140
TL
1091
1092 /* Lower the highest of the two items. */
1093 else if (src1->elems[i1] < src2->elems[i2])
832b75ed 1094 {
ff28b140
TL
1095 if (--i2 < 0)
1096 break;
1097 }
1098 else
1099 {
1100 if (--i1 < 0)
1101 break;
832b75ed 1102 }
832b75ed 1103 }
ff28b140
TL
1104
1105 id = dest->nelem - 1;
1106 is = dest->nelem + src1->nelem + src2->nelem - 1;
1107 delta = is - sbase + 1;
1108
1109 /* Now copy. When DELTA becomes zero, the remaining
1110 DEST elements are already in place; this is more or
1111 less the same loop that is in re_node_set_merge. */
1112 dest->nelem += delta;
1113 if (delta > 0 && id >= 0)
1114 for (;;)
1115 {
1116 if (dest->elems[is] > dest->elems[id])
1117 {
1118 /* Copy from the top. */
1119 dest->elems[id + delta--] = dest->elems[is--];
1120 if (delta == 0)
1121 break;
1122 }
1123 else
1124 {
1125 /* Slide from the bottom. */
1126 dest->elems[id + delta] = dest->elems[id];
1127 if (--id < 0)
1128 break;
1129 }
1130 }
1131
1132 /* Copy remaining SRC elements. */
1133 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1134
832b75ed
GG
1135 return REG_NOERROR;
1136}
1137
1138/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1139 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1140
1141static reg_errcode_t
ff28b140
TL
1142__attribute_warn_unused_result__
1143re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1144 const re_node_set *src2)
832b75ed 1145{
ff28b140 1146 Idx i1, i2, id;
832b75ed
GG
1147 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1148 {
1149 dest->alloc = src1->nelem + src2->nelem;
ff28b140 1150 dest->elems = re_malloc (Idx, dest->alloc);
832b75ed
GG
1151 if (BE (dest->elems == NULL, 0))
1152 return REG_ESPACE;
1153 }
1154 else
1155 {
1156 if (src1 != NULL && src1->nelem > 0)
1157 return re_node_set_init_copy (dest, src1);
1158 else if (src2 != NULL && src2->nelem > 0)
1159 return re_node_set_init_copy (dest, src2);
1160 else
1161 re_node_set_init_empty (dest);
1162 return REG_NOERROR;
1163 }
1164 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1165 {
1166 if (src1->elems[i1] > src2->elems[i2])
1167 {
1168 dest->elems[id++] = src2->elems[i2++];
1169 continue;
1170 }
1171 if (src1->elems[i1] == src2->elems[i2])
1172 ++i2;
1173 dest->elems[id++] = src1->elems[i1++];
1174 }
1175 if (i1 < src1->nelem)
1176 {
1177 memcpy (dest->elems + id, src1->elems + i1,
ff28b140 1178 (src1->nelem - i1) * sizeof (Idx));
832b75ed
GG
1179 id += src1->nelem - i1;
1180 }
1181 else if (i2 < src2->nelem)
1182 {
1183 memcpy (dest->elems + id, src2->elems + i2,
ff28b140 1184 (src2->nelem - i2) * sizeof (Idx));
832b75ed
GG
1185 id += src2->nelem - i2;
1186 }
1187 dest->nelem = id;
1188 return REG_NOERROR;
1189}
1190
1191/* Calculate the union set of the sets DEST and SRC. And store it to
1192 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1193
1194static reg_errcode_t
ff28b140
TL
1195__attribute_warn_unused_result__
1196re_node_set_merge (re_node_set *dest, const re_node_set *src)
832b75ed 1197{
ff28b140 1198 Idx is, id, sbase, delta;
832b75ed
GG
1199 if (src == NULL || src->nelem == 0)
1200 return REG_NOERROR;
ff28b140 1201 if (dest->alloc < 2 * src->nelem + dest->nelem)
832b75ed 1202 {
ff28b140
TL
1203 Idx new_alloc = 2 * (src->nelem + dest->alloc);
1204 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
832b75ed
GG
1205 if (BE (new_buffer == NULL, 0))
1206 return REG_ESPACE;
1207 dest->elems = new_buffer;
ff28b140 1208 dest->alloc = new_alloc;
832b75ed
GG
1209 }
1210
ff28b140 1211 if (BE (dest->nelem == 0, 0))
832b75ed 1212 {
ff28b140
TL
1213 dest->nelem = src->nelem;
1214 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1215 return REG_NOERROR;
1216 }
832b75ed 1217
ff28b140
TL
1218 /* Copy into the top of DEST the items of SRC that are not
1219 found in DEST. Maybe we could binary search in DEST? */
1220 for (sbase = dest->nelem + 2 * src->nelem,
1221 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1222 {
1223 if (dest->elems[id] == src->elems[is])
1224 is--, id--;
1225 else if (dest->elems[id] < src->elems[is])
1226 dest->elems[--sbase] = src->elems[is--];
1227 else /* if (dest->elems[id] > src->elems[is]) */
1228 --id;
1229 }
832b75ed 1230
ff28b140
TL
1231 if (is >= 0)
1232 {
1233 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1234 sbase -= is + 1;
1235 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
832b75ed
GG
1236 }
1237
ff28b140
TL
1238 id = dest->nelem - 1;
1239 is = dest->nelem + 2 * src->nelem - 1;
1240 delta = is - sbase + 1;
1241 if (delta == 0)
1242 return REG_NOERROR;
1243
1244 /* Now copy. When DELTA becomes zero, the remaining
1245 DEST elements are already in place. */
1246 dest->nelem += delta;
1247 for (;;)
832b75ed 1248 {
ff28b140
TL
1249 if (dest->elems[is] > dest->elems[id])
1250 {
1251 /* Copy from the top. */
1252 dest->elems[id + delta--] = dest->elems[is--];
1253 if (delta == 0)
1254 break;
1255 }
1256 else
1257 {
1258 /* Slide from the bottom. */
1259 dest->elems[id + delta] = dest->elems[id];
1260 if (--id < 0)
1261 {
1262 /* Copy remaining SRC elements. */
1263 memcpy (dest->elems, dest->elems + sbase,
1264 delta * sizeof (Idx));
1265 break;
1266 }
1267 }
832b75ed 1268 }
ff28b140 1269
832b75ed
GG
1270 return REG_NOERROR;
1271}
1272
1273/* Insert the new element ELEM to the re_node_set* SET.
ff28b140
TL
1274 SET should not already have ELEM.
1275 Return true if successful. */
832b75ed 1276
ff28b140
TL
1277static bool
1278__attribute_warn_unused_result__
1279re_node_set_insert (re_node_set *set, Idx elem)
832b75ed 1280{
ff28b140
TL
1281 Idx idx;
1282 /* In case the set is empty. */
1283 if (set->alloc == 0)
1284 return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
832b75ed 1285
ff28b140 1286 if (BE (set->nelem, 0) == 0)
832b75ed 1287 {
ff28b140
TL
1288 /* We already guaranteed above that set->alloc != 0. */
1289 set->elems[0] = elem;
1290 ++set->nelem;
1291 return true;
832b75ed
GG
1292 }
1293
1294 /* Realloc if we need. */
ff28b140 1295 if (set->alloc == set->nelem)
832b75ed 1296 {
ff28b140 1297 Idx *new_elems;
832b75ed 1298 set->alloc = set->alloc * 2;
ff28b140
TL
1299 new_elems = re_realloc (set->elems, Idx, set->alloc);
1300 if (BE (new_elems == NULL, 0))
1301 return false;
1302 set->elems = new_elems;
1303 }
1304
1305 /* Move the elements which follows the new element. Test the
1306 first element separately to skip a check in the inner loop. */
1307 if (elem < set->elems[0])
1308 {
1309 idx = 0;
1310 for (idx = set->nelem; idx > 0; idx--)
1311 set->elems[idx] = set->elems[idx - 1];
832b75ed
GG
1312 }
1313 else
1314 {
ff28b140
TL
1315 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1316 set->elems[idx] = set->elems[idx - 1];
832b75ed 1317 }
ff28b140 1318
832b75ed
GG
1319 /* Insert the new element. */
1320 set->elems[idx] = elem;
1321 ++set->nelem;
ff28b140
TL
1322 return true;
1323}
1324
1325/* Insert the new element ELEM to the re_node_set* SET.
1326 SET should not already have any element greater than or equal to ELEM.
1327 Return true if successful. */
1328
1329static bool
1330__attribute_warn_unused_result__
1331re_node_set_insert_last (re_node_set *set, Idx elem)
1332{
1333 /* Realloc if we need. */
1334 if (set->alloc == set->nelem)
1335 {
1336 Idx *new_elems;
1337 set->alloc = (set->alloc + 1) * 2;
1338 new_elems = re_realloc (set->elems, Idx, set->alloc);
1339 if (BE (new_elems == NULL, 0))
1340 return false;
1341 set->elems = new_elems;
1342 }
1343
1344 /* Insert the new element. */
1345 set->elems[set->nelem++] = elem;
1346 return true;
832b75ed
GG
1347}
1348
1349/* Compare two node sets SET1 and SET2.
ff28b140 1350 Return true if SET1 and SET2 are equivalent. */
832b75ed 1351
ff28b140
TL
1352static bool
1353__attribute__ ((pure))
1354re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
832b75ed 1355{
ff28b140 1356 Idx i;
832b75ed 1357 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
ff28b140
TL
1358 return false;
1359 for (i = set1->nelem ; --i >= 0 ; )
832b75ed 1360 if (set1->elems[i] != set2->elems[i])
ff28b140
TL
1361 return false;
1362 return true;
832b75ed
GG
1363}
1364
1365/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1366
ff28b140
TL
1367static Idx
1368__attribute__ ((pure))
1369re_node_set_contains (const re_node_set *set, Idx elem)
832b75ed 1370{
ff28b140 1371 __re_size_t idx, right, mid;
832b75ed
GG
1372 if (set->nelem <= 0)
1373 return 0;
1374
1375 /* Binary search the element. */
1376 idx = 0;
1377 right = set->nelem - 1;
1378 while (idx < right)
1379 {
1380 mid = (idx + right) / 2;
1381 if (set->elems[mid] < elem)
1382 idx = mid + 1;
1383 else
1384 right = mid;
1385 }
1386 return set->elems[idx] == elem ? idx + 1 : 0;
1387}
1388
1389static void
ff28b140 1390re_node_set_remove_at (re_node_set *set, Idx idx)
832b75ed
GG
1391{
1392 if (idx < 0 || idx >= set->nelem)
1393 return;
832b75ed 1394 --set->nelem;
ff28b140
TL
1395 for (; idx < set->nelem; idx++)
1396 set->elems[idx] = set->elems[idx + 1];
832b75ed
GG
1397}
1398\f
1399
1400/* Add the token TOKEN to dfa->nodes, and return the index of the token.
ff28b140 1401 Or return -1 if an error occurred. */
832b75ed 1402
ff28b140
TL
1403static Idx
1404re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
832b75ed 1405{
ff28b140 1406 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
832b75ed 1407 {
ff28b140
TL
1408 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1409 Idx *new_nexts, *new_indices;
1410 re_node_set *new_edests, *new_eclosures;
1411 re_token_t *new_nodes;
1412
1413 /* Avoid overflows in realloc. */
1414 const size_t max_object_size = MAX (sizeof (re_token_t),
1415 MAX (sizeof (re_node_set),
1416 sizeof (Idx)));
1417 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_nodes_alloc, 0))
832b75ed 1418 return -1;
ff28b140
TL
1419
1420 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1421 if (BE (new_nodes == NULL, 0))
1422 return -1;
1423 dfa->nodes = new_nodes;
1424 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1425 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1426 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1427 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1428 if (BE (new_nexts == NULL || new_indices == NULL
1429 || new_edests == NULL || new_eclosures == NULL, 0))
832b75ed 1430 {
ff28b140
TL
1431 re_free (new_nexts);
1432 re_free (new_indices);
1433 re_free (new_edests);
1434 re_free (new_eclosures);
1435 return -1;
832b75ed 1436 }
ff28b140
TL
1437 dfa->nexts = new_nexts;
1438 dfa->org_indices = new_indices;
1439 dfa->edests = new_edests;
1440 dfa->eclosures = new_eclosures;
1441 dfa->nodes_alloc = new_nodes_alloc;
832b75ed
GG
1442 }
1443 dfa->nodes[dfa->nodes_len] = token;
832b75ed 1444 dfa->nodes[dfa->nodes_len].constraint = 0;
ff28b140
TL
1445#ifdef RE_ENABLE_I18N
1446 dfa->nodes[dfa->nodes_len].accept_mb =
1447 ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1448 || token.type == COMPLEX_BRACKET);
1449#endif
1450 dfa->nexts[dfa->nodes_len] = -1;
1451 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1452 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
832b75ed
GG
1453 return dfa->nodes_len++;
1454}
1455
ff28b140
TL
1456static re_hashval_t
1457calc_state_hash (const re_node_set *nodes, unsigned int context)
832b75ed 1458{
ff28b140
TL
1459 re_hashval_t hash = nodes->nelem + context;
1460 Idx i;
832b75ed
GG
1461 for (i = 0 ; i < nodes->nelem ; i++)
1462 hash += nodes->elems[i];
1463 return hash;
1464}
1465
1466/* Search for the state whose node_set is equivalent to NODES.
1467 Return the pointer to the state, if we found it in the DFA.
1468 Otherwise create the new one and return it. In case of an error
1469 return NULL and set the error code in ERR.
1470 Note: - We assume NULL as the invalid state, then it is possible that
1471 return value is NULL and ERR is REG_NOERROR.
1472 - We never return non-NULL value in case of any errors, it is for
1473 optimization. */
1474
ff28b140
TL
1475static re_dfastate_t *
1476__attribute_warn_unused_result__
1477re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1478 const re_node_set *nodes)
832b75ed 1479{
ff28b140 1480 re_hashval_t hash;
832b75ed
GG
1481 re_dfastate_t *new_state;
1482 struct re_state_table_entry *spot;
ff28b140
TL
1483 Idx i;
1484#if defined GCC_LINT || defined lint
1485 /* Suppress bogus uninitialized-variable warnings. */
1486 *err = REG_NOERROR;
1487#endif
832b75ed
GG
1488 if (BE (nodes->nelem == 0, 0))
1489 {
1490 *err = REG_NOERROR;
1491 return NULL;
1492 }
1493 hash = calc_state_hash (nodes, 0);
1494 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1495
1496 for (i = 0 ; i < spot->num ; i++)
1497 {
1498 re_dfastate_t *state = spot->array[i];
1499 if (hash != state->hash)
1500 continue;
1501 if (re_node_set_compare (&state->nodes, nodes))
1502 return state;
1503 }
1504
1505 /* There are no appropriate state in the dfa, create the new one. */
1506 new_state = create_ci_newstate (dfa, nodes, hash);
ff28b140
TL
1507 if (BE (new_state == NULL, 0))
1508 *err = REG_ESPACE;
1509
1510 return new_state;
832b75ed
GG
1511}
1512
1513/* Search for the state whose node_set is equivalent to NODES and
1514 whose context is equivalent to CONTEXT.
1515 Return the pointer to the state, if we found it in the DFA.
1516 Otherwise create the new one and return it. In case of an error
1517 return NULL and set the error code in ERR.
1518 Note: - We assume NULL as the invalid state, then it is possible that
1519 return value is NULL and ERR is REG_NOERROR.
1520 - We never return non-NULL value in case of any errors, it is for
1521 optimization. */
1522
ff28b140
TL
1523static re_dfastate_t *
1524__attribute_warn_unused_result__
1525re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1526 const re_node_set *nodes, unsigned int context)
832b75ed 1527{
ff28b140 1528 re_hashval_t hash;
832b75ed
GG
1529 re_dfastate_t *new_state;
1530 struct re_state_table_entry *spot;
ff28b140
TL
1531 Idx i;
1532#if defined GCC_LINT || defined lint
1533 /* Suppress bogus uninitialized-variable warnings. */
1534 *err = REG_NOERROR;
1535#endif
832b75ed
GG
1536 if (nodes->nelem == 0)
1537 {
1538 *err = REG_NOERROR;
1539 return NULL;
1540 }
1541 hash = calc_state_hash (nodes, context);
1542 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1543
1544 for (i = 0 ; i < spot->num ; i++)
1545 {
1546 re_dfastate_t *state = spot->array[i];
ff28b140
TL
1547 if (state->hash == hash
1548 && state->context == context
1549 && re_node_set_compare (state->entrance_nodes, nodes))
832b75ed
GG
1550 return state;
1551 }
ff28b140 1552 /* There are no appropriate state in 'dfa', create the new one. */
832b75ed 1553 new_state = create_cd_newstate (dfa, nodes, context, hash);
ff28b140
TL
1554 if (BE (new_state == NULL, 0))
1555 *err = REG_ESPACE;
1556
1557 return new_state;
832b75ed
GG
1558}
1559
ff28b140
TL
1560/* Finish initialization of the new state NEWSTATE, and using its hash value
1561 HASH put in the appropriate bucket of DFA's state table. Return value
1562 indicates the error code if failed. */
832b75ed 1563
ff28b140
TL
1564static reg_errcode_t
1565__attribute_warn_unused_result__
1566register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1567 re_hashval_t hash)
832b75ed 1568{
ff28b140 1569 struct re_state_table_entry *spot;
832b75ed 1570 reg_errcode_t err;
ff28b140
TL
1571 Idx i;
1572
1573 newstate->hash = hash;
1574 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
832b75ed 1575 if (BE (err != REG_NOERROR, 0))
ff28b140
TL
1576 return REG_ESPACE;
1577 for (i = 0; i < newstate->nodes.nelem; i++)
832b75ed 1578 {
ff28b140
TL
1579 Idx elem = newstate->nodes.elems[i];
1580 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1581 if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1582 return REG_ESPACE;
832b75ed 1583 }
832b75ed 1584
832b75ed 1585 spot = dfa->state_table + (hash & dfa->state_hash_mask);
ff28b140 1586 if (BE (spot->alloc <= spot->num, 0))
832b75ed 1587 {
ff28b140
TL
1588 Idx new_alloc = 2 * spot->num + 2;
1589 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1590 new_alloc);
832b75ed
GG
1591 if (BE (new_array == NULL, 0))
1592 return REG_ESPACE;
1593 spot->array = new_array;
ff28b140 1594 spot->alloc = new_alloc;
832b75ed
GG
1595 }
1596 spot->array[spot->num++] = newstate;
1597 return REG_NOERROR;
1598}
1599
ff28b140
TL
1600static void
1601free_state (re_dfastate_t *state)
1602{
1603 re_node_set_free (&state->non_eps_nodes);
1604 re_node_set_free (&state->inveclosure);
1605 if (state->entrance_nodes != &state->nodes)
1606 {
1607 re_node_set_free (state->entrance_nodes);
1608 re_free (state->entrance_nodes);
1609 }
1610 re_node_set_free (&state->nodes);
1611 re_free (state->word_trtable);
1612 re_free (state->trtable);
1613 re_free (state);
1614}
1615
1616/* Create the new state which is independent of contexts.
832b75ed
GG
1617 Return the new state if succeeded, otherwise return NULL. */
1618
1619static re_dfastate_t *
ff28b140
TL
1620__attribute_warn_unused_result__
1621create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1622 re_hashval_t hash)
832b75ed 1623{
ff28b140 1624 Idx i;
832b75ed
GG
1625 reg_errcode_t err;
1626 re_dfastate_t *newstate;
ff28b140
TL
1627
1628 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
832b75ed
GG
1629 if (BE (newstate == NULL, 0))
1630 return NULL;
ff28b140
TL
1631 err = re_node_set_init_copy (&newstate->nodes, nodes);
1632 if (BE (err != REG_NOERROR, 0))
1633 {
1634 re_free (newstate);
1635 return NULL;
1636 }
832b75ed 1637
ff28b140 1638 newstate->entrance_nodes = &newstate->nodes;
832b75ed
GG
1639 for (i = 0 ; i < nodes->nelem ; i++)
1640 {
1641 re_token_t *node = dfa->nodes + nodes->elems[i];
1642 re_token_type_t type = node->type;
1643 if (type == CHARACTER && !node->constraint)
1644 continue;
ff28b140
TL
1645#ifdef RE_ENABLE_I18N
1646 newstate->accept_mb |= node->accept_mb;
1647#endif /* RE_ENABLE_I18N */
832b75ed
GG
1648
1649 /* If the state has the halt node, the state is a halt state. */
ff28b140 1650 if (type == END_OF_RE)
832b75ed 1651 newstate->halt = 1;
832b75ed
GG
1652 else if (type == OP_BACK_REF)
1653 newstate->has_backref = 1;
1654 else if (type == ANCHOR || node->constraint)
1655 newstate->has_constraint = 1;
1656 }
1657 err = register_state (dfa, newstate, hash);
1658 if (BE (err != REG_NOERROR, 0))
1659 {
1660 free_state (newstate);
1661 newstate = NULL;
1662 }
1663 return newstate;
1664}
1665
1666/* Create the new state which is depend on the context CONTEXT.
1667 Return the new state if succeeded, otherwise return NULL. */
1668
1669static re_dfastate_t *
ff28b140
TL
1670__attribute_warn_unused_result__
1671create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1672 unsigned int context, re_hashval_t hash)
832b75ed 1673{
ff28b140 1674 Idx i, nctx_nodes = 0;
832b75ed
GG
1675 reg_errcode_t err;
1676 re_dfastate_t *newstate;
1677
ff28b140 1678 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
832b75ed
GG
1679 if (BE (newstate == NULL, 0))
1680 return NULL;
ff28b140
TL
1681 err = re_node_set_init_copy (&newstate->nodes, nodes);
1682 if (BE (err != REG_NOERROR, 0))
1683 {
1684 re_free (newstate);
1685 return NULL;
1686 }
1687
832b75ed
GG
1688 newstate->context = context;
1689 newstate->entrance_nodes = &newstate->nodes;
1690
1691 for (i = 0 ; i < nodes->nelem ; i++)
1692 {
832b75ed
GG
1693 re_token_t *node = dfa->nodes + nodes->elems[i];
1694 re_token_type_t type = node->type;
ff28b140 1695 unsigned int constraint = node->constraint;
832b75ed
GG
1696
1697 if (type == CHARACTER && !constraint)
1698 continue;
832b75ed 1699#ifdef RE_ENABLE_I18N
ff28b140 1700 newstate->accept_mb |= node->accept_mb;
832b75ed 1701#endif /* RE_ENABLE_I18N */
ff28b140
TL
1702
1703 /* If the state has the halt node, the state is a halt state. */
1704 if (type == END_OF_RE)
1705 newstate->halt = 1;
832b75ed
GG
1706 else if (type == OP_BACK_REF)
1707 newstate->has_backref = 1;
832b75ed
GG
1708
1709 if (constraint)
1710 {
1711 if (newstate->entrance_nodes == &newstate->nodes)
1712 {
1713 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1714 if (BE (newstate->entrance_nodes == NULL, 0))
1715 {
1716 free_state (newstate);
1717 return NULL;
1718 }
ff28b140
TL
1719 if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1720 != REG_NOERROR)
1721 return NULL;
832b75ed
GG
1722 nctx_nodes = 0;
1723 newstate->has_constraint = 1;
1724 }
1725
1726 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1727 {
1728 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1729 ++nctx_nodes;
1730 }
1731 }
1732 }
1733 err = register_state (dfa, newstate, hash);
1734 if (BE (err != REG_NOERROR, 0))
1735 {
1736 free_state (newstate);
1737 newstate = NULL;
1738 }
1739 return newstate;
1740}