1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
3 Software Foundation, Inc.
4 This file is part of the GNU C Library.
5 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License along
18 with this program; if not, write to the Free Software Foundation,
19 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
21 static void re_string_construct_common (const char *str, Idx len,
22 re_string_t *pstr,
23 RE_TRANSLATE_TYPE trans, bool icase,
24 const re_dfa_t *dfa) internal_function;
25 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
26 const re_node_set *nodes,
27 re_hashval_t hash) internal_function;
28 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
29 const re_node_set *nodes,
30 unsigned int context,
31 re_hashval_t hash) internal_function;
32 \f
33 /* Functions for string operation. */
35 /* This function allocate the buffers. It is necessary to call
36 re_string_reconstruct before using the object. */
38 static reg_errcode_t
39 internal_function __attribute_warn_unused_result__
40 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
41 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
42 {
43 reg_errcode_t ret;
44 Idx init_buf_len;
46 /* Ensure at least one character fits into the buffers. */
47 if (init_len < dfa->mb_cur_max)
48 init_len = dfa->mb_cur_max;
49 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
50 re_string_construct_common (str, len, pstr, trans, icase, dfa);
52 ret = re_string_realloc_buffers (pstr, init_buf_len);
53 if (BE (ret != REG_NOERROR, 0))
54 return ret;
56 pstr->word_char = dfa->word_char;
57 pstr->word_ops_used = dfa->word_ops_used;
58 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
59 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
60 pstr->valid_raw_len = pstr->valid_len;
61 return REG_NOERROR;
62 }
64 /* This function allocate the buffers, and initialize them. */
66 static reg_errcode_t
67 internal_function __attribute_warn_unused_result__
68 re_string_construct (re_string_t *pstr, const char *str, Idx len,
69 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
70 {
71 reg_errcode_t ret;
72 memset (pstr, '\0', sizeof (re_string_t));
73 re_string_construct_common (str, len, pstr, trans, icase, dfa);
75 if (len > 0)
76 {
77 ret = re_string_realloc_buffers (pstr, len + 1);
78 if (BE (ret != REG_NOERROR, 0))
79 return ret;
80 }
81 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
83 if (icase)
84 {
85 #ifdef RE_ENABLE_I18N
86 if (dfa->mb_cur_max > 1)
87 {
88 while (1)
89 {
90 ret = build_wcs_upper_buffer (pstr);
91 if (BE (ret != REG_NOERROR, 0))
92 return ret;
93 if (pstr->valid_raw_len >= len)
94 break;
95 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
96 break;
97 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
98 if (BE (ret != REG_NOERROR, 0))
99 return ret;
100 }
101 }
102 else
103 #endif /* RE_ENABLE_I18N */
104 build_upper_buffer (pstr);
105 }
106 else
107 {
108 #ifdef RE_ENABLE_I18N
109 if (dfa->mb_cur_max > 1)
110 build_wcs_buffer (pstr);
111 else
112 #endif /* RE_ENABLE_I18N */
113 {
114 if (trans != NULL)
115 re_string_translate_buffer (pstr);
116 else
117 {
118 pstr->valid_len = pstr->bufs_len;
119 pstr->valid_raw_len = pstr->bufs_len;
120 }
121 }
122 }
124 return REG_NOERROR;
125 }
127 /* Helper functions for re_string_allocate, and re_string_construct. */
129 static reg_errcode_t
130 internal_function __attribute_warn_unused_result__
131 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
132 {
133 #ifdef RE_ENABLE_I18N
134 if (pstr->mb_cur_max > 1)
135 {
136 wint_t *new_wcs;
138 /* Avoid overflow. */
139 size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
140 if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
141 return REG_ESPACE;
143 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
144 if (BE (new_wcs == NULL, 0))
145 return REG_ESPACE;
146 pstr->wcs = new_wcs;
147 if (pstr->offsets != NULL)
148 {
149 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
150 if (BE (new_offsets == NULL, 0))
151 return REG_ESPACE;
152 pstr->offsets = new_offsets;
153 }
154 }
155 #endif /* RE_ENABLE_I18N */
156 if (pstr->mbs_allocated)
157 {
158 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
159 new_buf_len);
160 if (BE (new_mbs == NULL, 0))
161 return REG_ESPACE;
162 pstr->mbs = new_mbs;
163 }
164 pstr->bufs_len = new_buf_len;
165 return REG_NOERROR;
166 }
169 static void
170 internal_function
171 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
172 RE_TRANSLATE_TYPE trans, bool icase,
173 const re_dfa_t *dfa)
174 {
175 pstr->raw_mbs = (const unsigned char *) str;
176 pstr->len = len;
177 pstr->raw_len = len;
178 pstr->trans = trans;
179 pstr->icase = icase;
180 pstr->mbs_allocated = (trans != NULL || icase);
181 pstr->mb_cur_max = dfa->mb_cur_max;
182 pstr->is_utf8 = dfa->is_utf8;
183 pstr->map_notascii = dfa->map_notascii;
184 pstr->stop = pstr->len;
185 pstr->raw_stop = pstr->stop;
186 }
188 #ifdef RE_ENABLE_I18N
190 /* Build wide character buffer PSTR->WCS.
191 If the byte sequence of the string are:
192 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
193 Then wide character buffer will be:
194 <wc1> , WEOF , <wc2> , WEOF , <wc3>
195 We use WEOF for padding, they indicate that the position isn't
196 a first byte of a multibyte character.
198 Note that this function assumes PSTR->VALID_LEN elements are already
199 built and starts from PSTR->VALID_LEN. */
201 static void
202 internal_function
203 build_wcs_buffer (re_string_t *pstr)
204 {
205 #ifdef _LIBC
206 unsigned char buf[MB_LEN_MAX];
207 assert (MB_LEN_MAX >= pstr->mb_cur_max);
208 #else
209 unsigned char buf[64];
210 #endif
211 mbstate_t prev_st;
212 Idx byte_idx, end_idx, remain_len;
213 size_t mbclen;
215 /* Build the buffers from pstr->valid_len to either pstr->len or
216 pstr->bufs_len. */
217 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
218 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
219 {
220 wchar_t wc;
221 const char *p;
223 remain_len = end_idx - byte_idx;
224 prev_st = pstr->cur_state;
225 /* Apply the translation if we need. */
226 if (BE (pstr->trans != NULL, 0))
227 {
228 int i, ch;
230 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
231 {
232 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
233 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
234 }
235 p = (const char *) buf;
236 }
237 else
238 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
239 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
240 if (BE (mbclen == (size_t) -2, 0))
241 {
242 /* The buffer doesn't have enough space, finish to build. */
243 pstr->cur_state = prev_st;
244 break;
245 }
246 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
247 {
248 /* We treat these cases as a singlebyte character. */
249 mbclen = 1;
250 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
251 if (BE (pstr->trans != NULL, 0))
252 wc = pstr->trans[wc];
253 pstr->cur_state = prev_st;
254 }
256 /* Write wide character and padding. */
257 pstr->wcs[byte_idx++] = wc;
258 /* Write paddings. */
259 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
260 pstr->wcs[byte_idx++] = WEOF;
261 }
262 pstr->valid_len = byte_idx;
263 pstr->valid_raw_len = byte_idx;
264 }
266 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
267 but for REG_ICASE. */
269 static reg_errcode_t
270 internal_function __attribute_warn_unused_result__
271 build_wcs_upper_buffer (re_string_t *pstr)
272 {
273 mbstate_t prev_st;
274 Idx src_idx, byte_idx, end_idx, remain_len;
275 size_t mbclen;
276 #ifdef _LIBC
277 char buf[MB_LEN_MAX];
278 assert (MB_LEN_MAX >= pstr->mb_cur_max);
279 #else
280 char buf[64];
281 #endif
283 byte_idx = pstr->valid_len;
284 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
286 /* The following optimization assumes that ASCII characters can be
287 mapped to wide characters with a simple cast. */
288 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
289 {
290 while (byte_idx < end_idx)
291 {
292 wchar_t wc;
294 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
295 && mbsinit (&pstr->cur_state))
296 {
297 /* In case of a singlebyte character. */
298 pstr->mbs[byte_idx]
299 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
300 /* The next step uses the assumption that wchar_t is encoded
301 ASCII-safe: all ASCII values can be converted like this. */
302 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
303 ++byte_idx;
304 continue;
305 }
307 remain_len = end_idx - byte_idx;
308 prev_st = pstr->cur_state;
309 mbclen = __mbrtowc (&wc,
310 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
311 + byte_idx), remain_len, &pstr->cur_state);
312 if (BE (mbclen < (size_t) -2, 1))
313 {
314 wchar_t wcu = wc;
315 if (iswlower (wc))
316 {
317 size_t mbcdlen;
319 wcu = towupper (wc);
320 mbcdlen = wcrtomb (buf, wcu, &prev_st);
321 if (BE (mbclen == mbcdlen, 1))
322 memcpy (pstr->mbs + byte_idx, buf, mbclen);
323 else
324 {
325 src_idx = byte_idx;
326 goto offsets_needed;
327 }
328 }
329 else
330 memcpy (pstr->mbs + byte_idx,
331 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
332 pstr->wcs[byte_idx++] = wcu;
333 /* Write paddings. */
334 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
335 pstr->wcs[byte_idx++] = WEOF;
336 }
337 else if (mbclen == (size_t) -1 || mbclen == 0)
338 {
339 /* It is an invalid character or '\0'. Just use the byte. */
340 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
341 pstr->mbs[byte_idx] = ch;
342 /* And also cast it to wide char. */
343 pstr->wcs[byte_idx++] = (wchar_t) ch;
344 if (BE (mbclen == (size_t) -1, 0))
345 pstr->cur_state = prev_st;
346 }
347 else
348 {
349 /* The buffer doesn't have enough space, finish to build. */
350 pstr->cur_state = prev_st;
351 break;
352 }
353 }
354 pstr->valid_len = byte_idx;
355 pstr->valid_raw_len = byte_idx;
356 return REG_NOERROR;
357 }
358 else
359 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
360 {
361 wchar_t wc;
362 const char *p;
363 offsets_needed:
364 remain_len = end_idx - byte_idx;
365 prev_st = pstr->cur_state;
366 if (BE (pstr->trans != NULL, 0))
367 {
368 int i, ch;
370 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
371 {
372 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
373 buf[i] = pstr->trans[ch];
374 }
375 p = (const char *) buf;
376 }
377 else
378 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
379 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
380 if (BE (mbclen < (size_t) -2, 1))
381 {
382 wchar_t wcu = wc;
383 if (iswlower (wc))
384 {
385 size_t mbcdlen;
387 wcu = towupper (wc);
388 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
389 if (BE (mbclen == mbcdlen, 1))
390 memcpy (pstr->mbs + byte_idx, buf, mbclen);
391 else if (mbcdlen != (size_t) -1)
392 {
393 size_t i;
395 if (byte_idx + mbcdlen > pstr->bufs_len)
396 {
397 pstr->cur_state = prev_st;
398 break;
399 }
401 if (pstr->offsets == NULL)
402 {
403 pstr->offsets = re_malloc (Idx, pstr->bufs_len);
405 if (pstr->offsets == NULL)
406 return REG_ESPACE;
407 }
408 if (!pstr->offsets_needed)
409 {
410 for (i = 0; i < (size_t) byte_idx; ++i)
411 pstr->offsets[i] = i;
412 pstr->offsets_needed = 1;
413 }
415 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
416 pstr->wcs[byte_idx] = wcu;
417 pstr->offsets[byte_idx] = src_idx;
418 for (i = 1; i < mbcdlen; ++i)
419 {
420 pstr->offsets[byte_idx + i]
421 = src_idx + (i < mbclen ? i : mbclen - 1);
422 pstr->wcs[byte_idx + i] = WEOF;
423 }
424 pstr->len += mbcdlen - mbclen;
425 if (pstr->raw_stop > src_idx)
426 pstr->stop += mbcdlen - mbclen;
427 end_idx = (pstr->bufs_len > pstr->len)
428 ? pstr->len : pstr->bufs_len;
429 byte_idx += mbcdlen;
430 src_idx += mbclen;
431 continue;
432 }
433 else
434 memcpy (pstr->mbs + byte_idx, p, mbclen);
435 }
436 else
437 memcpy (pstr->mbs + byte_idx, p, mbclen);
439 if (BE (pstr->offsets_needed != 0, 0))
440 {
441 size_t i;
442 for (i = 0; i < mbclen; ++i)
443 pstr->offsets[byte_idx + i] = src_idx + i;
444 }
445 src_idx += mbclen;
447 pstr->wcs[byte_idx++] = wcu;
448 /* Write paddings. */
449 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
450 pstr->wcs[byte_idx++] = WEOF;
451 }
452 else if (mbclen == (size_t) -1 || mbclen == 0)
453 {
454 /* It is an invalid character or '\0'. Just use the byte. */
455 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
457 if (BE (pstr->trans != NULL, 0))
458 ch = pstr->trans [ch];
459 pstr->mbs[byte_idx] = ch;
461 if (BE (pstr->offsets_needed != 0, 0))
462 pstr->offsets[byte_idx] = src_idx;
463 ++src_idx;
465 /* And also cast it to wide char. */
466 pstr->wcs[byte_idx++] = (wchar_t) ch;
467 if (BE (mbclen == (size_t) -1, 0))
468 pstr->cur_state = prev_st;
469 }
470 else
471 {
472 /* The buffer doesn't have enough space, finish to build. */
473 pstr->cur_state = prev_st;
474 break;
475 }
476 }
477 pstr->valid_len = byte_idx;
478 pstr->valid_raw_len = src_idx;
479 return REG_NOERROR;
480 }
482 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
483 Return the index. */
485 static Idx
486 internal_function
487 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
488 {
489 mbstate_t prev_st;
490 Idx rawbuf_idx;
491 size_t mbclen;
492 wint_t wc = WEOF;
494 /* Skip the characters which are not necessary to check. */
495 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
496 rawbuf_idx < new_raw_idx;)
497 {
498 wchar_t wc2;
499 Idx remain_len;
500 remain_len = pstr->len - rawbuf_idx;
501 prev_st = pstr->cur_state;
502 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
503 remain_len, &pstr->cur_state);
504 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
505 {
506 /* We treat these cases as a single byte character. */
507 if (mbclen == 0 || remain_len == 0)
508 wc = L'\0';
509 else
510 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
511 mbclen = 1;
512 pstr->cur_state = prev_st;
513 }
514 else
515 wc = wc2;
516 /* Then proceed the next character. */
517 rawbuf_idx += mbclen;
518 }
519 *last_wc = wc;
520 return rawbuf_idx;
521 }
522 #endif /* RE_ENABLE_I18N */
524 /* Build the buffer PSTR->MBS, and apply the translation if we need.
525 This function is used in case of REG_ICASE. */
527 static void
528 internal_function
529 build_upper_buffer (re_string_t *pstr)
530 {
531 Idx char_idx, end_idx;
532 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
534 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
535 {
536 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
537 if (BE (pstr->trans != NULL, 0))
538 ch = pstr->trans[ch];
539 if (islower (ch))
540 pstr->mbs[char_idx] = toupper (ch);
541 else
542 pstr->mbs[char_idx] = ch;
543 }
544 pstr->valid_len = char_idx;
545 pstr->valid_raw_len = char_idx;
546 }
548 /* Apply TRANS to the buffer in PSTR. */
550 static void
551 internal_function
552 re_string_translate_buffer (re_string_t *pstr)
553 {
554 Idx buf_idx, end_idx;
555 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
557 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
558 {
559 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
560 pstr->mbs[buf_idx] = pstr->trans[ch];
561 }
563 pstr->valid_len = buf_idx;
564 pstr->valid_raw_len = buf_idx;
565 }
567 /* This function re-construct the buffers.
568 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
569 convert to upper case in case of REG_ICASE, apply translation. */
571 static reg_errcode_t
572 internal_function __attribute_warn_unused_result__
573 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
574 {
575 Idx offset;
577 if (BE (pstr->raw_mbs_idx <= idx, 0))
578 offset = idx - pstr->raw_mbs_idx;
579 else
580 {
581 /* Reset buffer. */
582 #ifdef RE_ENABLE_I18N
583 if (pstr->mb_cur_max > 1)
584 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
585 #endif /* RE_ENABLE_I18N */
586 pstr->len = pstr->raw_len;
587 pstr->stop = pstr->raw_stop;
588 pstr->valid_len = 0;
589 pstr->raw_mbs_idx = 0;
590 pstr->valid_raw_len = 0;
591 pstr->offsets_needed = 0;
592 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
593 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
594 if (!pstr->mbs_allocated)
595 pstr->mbs = (unsigned char *) pstr->raw_mbs;
596 offset = idx;
597 }
599 if (BE (offset != 0, 1))
600 {
601 /* Should the already checked characters be kept? */
602 if (BE (offset < pstr->valid_raw_len, 1))
603 {
604 /* Yes, move them to the front of the buffer. */
605 #ifdef RE_ENABLE_I18N
606 if (BE (pstr->offsets_needed, 0))
607 {
608 Idx low = 0, high = pstr->valid_len, mid;
609 do
610 {
611 mid = (high + low) / 2;
612 if (pstr->offsets[mid] > offset)
613 high = mid;
614 else if (pstr->offsets[mid] < offset)
615 low = mid + 1;
616 else
617 break;
618 }
619 while (low < high);
620 if (pstr->offsets[mid] < offset)
621 ++mid;
622 pstr->tip_context = re_string_context_at (pstr, mid - 1,
623 eflags);
624 /* This can be quite complicated, so handle specially
625 only the common and easy case where the character with
626 different length representation of lower and upper
627 case is present at or after offset. */
628 if (pstr->valid_len > offset
629 && mid == offset && pstr->offsets[mid] == offset)
630 {
631 memmove (pstr->wcs, pstr->wcs + offset,
632 (pstr->valid_len - offset) * sizeof (wint_t));
633 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
634 pstr->valid_len -= offset;
635 pstr->valid_raw_len -= offset;
636 for (low = 0; low < pstr->valid_len; low++)
637 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
638 }
639 else
640 {
641 /* Otherwise, just find out how long the partial multibyte
642 character at offset is and fill it with WEOF/255. */
643 pstr->len = pstr->raw_len - idx + offset;
644 pstr->stop = pstr->raw_stop - idx + offset;
645 pstr->offsets_needed = 0;
646 while (mid > 0 && pstr->offsets[mid - 1] == offset)
647 --mid;
648 while (mid < pstr->valid_len)
649 if (pstr->wcs[mid] != WEOF)
650 break;
651 else
652 ++mid;
653 if (mid == pstr->valid_len)
654 pstr->valid_len = 0;
655 else
656 {
657 pstr->valid_len = pstr->offsets[mid] - offset;
658 if (pstr->valid_len)
659 {
660 for (low = 0; low < pstr->valid_len; ++low)
661 pstr->wcs[low] = WEOF;
662 memset (pstr->mbs, 255, pstr->valid_len);
663 }
664 }
665 pstr->valid_raw_len = pstr->valid_len;
666 }
667 }
668 else
669 #endif
670 {
671 pstr->tip_context = re_string_context_at (pstr, offset - 1,
672 eflags);
673 #ifdef RE_ENABLE_I18N
674 if (pstr->mb_cur_max > 1)
675 memmove (pstr->wcs, pstr->wcs + offset,
676 (pstr->valid_len - offset) * sizeof (wint_t));
677 #endif /* RE_ENABLE_I18N */
678 if (BE (pstr->mbs_allocated, 0))
679 memmove (pstr->mbs, pstr->mbs + offset,
680 pstr->valid_len - offset);
681 pstr->valid_len -= offset;
682 pstr->valid_raw_len -= offset;
683 #if DEBUG
684 assert (pstr->valid_len > 0);
685 #endif
686 }
687 }
688 else
689 {
690 #ifdef RE_ENABLE_I18N
691 /* No, skip all characters until IDX. */
692 Idx prev_valid_len = pstr->valid_len;
694 if (BE (pstr->offsets_needed, 0))
695 {
696 pstr->len = pstr->raw_len - idx + offset;
697 pstr->stop = pstr->raw_stop - idx + offset;
698 pstr->offsets_needed = 0;
699 }
700 #endif
701 pstr->valid_len = 0;
702 #ifdef RE_ENABLE_I18N
703 if (pstr->mb_cur_max > 1)
704 {
705 Idx wcs_idx;
706 wint_t wc = WEOF;
708 if (pstr->is_utf8)
709 {
710 const unsigned char *raw, *p, *end;
712 /* Special case UTF-8. Multi-byte chars start with any
713 byte other than 0x80 - 0xbf. */
714 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
715 end = raw + (offset - pstr->mb_cur_max);
716 if (end < pstr->raw_mbs)
717 end = pstr->raw_mbs;
718 p = raw + offset - 1;
719 #ifdef _LIBC
720 /* We know the wchar_t encoding is UCS4, so for the simple
721 case, ASCII characters, skip the conversion step. */
722 if (isascii (*p) && BE (pstr->trans == NULL, 1))
723 {
724 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
725 /* pstr->valid_len = 0; */
726 wc = (wchar_t) *p;
727 }
728 else
729 #endif
730 for (; p >= end; --p)
731 if ((*p & 0xc0) != 0x80)
732 {
733 mbstate_t cur_state;
734 wchar_t wc2;
735 Idx mlen = raw + pstr->len - p;
736 unsigned char buf[6];
737 size_t mbclen;
739 if (BE (pstr->trans != NULL, 0))
740 {
741 int i = mlen < 6 ? mlen : 6;
742 while (--i >= 0)
743 buf[i] = pstr->trans[p[i]];
744 }
745 /* XXX Don't use mbrtowc, we know which conversion
746 to use (UTF-8 -> UCS4). */
747 memset (&cur_state, 0, sizeof (cur_state));
748 mbclen = __mbrtowc (&wc2, (const char *) p, mlen,
749 &cur_state);
750 if (raw + offset - p <= mbclen
751 && mbclen < (size_t) -2)
752 {
753 memset (&pstr->cur_state, '\0',
754 sizeof (mbstate_t));
755 pstr->valid_len = mbclen - (raw + offset - p);
756 wc = wc2;
757 }
758 break;
759 }
760 }
762 if (wc == WEOF)
763 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
764 if (wc == WEOF)
765 pstr->tip_context
766 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
767 else
768 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
769 && IS_WIDE_WORD_CHAR (wc))
770 ? CONTEXT_WORD
771 : ((IS_WIDE_NEWLINE (wc)
772 && pstr->newline_anchor)
773 ? CONTEXT_NEWLINE : 0));
774 if (BE (pstr->valid_len, 0))
775 {
776 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
777 pstr->wcs[wcs_idx] = WEOF;
778 if (pstr->mbs_allocated)
779 memset (pstr->mbs, 255, pstr->valid_len);
780 }
781 pstr->valid_raw_len = pstr->valid_len;
782 }
783 else
784 #endif /* RE_ENABLE_I18N */
785 {
786 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
787 pstr->valid_raw_len = 0;
788 if (pstr->trans)
789 c = pstr->trans[c];
790 pstr->tip_context = (bitset_contain (pstr->word_char, c)
791 ? CONTEXT_WORD
792 : ((IS_NEWLINE (c) && pstr->newline_anchor)
793 ? CONTEXT_NEWLINE : 0));
794 }
795 }
796 if (!BE (pstr->mbs_allocated, 0))
797 pstr->mbs += offset;
798 }
799 pstr->raw_mbs_idx = idx;
800 pstr->len -= offset;
801 pstr->stop -= offset;
803 /* Then build the buffers. */
804 #ifdef RE_ENABLE_I18N
805 if (pstr->mb_cur_max > 1)
806 {
807 if (pstr->icase)
808 {
809 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
810 if (BE (ret != REG_NOERROR, 0))
811 return ret;
812 }
813 else
814 build_wcs_buffer (pstr);
815 }
816 else
817 #endif /* RE_ENABLE_I18N */
818 if (BE (pstr->mbs_allocated, 0))
819 {
820 if (pstr->icase)
821 build_upper_buffer (pstr);
822 else if (pstr->trans != NULL)
823 re_string_translate_buffer (pstr);
824 }
825 else
826 pstr->valid_len = pstr->len;
828 pstr->cur_idx = 0;
829 return REG_NOERROR;
830 }
832 static unsigned char
833 internal_function __attribute ((pure))
834 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
835 {
836 int ch;
837 Idx off;
839 /* Handle the common (easiest) cases first. */
840 if (BE (!pstr->mbs_allocated, 1))
841 return re_string_peek_byte (pstr, idx);
843 #ifdef RE_ENABLE_I18N
844 if (pstr->mb_cur_max > 1
845 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
846 return re_string_peek_byte (pstr, idx);
847 #endif
849 off = pstr->cur_idx + idx;
850 #ifdef RE_ENABLE_I18N
851 if (pstr->offsets_needed)
852 off = pstr->offsets[off];
853 #endif
855 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
857 #ifdef RE_ENABLE_I18N
858 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
859 this function returns CAPITAL LETTER I instead of first byte of
860 DOTLESS SMALL LETTER I. The latter would confuse the parser,
861 since peek_byte_case doesn't advance cur_idx in any way. */
862 if (pstr->offsets_needed && !isascii (ch))
863 return re_string_peek_byte (pstr, idx);
864 #endif
866 return ch;
867 }
869 static unsigned char
870 internal_function __attribute ((pure))
871 re_string_fetch_byte_case (re_string_t *pstr)
872 {
873 if (BE (!pstr->mbs_allocated, 1))
874 return re_string_fetch_byte (pstr);
876 #ifdef RE_ENABLE_I18N
877 if (pstr->offsets_needed)
878 {
879 Idx off;
880 int ch;
882 /* For tr_TR.UTF-8 [[:islower:]] there is
883 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
884 in that case the whole multi-byte character and return
885 the original letter. On the other side, with
886 [[: DOTLESS SMALL LETTER I return [[:I, as doing
887 anything else would complicate things too much. */
889 if (!re_string_first_byte (pstr, pstr->cur_idx))
890 return re_string_fetch_byte (pstr);
892 off = pstr->offsets[pstr->cur_idx];
893 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
895 if (! isascii (ch))
896 return re_string_fetch_byte (pstr);
898 re_string_skip_bytes (pstr,
899 re_string_char_size_at (pstr, pstr->cur_idx));
900 return ch;
901 }
902 #endif
904 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
905 }
907 static void
908 internal_function
909 re_string_destruct (re_string_t *pstr)
910 {
911 #ifdef RE_ENABLE_I18N
912 re_free (pstr->wcs);
913 re_free (pstr->offsets);
914 #endif /* RE_ENABLE_I18N */
915 if (pstr->mbs_allocated)
916 re_free (pstr->mbs);
917 }
919 /* Return the context at IDX in INPUT. */
921 static unsigned int
922 internal_function
923 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
924 {
925 int c;
926 if (BE (! REG_VALID_INDEX (idx), 0))
927 /* In this case, we use the value stored in input->tip_context,
928 since we can't know the character in input->mbs[-1] here. */
929 return input->tip_context;
930 if (BE (idx == input->len, 0))
931 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
932 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
933 #ifdef RE_ENABLE_I18N
934 if (input->mb_cur_max > 1)
935 {
936 wint_t wc;
937 Idx wc_idx = idx;
938 while(input->wcs[wc_idx] == WEOF)
939 {
940 #ifdef DEBUG
941 /* It must not happen. */
942 assert (REG_VALID_INDEX (wc_idx));
943 #endif
944 --wc_idx;
945 if (! REG_VALID_INDEX (wc_idx))
946 return input->tip_context;
947 }
948 wc = input->wcs[wc_idx];
949 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
950 return CONTEXT_WORD;
951 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
952 ? CONTEXT_NEWLINE : 0);
953 }
954 else
955 #endif
956 {
957 c = re_string_byte_at (input, idx);
958 if (bitset_contain (input->word_char, c))
959 return CONTEXT_WORD;
960 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
961 }
962 }
963 \f
964 /* Functions for set operation. */
966 static reg_errcode_t
967 internal_function __attribute_warn_unused_result__
968 re_node_set_alloc (re_node_set *set, Idx size)
969 {
970 set->alloc = size;
971 set->nelem = 0;
972 set->elems = re_malloc (Idx, size);
973 if (BE (set->elems == NULL, 0))
974 return REG_ESPACE;
975 return REG_NOERROR;
976 }
978 static reg_errcode_t
979 internal_function __attribute_warn_unused_result__
980 re_node_set_init_1 (re_node_set *set, Idx elem)
981 {
982 set->alloc = 1;
983 set->nelem = 1;
984 set->elems = re_malloc (Idx, 1);
985 if (BE (set->elems == NULL, 0))
986 {
987 set->alloc = set->nelem = 0;
988 return REG_ESPACE;
989 }
990 set->elems[0] = elem;
991 return REG_NOERROR;
992 }
994 static reg_errcode_t
995 internal_function __attribute_warn_unused_result__
996 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
997 {
998 set->alloc = 2;
999 set->elems = re_malloc (Idx, 2);
1000 if (BE (set->elems == NULL, 0))
1001 return REG_ESPACE;
1002 if (elem1 == elem2)
1003 {
1004 set->nelem = 1;
1005 set->elems[0] = elem1;
1006 }
1007 else
1008 {
1009 set->nelem = 2;
1010 if (elem1 < elem2)
1011 {
1012 set->elems[0] = elem1;
1013 set->elems[1] = elem2;
1014 }
1015 else
1016 {
1017 set->elems[0] = elem2;
1018 set->elems[1] = elem1;
1019 }
1020 }
1021 return REG_NOERROR;
1022 }
1024 static reg_errcode_t
1025 internal_function __attribute_warn_unused_result__
1026 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1027 {
1028 dest->nelem = src->nelem;
1029 if (src->nelem > 0)
1030 {
1031 dest->alloc = dest->nelem;
1032 dest->elems = re_malloc (Idx, dest->alloc);
1033 if (BE (dest->elems == NULL, 0))
1034 {
1035 dest->alloc = dest->nelem = 0;
1036 return REG_ESPACE;
1037 }
1038 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1039 }
1040 else
1041 re_node_set_init_empty (dest);
1042 return REG_NOERROR;
1043 }
1045 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1046 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1047 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1049 static reg_errcode_t
1050 internal_function __attribute_warn_unused_result__
1051 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1052 const re_node_set *src2)
1053 {
1054 Idx i1, i2, is, id, delta, sbase;
1055 if (src1->nelem == 0 || src2->nelem == 0)
1056 return REG_NOERROR;
1058 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1059 conservative estimate. */
1060 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1061 {
1062 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1063 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1064 if (BE (new_elems == NULL, 0))
1065 return REG_ESPACE;
1066 dest->elems = new_elems;
1067 dest->alloc = new_alloc;
1068 }
1070 /* Find the items in the intersection of SRC1 and SRC2, and copy
1071 into the top of DEST those that are not already in DEST itself. */
1072 sbase = dest->nelem + src1->nelem + src2->nelem;
1073 i1 = src1->nelem - 1;
1074 i2 = src2->nelem - 1;
1075 id = dest->nelem - 1;
1076 for (;;)
1077 {
1078 if (src1->elems[i1] == src2->elems[i2])
1079 {
1080 /* Try to find the item in DEST. Maybe we could binary search? */
1081 while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1082 --id;
1084 if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1085 dest->elems[--sbase] = src1->elems[i1];
1087 if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1088 break;
1089 }
1091 /* Lower the highest of the two items. */
1092 else if (src1->elems[i1] < src2->elems[i2])
1093 {
1094 if (! REG_VALID_INDEX (--i2))
1095 break;
1096 }
1097 else
1098 {
1099 if (! REG_VALID_INDEX (--i1))
1100 break;
1101 }
1102 }
1104 id = dest->nelem - 1;
1105 is = dest->nelem + src1->nelem + src2->nelem - 1;
1106 delta = is - sbase + 1;
1108 /* Now copy. When DELTA becomes zero, the remaining
1109 DEST elements are already in place; this is more or
1110 less the same loop that is in re_node_set_merge. */
1111 dest->nelem += delta;
1112 if (delta > 0 && REG_VALID_INDEX (id))
1113 for (;;)
1114 {
1115 if (dest->elems[is] > dest->elems[id])
1116 {
1117 /* Copy from the top. */
1118 dest->elems[id + delta--] = dest->elems[is--];
1119 if (delta == 0)
1120 break;
1121 }
1122 else
1123 {
1124 /* Slide from the bottom. */
1125 dest->elems[id + delta] = dest->elems[id];
1126 if (! REG_VALID_INDEX (--id))
1127 break;
1128 }
1129 }
1131 /* Copy remaining SRC elements. */
1132 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1134 return REG_NOERROR;
1135 }
1137 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1138 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1140 static reg_errcode_t
1141 internal_function __attribute_warn_unused_result__
1142 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1143 const re_node_set *src2)
1144 {
1145 Idx i1, i2, id;
1146 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1147 {
1148 dest->alloc = src1->nelem + src2->nelem;
1149 dest->elems = re_malloc (Idx, dest->alloc);
1150 if (BE (dest->elems == NULL, 0))
1151 return REG_ESPACE;
1152 }
1153 else
1154 {
1155 if (src1 != NULL && src1->nelem > 0)
1156 return re_node_set_init_copy (dest, src1);
1157 else if (src2 != NULL && src2->nelem > 0)
1158 return re_node_set_init_copy (dest, src2);
1159 else
1160 re_node_set_init_empty (dest);
1161 return REG_NOERROR;
1162 }
1163 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1164 {
1165 if (src1->elems[i1] > src2->elems[i2])
1166 {
1167 dest->elems[id++] = src2->elems[i2++];
1168 continue;
1169 }
1170 if (src1->elems[i1] == src2->elems[i2])
1171 ++i2;
1172 dest->elems[id++] = src1->elems[i1++];
1173 }
1174 if (i1 < src1->nelem)
1175 {
1176 memcpy (dest->elems + id, src1->elems + i1,
1177 (src1->nelem - i1) * sizeof (Idx));
1178 id += src1->nelem - i1;
1179 }
1180 else if (i2 < src2->nelem)
1181 {
1182 memcpy (dest->elems + id, src2->elems + i2,
1183 (src2->nelem - i2) * sizeof (Idx));
1184 id += src2->nelem - i2;
1185 }
1186 dest->nelem = id;
1187 return REG_NOERROR;
1188 }
1190 /* Calculate the union set of the sets DEST and SRC. And store it to
1191 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1193 static reg_errcode_t
1194 internal_function __attribute_warn_unused_result__
1195 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1196 {
1197 Idx is, id, sbase, delta;
1198 if (src == NULL || src->nelem == 0)
1199 return REG_NOERROR;
1200 if (dest->alloc < 2 * src->nelem + dest->nelem)
1201 {
1202 Idx new_alloc = 2 * (src->nelem + dest->alloc);
1203 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1204 if (BE (new_buffer == NULL, 0))
1205 return REG_ESPACE;
1206 dest->elems = new_buffer;
1207 dest->alloc = new_alloc;
1208 }
1210 if (BE (dest->nelem == 0, 0))
1211 {
1212 dest->nelem = src->nelem;
1213 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1214 return REG_NOERROR;
1215 }
1217 /* Copy into the top of DEST the items of SRC that are not
1218 found in DEST. Maybe we could binary search in DEST? */
1219 for (sbase = dest->nelem + 2 * src->nelem,
1220 is = src->nelem - 1, id = dest->nelem - 1;
1221 REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1222 {
1223 if (dest->elems[id] == src->elems[is])
1224 is--, id--;
1225 else if (dest->elems[id] < src->elems[is])
1226 dest->elems[--sbase] = src->elems[is--];
1227 else /* if (dest->elems[id] > src->elems[is]) */
1228 --id;
1229 }
1231 if (REG_VALID_INDEX (is))
1232 {
1233 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1234 sbase -= is + 1;
1235 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1236 }
1238 id = dest->nelem - 1;
1239 is = dest->nelem + 2 * src->nelem - 1;
1240 delta = is - sbase + 1;
1241 if (delta == 0)
1242 return REG_NOERROR;
1244 /* Now copy. When DELTA becomes zero, the remaining
1245 DEST elements are already in place. */
1246 dest->nelem += delta;
1247 for (;;)
1248 {
1249 if (dest->elems[is] > dest->elems[id])
1250 {
1251 /* Copy from the top. */
1252 dest->elems[id + delta--] = dest->elems[is--];
1253 if (delta == 0)
1254 break;
1255 }
1256 else
1257 {
1258 /* Slide from the bottom. */
1259 dest->elems[id + delta] = dest->elems[id];
1260 if (! REG_VALID_INDEX (--id))
1261 {
1262 /* Copy remaining SRC elements. */
1263 memcpy (dest->elems, dest->elems + sbase,
1264 delta * sizeof (Idx));
1265 break;
1266 }
1267 }
1268 }
1270 return REG_NOERROR;
1271 }
1273 /* Insert the new element ELEM to the re_node_set* SET.
1274 SET should not already have ELEM.
1275 Return true if successful. */
1277 static bool
1278 internal_function __attribute_warn_unused_result__
1279 re_node_set_insert (re_node_set *set, Idx elem)
1280 {
1281 Idx idx;
1282 /* In case the set is empty. */
1283 if (set->alloc == 0)
1284 return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1286 if (BE (set->nelem, 0) == 0)
1287 {
1288 /* We already guaranteed above that set->alloc != 0. */
1289 set->elems[0] = elem;
1290 ++set->nelem;
1291 return true;
1292 }
1294 /* Realloc if we need. */
1295 if (set->alloc == set->nelem)
1296 {
1297 Idx *new_elems;
1298 set->alloc = set->alloc * 2;
1299 new_elems = re_realloc (set->elems, Idx, set->alloc);
1300 if (BE (new_elems == NULL, 0))
1301 return false;
1302 set->elems = new_elems;
1303 }
1305 /* Move the elements which follows the new element. Test the
1306 first element separately to skip a check in the inner loop. */
1307 if (elem < set->elems[0])
1308 {
1309 idx = 0;
1310 for (idx = set->nelem; idx > 0; idx--)
1311 set->elems[idx] = set->elems[idx - 1];
1312 }
1313 else
1314 {
1315 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1316 set->elems[idx] = set->elems[idx - 1];
1317 }
1319 /* Insert the new element. */
1320 set->elems[idx] = elem;
1321 ++set->nelem;
1322 return true;
1323 }
1325 /* Insert the new element ELEM to the re_node_set* SET.
1326 SET should not already have any element greater than or equal to ELEM.
1327 Return true if successful. */
1329 static bool
1330 internal_function __attribute_warn_unused_result__
1331 re_node_set_insert_last (re_node_set *set, Idx elem)
1332 {
1333 /* Realloc if we need. */
1334 if (set->alloc == set->nelem)
1335 {
1336 Idx *new_elems;
1337 set->alloc = (set->alloc + 1) * 2;
1338 new_elems = re_realloc (set->elems, Idx, set->alloc);
1339 if (BE (new_elems == NULL, 0))
1340 return false;
1341 set->elems = new_elems;
1342 }
1344 /* Insert the new element. */
1345 set->elems[set->nelem++] = elem;
1346 return true;
1347 }
1349 /* Compare two node sets SET1 and SET2.
1350 Return true if SET1 and SET2 are equivalent. */
1352 static bool
1353 internal_function __attribute ((pure))
1354 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1355 {
1356 Idx i;
1357 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1358 return false;
1359 for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1360 if (set1->elems[i] != set2->elems[i])
1361 return false;
1362 return true;
1363 }
1365 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1367 static Idx
1368 internal_function __attribute ((pure))
1369 re_node_set_contains (const re_node_set *set, Idx elem)
1370 {
1371 __re_size_t idx, right, mid;
1372 if (! REG_VALID_NONZERO_INDEX (set->nelem))
1373 return 0;
1375 /* Binary search the element. */
1376 idx = 0;
1377 right = set->nelem - 1;
1378 while (idx < right)
1379 {
1380 mid = (idx + right) / 2;
1381 if (set->elems[mid] < elem)
1382 idx = mid + 1;
1383 else
1384 right = mid;
1385 }
1386 return set->elems[idx] == elem ? idx + 1 : 0;
1387 }
1389 static void
1390 internal_function
1391 re_node_set_remove_at (re_node_set *set, Idx idx)
1392 {
1393 if (idx < 0 || idx >= set->nelem)
1394 return;
1395 --set->nelem;
1396 for (; idx < set->nelem; idx++)
1397 set->elems[idx] = set->elems[idx + 1];
1398 }
1399 \f
1401 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1402 Or return REG_MISSING if an error occurred. */
1404 static Idx
1405 internal_function
1406 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1407 {
1408 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1409 {
1410 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1411 Idx *new_nexts, *new_indices;
1412 re_node_set *new_edests, *new_eclosures;
1413 re_token_t *new_nodes;
1414 size_t max_object_size =
1415 MAX (sizeof (re_token_t),
1416 MAX (sizeof (re_node_set),
1417 sizeof (Idx)));
1419 /* Avoid overflows. */
1420 if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0))
1421 return REG_MISSING;
1423 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1424 if (BE (new_nodes == NULL, 0))
1425 return REG_MISSING;
1426 dfa->nodes = new_nodes;
1427 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1428 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1429 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1430 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1431 if (BE (new_nexts == NULL || new_indices == NULL
1432 || new_edests == NULL || new_eclosures == NULL, 0))
1433 return REG_MISSING;
1434 dfa->nexts = new_nexts;
1435 dfa->org_indices = new_indices;
1436 dfa->edests = new_edests;
1437 dfa->eclosures = new_eclosures;
1438 dfa->nodes_alloc = new_nodes_alloc;
1439 }
1440 dfa->nodes[dfa->nodes_len] = token;
1441 dfa->nodes[dfa->nodes_len].constraint = 0;
1442 #ifdef RE_ENABLE_I18N
1443 {
1444 int type = token.type;
1445 dfa->nodes[dfa->nodes_len].accept_mb =
1446 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1447 }
1448 #endif
1449 dfa->nexts[dfa->nodes_len] = REG_MISSING;
1450 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1451 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1452 return dfa->nodes_len++;
1453 }
1455 static inline re_hashval_t
1456 internal_function
1457 calc_state_hash (const re_node_set *nodes, unsigned int context)
1458 {
1459 re_hashval_t hash = nodes->nelem + context;
1460 Idx i;
1461 for (i = 0 ; i < nodes->nelem ; i++)
1462 hash += nodes->elems[i];
1463 return hash;
1464 }
1466 /* Search for the state whose node_set is equivalent to NODES.
1467 Return the pointer to the state, if we found it in the DFA.
1468 Otherwise create the new one and return it. In case of an error
1469 return NULL and set the error code in ERR.
1470 Note: - We assume NULL as the invalid state, then it is possible that
1471 return value is NULL and ERR is REG_NOERROR.
1472 - We never return non-NULL value in case of any errors, it is for
1473 optimization. */
1475 static re_dfastate_t *
1476 internal_function __attribute_warn_unused_result__
1477 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1478 const re_node_set *nodes)
1479 {
1480 re_hashval_t hash;
1481 re_dfastate_t *new_state;
1482 struct re_state_table_entry *spot;
1483 Idx i;
1484 #ifdef lint
1485 /* Suppress bogus uninitialized-variable warnings. */
1486 *err = REG_NOERROR;
1487 #endif
1488 if (BE (nodes->nelem == 0, 0))
1489 {
1490 *err = REG_NOERROR;
1491 return NULL;
1492 }
1493 hash = calc_state_hash (nodes, 0);
1494 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1496 for (i = 0 ; i < spot->num ; i++)
1497 {
1498 re_dfastate_t *state = spot->array[i];
1499 if (hash != state->hash)
1500 continue;
1501 if (re_node_set_compare (&state->nodes, nodes))
1502 return state;
1503 }
1505 /* There are no appropriate state in the dfa, create the new one. */
1506 new_state = create_ci_newstate (dfa, nodes, hash);
1507 if (BE (new_state == NULL, 0))
1508 *err = REG_ESPACE;
1510 return new_state;
1511 }
1513 /* Search for the state whose node_set is equivalent to NODES and
1514 whose context is equivalent to CONTEXT.
1515 Return the pointer to the state, if we found it in the DFA.
1516 Otherwise create the new one and return it. In case of an error
1517 return NULL and set the error code in ERR.
1518 Note: - We assume NULL as the invalid state, then it is possible that
1519 return value is NULL and ERR is REG_NOERROR.
1520 - We never return non-NULL value in case of any errors, it is for
1521 optimization. */
1523 static re_dfastate_t *
1524 internal_function __attribute_warn_unused_result__
1525 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1526 const re_node_set *nodes, unsigned int context)
1527 {
1528 re_hashval_t hash;
1529 re_dfastate_t *new_state;
1530 struct re_state_table_entry *spot;
1531 Idx i;
1532 #ifdef lint
1533 /* Suppress bogus uninitialized-variable warnings. */
1534 *err = REG_NOERROR;
1535 #endif
1536 if (nodes->nelem == 0)
1537 {
1538 *err = REG_NOERROR;
1539 return NULL;
1540 }
1541 hash = calc_state_hash (nodes, context);
1542 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1544 for (i = 0 ; i < spot->num ; i++)
1545 {
1546 re_dfastate_t *state = spot->array[i];
1547 if (state->hash == hash
1548 && state->context == context
1549 && re_node_set_compare (state->entrance_nodes, nodes))
1550 return state;
1551 }
1552 /* There are no appropriate state in `dfa', create the new one. */
1553 new_state = create_cd_newstate (dfa, nodes, context, hash);
1554 if (BE (new_state == NULL, 0))
1555 *err = REG_ESPACE;
1557 return new_state;
1558 }
1560 /* Finish initialization of the new state NEWSTATE, and using its hash value
1561 HASH put in the appropriate bucket of DFA's state table. Return value
1562 indicates the error code if failed. */
1564 static reg_errcode_t
1565 __attribute_warn_unused_result__
1566 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1567 re_hashval_t hash)
1568 {
1569 struct re_state_table_entry *spot;
1570 reg_errcode_t err;
1571 Idx i;
1573 newstate->hash = hash;
1574 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1575 if (BE (err != REG_NOERROR, 0))
1576 return REG_ESPACE;
1577 for (i = 0; i < newstate->nodes.nelem; i++)
1578 {
1579 Idx elem = newstate->nodes.elems[i];
1580 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1581 if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0))
1582 return REG_ESPACE;
1583 }
1585 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1586 if (BE (spot->alloc <= spot->num, 0))
1587 {
1588 Idx new_alloc = 2 * spot->num + 2;
1589 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1590 new_alloc);
1591 if (BE (new_array == NULL, 0))
1592 return REG_ESPACE;
1593 spot->array = new_array;
1594 spot->alloc = new_alloc;
1595 }
1596 spot->array[spot->num++] = newstate;
1597 return REG_NOERROR;
1598 }
1600 static void
1601 free_state (re_dfastate_t *state)
1602 {
1603 re_node_set_free (&state->non_eps_nodes);
1604 re_node_set_free (&state->inveclosure);
1605 if (state->entrance_nodes != &state->nodes)
1606 {
1607 re_node_set_free (state->entrance_nodes);
1608 re_free (state->entrance_nodes);
1609 }
1610 re_node_set_free (&state->nodes);
1611 re_free (state->word_trtable);
1612 re_free (state->trtable);
1613 re_free (state);
1614 }
1616 /* Create the new state which is independ of contexts.
1617 Return the new state if succeeded, otherwise return NULL. */
1619 static re_dfastate_t *
1620 internal_function __attribute_warn_unused_result__
1621 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1622 re_hashval_t hash)
1623 {
1624 Idx i;
1625 reg_errcode_t err;
1626 re_dfastate_t *newstate;
1628 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1629 if (BE (newstate == NULL, 0))
1630 return NULL;
1631 err = re_node_set_init_copy (&newstate->nodes, nodes);
1632 if (BE (err != REG_NOERROR, 0))
1633 {
1634 re_free (newstate);
1635 return NULL;
1636 }
1638 newstate->entrance_nodes = &newstate->nodes;
1639 for (i = 0 ; i < nodes->nelem ; i++)
1640 {
1641 re_token_t *node = dfa->nodes + nodes->elems[i];
1642 re_token_type_t type = node->type;
1643 if (type == CHARACTER && !node->constraint)
1644 continue;
1645 #ifdef RE_ENABLE_I18N
1646 newstate->accept_mb |= node->accept_mb;
1647 #endif /* RE_ENABLE_I18N */
1649 /* If the state has the halt node, the state is a halt state. */
1650 if (type == END_OF_RE)
1651 newstate->halt = 1;
1652 else if (type == OP_BACK_REF)
1653 newstate->has_backref = 1;
1654 else if (type == ANCHOR || node->constraint)
1655 newstate->has_constraint = 1;
1656 }
1657 err = register_state (dfa, newstate, hash);
1658 if (BE (err != REG_NOERROR, 0))
1659 {
1660 free_state (newstate);
1661 newstate = NULL;
1662 }
1663 return newstate;
1664 }
1666 /* Create the new state which is depend on the context CONTEXT.
1667 Return the new state if succeeded, otherwise return NULL. */
1669 static re_dfastate_t *
1670 internal_function __attribute_warn_unused_result__
1671 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1672 unsigned int context, re_hashval_t hash)
1673 {
1674 Idx i, nctx_nodes = 0;
1675 reg_errcode_t err;
1676 re_dfastate_t *newstate;
1678 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1679 if (BE (newstate == NULL, 0))
1680 return NULL;
1681 err = re_node_set_init_copy (&newstate->nodes, nodes);
1682 if (BE (err != REG_NOERROR, 0))
1683 {
1684 re_free (newstate);
1685 return NULL;
1686 }
1688 newstate->context = context;
1689 newstate->entrance_nodes = &newstate->nodes;
1691 for (i = 0 ; i < nodes->nelem ; i++)
1692 {
1693 re_token_t *node = dfa->nodes + nodes->elems[i];
1694 re_token_type_t type = node->type;
1695 unsigned int constraint = node->constraint;
1697 if (type == CHARACTER && !constraint)
1698 continue;
1699 #ifdef RE_ENABLE_I18N
1700 newstate->accept_mb |= node->accept_mb;
1701 #endif /* RE_ENABLE_I18N */
1703 /* If the state has the halt node, the state is a halt state. */
1704 if (type == END_OF_RE)
1705 newstate->halt = 1;
1706 else if (type == OP_BACK_REF)
1707 newstate->has_backref = 1;
1709 if (constraint)
1710 {
1711 if (newstate->entrance_nodes == &newstate->nodes)
1712 {
1713 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1714 if (BE (newstate->entrance_nodes == NULL, 0))
1715 {
1716 free_state (newstate);
1717 return NULL;
1718 }
1719 if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1720 != REG_NOERROR)
1721 return NULL;
1722 nctx_nodes = 0;
1723 newstate->has_constraint = 1;
1724 }
1726 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1727 {
1728 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1729 ++nctx_nodes;
1730 }
1731 }
1732 }
1733 err = register_state (dfa, newstate, hash);
1734 if (BE (err != REG_NOERROR, 0))
1735 {
1736 free_state (newstate);
1737 newstate = NULL;
1738 }
1739 return newstate;
1740 }