gl/regex_internal.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
   3    Software Foundation, Inc.
   4    This file is part of the GNU C Library.
   5    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   6
   7    This program is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License along
  18    with this program; if not, write to the Free Software Foundation,
  19    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  20
  21 static void re_string_construct_common (const char *str, Idx len,
  22                                         re_string_t *pstr,
  23                                         RE_TRANSLATE_TYPE trans, bool icase,
  24                                         const re_dfa_t *dfa) internal_function;
  25 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  26                                           const re_node_set *nodes,
  27                                           re_hashval_t hash) internal_function;
  28 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  29                                           const re_node_set *nodes,
  30                                           unsigned int context,
  31                                           re_hashval_t hash) internal_function;
  32 \f
  33 /* Functions for string operation.  */
  34
  35 /* This function allocate the buffers.  It is necessary to call
  36    re_string_reconstruct before using the object.  */
  37
  38 static reg_errcode_t
  39 internal_function __attribute_warn_unused_result__
  40 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
  41                     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  42 {
  43   reg_errcode_t ret;
  44   Idx init_buf_len;
  45
  46   /* Ensure at least one character fits into the buffers.  */
  47   if (init_len < dfa->mb_cur_max)
  48     init_len = dfa->mb_cur_max;
  49   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  50   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  51
  52   ret = re_string_realloc_buffers (pstr, init_buf_len);
  53   if (BE (ret != REG_NOERROR, 0))
  54     return ret;
  55
  56   pstr->word_char = dfa->word_char;
  57   pstr->word_ops_used = dfa->word_ops_used;
  58   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  59   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  60   pstr->valid_raw_len = pstr->valid_len;
  61   return REG_NOERROR;
  62 }
  63
  64 /* This function allocate the buffers, and initialize them.  */
  65
  66 static reg_errcode_t
  67 internal_function __attribute_warn_unused_result__
  68 re_string_construct (re_string_t *pstr, const char *str, Idx len,
  69                      RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  70 {
  71   reg_errcode_t ret;
  72   memset (pstr, '\0', sizeof (re_string_t));
  73   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  74
  75   if (len > 0)
  76     {
  77       ret = re_string_realloc_buffers (pstr, len + 1);
  78       if (BE (ret != REG_NOERROR, 0))
  79         return ret;
  80     }
  81   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  82
  83   if (icase)
  84     {
  85 #ifdef RE_ENABLE_I18N
  86       if (dfa->mb_cur_max > 1)
  87         {
  88           while (1)
  89             {
  90               ret = build_wcs_upper_buffer (pstr);
  91               if (BE (ret != REG_NOERROR, 0))
  92                 return ret;
  93               if (pstr->valid_raw_len >= len)
  94                 break;
  95               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
  96                 break;
  97               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
  98               if (BE (ret != REG_NOERROR, 0))
  99                 return ret;
 100             }
 101         }
 102       else
 103 #endif /* RE_ENABLE_I18N  */
 104         build_upper_buffer (pstr);
 105     }
 106   else
 107     {
 108 #ifdef RE_ENABLE_I18N
 109       if (dfa->mb_cur_max > 1)
 110         build_wcs_buffer (pstr);
 111       else
 112 #endif /* RE_ENABLE_I18N  */
 113         {
 114           if (trans != NULL)
 115             re_string_translate_buffer (pstr);
 116           else
 117             {
 118               pstr->valid_len = pstr->bufs_len;
 119               pstr->valid_raw_len = pstr->bufs_len;
 120             }
 121         }
 122     }
 123
 124   return REG_NOERROR;
 125 }
 126
 127 /* Helper functions for re_string_allocate, and re_string_construct.  */
 128
 129 static reg_errcode_t
 130 internal_function __attribute_warn_unused_result__
 131 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
 132 {
 133 #ifdef RE_ENABLE_I18N
 134   if (pstr->mb_cur_max > 1)
 135     {
 136       wint_t *new_wcs;
 137
 138       /* Avoid overflow.  */
 139       size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
 140       if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
 141         return REG_ESPACE;
 142
 143       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
 144       if (BE (new_wcs == NULL, 0))
 145         return REG_ESPACE;
 146       pstr->wcs = new_wcs;
 147       if (pstr->offsets != NULL)
 148         {
 149           Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
 150           if (BE (new_offsets == NULL, 0))
 151             return REG_ESPACE;
 152           pstr->offsets = new_offsets;
 153         }
 154     }
 155 #endif /* RE_ENABLE_I18N  */
 156   if (pstr->mbs_allocated)
 157     {
 158       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
 159                                            new_buf_len);
 160       if (BE (new_mbs == NULL, 0))
 161         return REG_ESPACE;
 162       pstr->mbs = new_mbs;
 163     }
 164   pstr->bufs_len = new_buf_len;
 165   return REG_NOERROR;
 166 }
 167
 168
 169 static void
 170 internal_function
 171 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
 172                             RE_TRANSLATE_TYPE trans, bool icase,
 173                             const re_dfa_t *dfa)
 174 {
 175   pstr->raw_mbs = (const unsigned char *) str;
 176   pstr->len = len;
 177   pstr->raw_len = len;
 178   pstr->trans = trans;
 179   pstr->icase = icase;
 180   pstr->mbs_allocated = (trans != NULL || icase);
 181   pstr->mb_cur_max = dfa->mb_cur_max;
 182   pstr->is_utf8 = dfa->is_utf8;
 183   pstr->map_notascii = dfa->map_notascii;
 184   pstr->stop = pstr->len;
 185   pstr->raw_stop = pstr->stop;
 186 }
 187
 188 #ifdef RE_ENABLE_I18N
 189
 190 /* Build wide character buffer PSTR->WCS.
 191    If the byte sequence of the string are:
 192      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 193    Then wide character buffer will be:
 194      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 195    We use WEOF for padding, they indicate that the position isn't
 196    a first byte of a multibyte character.
 197
 198    Note that this function assumes PSTR->VALID_LEN elements are already
 199    built and starts from PSTR->VALID_LEN.  */
 200
 201 static void
 202 internal_function
 203 build_wcs_buffer (re_string_t *pstr)
 204 {
 205 #ifdef _LIBC
 206   unsigned char buf[MB_LEN_MAX];
 207   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 208 #else
 209   unsigned char buf[64];
 210 #endif
 211   mbstate_t prev_st;
 212   Idx byte_idx, end_idx, remain_len;
 213   size_t mbclen;
 214
 215   /* Build the buffers from pstr->valid_len to either pstr->len or
 216      pstr->bufs_len.  */
 217   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 218   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 219     {
 220       wchar_t wc;
 221       const char *p;
 222
 223       remain_len = end_idx - byte_idx;
 224       prev_st = pstr->cur_state;
 225       /* Apply the translation if we need.  */
 226       if (BE (pstr->trans != NULL, 0))
 227         {
 228           int i, ch;
 229
 230           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 231             {
 232               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 233               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 234             }
 235           p = (const char *) buf;
 236         }
 237       else
 238         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 239       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 240       if (BE (mbclen == (size_t) -2, 0))
 241         {
 242           /* The buffer doesn't have enough space, finish to build.  */
 243           pstr->cur_state = prev_st;
 244           break;
 245         }
 246       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
 247         {
 248           /* We treat these cases as a singlebyte character.  */
 249           mbclen = 1;
 250           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 251           if (BE (pstr->trans != NULL, 0))
 252             wc = pstr->trans[wc];
 253           pstr->cur_state = prev_st;
 254         }
 255
 256       /* Write wide character and padding.  */
 257       pstr->wcs[byte_idx++] = wc;
 258       /* Write paddings.  */
 259       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 260         pstr->wcs[byte_idx++] = WEOF;
 261     }
 262   pstr->valid_len = byte_idx;
 263   pstr->valid_raw_len = byte_idx;
 264 }
 265
 266 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 267    but for REG_ICASE.  */
 268
 269 static reg_errcode_t
 270 internal_function __attribute_warn_unused_result__
 271 build_wcs_upper_buffer (re_string_t *pstr)
 272 {
 273   mbstate_t prev_st;
 274   Idx src_idx, byte_idx, end_idx, remain_len;
 275   size_t mbclen;
 276 #ifdef _LIBC
 277   char buf[MB_LEN_MAX];
 278   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 279 #else
 280   char buf[64];
 281 #endif
 282
 283   byte_idx = pstr->valid_len;
 284   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 285
 286   /* The following optimization assumes that ASCII characters can be
 287      mapped to wide characters with a simple cast.  */
 288   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 289     {
 290       while (byte_idx < end_idx)
 291         {
 292           wchar_t wc;
 293
 294           if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 295               && mbsinit (&pstr->cur_state))
 296             {
 297               /* In case of a singlebyte character.  */
 298               pstr->mbs[byte_idx]
 299                 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 300               /* The next step uses the assumption that wchar_t is encoded
 301                  ASCII-safe: all ASCII values can be converted like this.  */
 302               pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 303               ++byte_idx;
 304               continue;
 305             }
 306
 307           remain_len = end_idx - byte_idx;
 308           prev_st = pstr->cur_state;
 309           mbclen = __mbrtowc (&wc,
 310                               ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 311                                + byte_idx), remain_len, &pstr->cur_state);
 312           if (BE (mbclen < (size_t) -2, 1))
 313             {
 314               wchar_t wcu = wc;
 315               if (iswlower (wc))
 316                 {
 317                   size_t mbcdlen;
 318
 319                   wcu = towupper (wc);
 320                   mbcdlen = wcrtomb (buf, wcu, &prev_st);
 321                   if (BE (mbclen == mbcdlen, 1))
 322                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 323                   else
 324                     {
 325                       src_idx = byte_idx;
 326                       goto offsets_needed;
 327                     }
 328                 }
 329               else
 330                 memcpy (pstr->mbs + byte_idx,
 331                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 332               pstr->wcs[byte_idx++] = wcu;
 333               /* Write paddings.  */
 334               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 335                 pstr->wcs[byte_idx++] = WEOF;
 336             }
 337           else if (mbclen == (size_t) -1 || mbclen == 0)
 338             {
 339               /* It is an invalid character or '\0'.  Just use the byte.  */
 340               int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 341               pstr->mbs[byte_idx] = ch;
 342               /* And also cast it to wide char.  */
 343               pstr->wcs[byte_idx++] = (wchar_t) ch;
 344               if (BE (mbclen == (size_t) -1, 0))
 345                 pstr->cur_state = prev_st;
 346             }
 347           else
 348             {
 349               /* The buffer doesn't have enough space, finish to build.  */
 350               pstr->cur_state = prev_st;
 351               break;
 352             }
 353         }
 354       pstr->valid_len = byte_idx;
 355       pstr->valid_raw_len = byte_idx;
 356       return REG_NOERROR;
 357     }
 358   else
 359     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 360       {
 361         wchar_t wc;
 362         const char *p;
 363       offsets_needed:
 364         remain_len = end_idx - byte_idx;
 365         prev_st = pstr->cur_state;
 366         if (BE (pstr->trans != NULL, 0))
 367           {
 368             int i, ch;
 369
 370             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 371               {
 372                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 373                 buf[i] = pstr->trans[ch];
 374               }
 375             p = (const char *) buf;
 376           }
 377         else
 378           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 379         mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 380         if (BE (mbclen < (size_t) -2, 1))
 381           {
 382             wchar_t wcu = wc;
 383             if (iswlower (wc))
 384               {
 385                 size_t mbcdlen;
 386
 387                 wcu = towupper (wc);
 388                 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
 389                 if (BE (mbclen == mbcdlen, 1))
 390                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 391                 else if (mbcdlen != (size_t) -1)
 392                   {
 393                     size_t i;
 394
 395                     if (byte_idx + mbcdlen > pstr->bufs_len)
 396                       {
 397                         pstr->cur_state = prev_st;
 398                         break;
 399                       }
 400
 401                     if (pstr->offsets == NULL)
 402                       {
 403                         pstr->offsets = re_malloc (Idx, pstr->bufs_len);
 404
 405                         if (pstr->offsets == NULL)
 406                           return REG_ESPACE;
 407                       }
 408                     if (!pstr->offsets_needed)
 409                       {
 410                         for (i = 0; i < (size_t) byte_idx; ++i)
 411                           pstr->offsets[i] = i;
 412                         pstr->offsets_needed = 1;
 413                       }
 414
 415                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 416                     pstr->wcs[byte_idx] = wcu;
 417                     pstr->offsets[byte_idx] = src_idx;
 418                     for (i = 1; i < mbcdlen; ++i)
 419                       {
 420                         pstr->offsets[byte_idx + i]
 421                           = src_idx + (i < mbclen ? i : mbclen - 1);
 422                         pstr->wcs[byte_idx + i] = WEOF;
 423                       }
 424                     pstr->len += mbcdlen - mbclen;
 425                     if (pstr->raw_stop > src_idx)
 426                       pstr->stop += mbcdlen - mbclen;
 427                     end_idx = (pstr->bufs_len > pstr->len)
 428                               ? pstr->len : pstr->bufs_len;
 429                     byte_idx += mbcdlen;
 430                     src_idx += mbclen;
 431                     continue;
 432                   }
 433                 else
 434                   memcpy (pstr->mbs + byte_idx, p, mbclen);
 435               }
 436             else
 437               memcpy (pstr->mbs + byte_idx, p, mbclen);
 438
 439             if (BE (pstr->offsets_needed != 0, 0))
 440               {
 441                 size_t i;
 442                 for (i = 0; i < mbclen; ++i)
 443                   pstr->offsets[byte_idx + i] = src_idx + i;
 444               }
 445             src_idx += mbclen;
 446
 447             pstr->wcs[byte_idx++] = wcu;
 448             /* Write paddings.  */
 449             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 450               pstr->wcs[byte_idx++] = WEOF;
 451           }
 452         else if (mbclen == (size_t) -1 || mbclen == 0)
 453           {
 454             /* It is an invalid character or '\0'.  Just use the byte.  */
 455             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 456
 457             if (BE (pstr->trans != NULL, 0))
 458               ch = pstr->trans [ch];
 459             pstr->mbs[byte_idx] = ch;
 460
 461             if (BE (pstr->offsets_needed != 0, 0))
 462               pstr->offsets[byte_idx] = src_idx;
 463             ++src_idx;
 464
 465             /* And also cast it to wide char.  */
 466             pstr->wcs[byte_idx++] = (wchar_t) ch;
 467             if (BE (mbclen == (size_t) -1, 0))
 468               pstr->cur_state = prev_st;
 469           }
 470         else
 471           {
 472             /* The buffer doesn't have enough space, finish to build.  */
 473             pstr->cur_state = prev_st;
 474             break;
 475           }
 476       }
 477   pstr->valid_len = byte_idx;
 478   pstr->valid_raw_len = src_idx;
 479   return REG_NOERROR;
 480 }
 481
 482 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 483    Return the index.  */
 484
 485 static Idx
 486 internal_function
 487 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
 488 {
 489   mbstate_t prev_st;
 490   Idx rawbuf_idx;
 491   size_t mbclen;
 492   wint_t wc = WEOF;
 493
 494   /* Skip the characters which are not necessary to check.  */
 495   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 496        rawbuf_idx < new_raw_idx;)
 497     {
 498       wchar_t wc2;
 499       Idx remain_len;
 500       remain_len = pstr->len - rawbuf_idx;
 501       prev_st = pstr->cur_state;
 502       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
 503                           remain_len, &pstr->cur_state);
 504       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
 505         {
 506           /* We treat these cases as a single byte character.  */
 507           if (mbclen == 0 || remain_len == 0)
 508             wc = L'\0';
 509           else
 510             wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
 511           mbclen = 1;
 512           pstr->cur_state = prev_st;
 513         }
 514       else
 515         wc = wc2;
 516       /* Then proceed the next character.  */
 517       rawbuf_idx += mbclen;
 518     }
 519   *last_wc = wc;
 520   return rawbuf_idx;
 521 }
 522 #endif /* RE_ENABLE_I18N  */
 523
 524 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 525    This function is used in case of REG_ICASE.  */
 526
 527 static void
 528 internal_function
 529 build_upper_buffer (re_string_t *pstr)
 530 {
 531   Idx char_idx, end_idx;
 532   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 533
 534   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 535     {
 536       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 537       if (BE (pstr->trans != NULL, 0))
 538         ch = pstr->trans[ch];
 539       if (islower (ch))
 540         pstr->mbs[char_idx] = toupper (ch);
 541       else
 542         pstr->mbs[char_idx] = ch;
 543     }
 544   pstr->valid_len = char_idx;
 545   pstr->valid_raw_len = char_idx;
 546 }
 547
 548 /* Apply TRANS to the buffer in PSTR.  */
 549
 550 static void
 551 internal_function
 552 re_string_translate_buffer (re_string_t *pstr)
 553 {
 554   Idx buf_idx, end_idx;
 555   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 556
 557   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 558     {
 559       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 560       pstr->mbs[buf_idx] = pstr->trans[ch];
 561     }
 562
 563   pstr->valid_len = buf_idx;
 564   pstr->valid_raw_len = buf_idx;
 565 }
 566
 567 /* This function re-construct the buffers.
 568    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 569    convert to upper case in case of REG_ICASE, apply translation.  */
 570
 571 static reg_errcode_t
 572 internal_function __attribute_warn_unused_result__
 573 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
 574 {
 575   Idx offset;
 576
 577   if (BE (pstr->raw_mbs_idx <= idx, 0))
 578     offset = idx - pstr->raw_mbs_idx;
 579   else
 580     {
 581       /* Reset buffer.  */
 582 #ifdef RE_ENABLE_I18N
 583       if (pstr->mb_cur_max > 1)
 584         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 585 #endif /* RE_ENABLE_I18N */
 586       pstr->len = pstr->raw_len;
 587       pstr->stop = pstr->raw_stop;
 588       pstr->valid_len = 0;
 589       pstr->raw_mbs_idx = 0;
 590       pstr->valid_raw_len = 0;
 591       pstr->offsets_needed = 0;
 592       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 593                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 594       if (!pstr->mbs_allocated)
 595         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 596       offset = idx;
 597     }
 598
 599   if (BE (offset != 0, 1))
 600     {
 601       /* Should the already checked characters be kept?  */
 602       if (BE (offset < pstr->valid_raw_len, 1))
 603         {
 604           /* Yes, move them to the front of the buffer.  */
 605 #ifdef RE_ENABLE_I18N
 606           if (BE (pstr->offsets_needed, 0))
 607             {
 608               Idx low = 0, high = pstr->valid_len, mid;
 609               do
 610                 {
 611                   mid = (high + low) / 2;
 612                   if (pstr->offsets[mid] > offset)
 613                     high = mid;
 614                   else if (pstr->offsets[mid] < offset)
 615                     low = mid + 1;
 616                   else
 617                     break;
 618                 }
 619               while (low < high);
 620               if (pstr->offsets[mid] < offset)
 621                 ++mid;
 622               pstr->tip_context = re_string_context_at (pstr, mid - 1,
 623                                                         eflags);
 624               /* This can be quite complicated, so handle specially
 625                  only the common and easy case where the character with
 626                  different length representation of lower and upper
 627                  case is present at or after offset.  */
 628               if (pstr->valid_len > offset
 629                   && mid == offset && pstr->offsets[mid] == offset)
 630                 {
 631                   memmove (pstr->wcs, pstr->wcs + offset,
 632                            (pstr->valid_len - offset) * sizeof (wint_t));
 633                   memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
 634                   pstr->valid_len -= offset;
 635                   pstr->valid_raw_len -= offset;
 636                   for (low = 0; low < pstr->valid_len; low++)
 637                     pstr->offsets[low] = pstr->offsets[low + offset] - offset;
 638                 }
 639               else
 640                 {
 641                   /* Otherwise, just find out how long the partial multibyte
 642                      character at offset is and fill it with WEOF/255.  */
 643                   pstr->len = pstr->raw_len - idx + offset;
 644                   pstr->stop = pstr->raw_stop - idx + offset;
 645                   pstr->offsets_needed = 0;
 646                   while (mid > 0 && pstr->offsets[mid - 1] == offset)
 647                     --mid;
 648                   while (mid < pstr->valid_len)
 649                     if (pstr->wcs[mid] != WEOF)
 650                       break;
 651                     else
 652                       ++mid;
 653                   if (mid == pstr->valid_len)
 654                     pstr->valid_len = 0;
 655                   else
 656                     {
 657                       pstr->valid_len = pstr->offsets[mid] - offset;
 658                       if (pstr->valid_len)
 659                         {
 660                           for (low = 0; low < pstr->valid_len; ++low)
 661                             pstr->wcs[low] = WEOF;
 662                           memset (pstr->mbs, 255, pstr->valid_len);
 663                         }
 664                     }
 665                   pstr->valid_raw_len = pstr->valid_len;
 666                 }
 667             }
 668           else
 669 #endif
 670             {
 671               pstr->tip_context = re_string_context_at (pstr, offset - 1,
 672                                                         eflags);
 673 #ifdef RE_ENABLE_I18N
 674               if (pstr->mb_cur_max > 1)
 675                 memmove (pstr->wcs, pstr->wcs + offset,
 676                          (pstr->valid_len - offset) * sizeof (wint_t));
 677 #endif /* RE_ENABLE_I18N */
 678               if (BE (pstr->mbs_allocated, 0))
 679                 memmove (pstr->mbs, pstr->mbs + offset,
 680                          pstr->valid_len - offset);
 681               pstr->valid_len -= offset;
 682               pstr->valid_raw_len -= offset;
 683 #if DEBUG
 684               assert (pstr->valid_len > 0);
 685 #endif
 686             }
 687         }
 688       else
 689         {
 690 #ifdef RE_ENABLE_I18N
 691           /* No, skip all characters until IDX.  */
 692           Idx prev_valid_len = pstr->valid_len;
 693
 694           if (BE (pstr->offsets_needed, 0))
 695             {
 696               pstr->len = pstr->raw_len - idx + offset;
 697               pstr->stop = pstr->raw_stop - idx + offset;
 698               pstr->offsets_needed = 0;
 699             }
 700 #endif
 701           pstr->valid_len = 0;
 702 #ifdef RE_ENABLE_I18N
 703           if (pstr->mb_cur_max > 1)
 704             {
 705               Idx wcs_idx;
 706               wint_t wc = WEOF;
 707
 708               if (pstr->is_utf8)
 709                 {
 710                   const unsigned char *raw, *p, *end;
 711
 712                   /* Special case UTF-8.  Multi-byte chars start with any
 713                      byte other than 0x80 - 0xbf.  */
 714                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 715                   end = raw + (offset - pstr->mb_cur_max);
 716                   if (end < pstr->raw_mbs)
 717                     end = pstr->raw_mbs;
 718                   p = raw + offset - 1;
 719 #ifdef _LIBC
 720                   /* We know the wchar_t encoding is UCS4, so for the simple
 721                      case, ASCII characters, skip the conversion step.  */
 722                   if (isascii (*p) && BE (pstr->trans == NULL, 1))
 723                     {
 724                       memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 725                       /* pstr->valid_len = 0; */
 726                       wc = (wchar_t) *p;
 727                     }
 728                   else
 729 #endif
 730                     for (; p >= end; --p)
 731                       if ((*p & 0xc0) != 0x80)
 732                         {
 733                           mbstate_t cur_state;
 734                           wchar_t wc2;
 735                           Idx mlen = raw + pstr->len - p;
 736                           unsigned char buf[6];
 737                           size_t mbclen;
 738
 739                           if (BE (pstr->trans != NULL, 0))
 740                             {
 741                               int i = mlen < 6 ? mlen : 6;
 742                               while (--i >= 0)
 743                                 buf[i] = pstr->trans[p[i]];
 744                             }
 745                           /* XXX Don't use mbrtowc, we know which conversion
 746                              to use (UTF-8 -> UCS4).  */
 747                           memset (&cur_state, 0, sizeof (cur_state));
 748                           mbclen = __mbrtowc (&wc2, (const char *) p, mlen,
 749                                               &cur_state);
 750                           if (raw + offset - p <= mbclen
 751                               && mbclen < (size_t) -2)
 752                             {
 753                               memset (&pstr->cur_state, '\0',
 754                                       sizeof (mbstate_t));
 755                               pstr->valid_len = mbclen - (raw + offset - p);
 756                               wc = wc2;
 757                             }
 758                           break;
 759                         }
 760                 }
 761
 762               if (wc == WEOF)
 763                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 764               if (wc == WEOF)
 765                 pstr->tip_context
 766                   = re_string_context_at (pstr, prev_valid_len - 1, eflags);
 767               else
 768                 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 769                                       && IS_WIDE_WORD_CHAR (wc))
 770                                      ? CONTEXT_WORD
 771                                      : ((IS_WIDE_NEWLINE (wc)
 772                                          && pstr->newline_anchor)
 773                                         ? CONTEXT_NEWLINE : 0));
 774               if (BE (pstr->valid_len, 0))
 775                 {
 776                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 777                     pstr->wcs[wcs_idx] = WEOF;
 778                   if (pstr->mbs_allocated)
 779                     memset (pstr->mbs, 255, pstr->valid_len);
 780                 }
 781               pstr->valid_raw_len = pstr->valid_len;
 782             }
 783           else
 784 #endif /* RE_ENABLE_I18N */
 785             {
 786               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 787               pstr->valid_raw_len = 0;
 788               if (pstr->trans)
 789                 c = pstr->trans[c];
 790               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 791                                    ? CONTEXT_WORD
 792                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 793                                       ? CONTEXT_NEWLINE : 0));
 794             }
 795         }
 796       if (!BE (pstr->mbs_allocated, 0))
 797         pstr->mbs += offset;
 798     }
 799   pstr->raw_mbs_idx = idx;
 800   pstr->len -= offset;
 801   pstr->stop -= offset;
 802
 803   /* Then build the buffers.  */
 804 #ifdef RE_ENABLE_I18N
 805   if (pstr->mb_cur_max > 1)
 806     {
 807       if (pstr->icase)
 808         {
 809           reg_errcode_t ret = build_wcs_upper_buffer (pstr);
 810           if (BE (ret != REG_NOERROR, 0))
 811             return ret;
 812         }
 813       else
 814         build_wcs_buffer (pstr);
 815     }
 816   else
 817 #endif /* RE_ENABLE_I18N */
 818     if (BE (pstr->mbs_allocated, 0))
 819       {
 820         if (pstr->icase)
 821           build_upper_buffer (pstr);
 822         else if (pstr->trans != NULL)
 823           re_string_translate_buffer (pstr);
 824       }
 825     else
 826       pstr->valid_len = pstr->len;
 827
 828   pstr->cur_idx = 0;
 829   return REG_NOERROR;
 830 }
 831
 832 static unsigned char
 833 internal_function __attribute ((pure))
 834 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
 835 {
 836   int ch;
 837   Idx off;
 838
 839   /* Handle the common (easiest) cases first.  */
 840   if (BE (!pstr->mbs_allocated, 1))
 841     return re_string_peek_byte (pstr, idx);
 842
 843 #ifdef RE_ENABLE_I18N
 844   if (pstr->mb_cur_max > 1
 845       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 846     return re_string_peek_byte (pstr, idx);
 847 #endif
 848
 849   off = pstr->cur_idx + idx;
 850 #ifdef RE_ENABLE_I18N
 851   if (pstr->offsets_needed)
 852     off = pstr->offsets[off];
 853 #endif
 854
 855   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 856
 857 #ifdef RE_ENABLE_I18N
 858   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 859      this function returns CAPITAL LETTER I instead of first byte of
 860      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 861      since peek_byte_case doesn't advance cur_idx in any way.  */
 862   if (pstr->offsets_needed && !isascii (ch))
 863     return re_string_peek_byte (pstr, idx);
 864 #endif
 865
 866   return ch;
 867 }
 868
 869 static unsigned char
 870 internal_function __attribute ((pure))
 871 re_string_fetch_byte_case (re_string_t *pstr)
 872 {
 873   if (BE (!pstr->mbs_allocated, 1))
 874     return re_string_fetch_byte (pstr);
 875
 876 #ifdef RE_ENABLE_I18N
 877   if (pstr->offsets_needed)
 878     {
 879       Idx off;
 880       int ch;
 881
 882       /* For tr_TR.UTF-8 [[:islower:]] there is
 883          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 884          in that case the whole multi-byte character and return
 885          the original letter.  On the other side, with
 886          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 887          anything else would complicate things too much.  */
 888
 889       if (!re_string_first_byte (pstr, pstr->cur_idx))
 890         return re_string_fetch_byte (pstr);
 891
 892       off = pstr->offsets[pstr->cur_idx];
 893       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 894
 895       if (! isascii (ch))
 896         return re_string_fetch_byte (pstr);
 897
 898       re_string_skip_bytes (pstr,
 899                             re_string_char_size_at (pstr, pstr->cur_idx));
 900       return ch;
 901     }
 902 #endif
 903
 904   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 905 }
 906
 907 static void
 908 internal_function
 909 re_string_destruct (re_string_t *pstr)
 910 {
 911 #ifdef RE_ENABLE_I18N
 912   re_free (pstr->wcs);
 913   re_free (pstr->offsets);
 914 #endif /* RE_ENABLE_I18N  */
 915   if (pstr->mbs_allocated)
 916     re_free (pstr->mbs);
 917 }
 918
 919 /* Return the context at IDX in INPUT.  */
 920
 921 static unsigned int
 922 internal_function
 923 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
 924 {
 925   int c;
 926   if (BE (! REG_VALID_INDEX (idx), 0))
 927     /* In this case, we use the value stored in input->tip_context,
 928        since we can't know the character in input->mbs[-1] here.  */
 929     return input->tip_context;
 930   if (BE (idx == input->len, 0))
 931     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 932             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 933 #ifdef RE_ENABLE_I18N
 934   if (input->mb_cur_max > 1)
 935     {
 936       wint_t wc;
 937       Idx wc_idx = idx;
 938       while(input->wcs[wc_idx] == WEOF)
 939         {
 940 #ifdef DEBUG
 941           /* It must not happen.  */
 942           assert (REG_VALID_INDEX (wc_idx));
 943 #endif
 944           --wc_idx;
 945           if (! REG_VALID_INDEX (wc_idx))
 946             return input->tip_context;
 947         }
 948       wc = input->wcs[wc_idx];
 949       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
 950         return CONTEXT_WORD;
 951       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 952               ? CONTEXT_NEWLINE : 0);
 953     }
 954   else
 955 #endif
 956     {
 957       c = re_string_byte_at (input, idx);
 958       if (bitset_contain (input->word_char, c))
 959         return CONTEXT_WORD;
 960       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 961     }
 962 }
 963 \f
 964 /* Functions for set operation.  */
 965
 966 static reg_errcode_t
 967 internal_function __attribute_warn_unused_result__
 968 re_node_set_alloc (re_node_set *set, Idx size)
 969 {
 970   set->alloc = size;
 971   set->nelem = 0;
 972   set->elems = re_malloc (Idx, size);
 973   if (BE (set->elems == NULL, 0))
 974     return REG_ESPACE;
 975   return REG_NOERROR;
 976 }
 977
 978 static reg_errcode_t
 979 internal_function __attribute_warn_unused_result__
 980 re_node_set_init_1 (re_node_set *set, Idx elem)
 981 {
 982   set->alloc = 1;
 983   set->nelem = 1;
 984   set->elems = re_malloc (Idx, 1);
 985   if (BE (set->elems == NULL, 0))
 986     {
 987       set->alloc = set->nelem = 0;
 988       return REG_ESPACE;
 989     }
 990   set->elems[0] = elem;
 991   return REG_NOERROR;
 992 }
 993
 994 static reg_errcode_t
 995 internal_function __attribute_warn_unused_result__
 996 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
 997 {
 998   set->alloc = 2;
 999   set->elems = re_malloc (Idx, 2);
1000   if (BE (set->elems == NULL, 0))
1001     return REG_ESPACE;
1002   if (elem1 == elem2)
1003     {
1004       set->nelem = 1;
1005       set->elems[0] = elem1;
1006     }
1007   else
1008     {
1009       set->nelem = 2;
1010       if (elem1 < elem2)
1011         {
1012           set->elems[0] = elem1;
1013           set->elems[1] = elem2;
1014         }
1015       else
1016         {
1017           set->elems[0] = elem2;
1018           set->elems[1] = elem1;
1019         }
1020     }
1021   return REG_NOERROR;
1022 }
1023
1024 static reg_errcode_t
1025 internal_function __attribute_warn_unused_result__
1026 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1027 {
1028   dest->nelem = src->nelem;
1029   if (src->nelem > 0)
1030     {
1031       dest->alloc = dest->nelem;
1032       dest->elems = re_malloc (Idx, dest->alloc);
1033       if (BE (dest->elems == NULL, 0))
1034         {
1035           dest->alloc = dest->nelem = 0;
1036           return REG_ESPACE;
1037         }
1038       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1039     }
1040   else
1041     re_node_set_init_empty (dest);
1042   return REG_NOERROR;
1043 }
1044
1045 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1046    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1047    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1048
1049 static reg_errcode_t
1050 internal_function __attribute_warn_unused_result__
1051 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1052                            const re_node_set *src2)
1053 {
1054   Idx i1, i2, is, id, delta, sbase;
1055   if (src1->nelem == 0 || src2->nelem == 0)
1056     return REG_NOERROR;
1057
1058   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1059      conservative estimate.  */
1060   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1061     {
1062       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1063       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1064       if (BE (new_elems == NULL, 0))
1065         return REG_ESPACE;
1066       dest->elems = new_elems;
1067       dest->alloc = new_alloc;
1068     }
1069
1070   /* Find the items in the intersection of SRC1 and SRC2, and copy
1071      into the top of DEST those that are not already in DEST itself.  */
1072   sbase = dest->nelem + src1->nelem + src2->nelem;
1073   i1 = src1->nelem - 1;
1074   i2 = src2->nelem - 1;
1075   id = dest->nelem - 1;
1076   for (;;)
1077     {
1078       if (src1->elems[i1] == src2->elems[i2])
1079         {
1080           /* Try to find the item in DEST.  Maybe we could binary search?  */
1081           while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1082             --id;
1083
1084           if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1085             dest->elems[--sbase] = src1->elems[i1];
1086
1087           if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1088             break;
1089         }
1090
1091       /* Lower the highest of the two items.  */
1092       else if (src1->elems[i1] < src2->elems[i2])
1093         {
1094           if (! REG_VALID_INDEX (--i2))
1095             break;
1096         }
1097       else
1098         {
1099           if (! REG_VALID_INDEX (--i1))
1100             break;
1101         }
1102     }
1103
1104   id = dest->nelem - 1;
1105   is = dest->nelem + src1->nelem + src2->nelem - 1;
1106   delta = is - sbase + 1;
1107
1108   /* Now copy.  When DELTA becomes zero, the remaining
1109      DEST elements are already in place; this is more or
1110      less the same loop that is in re_node_set_merge.  */
1111   dest->nelem += delta;
1112   if (delta > 0 && REG_VALID_INDEX (id))
1113     for (;;)
1114       {
1115         if (dest->elems[is] > dest->elems[id])
1116           {
1117             /* Copy from the top.  */
1118             dest->elems[id + delta--] = dest->elems[is--];
1119             if (delta == 0)
1120               break;
1121           }
1122         else
1123           {
1124             /* Slide from the bottom.  */
1125             dest->elems[id + delta] = dest->elems[id];
1126             if (! REG_VALID_INDEX (--id))
1127               break;
1128           }
1129       }
1130
1131   /* Copy remaining SRC elements.  */
1132   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1133
1134   return REG_NOERROR;
1135 }
1136
1137 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1138    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1139
1140 static reg_errcode_t
1141 internal_function __attribute_warn_unused_result__
1142 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1143                         const re_node_set *src2)
1144 {
1145   Idx i1, i2, id;
1146   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1147     {
1148       dest->alloc = src1->nelem + src2->nelem;
1149       dest->elems = re_malloc (Idx, dest->alloc);
1150       if (BE (dest->elems == NULL, 0))
1151         return REG_ESPACE;
1152     }
1153   else
1154     {
1155       if (src1 != NULL && src1->nelem > 0)
1156         return re_node_set_init_copy (dest, src1);
1157       else if (src2 != NULL && src2->nelem > 0)
1158         return re_node_set_init_copy (dest, src2);
1159       else
1160         re_node_set_init_empty (dest);
1161       return REG_NOERROR;
1162     }
1163   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1164     {
1165       if (src1->elems[i1] > src2->elems[i2])
1166         {
1167           dest->elems[id++] = src2->elems[i2++];
1168           continue;
1169         }
1170       if (src1->elems[i1] == src2->elems[i2])
1171         ++i2;
1172       dest->elems[id++] = src1->elems[i1++];
1173     }
1174   if (i1 < src1->nelem)
1175     {
1176       memcpy (dest->elems + id, src1->elems + i1,
1177              (src1->nelem - i1) * sizeof (Idx));
1178       id += src1->nelem - i1;
1179     }
1180   else if (i2 < src2->nelem)
1181     {
1182       memcpy (dest->elems + id, src2->elems + i2,
1183              (src2->nelem - i2) * sizeof (Idx));
1184       id += src2->nelem - i2;
1185     }
1186   dest->nelem = id;
1187   return REG_NOERROR;
1188 }
1189
1190 /* Calculate the union set of the sets DEST and SRC. And store it to
1191    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1192
1193 static reg_errcode_t
1194 internal_function __attribute_warn_unused_result__
1195 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1196 {
1197   Idx is, id, sbase, delta;
1198   if (src == NULL || src->nelem == 0)
1199     return REG_NOERROR;
1200   if (dest->alloc < 2 * src->nelem + dest->nelem)
1201     {
1202       Idx new_alloc = 2 * (src->nelem + dest->alloc);
1203       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1204       if (BE (new_buffer == NULL, 0))
1205         return REG_ESPACE;
1206       dest->elems = new_buffer;
1207       dest->alloc = new_alloc;
1208     }
1209
1210   if (BE (dest->nelem == 0, 0))
1211     {
1212       dest->nelem = src->nelem;
1213       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1214       return REG_NOERROR;
1215     }
1216
1217   /* Copy into the top of DEST the items of SRC that are not
1218      found in DEST.  Maybe we could binary search in DEST?  */
1219   for (sbase = dest->nelem + 2 * src->nelem,
1220        is = src->nelem - 1, id = dest->nelem - 1;
1221        REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1222     {
1223       if (dest->elems[id] == src->elems[is])
1224         is--, id--;
1225       else if (dest->elems[id] < src->elems[is])
1226         dest->elems[--sbase] = src->elems[is--];
1227       else /* if (dest->elems[id] > src->elems[is]) */
1228         --id;
1229     }
1230
1231   if (REG_VALID_INDEX (is))
1232     {
1233       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1234       sbase -= is + 1;
1235       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1236     }
1237
1238   id = dest->nelem - 1;
1239   is = dest->nelem + 2 * src->nelem - 1;
1240   delta = is - sbase + 1;
1241   if (delta == 0)
1242     return REG_NOERROR;
1243
1244   /* Now copy.  When DELTA becomes zero, the remaining
1245      DEST elements are already in place.  */
1246   dest->nelem += delta;
1247   for (;;)
1248     {
1249       if (dest->elems[is] > dest->elems[id])
1250         {
1251           /* Copy from the top.  */
1252           dest->elems[id + delta--] = dest->elems[is--];
1253           if (delta == 0)
1254             break;
1255         }
1256       else
1257         {
1258           /* Slide from the bottom.  */
1259           dest->elems[id + delta] = dest->elems[id];
1260           if (! REG_VALID_INDEX (--id))
1261             {
1262               /* Copy remaining SRC elements.  */
1263               memcpy (dest->elems, dest->elems + sbase,
1264                       delta * sizeof (Idx));
1265               break;
1266             }
1267         }
1268     }
1269
1270   return REG_NOERROR;
1271 }
1272
1273 /* Insert the new element ELEM to the re_node_set* SET.
1274    SET should not already have ELEM.
1275    Return true if successful.  */
1276
1277 static bool
1278 internal_function __attribute_warn_unused_result__
1279 re_node_set_insert (re_node_set *set, Idx elem)
1280 {
1281   Idx idx;
1282   /* In case the set is empty.  */
1283   if (set->alloc == 0)
1284     return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1285
1286   if (BE (set->nelem, 0) == 0)
1287     {
1288       /* We already guaranteed above that set->alloc != 0.  */
1289       set->elems[0] = elem;
1290       ++set->nelem;
1291       return true;
1292     }
1293
1294   /* Realloc if we need.  */
1295   if (set->alloc == set->nelem)
1296     {
1297       Idx *new_elems;
1298       set->alloc = set->alloc * 2;
1299       new_elems = re_realloc (set->elems, Idx, set->alloc);
1300       if (BE (new_elems == NULL, 0))
1301         return false;
1302       set->elems = new_elems;
1303     }
1304
1305   /* Move the elements which follows the new element.  Test the
1306      first element separately to skip a check in the inner loop.  */
1307   if (elem < set->elems[0])
1308     {
1309       idx = 0;
1310       for (idx = set->nelem; idx > 0; idx--)
1311         set->elems[idx] = set->elems[idx - 1];
1312     }
1313   else
1314     {
1315       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1316         set->elems[idx] = set->elems[idx - 1];
1317     }
1318
1319   /* Insert the new element.  */
1320   set->elems[idx] = elem;
1321   ++set->nelem;
1322   return true;
1323 }
1324
1325 /* Insert the new element ELEM to the re_node_set* SET.
1326    SET should not already have any element greater than or equal to ELEM.
1327    Return true if successful.  */
1328
1329 static bool
1330 internal_function __attribute_warn_unused_result__
1331 re_node_set_insert_last (re_node_set *set, Idx elem)
1332 {
1333   /* Realloc if we need.  */
1334   if (set->alloc == set->nelem)
1335     {
1336       Idx *new_elems;
1337       set->alloc = (set->alloc + 1) * 2;
1338       new_elems = re_realloc (set->elems, Idx, set->alloc);
1339       if (BE (new_elems == NULL, 0))
1340         return false;
1341       set->elems = new_elems;
1342     }
1343
1344   /* Insert the new element.  */
1345   set->elems[set->nelem++] = elem;
1346   return true;
1347 }
1348
1349 /* Compare two node sets SET1 and SET2.
1350    Return true if SET1 and SET2 are equivalent.  */
1351
1352 static bool
1353 internal_function __attribute ((pure))
1354 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1355 {
1356   Idx i;
1357   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1358     return false;
1359   for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1360     if (set1->elems[i] != set2->elems[i])
1361       return false;
1362   return true;
1363 }
1364
1365 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1366
1367 static Idx
1368 internal_function __attribute ((pure))
1369 re_node_set_contains (const re_node_set *set, Idx elem)
1370 {
1371   __re_size_t idx, right, mid;
1372   if (! REG_VALID_NONZERO_INDEX (set->nelem))
1373     return 0;
1374
1375   /* Binary search the element.  */
1376   idx = 0;
1377   right = set->nelem - 1;
1378   while (idx < right)
1379     {
1380       mid = (idx + right) / 2;
1381       if (set->elems[mid] < elem)
1382         idx = mid + 1;
1383       else
1384         right = mid;
1385     }
1386   return set->elems[idx] == elem ? idx + 1 : 0;
1387 }
1388
1389 static void
1390 internal_function
1391 re_node_set_remove_at (re_node_set *set, Idx idx)
1392 {
1393   if (idx < 0 || idx >= set->nelem)
1394     return;
1395   --set->nelem;
1396   for (; idx < set->nelem; idx++)
1397     set->elems[idx] = set->elems[idx + 1];
1398 }
1399 \f
1400
1401 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1402    Or return REG_MISSING if an error occurred.  */
1403
1404 static Idx
1405 internal_function
1406 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1407 {
1408   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1409     {
1410       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1411       Idx *new_nexts, *new_indices;
1412       re_node_set *new_edests, *new_eclosures;
1413       re_token_t *new_nodes;
1414       size_t max_object_size =
1415         MAX (sizeof (re_token_t),
1416              MAX (sizeof (re_node_set),
1417                   sizeof (Idx)));
1418
1419       /* Avoid overflows.  */
1420       if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0))
1421         return REG_MISSING;
1422
1423       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1424       if (BE (new_nodes == NULL, 0))
1425         return REG_MISSING;
1426       dfa->nodes = new_nodes;
1427       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1428       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1429       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1430       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1431       if (BE (new_nexts == NULL || new_indices == NULL
1432               || new_edests == NULL || new_eclosures == NULL, 0))
1433         return REG_MISSING;
1434       dfa->nexts = new_nexts;
1435       dfa->org_indices = new_indices;
1436       dfa->edests = new_edests;
1437       dfa->eclosures = new_eclosures;
1438       dfa->nodes_alloc = new_nodes_alloc;
1439     }
1440   dfa->nodes[dfa->nodes_len] = token;
1441   dfa->nodes[dfa->nodes_len].constraint = 0;
1442 #ifdef RE_ENABLE_I18N
1443   {
1444   int type = token.type;
1445   dfa->nodes[dfa->nodes_len].accept_mb =
1446     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1447   }
1448 #endif
1449   dfa->nexts[dfa->nodes_len] = REG_MISSING;
1450   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1451   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1452   return dfa->nodes_len++;
1453 }
1454
1455 static inline re_hashval_t
1456 internal_function
1457 calc_state_hash (const re_node_set *nodes, unsigned int context)
1458 {
1459   re_hashval_t hash = nodes->nelem + context;
1460   Idx i;
1461   for (i = 0 ; i < nodes->nelem ; i++)
1462     hash += nodes->elems[i];
1463   return hash;
1464 }
1465
1466 /* Search for the state whose node_set is equivalent to NODES.
1467    Return the pointer to the state, if we found it in the DFA.
1468    Otherwise create the new one and return it.  In case of an error
1469    return NULL and set the error code in ERR.
1470    Note: - We assume NULL as the invalid state, then it is possible that
1471            return value is NULL and ERR is REG_NOERROR.
1472          - We never return non-NULL value in case of any errors, it is for
1473            optimization.  */
1474
1475 static re_dfastate_t *
1476 internal_function __attribute_warn_unused_result__
1477 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1478                   const re_node_set *nodes)
1479 {
1480   re_hashval_t hash;
1481   re_dfastate_t *new_state;
1482   struct re_state_table_entry *spot;
1483   Idx i;
1484 #ifdef lint
1485   /* Suppress bogus uninitialized-variable warnings.  */
1486   *err = REG_NOERROR;
1487 #endif
1488   if (BE (nodes->nelem == 0, 0))
1489     {
1490       *err = REG_NOERROR;
1491       return NULL;
1492     }
1493   hash = calc_state_hash (nodes, 0);
1494   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1495
1496   for (i = 0 ; i < spot->num ; i++)
1497     {
1498       re_dfastate_t *state = spot->array[i];
1499       if (hash != state->hash)
1500         continue;
1501       if (re_node_set_compare (&state->nodes, nodes))
1502         return state;
1503     }
1504
1505   /* There are no appropriate state in the dfa, create the new one.  */
1506   new_state = create_ci_newstate (dfa, nodes, hash);
1507   if (BE (new_state == NULL, 0))
1508     *err = REG_ESPACE;
1509
1510   return new_state;
1511 }
1512
1513 /* Search for the state whose node_set is equivalent to NODES and
1514    whose context is equivalent to CONTEXT.
1515    Return the pointer to the state, if we found it in the DFA.
1516    Otherwise create the new one and return it.  In case of an error
1517    return NULL and set the error code in ERR.
1518    Note: - We assume NULL as the invalid state, then it is possible that
1519            return value is NULL and ERR is REG_NOERROR.
1520          - We never return non-NULL value in case of any errors, it is for
1521            optimization.  */
1522
1523 static re_dfastate_t *
1524 internal_function __attribute_warn_unused_result__
1525 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1526                           const re_node_set *nodes, unsigned int context)
1527 {
1528   re_hashval_t hash;
1529   re_dfastate_t *new_state;
1530   struct re_state_table_entry *spot;
1531   Idx i;
1532 #ifdef lint
1533   /* Suppress bogus uninitialized-variable warnings.  */
1534   *err = REG_NOERROR;
1535 #endif
1536   if (nodes->nelem == 0)
1537     {
1538       *err = REG_NOERROR;
1539       return NULL;
1540     }
1541   hash = calc_state_hash (nodes, context);
1542   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1543
1544   for (i = 0 ; i < spot->num ; i++)
1545     {
1546       re_dfastate_t *state = spot->array[i];
1547       if (state->hash == hash
1548           && state->context == context
1549           && re_node_set_compare (state->entrance_nodes, nodes))
1550         return state;
1551     }
1552   /* There are no appropriate state in `dfa', create the new one.  */
1553   new_state = create_cd_newstate (dfa, nodes, context, hash);
1554   if (BE (new_state == NULL, 0))
1555     *err = REG_ESPACE;
1556
1557   return new_state;
1558 }
1559
1560 /* Finish initialization of the new state NEWSTATE, and using its hash value
1561    HASH put in the appropriate bucket of DFA's state table.  Return value
1562    indicates the error code if failed.  */
1563
1564 static reg_errcode_t
1565 __attribute_warn_unused_result__
1566 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1567                 re_hashval_t hash)
1568 {
1569   struct re_state_table_entry *spot;
1570   reg_errcode_t err;
1571   Idx i;
1572
1573   newstate->hash = hash;
1574   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1575   if (BE (err != REG_NOERROR, 0))
1576     return REG_ESPACE;
1577   for (i = 0; i < newstate->nodes.nelem; i++)
1578     {
1579       Idx elem = newstate->nodes.elems[i];
1580       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1581         if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0))
1582           return REG_ESPACE;
1583     }
1584
1585   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1586   if (BE (spot->alloc <= spot->num, 0))
1587     {
1588       Idx new_alloc = 2 * spot->num + 2;
1589       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1590                                               new_alloc);
1591       if (BE (new_array == NULL, 0))
1592         return REG_ESPACE;
1593       spot->array = new_array;
1594       spot->alloc = new_alloc;
1595     }
1596   spot->array[spot->num++] = newstate;
1597   return REG_NOERROR;
1598 }
1599
1600 static void
1601 free_state (re_dfastate_t *state)
1602 {
1603   re_node_set_free (&state->non_eps_nodes);
1604   re_node_set_free (&state->inveclosure);
1605   if (state->entrance_nodes != &state->nodes)
1606     {
1607       re_node_set_free (state->entrance_nodes);
1608       re_free (state->entrance_nodes);
1609     }
1610   re_node_set_free (&state->nodes);
1611   re_free (state->word_trtable);
1612   re_free (state->trtable);
1613   re_free (state);
1614 }
1615
1616 /* Create the new state which is independ of contexts.
1617    Return the new state if succeeded, otherwise return NULL.  */
1618
1619 static re_dfastate_t *
1620 internal_function __attribute_warn_unused_result__
1621 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1622                     re_hashval_t hash)
1623 {
1624   Idx i;
1625   reg_errcode_t err;
1626   re_dfastate_t *newstate;
1627
1628   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1629   if (BE (newstate == NULL, 0))
1630     return NULL;
1631   err = re_node_set_init_copy (&newstate->nodes, nodes);
1632   if (BE (err != REG_NOERROR, 0))
1633     {
1634       re_free (newstate);
1635       return NULL;
1636     }
1637
1638   newstate->entrance_nodes = &newstate->nodes;
1639   for (i = 0 ; i < nodes->nelem ; i++)
1640     {
1641       re_token_t *node = dfa->nodes + nodes->elems[i];
1642       re_token_type_t type = node->type;
1643       if (type == CHARACTER && !node->constraint)
1644         continue;
1645 #ifdef RE_ENABLE_I18N
1646       newstate->accept_mb |= node->accept_mb;
1647 #endif /* RE_ENABLE_I18N */
1648
1649       /* If the state has the halt node, the state is a halt state.  */
1650       if (type == END_OF_RE)
1651         newstate->halt = 1;
1652       else if (type == OP_BACK_REF)
1653         newstate->has_backref = 1;
1654       else if (type == ANCHOR || node->constraint)
1655         newstate->has_constraint = 1;
1656     }
1657   err = register_state (dfa, newstate, hash);
1658   if (BE (err != REG_NOERROR, 0))
1659     {
1660       free_state (newstate);
1661       newstate = NULL;
1662     }
1663   return newstate;
1664 }
1665
1666 /* Create the new state which is depend on the context CONTEXT.
1667    Return the new state if succeeded, otherwise return NULL.  */
1668
1669 static re_dfastate_t *
1670 internal_function __attribute_warn_unused_result__
1671 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1672                     unsigned int context, re_hashval_t hash)
1673 {
1674   Idx i, nctx_nodes = 0;
1675   reg_errcode_t err;
1676   re_dfastate_t *newstate;
1677
1678   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1679   if (BE (newstate == NULL, 0))
1680     return NULL;
1681   err = re_node_set_init_copy (&newstate->nodes, nodes);
1682   if (BE (err != REG_NOERROR, 0))
1683     {
1684       re_free (newstate);
1685       return NULL;
1686     }
1687
1688   newstate->context = context;
1689   newstate->entrance_nodes = &newstate->nodes;
1690
1691   for (i = 0 ; i < nodes->nelem ; i++)
1692     {
1693       re_token_t *node = dfa->nodes + nodes->elems[i];
1694       re_token_type_t type = node->type;
1695       unsigned int constraint = node->constraint;
1696
1697       if (type == CHARACTER && !constraint)
1698         continue;
1699 #ifdef RE_ENABLE_I18N
1700       newstate->accept_mb |= node->accept_mb;
1701 #endif /* RE_ENABLE_I18N */
1702
1703       /* If the state has the halt node, the state is a halt state.  */
1704       if (type == END_OF_RE)
1705         newstate->halt = 1;
1706       else if (type == OP_BACK_REF)
1707         newstate->has_backref = 1;
1708
1709       if (constraint)
1710         {
1711           if (newstate->entrance_nodes == &newstate->nodes)
1712             {
1713               newstate->entrance_nodes = re_malloc (re_node_set, 1);
1714               if (BE (newstate->entrance_nodes == NULL, 0))
1715                 {
1716                   free_state (newstate);
1717                   return NULL;
1718                 }
1719               if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1720                   != REG_NOERROR)
1721                 return NULL;
1722               nctx_nodes = 0;
1723               newstate->has_constraint = 1;
1724             }
1725
1726           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1727             {
1728               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1729               ++nctx_nodes;
1730             }
1731         }
1732     }
1733   err = register_state (dfa, newstate, hash);
1734   if (BE (err != REG_NOERROR, 0))
1735     {
1736       free_state (newstate);
1737       newstate = NULL;
1738     }
1739   return  newstate;
1740 }