compat/regex/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19    02110-1301 USA.  */
  20
  21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                           size_t length, reg_syntax_t syntax);
  23 static void re_compile_fastmap_iter (regex_t *bufp,
  24                                      const re_dfastate_t *init_state,
  25                                      char *fastmap);
  26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  27 #ifdef RE_ENABLE_I18N
  28 static void free_charset (re_charset_t *cset);
  29 #endif /* RE_ENABLE_I18N */
  30 static void free_workarea_compile (regex_t *preg);
  31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  32 #ifdef RE_ENABLE_I18N
  33 static void optimize_utf8 (re_dfa_t *dfa);
  34 #endif
  35 static reg_errcode_t analyze (regex_t *preg);
  36 static reg_errcode_t preorder (bin_tree_t *root,
  37                                reg_errcode_t (fn (void *, bin_tree_t *)),
  38                                void *extra);
  39 static reg_errcode_t postorder (bin_tree_t *root,
  40                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  41                                 void *extra);
  42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  45                                  bin_tree_t *node);
  46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  49 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  50 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  51                                    unsigned int constraint);
  52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  54                                          int node, int root);
  55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  56 static int fetch_number (re_string_t *input, re_token_t *token,
  57                          reg_syntax_t syntax);
  58 static int peek_token (re_token_t *token, re_string_t *input,
  59                         reg_syntax_t syntax) internal_function;
  60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  61                           reg_syntax_t syntax, reg_errcode_t *err);
  62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  63                                   re_token_t *token, reg_syntax_t syntax,
  64                                   int nest, reg_errcode_t *err);
  65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  66                                  re_token_t *token, reg_syntax_t syntax,
  67                                  int nest, reg_errcode_t *err);
  68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  69                                      re_token_t *token, reg_syntax_t syntax,
  70                                      int nest, reg_errcode_t *err);
  71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  72                                   re_token_t *token, reg_syntax_t syntax,
  73                                   int nest, reg_errcode_t *err);
  74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  75                                  re_dfa_t *dfa, re_token_t *token,
  76                                  reg_syntax_t syntax, reg_errcode_t *err);
  77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  78                                       re_token_t *token, reg_syntax_t syntax,
  79                                       reg_errcode_t *err);
  80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  81                                             re_string_t *regexp,
  82                                             re_token_t *token, int token_len,
  83                                             re_dfa_t *dfa,
  84                                             reg_syntax_t syntax,
  85                                             int accept_hyphen);
  86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  87                                           re_string_t *regexp,
  88                                           re_token_t *token);
  89 #ifdef RE_ENABLE_I18N
  90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  91                                         re_charset_t *mbcset,
  92                                         int *equiv_class_alloc,
  93                                         const unsigned char *name);
  94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  95                                       bitset_t sbcset,
  96                                       re_charset_t *mbcset,
  97                                       int *char_class_alloc,
  98                                       const char *class_name,
  99                                       reg_syntax_t syntax);
 100 #else  /* not RE_ENABLE_I18N */
 101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 102                                         const unsigned char *name);
 103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 104                                       bitset_t sbcset,
 105                                       const char *class_name,
 106                                       reg_syntax_t syntax);
 107 #endif /* not RE_ENABLE_I18N */
 108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 109                                        RE_TRANSLATE_TYPE trans,
 110                                        const char *class_name,
 111                                        const char *extra,
 112                                        int non_match, reg_errcode_t *err);
 113 static bin_tree_t *create_tree (re_dfa_t *dfa,
 114                                 bin_tree_t *left, bin_tree_t *right,
 115                                 re_token_type_t type);
 116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 117                                       bin_tree_t *left, bin_tree_t *right,
 118                                       const re_token_t *token);
 119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 120 static void free_token (re_token_t *node);
 121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 123 \f
 124 /* This table gives an error message for each of the error codes listed
 125    in regex.h.  Obviously the order here has to be same as there.
 126    POSIX doesn't require that we do anything for REG_NOERROR,
 127    but why not be nice?  */
 128
 129 const char __re_error_msgid[] attribute_hidden =
 130   {
 131 #define REG_NOERROR_IDX 0
 132     gettext_noop ("Success")    /* REG_NOERROR */
 133     "\0"
 134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 135     gettext_noop ("No match")   /* REG_NOMATCH */
 136     "\0"
 137 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 138     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 139     "\0"
 140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 141     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 142     "\0"
 143 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 144     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 145     "\0"
 146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 147     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 148     "\0"
 149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 150     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 151     "\0"
 152 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 153     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 154     "\0"
 155 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 156     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 157     "\0"
 158 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 159     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 160     "\0"
 161 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 162     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 163     "\0"
 164 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 165     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 166     "\0"
 167 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 168     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 169     "\0"
 170 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 171     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 172     "\0"
 173 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 174     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 175     "\0"
 176 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 177     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 178     "\0"
 179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 180     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 181   };
 182
 183 const size_t __re_error_msgid_idx[] attribute_hidden =
 184   {
 185     REG_NOERROR_IDX,
 186     REG_NOMATCH_IDX,
 187     REG_BADPAT_IDX,
 188     REG_ECOLLATE_IDX,
 189     REG_ECTYPE_IDX,
 190     REG_EESCAPE_IDX,
 191     REG_ESUBREG_IDX,
 192     REG_EBRACK_IDX,
 193     REG_EPAREN_IDX,
 194     REG_EBRACE_IDX,
 195     REG_BADBR_IDX,
 196     REG_ERANGE_IDX,
 197     REG_ESPACE_IDX,
 198     REG_BADRPT_IDX,
 199     REG_EEND_IDX,
 200     REG_ESIZE_IDX,
 201     REG_ERPAREN_IDX
 202   };
 203 \f
 204 /* Entry points for GNU code.  */
 205
 206
 207 #ifdef ZOS_USS
 208
 209 /* For ZOS USS we must define btowc */
 210
 211 wchar_t
 212 btowc (int c)
 213 {
 214    wchar_t wtmp[2];
 215    char tmp[2];
 216
 217    tmp[0] = c;
 218    tmp[1] = 0;
 219
 220    mbtowc (wtmp, tmp, 1);
 221    return wtmp[0];
 222 }
 223 #endif
 224
 225 /* re_compile_pattern is the GNU regular expression compiler: it
 226    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 227    Returns 0 if the pattern was valid, otherwise an error string.
 228
 229    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 230    are set in BUFP on entry.  */
 231
 232 const char *
 233 re_compile_pattern (pattern, length, bufp)
 234     const char *pattern;
 235     size_t length;
 236     struct re_pattern_buffer *bufp;
 237 {
 238   reg_errcode_t ret;
 239
 240   /* And GNU code determines whether or not to get register information
 241      by passing null for the REGS argument to re_match, etc., not by
 242      setting no_sub, unless RE_NO_SUB is set.  */
 243   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 244
 245   /* Match anchors at newline.  */
 246   bufp->newline_anchor = 1;
 247
 248   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 249
 250   if (!ret)
 251     return NULL;
 252   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 253 }
 254 #ifdef _LIBC
 255 weak_alias (__re_compile_pattern, re_compile_pattern)
 256 #endif
 257
 258 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 259    also be assigned to arbitrarily: each pattern buffer stores its own
 260    syntax, so it can be changed between regex compilations.  */
 261 /* This has no initializer because initialized variables in Emacs
 262    become read-only after dumping.  */
 263 reg_syntax_t re_syntax_options;
 264
 265
 266 /* Specify the precise syntax of regexps for compilation.  This provides
 267    for compatibility for various utilities which historically have
 268    different, incompatible syntaxes.
 269
 270    The argument SYNTAX is a bit mask comprised of the various bits
 271    defined in regex.h.  We return the old syntax.  */
 272
 273 reg_syntax_t
 274 re_set_syntax (syntax)
 275     reg_syntax_t syntax;
 276 {
 277   reg_syntax_t ret = re_syntax_options;
 278
 279   re_syntax_options = syntax;
 280   return ret;
 281 }
 282 #ifdef _LIBC
 283 weak_alias (__re_set_syntax, re_set_syntax)
 284 #endif
 285
 286 int
 287 re_compile_fastmap (bufp)
 288     struct re_pattern_buffer *bufp;
 289 {
 290   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 291   char *fastmap = bufp->fastmap;
 292
 293   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 294   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 295   if (dfa->init_state != dfa->init_state_word)
 296     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 297   if (dfa->init_state != dfa->init_state_nl)
 298     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 299   if (dfa->init_state != dfa->init_state_begbuf)
 300     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 301   bufp->fastmap_accurate = 1;
 302   return 0;
 303 }
 304 #ifdef _LIBC
 305 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 306 #endif
 307
 308 static inline void
 309 __attribute ((always_inline))
 310 re_set_fastmap (char *fastmap, int icase, int ch)
 311 {
 312   fastmap[ch] = 1;
 313   if (icase)
 314     fastmap[tolower (ch)] = 1;
 315 }
 316
 317 /* Helper function for re_compile_fastmap.
 318    Compile fastmap for the initial_state INIT_STATE.  */
 319
 320 static void
 321 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 322                          char *fastmap)
 323 {
 324   volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 325   int node_cnt;
 326   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 327   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 328     {
 329       int node = init_state->nodes.elems[node_cnt];
 330       re_token_type_t type = dfa->nodes[node].type;
 331
 332       if (type == CHARACTER)
 333         {
 334           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 335 #ifdef RE_ENABLE_I18N
 336           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 337             {
 338               unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
 339               wchar_t wc;
 340               mbstate_t state;
 341
 342               p = buf;
 343               *p++ = dfa->nodes[node].opr.c;
 344               while (++node < dfa->nodes_len
 345                      && dfa->nodes[node].type == CHARACTER
 346                      && dfa->nodes[node].mb_partial)
 347                 *p++ = dfa->nodes[node].opr.c;
 348               memset (&state, '\0', sizeof (state));
 349               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 350                              &state) == p - buf
 351                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 352                       != (size_t) -1))
 353                 re_set_fastmap (fastmap, 0, buf[0]);
 354               re_free (buf);
 355             }
 356 #endif
 357         }
 358       else if (type == SIMPLE_BRACKET)
 359         {
 360           int i, ch;
 361           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 362             {
 363               int j;
 364               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 365               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 366                 if (w & ((bitset_word_t) 1 << j))
 367                   re_set_fastmap (fastmap, icase, ch);
 368             }
 369         }
 370 #ifdef RE_ENABLE_I18N
 371       else if (type == COMPLEX_BRACKET)
 372         {
 373           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 374           int i;
 375
 376 # ifdef _LIBC
 377           /* See if we have to try all bytes which start multiple collation
 378              elements.
 379              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 380                   collation element, and don't catch 'b' since 'b' is
 381                   the only collation element which starts from 'b' (and
 382                   it is caught by SIMPLE_BRACKET).  */
 383               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 384                   && (cset->ncoll_syms || cset->nranges))
 385                 {
 386                   const int32_t *table = (const int32_t *)
 387                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 388                   for (i = 0; i < SBC_MAX; ++i)
 389                     if (table[i] < 0)
 390                       re_set_fastmap (fastmap, icase, i);
 391                 }
 392 # endif /* _LIBC */
 393
 394           /* See if we have to start the match at all multibyte characters,
 395              i.e. where we would not find an invalid sequence.  This only
 396              applies to multibyte character sets; for single byte character
 397              sets, the SIMPLE_BRACKET again suffices.  */
 398           if (dfa->mb_cur_max > 1
 399               && (cset->nchar_classes || cset->non_match || cset->nranges
 400 # ifdef _LIBC
 401                   || cset->nequiv_classes
 402 # endif /* _LIBC */
 403                  ))
 404             {
 405               unsigned char c = 0;
 406               do
 407                 {
 408                   mbstate_t mbs;
 409                   memset (&mbs, 0, sizeof (mbs));
 410                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 411                     re_set_fastmap (fastmap, false, (int) c);
 412                 }
 413               while (++c != 0);
 414             }
 415
 416           else
 417             {
 418               /* ... Else catch all bytes which can start the mbchars.  */
 419               for (i = 0; i < cset->nmbchars; ++i)
 420                 {
 421                   char buf[256];
 422                   mbstate_t state;
 423                   memset (&state, '\0', sizeof (state));
 424                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 425                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 426                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 427                     {
 428                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 429                           != (size_t) -1)
 430                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 431                     }
 432                 }
 433             }
 434         }
 435 #endif /* RE_ENABLE_I18N */
 436       else if (type == OP_PERIOD
 437 #ifdef RE_ENABLE_I18N
 438                || type == OP_UTF8_PERIOD
 439 #endif /* RE_ENABLE_I18N */
 440                || type == END_OF_RE)
 441         {
 442           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 443           if (type == END_OF_RE)
 444             bufp->can_be_null = 1;
 445           return;
 446         }
 447     }
 448 }
 449 \f
 450 /* Entry point for POSIX code.  */
 451 /* regcomp takes a regular expression as a string and compiles it.
 452
 453    PREG is a regex_t *.  We do not expect any fields to be initialized,
 454    since POSIX says we shouldn't.  Thus, we set
 455
 456      `buffer' to the compiled pattern;
 457      `used' to the length of the compiled pattern;
 458      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 459        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 460        RE_SYNTAX_POSIX_BASIC;
 461      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 462      `fastmap' to an allocated space for the fastmap;
 463      `fastmap_accurate' to zero;
 464      `re_nsub' to the number of subexpressions in PATTERN.
 465
 466    PATTERN is the address of the pattern string.
 467
 468    CFLAGS is a series of bits which affect compilation.
 469
 470      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 471      use POSIX basic syntax.
 472
 473      If REG_NEWLINE is set, then . and [^...] don't match newline.
 474      Also, regexec will try a match beginning after every newline.
 475
 476      If REG_ICASE is set, then we considers upper- and lowercase
 477      versions of letters to be equivalent when matching.
 478
 479      If REG_NOSUB is set, then when PREG is passed to regexec, that
 480      routine will report only success or failure, and nothing about the
 481      registers.
 482
 483    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 484    the return codes and their meanings.)  */
 485
 486 int
 487 regcomp (preg, pattern, cflags)
 488     regex_t *__restrict preg;
 489     const char *__restrict pattern;
 490     int cflags;
 491 {
 492   reg_errcode_t ret;
 493   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 494                          : RE_SYNTAX_POSIX_BASIC);
 495
 496   preg->buffer = NULL;
 497   preg->allocated = 0;
 498   preg->used = 0;
 499
 500   /* Try to allocate space for the fastmap.  */
 501   preg->fastmap = re_malloc (char, SBC_MAX);
 502   if (BE (preg->fastmap == NULL, 0))
 503     return REG_ESPACE;
 504
 505   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 506
 507   /* If REG_NEWLINE is set, newlines are treated differently.  */
 508   if (cflags & REG_NEWLINE)
 509     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 510       syntax &= ~RE_DOT_NEWLINE;
 511       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 512       /* It also changes the matching behavior.  */
 513       preg->newline_anchor = 1;
 514     }
 515   else
 516     preg->newline_anchor = 0;
 517   preg->no_sub = !!(cflags & REG_NOSUB);
 518   preg->translate = NULL;
 519
 520   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 521
 522   /* POSIX doesn't distinguish between an unmatched open-group and an
 523      unmatched close-group: both are REG_EPAREN.  */
 524   if (ret == REG_ERPAREN)
 525     ret = REG_EPAREN;
 526
 527   /* We have already checked preg->fastmap != NULL.  */
 528   if (BE (ret == REG_NOERROR, 1))
 529     /* Compute the fastmap now, since regexec cannot modify the pattern
 530        buffer.  This function never fails in this implementation.  */
 531     (void) re_compile_fastmap (preg);
 532   else
 533     {
 534       /* Some error occurred while compiling the expression.  */
 535       re_free (preg->fastmap);
 536       preg->fastmap = NULL;
 537     }
 538
 539   return (int) ret;
 540 }
 541 #ifdef _LIBC
 542 weak_alias (__regcomp, regcomp)
 543 #endif
 544
 545 /* Returns a message corresponding to an error code, ERRCODE, returned
 546    from either regcomp or regexec.   We don't use PREG here.  */
 547
 548 size_t
 549 regerror(int errcode, const regex_t *__restrict preg,
 550          char *__restrict errbuf, size_t errbuf_size)
 551 {
 552   const char *msg;
 553   size_t msg_size;
 554
 555   if (BE (errcode < 0
 556           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 557                                / sizeof (__re_error_msgid_idx[0])), 0))
 558     /* Only error codes returned by the rest of the code should be passed
 559        to this routine.  If we are given anything else, or if other regex
 560        code generates an invalid error code, then the program has a bug.
 561        Dump core so we can fix it.  */
 562     abort ();
 563
 564   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 565
 566   msg_size = strlen (msg) + 1; /* Includes the null.  */
 567
 568   if (BE (errbuf_size != 0, 1))
 569     {
 570       if (BE (msg_size > errbuf_size, 0))
 571         {
 572           memcpy (errbuf, msg, errbuf_size - 1);
 573           errbuf[errbuf_size - 1] = 0;
 574         }
 575       else
 576         memcpy (errbuf, msg, msg_size);
 577     }
 578
 579   return msg_size;
 580 }
 581 #ifdef _LIBC
 582 weak_alias (__regerror, regerror)
 583 #endif
 584
 585
 586 #ifdef RE_ENABLE_I18N
 587 /* This static array is used for the map to single-byte characters when
 588    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 589    it the same all the time.  UTF-8 is the preferred encoding so this is
 590    a worthwhile optimization.  */
 591 #if __GNUC__ >= 3
 592 static const bitset_t utf8_sb_map = {
 593   /* Set the first 128 bits.  */
 594   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 595 };
 596 #else /* ! (__GNUC__ >= 3) */
 597 static bitset_t utf8_sb_map;
 598 #endif /* __GNUC__ >= 3 */
 599 #endif /* RE_ENABLE_I18N */
 600
 601
 602 static void
 603 free_dfa_content (re_dfa_t *dfa)
 604 {
 605   int i, j;
 606
 607   if (dfa->nodes)
 608     for (i = 0; i < dfa->nodes_len; ++i)
 609       free_token (dfa->nodes + i);
 610   re_free (dfa->nexts);
 611   for (i = 0; i < dfa->nodes_len; ++i)
 612     {
 613       if (dfa->eclosures != NULL)
 614         re_node_set_free (dfa->eclosures + i);
 615       if (dfa->inveclosures != NULL)
 616         re_node_set_free (dfa->inveclosures + i);
 617       if (dfa->edests != NULL)
 618         re_node_set_free (dfa->edests + i);
 619     }
 620   re_free (dfa->edests);
 621   re_free (dfa->eclosures);
 622   re_free (dfa->inveclosures);
 623   re_free (dfa->nodes);
 624
 625   if (dfa->state_table)
 626     for (i = 0; i <= dfa->state_hash_mask; ++i)
 627       {
 628         struct re_state_table_entry *entry = dfa->state_table + i;
 629         for (j = 0; j < entry->num; ++j)
 630           {
 631             re_dfastate_t *state = entry->array[j];
 632             free_state (state);
 633           }
 634         re_free (entry->array);
 635       }
 636   re_free (dfa->state_table);
 637 #ifdef RE_ENABLE_I18N
 638   if (dfa->sb_char != utf8_sb_map)
 639     re_free (dfa->sb_char);
 640 #endif
 641   re_free (dfa->subexp_map);
 642 #ifdef DEBUG
 643   re_free (dfa->re_str);
 644 #endif
 645
 646   re_free (dfa);
 647 }
 648
 649
 650 /* Free dynamically allocated space used by PREG.  */
 651
 652 void
 653 regfree (preg)
 654     regex_t *preg;
 655 {
 656   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 657   if (BE (dfa != NULL, 1))
 658     free_dfa_content (dfa);
 659   preg->buffer = NULL;
 660   preg->allocated = 0;
 661
 662   re_free (preg->fastmap);
 663   preg->fastmap = NULL;
 664
 665   re_free (preg->translate);
 666   preg->translate = NULL;
 667 }
 668 #ifdef _LIBC
 669 weak_alias (__regfree, regfree)
 670 #endif
 671 \f
 672 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 673    them unless specifically requested.  */
 674
 675 #if defined _REGEX_RE_COMP || defined _LIBC
 676
 677 /* BSD has one and only one pattern buffer.  */
 678 static struct re_pattern_buffer re_comp_buf;
 679
 680 char *
 681 # ifdef _LIBC
 682 /* Make these definitions weak in libc, so POSIX programs can redefine
 683    these names if they don't use our functions, and still use
 684    regcomp/regexec above without link errors.  */
 685 weak_function
 686 # endif
 687 re_comp (s)
 688      const char *s;
 689 {
 690   reg_errcode_t ret;
 691   char *fastmap;
 692
 693   if (!s)
 694     {
 695       if (!re_comp_buf.buffer)
 696         return gettext ("No previous regular expression");
 697       return 0;
 698     }
 699
 700   if (re_comp_buf.buffer)
 701     {
 702       fastmap = re_comp_buf.fastmap;
 703       re_comp_buf.fastmap = NULL;
 704       __regfree (&re_comp_buf);
 705       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 706       re_comp_buf.fastmap = fastmap;
 707     }
 708
 709   if (re_comp_buf.fastmap == NULL)
 710     {
 711       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 712       if (re_comp_buf.fastmap == NULL)
 713         return (char *) gettext (__re_error_msgid
 714                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 715     }
 716
 717   /* Since `re_exec' always passes NULL for the `regs' argument, we
 718      don't need to initialize the pattern buffer fields which affect it.  */
 719
 720   /* Match anchors at newlines.  */
 721   re_comp_buf.newline_anchor = 1;
 722
 723   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 724
 725   if (!ret)
 726     return NULL;
 727
 728   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 729   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 730 }
 731
 732 #ifdef _LIBC
 733 libc_freeres_fn (free_mem)
 734 {
 735   __regfree (&re_comp_buf);
 736 }
 737 #endif
 738
 739 #endif /* _REGEX_RE_COMP */
 740 \f
 741 /* Internal entry point.
 742    Compile the regular expression PATTERN, whose length is LENGTH.
 743    SYNTAX indicate regular expression's syntax.  */
 744
 745 static reg_errcode_t
 746 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 747                      reg_syntax_t syntax)
 748 {
 749   reg_errcode_t err = REG_NOERROR;
 750   re_dfa_t *dfa;
 751   re_string_t regexp;
 752
 753   /* Initialize the pattern buffer.  */
 754   preg->fastmap_accurate = 0;
 755   preg->syntax = syntax;
 756   preg->not_bol = preg->not_eol = 0;
 757   preg->used = 0;
 758   preg->re_nsub = 0;
 759   preg->can_be_null = 0;
 760   preg->regs_allocated = REGS_UNALLOCATED;
 761
 762   /* Initialize the dfa.  */
 763   dfa = (re_dfa_t *) preg->buffer;
 764   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 765     {
 766       /* If zero allocated, but buffer is non-null, try to realloc
 767          enough space.  This loses if buffer's address is bogus, but
 768          that is the user's responsibility.  If ->buffer is NULL this
 769          is a simple allocation.  */
 770       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 771       if (dfa == NULL)
 772         return REG_ESPACE;
 773       preg->allocated = sizeof (re_dfa_t);
 774       preg->buffer = (unsigned char *) dfa;
 775     }
 776   preg->used = sizeof (re_dfa_t);
 777
 778   err = init_dfa (dfa, length);
 779   if (BE (err != REG_NOERROR, 0))
 780     {
 781       free_dfa_content (dfa);
 782       preg->buffer = NULL;
 783       preg->allocated = 0;
 784       return err;
 785     }
 786 #ifdef DEBUG
 787   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 788   dfa->re_str = re_malloc (char, length + 1);
 789   strncpy (dfa->re_str, pattern, length + 1);
 790 #endif
 791
 792   __libc_lock_init (dfa->lock);
 793
 794   err = re_string_construct (&regexp, pattern, length, preg->translate,
 795                              syntax & RE_ICASE, dfa);
 796   if (BE (err != REG_NOERROR, 0))
 797     {
 798     re_compile_internal_free_return:
 799       free_workarea_compile (preg);
 800       re_string_destruct (&regexp);
 801       free_dfa_content (dfa);
 802       preg->buffer = NULL;
 803       preg->allocated = 0;
 804       return err;
 805     }
 806
 807   /* Parse the regular expression, and build a structure tree.  */
 808   preg->re_nsub = 0;
 809   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 810   if (BE (dfa->str_tree == NULL, 0))
 811     goto re_compile_internal_free_return;
 812
 813   /* Analyze the tree and create the nfa.  */
 814   err = analyze (preg);
 815   if (BE (err != REG_NOERROR, 0))
 816     goto re_compile_internal_free_return;
 817
 818 #ifdef RE_ENABLE_I18N
 819   /* If possible, do searching in single byte encoding to speed things up.  */
 820   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 821     optimize_utf8 (dfa);
 822 #endif
 823
 824   /* Then create the initial state of the dfa.  */
 825   err = create_initial_state (dfa);
 826
 827   /* Release work areas.  */
 828   free_workarea_compile (preg);
 829   re_string_destruct (&regexp);
 830
 831   if (BE (err != REG_NOERROR, 0))
 832     {
 833       free_dfa_content (dfa);
 834       preg->buffer = NULL;
 835       preg->allocated = 0;
 836     }
 837
 838   return err;
 839 }
 840
 841 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 842    as the initial length of some arrays.  */
 843
 844 static reg_errcode_t
 845 init_dfa (re_dfa_t *dfa, size_t pat_len)
 846 {
 847   unsigned int table_size;
 848 #ifndef _LIBC
 849   char *codeset_name;
 850 #endif
 851
 852   memset (dfa, '\0', sizeof (re_dfa_t));
 853
 854   /* Force allocation of str_tree_storage the first time.  */
 855   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 856
 857   /* Avoid overflows.  */
 858   if (pat_len == SIZE_MAX)
 859     return REG_ESPACE;
 860
 861   dfa->nodes_alloc = pat_len + 1;
 862   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 863
 864   /*  table_size = 2 ^ ceil(log pat_len) */
 865   for (table_size = 1; ; table_size <<= 1)
 866     if (table_size > pat_len)
 867       break;
 868
 869   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 870   dfa->state_hash_mask = table_size - 1;
 871
 872   dfa->mb_cur_max = MB_CUR_MAX;
 873 #ifdef _LIBC
 874   if (dfa->mb_cur_max == 6
 875       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 876     dfa->is_utf8 = 1;
 877   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 878                        != 0);
 879 #else
 880 # ifdef HAVE_LANGINFO_CODESET
 881   codeset_name = nl_langinfo (CODESET);
 882 # else
 883   codeset_name = getenv ("LC_ALL");
 884   if (codeset_name == NULL || codeset_name[0] == '\0')
 885     codeset_name = getenv ("LC_CTYPE");
 886   if (codeset_name == NULL || codeset_name[0] == '\0')
 887     codeset_name = getenv ("LANG");
 888   if (codeset_name == NULL)
 889     codeset_name = "";
 890   else if (strchr (codeset_name, '.') !=  NULL)
 891     codeset_name = strchr (codeset_name, '.') + 1;
 892 # endif
 893
 894   /* strcasecmp isn't a standard interface. brute force check */
 895 #if 0
 896   if (strcasecmp (codeset_name, "UTF-8") == 0
 897       || strcasecmp (codeset_name, "UTF8") == 0)
 898     dfa->is_utf8 = 1;
 899 #else
 900   if (   (codeset_name[0] == 'U' || codeset_name[0] == 'u')
 901       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
 902       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
 903       && (codeset_name[3] == '-'
 904           ? codeset_name[4] == '8' && codeset_name[5] == '\0'
 905           : codeset_name[3] == '8' && codeset_name[4] == '\0'))
 906     dfa->is_utf8 = 1;
 907 #endif
 908
 909   /* We check exhaustively in the loop below if this charset is a
 910      superset of ASCII.  */
 911   dfa->map_notascii = 0;
 912 #endif
 913
 914 #ifdef RE_ENABLE_I18N
 915   if (dfa->mb_cur_max > 1)
 916     {
 917       if (dfa->is_utf8)
 918         {
 919 #if !defined(__GNUC__) || __GNUC__ < 3
 920           static short utf8_sb_map_inited = 0;
 921
 922           if (! utf8_sb_map_inited)
 923             {
 924                 int i;
 925
 926                 utf8_sb_map_inited = 0;
 927                 for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
 928                   utf8_sb_map[i] = BITSET_WORD_MAX;
 929             }
 930 #endif
 931           dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 932         }
 933       else
 934         {
 935           int i, j, ch;
 936
 937           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 938           if (BE (dfa->sb_char == NULL, 0))
 939             return REG_ESPACE;
 940
 941           /* Set the bits corresponding to single byte chars.  */
 942           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 943             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 944               {
 945                 wint_t wch = __btowc (ch);
 946                 if (wch != WEOF)
 947                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 948 # ifndef _LIBC
 949                 if (isascii (ch) && wch != ch)
 950                   dfa->map_notascii = 1;
 951 # endif
 952               }
 953         }
 954     }
 955 #endif
 956
 957   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 958     return REG_ESPACE;
 959   return REG_NOERROR;
 960 }
 961
 962 /* Initialize WORD_CHAR table, which indicate which character is
 963    "word".  In this case "word" means that it is the word construction
 964    character used by some operators like "\<", "\>", etc.  */
 965
 966 static void
 967 internal_function
 968 init_word_char (re_dfa_t *dfa)
 969 {
 970   int i, j, ch;
 971   dfa->word_ops_used = 1;
 972   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 973     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 974       if (isalnum (ch) || ch == '_')
 975         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 976 }
 977
 978 /* Free the work area which are only used while compiling.  */
 979
 980 static void
 981 free_workarea_compile (regex_t *preg)
 982 {
 983   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 984   bin_tree_storage_t *storage, *next;
 985   for (storage = dfa->str_tree_storage; storage; storage = next)
 986     {
 987       next = storage->next;
 988       re_free (storage);
 989     }
 990   dfa->str_tree_storage = NULL;
 991   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 992   dfa->str_tree = NULL;
 993   re_free (dfa->org_indices);
 994   dfa->org_indices = NULL;
 995 }
 996
 997 /* Create initial states for all contexts.  */
 998
 999 static reg_errcode_t
1000 create_initial_state (re_dfa_t *dfa)
1001 {
1002   int first, i;
1003   reg_errcode_t err;
1004   re_node_set init_nodes;
1005
1006   /* Initial states have the epsilon closure of the node which is
1007      the first node of the regular expression.  */
1008   first = dfa->str_tree->first->node_idx;
1009   dfa->init_node = first;
1010   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1011   if (BE (err != REG_NOERROR, 0))
1012     return err;
1013
1014   /* The back-references which are in initial states can epsilon transit,
1015      since in this case all of the subexpressions can be null.
1016      Then we add epsilon closures of the nodes which are the next nodes of
1017      the back-references.  */
1018   if (dfa->nbackref > 0)
1019     for (i = 0; i < init_nodes.nelem; ++i)
1020       {
1021         int node_idx = init_nodes.elems[i];
1022         re_token_type_t type = dfa->nodes[node_idx].type;
1023
1024         int clexp_idx;
1025         if (type != OP_BACK_REF)
1026           continue;
1027         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1028           {
1029             re_token_t *clexp_node;
1030             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1031             if (clexp_node->type == OP_CLOSE_SUBEXP
1032                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1033               break;
1034           }
1035         if (clexp_idx == init_nodes.nelem)
1036           continue;
1037
1038         if (type == OP_BACK_REF)
1039           {
1040             int dest_idx = dfa->edests[node_idx].elems[0];
1041             if (!re_node_set_contains (&init_nodes, dest_idx))
1042               {
1043                 reg_errcode_t err = re_node_set_merge (&init_nodes,
1044                                                        dfa->eclosures
1045                                                        + dest_idx);
1046                 if (err != REG_NOERROR)
1047                   return err;
1048                 i = 0;
1049               }
1050           }
1051       }
1052
1053   /* It must be the first time to invoke acquire_state.  */
1054   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1055   /* We don't check ERR here, since the initial state must not be NULL.  */
1056   if (BE (dfa->init_state == NULL, 0))
1057     return err;
1058   if (dfa->init_state->has_constraint)
1059     {
1060       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1061                                                        CONTEXT_WORD);
1062       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1063                                                      CONTEXT_NEWLINE);
1064       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1065                                                          &init_nodes,
1066                                                          CONTEXT_NEWLINE
1067                                                          | CONTEXT_BEGBUF);
1068       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1069               || dfa->init_state_begbuf == NULL, 0))
1070         return err;
1071     }
1072   else
1073     dfa->init_state_word = dfa->init_state_nl
1074       = dfa->init_state_begbuf = dfa->init_state;
1075
1076   re_node_set_free (&init_nodes);
1077   return REG_NOERROR;
1078 }
1079 \f
1080 #ifdef RE_ENABLE_I18N
1081 /* If it is possible to do searching in single byte encoding instead of UTF-8
1082    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1083    DFA nodes where needed.  */
1084
1085 static void
1086 optimize_utf8 (re_dfa_t *dfa)
1087 {
1088   int node, i, mb_chars = 0, has_period = 0;
1089
1090   for (node = 0; node < dfa->nodes_len; ++node)
1091     switch (dfa->nodes[node].type)
1092       {
1093       case CHARACTER:
1094         if (dfa->nodes[node].opr.c >= 0x80)
1095           mb_chars = 1;
1096         break;
1097       case ANCHOR:
1098         switch (dfa->nodes[node].opr.ctx_type)
1099           {
1100           case LINE_FIRST:
1101           case LINE_LAST:
1102           case BUF_FIRST:
1103           case BUF_LAST:
1104             break;
1105           default:
1106             /* Word anchors etc. cannot be handled.  It's okay to test
1107                opr.ctx_type since constraints (for all DFA nodes) are
1108                created by ORing one or more opr.ctx_type values.  */
1109             return;
1110           }
1111         break;
1112       case OP_PERIOD:
1113         has_period = 1;
1114         break;
1115       case OP_BACK_REF:
1116       case OP_ALT:
1117       case END_OF_RE:
1118       case OP_DUP_ASTERISK:
1119       case OP_OPEN_SUBEXP:
1120       case OP_CLOSE_SUBEXP:
1121         break;
1122       case COMPLEX_BRACKET:
1123         return;
1124       case SIMPLE_BRACKET:
1125         /* Just double check.  The non-ASCII range starts at 0x80.  */
1126         assert (0x80 % BITSET_WORD_BITS == 0);
1127         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1128           if (dfa->nodes[node].opr.sbcset[i])
1129             return;
1130         break;
1131       default:
1132         abort ();
1133       }
1134
1135   if (mb_chars || has_period)
1136     for (node = 0; node < dfa->nodes_len; ++node)
1137       {
1138         if (dfa->nodes[node].type == CHARACTER
1139             && dfa->nodes[node].opr.c >= 0x80)
1140           dfa->nodes[node].mb_partial = 0;
1141         else if (dfa->nodes[node].type == OP_PERIOD)
1142           dfa->nodes[node].type = OP_UTF8_PERIOD;
1143       }
1144
1145   /* The search can be in single byte locale.  */
1146   dfa->mb_cur_max = 1;
1147   dfa->is_utf8 = 0;
1148   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1149 }
1150 #endif
1151 \f
1152 /* Analyze the structure tree, and calculate "first", "next", "edest",
1153    "eclosure", and "inveclosure".  */
1154
1155 static reg_errcode_t
1156 analyze (regex_t *preg)
1157 {
1158   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1159   reg_errcode_t ret;
1160
1161   /* Allocate arrays.  */
1162   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1163   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1164   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1165   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1166   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1167           || dfa->eclosures == NULL, 0))
1168     return REG_ESPACE;
1169
1170   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1171   if (dfa->subexp_map != NULL)
1172     {
1173       int i;
1174       for (i = 0; i < preg->re_nsub; i++)
1175         dfa->subexp_map[i] = i;
1176       preorder (dfa->str_tree, optimize_subexps, dfa);
1177       for (i = 0; i < preg->re_nsub; i++)
1178         if (dfa->subexp_map[i] != i)
1179           break;
1180       if (i == preg->re_nsub)
1181         {
1182           free (dfa->subexp_map);
1183           dfa->subexp_map = NULL;
1184         }
1185     }
1186
1187   ret = postorder (dfa->str_tree, lower_subexps, preg);
1188   if (BE (ret != REG_NOERROR, 0))
1189     return ret;
1190   ret = postorder (dfa->str_tree, calc_first, dfa);
1191   if (BE (ret != REG_NOERROR, 0))
1192     return ret;
1193   preorder (dfa->str_tree, calc_next, dfa);
1194   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1195   if (BE (ret != REG_NOERROR, 0))
1196     return ret;
1197   ret = calc_eclosure (dfa);
1198   if (BE (ret != REG_NOERROR, 0))
1199     return ret;
1200
1201   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1202      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1203   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1204       || dfa->nbackref)
1205     {
1206       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1207       if (BE (dfa->inveclosures == NULL, 0))
1208         return REG_ESPACE;
1209       ret = calc_inveclosure (dfa);
1210     }
1211
1212   return ret;
1213 }
1214
1215 /* Our parse trees are very unbalanced, so we cannot use a stack to
1216    implement parse tree visits.  Instead, we use parent pointers and
1217    some hairy code in these two functions.  */
1218 static reg_errcode_t
1219 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1220            void *extra)
1221 {
1222   bin_tree_t *node, *prev;
1223
1224   for (node = root; ; )
1225     {
1226       /* Descend down the tree, preferably to the left (or to the right
1227          if that's the only child).  */
1228       while (node->left || node->right)
1229         if (node->left)
1230           node = node->left;
1231         else
1232           node = node->right;
1233
1234       do
1235         {
1236           reg_errcode_t err = fn (extra, node);
1237           if (BE (err != REG_NOERROR, 0))
1238             return err;
1239           if (node->parent == NULL)
1240             return REG_NOERROR;
1241           prev = node;
1242           node = node->parent;
1243         }
1244       /* Go up while we have a node that is reached from the right.  */
1245       while (node->right == prev || node->right == NULL);
1246       node = node->right;
1247     }
1248 }
1249
1250 static reg_errcode_t
1251 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1252           void *extra)
1253 {
1254   bin_tree_t *node;
1255
1256   for (node = root; ; )
1257     {
1258       reg_errcode_t err = fn (extra, node);
1259       if (BE (err != REG_NOERROR, 0))
1260         return err;
1261
1262       /* Go to the left node, or up and to the right.  */
1263       if (node->left)
1264         node = node->left;
1265       else
1266         {
1267           bin_tree_t *prev = NULL;
1268           while (node->right == prev || node->right == NULL)
1269             {
1270               prev = node;
1271               node = node->parent;
1272               if (!node)
1273                 return REG_NOERROR;
1274             }
1275           node = node->right;
1276         }
1277     }
1278 }
1279
1280 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1281    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1282    backreferences as well.  Requires a preorder visit.  */
1283 static reg_errcode_t
1284 optimize_subexps (void *extra, bin_tree_t *node)
1285 {
1286   re_dfa_t *dfa = (re_dfa_t *) extra;
1287
1288   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1289     {
1290       int idx = node->token.opr.idx;
1291       node->token.opr.idx = dfa->subexp_map[idx];
1292       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1293     }
1294
1295   else if (node->token.type == SUBEXP
1296            && node->left && node->left->token.type == SUBEXP)
1297     {
1298       int other_idx = node->left->token.opr.idx;
1299
1300       node->left = node->left->left;
1301       if (node->left)
1302         node->left->parent = node;
1303
1304       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1305       if (other_idx < BITSET_WORD_BITS)
1306           dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1307     }
1308
1309   return REG_NOERROR;
1310 }
1311
1312 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1313    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1314 static reg_errcode_t
1315 lower_subexps (void *extra, bin_tree_t *node)
1316 {
1317   regex_t *preg = (regex_t *) extra;
1318   reg_errcode_t err = REG_NOERROR;
1319
1320   if (node->left && node->left->token.type == SUBEXP)
1321     {
1322       node->left = lower_subexp (&err, preg, node->left);
1323       if (node->left)
1324         node->left->parent = node;
1325     }
1326   if (node->right && node->right->token.type == SUBEXP)
1327     {
1328       node->right = lower_subexp (&err, preg, node->right);
1329       if (node->right)
1330         node->right->parent = node;
1331     }
1332
1333   return err;
1334 }
1335
1336 static bin_tree_t *
1337 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1338 {
1339   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1340   bin_tree_t *body = node->left;
1341   bin_tree_t *op, *cls, *tree1, *tree;
1342
1343   if (preg->no_sub
1344       /* We do not optimize empty subexpressions, because otherwise we may
1345          have bad CONCAT nodes with NULL children.  This is obviously not
1346          very common, so we do not lose much.  An example that triggers
1347          this case is the sed "script" /\(\)/x.  */
1348       && node->left != NULL
1349       && (node->token.opr.idx >= BITSET_WORD_BITS
1350           || !(dfa->used_bkref_map
1351                & ((bitset_word_t) 1 << node->token.opr.idx))))
1352     return node->left;
1353
1354   /* Convert the SUBEXP node to the concatenation of an
1355      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1356   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1357   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1358   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1359   tree = create_tree (dfa, op, tree1, CONCAT);
1360   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1361     {
1362       *err = REG_ESPACE;
1363       return NULL;
1364     }
1365
1366   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1367   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1368   return tree;
1369 }
1370
1371 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1372    nodes.  Requires a postorder visit.  */
1373 static reg_errcode_t
1374 calc_first (void *extra, bin_tree_t *node)
1375 {
1376   re_dfa_t *dfa = (re_dfa_t *) extra;
1377   if (node->token.type == CONCAT)
1378     {
1379       node->first = node->left->first;
1380       node->node_idx = node->left->node_idx;
1381     }
1382   else
1383     {
1384       node->first = node;
1385       node->node_idx = re_dfa_add_node (dfa, node->token);
1386       if (BE (node->node_idx == -1, 0))
1387         return REG_ESPACE;
1388       if (node->token.type == ANCHOR)
1389         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1390     }
1391   return REG_NOERROR;
1392 }
1393
1394 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1395 static reg_errcode_t
1396 calc_next (void *extra, bin_tree_t *node)
1397 {
1398   switch (node->token.type)
1399     {
1400     case OP_DUP_ASTERISK:
1401       node->left->next = node;
1402       break;
1403     case CONCAT:
1404       node->left->next = node->right->first;
1405       node->right->next = node->next;
1406       break;
1407     default:
1408       if (node->left)
1409         node->left->next = node->next;
1410       if (node->right)
1411         node->right->next = node->next;
1412       break;
1413     }
1414   return REG_NOERROR;
1415 }
1416
1417 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1418 static reg_errcode_t
1419 link_nfa_nodes (void *extra, bin_tree_t *node)
1420 {
1421   re_dfa_t *dfa = (re_dfa_t *) extra;
1422   int idx = node->node_idx;
1423   reg_errcode_t err = REG_NOERROR;
1424
1425   switch (node->token.type)
1426     {
1427     case CONCAT:
1428       break;
1429
1430     case END_OF_RE:
1431       assert (node->next == NULL);
1432       break;
1433
1434     case OP_DUP_ASTERISK:
1435     case OP_ALT:
1436       {
1437         int left, right;
1438         dfa->has_plural_match = 1;
1439         if (node->left != NULL)
1440           left = node->left->first->node_idx;
1441         else
1442           left = node->next->node_idx;
1443         if (node->right != NULL)
1444           right = node->right->first->node_idx;
1445         else
1446           right = node->next->node_idx;
1447         assert (left > -1);
1448         assert (right > -1);
1449         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1450       }
1451       break;
1452
1453     case ANCHOR:
1454     case OP_OPEN_SUBEXP:
1455     case OP_CLOSE_SUBEXP:
1456       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1457       break;
1458
1459     case OP_BACK_REF:
1460       dfa->nexts[idx] = node->next->node_idx;
1461       if (node->token.type == OP_BACK_REF)
1462         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1463       break;
1464
1465     default:
1466       assert (!IS_EPSILON_NODE (node->token.type));
1467       dfa->nexts[idx] = node->next->node_idx;
1468       break;
1469     }
1470
1471   return err;
1472 }
1473
1474 /* Duplicate the epsilon closure of the node ROOT_NODE.
1475    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1476    to their own constraint.  */
1477
1478 static reg_errcode_t
1479 internal_function
1480 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1481                         int root_node, unsigned int init_constraint)
1482 {
1483   int org_node, clone_node, ret;
1484   unsigned int constraint = init_constraint;
1485   for (org_node = top_org_node, clone_node = top_clone_node;;)
1486     {
1487       int org_dest, clone_dest;
1488       if (dfa->nodes[org_node].type == OP_BACK_REF)
1489         {
1490           /* If the back reference epsilon-transit, its destination must
1491              also have the constraint.  Then duplicate the epsilon closure
1492              of the destination of the back reference, and store it in
1493              edests of the back reference.  */
1494           org_dest = dfa->nexts[org_node];
1495           re_node_set_empty (dfa->edests + clone_node);
1496           clone_dest = duplicate_node (dfa, org_dest, constraint);
1497           if (BE (clone_dest == -1, 0))
1498             return REG_ESPACE;
1499           dfa->nexts[clone_node] = dfa->nexts[org_node];
1500           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1501           if (BE (ret < 0, 0))
1502             return REG_ESPACE;
1503         }
1504       else if (dfa->edests[org_node].nelem == 0)
1505         {
1506           /* In case of the node can't epsilon-transit, don't duplicate the
1507              destination and store the original destination as the
1508              destination of the node.  */
1509           dfa->nexts[clone_node] = dfa->nexts[org_node];
1510           break;
1511         }
1512       else if (dfa->edests[org_node].nelem == 1)
1513         {
1514           /* In case of the node can epsilon-transit, and it has only one
1515              destination.  */
1516           org_dest = dfa->edests[org_node].elems[0];
1517           re_node_set_empty (dfa->edests + clone_node);
1518           /* If the node is root_node itself, it means the epsilon clsoure
1519              has a loop.   Then tie it to the destination of the root_node.  */
1520           if (org_node == root_node && clone_node != org_node)
1521             {
1522               ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1523               if (BE (ret < 0, 0))
1524                 return REG_ESPACE;
1525               break;
1526             }
1527           /* In case of the node has another constraint, add it.  */
1528           constraint |= dfa->nodes[org_node].constraint;
1529           clone_dest = duplicate_node (dfa, org_dest, constraint);
1530           if (BE (clone_dest == -1, 0))
1531             return REG_ESPACE;
1532           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1533           if (BE (ret < 0, 0))
1534             return REG_ESPACE;
1535         }
1536       else /* dfa->edests[org_node].nelem == 2 */
1537         {
1538           /* In case of the node can epsilon-transit, and it has two
1539              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1540           org_dest = dfa->edests[org_node].elems[0];
1541           re_node_set_empty (dfa->edests + clone_node);
1542           /* Search for a duplicated node which satisfies the constraint.  */
1543           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1544           if (clone_dest == -1)
1545             {
1546               /* There is no such duplicated node, create a new one.  */
1547               reg_errcode_t err;
1548               clone_dest = duplicate_node (dfa, org_dest, constraint);
1549               if (BE (clone_dest == -1, 0))
1550                 return REG_ESPACE;
1551               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1552               if (BE (ret < 0, 0))
1553                 return REG_ESPACE;
1554               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1555                                             root_node, constraint);
1556               if (BE (err != REG_NOERROR, 0))
1557                 return err;
1558             }
1559           else
1560             {
1561               /* There is a duplicated node which satisfies the constraint,
1562                  use it to avoid infinite loop.  */
1563               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1564               if (BE (ret < 0, 0))
1565                 return REG_ESPACE;
1566             }
1567
1568           org_dest = dfa->edests[org_node].elems[1];
1569           clone_dest = duplicate_node (dfa, org_dest, constraint);
1570           if (BE (clone_dest == -1, 0))
1571             return REG_ESPACE;
1572           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1573           if (BE (ret < 0, 0))
1574             return REG_ESPACE;
1575         }
1576       org_node = org_dest;
1577       clone_node = clone_dest;
1578     }
1579   return REG_NOERROR;
1580 }
1581
1582 /* Search for a node which is duplicated from the node ORG_NODE, and
1583    satisfies the constraint CONSTRAINT.  */
1584
1585 static int
1586 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1587                         unsigned int constraint)
1588 {
1589   int idx;
1590   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1591     {
1592       if (org_node == dfa->org_indices[idx]
1593           && constraint == dfa->nodes[idx].constraint)
1594         return idx; /* Found.  */
1595     }
1596   return -1; /* Not found.  */
1597 }
1598
1599 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1600    Return the index of the new node, or -1 if insufficient storage is
1601    available.  */
1602
1603 static int
1604 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1605 {
1606   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1607   if (BE (dup_idx != -1, 1))
1608     {
1609       dfa->nodes[dup_idx].constraint = constraint;
1610       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1611       dfa->nodes[dup_idx].duplicated = 1;
1612
1613       /* Store the index of the original node.  */
1614       dfa->org_indices[dup_idx] = org_idx;
1615     }
1616   return dup_idx;
1617 }
1618
1619 static reg_errcode_t
1620 calc_inveclosure (re_dfa_t *dfa)
1621 {
1622   int src, idx, ret;
1623   for (idx = 0; idx < dfa->nodes_len; ++idx)
1624     re_node_set_init_empty (dfa->inveclosures + idx);
1625
1626   for (src = 0; src < dfa->nodes_len; ++src)
1627     {
1628       int *elems = dfa->eclosures[src].elems;
1629       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1630         {
1631           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1632           if (BE (ret == -1, 0))
1633             return REG_ESPACE;
1634         }
1635     }
1636
1637   return REG_NOERROR;
1638 }
1639
1640 /* Calculate "eclosure" for all the node in DFA.  */
1641
1642 static reg_errcode_t
1643 calc_eclosure (re_dfa_t *dfa)
1644 {
1645   int node_idx, incomplete;
1646 #ifdef DEBUG
1647   assert (dfa->nodes_len > 0);
1648 #endif
1649   incomplete = 0;
1650   /* For each nodes, calculate epsilon closure.  */
1651   for (node_idx = 0; ; ++node_idx)
1652     {
1653       reg_errcode_t err;
1654       re_node_set eclosure_elem;
1655       if (node_idx == dfa->nodes_len)
1656         {
1657           if (!incomplete)
1658             break;
1659           incomplete = 0;
1660           node_idx = 0;
1661         }
1662
1663 #ifdef DEBUG
1664       assert (dfa->eclosures[node_idx].nelem != -1);
1665 #endif
1666
1667       /* If we have already calculated, skip it.  */
1668       if (dfa->eclosures[node_idx].nelem != 0)
1669         continue;
1670       /* Calculate epsilon closure of `node_idx'.  */
1671       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1672       if (BE (err != REG_NOERROR, 0))
1673         return err;
1674
1675       if (dfa->eclosures[node_idx].nelem == 0)
1676         {
1677           incomplete = 1;
1678           re_node_set_free (&eclosure_elem);
1679         }
1680     }
1681   return REG_NOERROR;
1682 }
1683
1684 /* Calculate epsilon closure of NODE.  */
1685
1686 static reg_errcode_t
1687 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1688 {
1689   reg_errcode_t err;
1690   int i;
1691   re_node_set eclosure;
1692   int ret;
1693   int incomplete = 0;
1694   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1695   if (BE (err != REG_NOERROR, 0))
1696     return err;
1697
1698   /* This indicates that we are calculating this node now.
1699      We reference this value to avoid infinite loop.  */
1700   dfa->eclosures[node].nelem = -1;
1701
1702   /* If the current node has constraints, duplicate all nodes
1703      since they must inherit the constraints.  */
1704   if (dfa->nodes[node].constraint
1705       && dfa->edests[node].nelem
1706       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1707     {
1708       err = duplicate_node_closure (dfa, node, node, node,
1709                                     dfa->nodes[node].constraint);
1710       if (BE (err != REG_NOERROR, 0))
1711         return err;
1712     }
1713
1714   /* Expand each epsilon destination nodes.  */
1715   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1716     for (i = 0; i < dfa->edests[node].nelem; ++i)
1717       {
1718         re_node_set eclosure_elem;
1719         int edest = dfa->edests[node].elems[i];
1720         /* If calculating the epsilon closure of `edest' is in progress,
1721            return intermediate result.  */
1722         if (dfa->eclosures[edest].nelem == -1)
1723           {
1724             incomplete = 1;
1725             continue;
1726           }
1727         /* If we haven't calculated the epsilon closure of `edest' yet,
1728            calculate now. Otherwise use calculated epsilon closure.  */
1729         if (dfa->eclosures[edest].nelem == 0)
1730           {
1731             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1732             if (BE (err != REG_NOERROR, 0))
1733               return err;
1734           }
1735         else
1736           eclosure_elem = dfa->eclosures[edest];
1737         /* Merge the epsilon closure of `edest'.  */
1738         err = re_node_set_merge (&eclosure, &eclosure_elem);
1739         if (BE (err != REG_NOERROR, 0))
1740           return err;
1741         /* If the epsilon closure of `edest' is incomplete,
1742            the epsilon closure of this node is also incomplete.  */
1743         if (dfa->eclosures[edest].nelem == 0)
1744           {
1745             incomplete = 1;
1746             re_node_set_free (&eclosure_elem);
1747           }
1748       }
1749
1750   /* An epsilon closure includes itself.  */
1751   ret = re_node_set_insert (&eclosure, node);
1752   if (BE (ret < 0, 0))
1753     return REG_ESPACE;
1754   if (incomplete && !root)
1755     dfa->eclosures[node].nelem = 0;
1756   else
1757     dfa->eclosures[node] = eclosure;
1758   *new_set = eclosure;
1759   return REG_NOERROR;
1760 }
1761 \f
1762 /* Functions for token which are used in the parser.  */
1763
1764 /* Fetch a token from INPUT.
1765    We must not use this function inside bracket expressions.  */
1766
1767 static void
1768 internal_function
1769 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1770 {
1771   re_string_skip_bytes (input, peek_token (result, input, syntax));
1772 }
1773
1774 /* Peek a token from INPUT, and return the length of the token.
1775    We must not use this function inside bracket expressions.  */
1776
1777 static int
1778 internal_function
1779 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1780 {
1781   unsigned char c;
1782
1783   if (re_string_eoi (input))
1784     {
1785       token->type = END_OF_RE;
1786       return 0;
1787     }
1788
1789   c = re_string_peek_byte (input, 0);
1790   token->opr.c = c;
1791
1792   token->word_char = 0;
1793 #ifdef RE_ENABLE_I18N
1794   token->mb_partial = 0;
1795   if (input->mb_cur_max > 1 &&
1796       !re_string_first_byte (input, re_string_cur_idx (input)))
1797     {
1798       token->type = CHARACTER;
1799       token->mb_partial = 1;
1800       return 1;
1801     }
1802 #endif
1803   if (c == '\\')
1804     {
1805       unsigned char c2;
1806       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1807         {
1808           token->type = BACK_SLASH;
1809           return 1;
1810         }
1811
1812       c2 = re_string_peek_byte_case (input, 1);
1813       token->opr.c = c2;
1814       token->type = CHARACTER;
1815 #ifdef RE_ENABLE_I18N
1816       if (input->mb_cur_max > 1)
1817         {
1818           wint_t wc = re_string_wchar_at (input,
1819                                           re_string_cur_idx (input) + 1);
1820           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1821         }
1822       else
1823 #endif
1824         token->word_char = IS_WORD_CHAR (c2) != 0;
1825
1826       switch (c2)
1827         {
1828         case '|':
1829           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1830             token->type = OP_ALT;
1831           break;
1832         case '1': case '2': case '3': case '4': case '5':
1833         case '6': case '7': case '8': case '9':
1834           if (!(syntax & RE_NO_BK_REFS))
1835             {
1836               token->type = OP_BACK_REF;
1837               token->opr.idx = c2 - '1';
1838             }
1839           break;
1840         case '<':
1841           if (!(syntax & RE_NO_GNU_OPS))
1842             {
1843               token->type = ANCHOR;
1844               token->opr.ctx_type = WORD_FIRST;
1845             }
1846           break;
1847         case '>':
1848           if (!(syntax & RE_NO_GNU_OPS))
1849             {
1850               token->type = ANCHOR;
1851               token->opr.ctx_type = WORD_LAST;
1852             }
1853           break;
1854         case 'b':
1855           if (!(syntax & RE_NO_GNU_OPS))
1856             {
1857               token->type = ANCHOR;
1858               token->opr.ctx_type = WORD_DELIM;
1859             }
1860           break;
1861         case 'B':
1862           if (!(syntax & RE_NO_GNU_OPS))
1863             {
1864               token->type = ANCHOR;
1865               token->opr.ctx_type = NOT_WORD_DELIM;
1866             }
1867           break;
1868         case 'w':
1869           if (!(syntax & RE_NO_GNU_OPS))
1870             token->type = OP_WORD;
1871           break;
1872         case 'W':
1873           if (!(syntax & RE_NO_GNU_OPS))
1874             token->type = OP_NOTWORD;
1875           break;
1876         case 's':
1877           if (!(syntax & RE_NO_GNU_OPS))
1878             token->type = OP_SPACE;
1879           break;
1880         case 'S':
1881           if (!(syntax & RE_NO_GNU_OPS))
1882             token->type = OP_NOTSPACE;
1883           break;
1884         case '`':
1885           if (!(syntax & RE_NO_GNU_OPS))
1886             {
1887               token->type = ANCHOR;
1888               token->opr.ctx_type = BUF_FIRST;
1889             }
1890           break;
1891         case '\'':
1892           if (!(syntax & RE_NO_GNU_OPS))
1893             {
1894               token->type = ANCHOR;
1895               token->opr.ctx_type = BUF_LAST;
1896             }
1897           break;
1898         case '(':
1899           if (!(syntax & RE_NO_BK_PARENS))
1900             token->type = OP_OPEN_SUBEXP;
1901           break;
1902         case ')':
1903           if (!(syntax & RE_NO_BK_PARENS))
1904             token->type = OP_CLOSE_SUBEXP;
1905           break;
1906         case '+':
1907           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1908             token->type = OP_DUP_PLUS;
1909           break;
1910         case '?':
1911           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1912             token->type = OP_DUP_QUESTION;
1913           break;
1914         case '{':
1915           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1916             token->type = OP_OPEN_DUP_NUM;
1917           break;
1918         case '}':
1919           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1920             token->type = OP_CLOSE_DUP_NUM;
1921           break;
1922         default:
1923           break;
1924         }
1925       return 2;
1926     }
1927
1928   token->type = CHARACTER;
1929 #ifdef RE_ENABLE_I18N
1930   if (input->mb_cur_max > 1)
1931     {
1932       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1933       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1934     }
1935   else
1936 #endif
1937     token->word_char = IS_WORD_CHAR (token->opr.c);
1938
1939   switch (c)
1940     {
1941     case '\n':
1942       if (syntax & RE_NEWLINE_ALT)
1943         token->type = OP_ALT;
1944       break;
1945     case '|':
1946       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1947         token->type = OP_ALT;
1948       break;
1949     case '*':
1950       token->type = OP_DUP_ASTERISK;
1951       break;
1952     case '+':
1953       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1954         token->type = OP_DUP_PLUS;
1955       break;
1956     case '?':
1957       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1958         token->type = OP_DUP_QUESTION;
1959       break;
1960     case '{':
1961       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1962         token->type = OP_OPEN_DUP_NUM;
1963       break;
1964     case '}':
1965       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1966         token->type = OP_CLOSE_DUP_NUM;
1967       break;
1968     case '(':
1969       if (syntax & RE_NO_BK_PARENS)
1970         token->type = OP_OPEN_SUBEXP;
1971       break;
1972     case ')':
1973       if (syntax & RE_NO_BK_PARENS)
1974         token->type = OP_CLOSE_SUBEXP;
1975       break;
1976     case '[':
1977       token->type = OP_OPEN_BRACKET;
1978       break;
1979     case '.':
1980       token->type = OP_PERIOD;
1981       break;
1982     case '^':
1983       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1984           re_string_cur_idx (input) != 0)
1985         {
1986           char prev = re_string_peek_byte (input, -1);
1987           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1988             break;
1989         }
1990       token->type = ANCHOR;
1991       token->opr.ctx_type = LINE_FIRST;
1992       break;
1993     case '$':
1994       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1995           re_string_cur_idx (input) + 1 != re_string_length (input))
1996         {
1997           re_token_t next;
1998           re_string_skip_bytes (input, 1);
1999           peek_token (&next, input, syntax);
2000           re_string_skip_bytes (input, -1);
2001           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2002             break;
2003         }
2004       token->type = ANCHOR;
2005       token->opr.ctx_type = LINE_LAST;
2006       break;
2007     default:
2008       break;
2009     }
2010   return 1;
2011 }
2012
2013 /* Peek a token from INPUT, and return the length of the token.
2014    We must not use this function out of bracket expressions.  */
2015
2016 static int
2017 internal_function
2018 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2019 {
2020   unsigned char c;
2021   if (re_string_eoi (input))
2022     {
2023       token->type = END_OF_RE;
2024       return 0;
2025     }
2026   c = re_string_peek_byte (input, 0);
2027   token->opr.c = c;
2028
2029 #ifdef RE_ENABLE_I18N
2030   if (input->mb_cur_max > 1 &&
2031       !re_string_first_byte (input, re_string_cur_idx (input)))
2032     {
2033       token->type = CHARACTER;
2034       return 1;
2035     }
2036 #endif /* RE_ENABLE_I18N */
2037
2038   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2039       && re_string_cur_idx (input) + 1 < re_string_length (input))
2040     {
2041       /* In this case, '\' escape a character.  */
2042       unsigned char c2;
2043       re_string_skip_bytes (input, 1);
2044       c2 = re_string_peek_byte (input, 0);
2045       token->opr.c = c2;
2046       token->type = CHARACTER;
2047       return 1;
2048     }
2049   if (c == '[') /* '[' is a special char in a bracket exps.  */
2050     {
2051       unsigned char c2;
2052       int token_len;
2053       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2054         c2 = re_string_peek_byte (input, 1);
2055       else
2056         c2 = 0;
2057       token->opr.c = c2;
2058       token_len = 2;
2059       switch (c2)
2060         {
2061         case '.':
2062           token->type = OP_OPEN_COLL_ELEM;
2063           break;
2064         case '=':
2065           token->type = OP_OPEN_EQUIV_CLASS;
2066           break;
2067         case ':':
2068           if (syntax & RE_CHAR_CLASSES)
2069             {
2070               token->type = OP_OPEN_CHAR_CLASS;
2071               break;
2072             }
2073           /* else fall through.  */
2074         default:
2075           token->type = CHARACTER;
2076           token->opr.c = c;
2077           token_len = 1;
2078           break;
2079         }
2080       return token_len;
2081     }
2082   switch (c)
2083     {
2084     case '-':
2085       token->type = OP_CHARSET_RANGE;
2086       break;
2087     case ']':
2088       token->type = OP_CLOSE_BRACKET;
2089       break;
2090     case '^':
2091       token->type = OP_NON_MATCH_LIST;
2092       break;
2093     default:
2094       token->type = CHARACTER;
2095     }
2096   return 1;
2097 }
2098 \f
2099 /* Functions for parser.  */
2100
2101 /* Entry point of the parser.
2102    Parse the regular expression REGEXP and return the structure tree.
2103    If an error is occured, ERR is set by error code, and return NULL.
2104    This function build the following tree, from regular expression <reg_exp>:
2105            CAT
2106            / \
2107           /   \
2108    <reg_exp>  EOR
2109
2110    CAT means concatenation.
2111    EOR means end of regular expression.  */
2112
2113 static bin_tree_t *
2114 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2115        reg_errcode_t *err)
2116 {
2117   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2118   bin_tree_t *tree, *eor, *root;
2119   re_token_t current_token;
2120   dfa->syntax = syntax;
2121   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2122   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2123   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2124     return NULL;
2125   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2126   if (tree != NULL)
2127     root = create_tree (dfa, tree, eor, CONCAT);
2128   else
2129     root = eor;
2130   if (BE (eor == NULL || root == NULL, 0))
2131     {
2132       *err = REG_ESPACE;
2133       return NULL;
2134     }
2135   return root;
2136 }
2137
2138 /* This function build the following tree, from regular expression
2139    <branch1>|<branch2>:
2140            ALT
2141            / \
2142           /   \
2143    <branch1> <branch2>
2144
2145    ALT means alternative, which represents the operator `|'.  */
2146
2147 static bin_tree_t *
2148 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2149                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2150 {
2151   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2152   bin_tree_t *tree, *branch = NULL;
2153   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2154   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2155     return NULL;
2156
2157   while (token->type == OP_ALT)
2158     {
2159       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2160       if (token->type != OP_ALT && token->type != END_OF_RE
2161           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2162         {
2163           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2164           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2165             return NULL;
2166         }
2167       else
2168         branch = NULL;
2169       tree = create_tree (dfa, tree, branch, OP_ALT);
2170       if (BE (tree == NULL, 0))
2171         {
2172           *err = REG_ESPACE;
2173           return NULL;
2174         }
2175     }
2176   return tree;
2177 }
2178
2179 /* This function build the following tree, from regular expression
2180    <exp1><exp2>:
2181         CAT
2182         / \
2183        /   \
2184    <exp1> <exp2>
2185
2186    CAT means concatenation.  */
2187
2188 static bin_tree_t *
2189 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2190               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2191 {
2192   bin_tree_t *tree, *exp;
2193   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2194   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2195   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2196     return NULL;
2197
2198   while (token->type != OP_ALT && token->type != END_OF_RE
2199          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2200     {
2201       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2202       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2203         {
2204           return NULL;
2205         }
2206       if (tree != NULL && exp != NULL)
2207         {
2208           tree = create_tree (dfa, tree, exp, CONCAT);
2209           if (tree == NULL)
2210             {
2211               *err = REG_ESPACE;
2212               return NULL;
2213             }
2214         }
2215       else if (tree == NULL)
2216         tree = exp;
2217       /* Otherwise exp == NULL, we don't need to create new tree.  */
2218     }
2219   return tree;
2220 }
2221
2222 /* This function build the following tree, from regular expression a*:
2223          *
2224          |
2225          a
2226 */
2227
2228 static bin_tree_t *
2229 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2230                   reg_syntax_t syntax, int nest, reg_errcode_t *err)
2231 {
2232   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2233   bin_tree_t *tree;
2234   switch (token->type)
2235     {
2236     case CHARACTER:
2237       tree = create_token_tree (dfa, NULL, NULL, token);
2238       if (BE (tree == NULL, 0))
2239         {
2240           *err = REG_ESPACE;
2241           return NULL;
2242         }
2243 #ifdef RE_ENABLE_I18N
2244       if (dfa->mb_cur_max > 1)
2245         {
2246           while (!re_string_eoi (regexp)
2247                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2248             {
2249               bin_tree_t *mbc_remain;
2250               fetch_token (token, regexp, syntax);
2251               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2252               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2253               if (BE (mbc_remain == NULL || tree == NULL, 0))
2254                 {
2255                   *err = REG_ESPACE;
2256                   return NULL;
2257                 }
2258             }
2259         }
2260 #endif
2261       break;
2262     case OP_OPEN_SUBEXP:
2263       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2264       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2265         return NULL;
2266       break;
2267     case OP_OPEN_BRACKET:
2268       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2269       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2270         return NULL;
2271       break;
2272     case OP_BACK_REF:
2273       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2274         {
2275           *err = REG_ESUBREG;
2276           return NULL;
2277         }
2278       dfa->used_bkref_map |= 1 << token->opr.idx;
2279       tree = create_token_tree (dfa, NULL, NULL, token);
2280       if (BE (tree == NULL, 0))
2281         {
2282           *err = REG_ESPACE;
2283           return NULL;
2284         }
2285       ++dfa->nbackref;
2286       dfa->has_mb_node = 1;
2287       break;
2288     case OP_OPEN_DUP_NUM:
2289       if (syntax & RE_CONTEXT_INVALID_DUP)
2290         {
2291           *err = REG_BADRPT;
2292           return NULL;
2293         }
2294       /* FALLTHROUGH */
2295     case OP_DUP_ASTERISK:
2296     case OP_DUP_PLUS:
2297     case OP_DUP_QUESTION:
2298       if (syntax & RE_CONTEXT_INVALID_OPS)
2299         {
2300           *err = REG_BADRPT;
2301           return NULL;
2302         }
2303       else if (syntax & RE_CONTEXT_INDEP_OPS)
2304         {
2305           fetch_token (token, regexp, syntax);
2306           return parse_expression (regexp, preg, token, syntax, nest, err);
2307         }
2308       /* else fall through  */
2309     case OP_CLOSE_SUBEXP:
2310       if ((token->type == OP_CLOSE_SUBEXP) &&
2311           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2312         {
2313           *err = REG_ERPAREN;
2314           return NULL;
2315         }
2316       /* else fall through  */
2317     case OP_CLOSE_DUP_NUM:
2318       /* We treat it as a normal character.  */
2319
2320       /* Then we can these characters as normal characters.  */
2321       token->type = CHARACTER;
2322       /* mb_partial and word_char bits should be initialized already
2323          by peek_token.  */
2324       tree = create_token_tree (dfa, NULL, NULL, token);
2325       if (BE (tree == NULL, 0))
2326         {
2327           *err = REG_ESPACE;
2328           return NULL;
2329         }
2330       break;
2331     case ANCHOR:
2332       if ((token->opr.ctx_type
2333            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2334           && dfa->word_ops_used == 0)
2335         init_word_char (dfa);
2336       if (token->opr.ctx_type == WORD_DELIM
2337           || token->opr.ctx_type == NOT_WORD_DELIM)
2338         {
2339           bin_tree_t *tree_first, *tree_last;
2340           if (token->opr.ctx_type == WORD_DELIM)
2341             {
2342               token->opr.ctx_type = WORD_FIRST;
2343               tree_first = create_token_tree (dfa, NULL, NULL, token);
2344               token->opr.ctx_type = WORD_LAST;
2345             }
2346           else
2347             {
2348               token->opr.ctx_type = INSIDE_WORD;
2349               tree_first = create_token_tree (dfa, NULL, NULL, token);
2350               token->opr.ctx_type = INSIDE_NOTWORD;
2351             }
2352           tree_last = create_token_tree (dfa, NULL, NULL, token);
2353           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2354           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2355             {
2356               *err = REG_ESPACE;
2357               return NULL;
2358             }
2359         }
2360       else
2361         {
2362           tree = create_token_tree (dfa, NULL, NULL, token);
2363           if (BE (tree == NULL, 0))
2364             {
2365               *err = REG_ESPACE;
2366               return NULL;
2367             }
2368         }
2369       /* We must return here, since ANCHORs can't be followed
2370          by repetition operators.
2371          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2372              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2373       fetch_token (token, regexp, syntax);
2374       return tree;
2375     case OP_PERIOD:
2376       tree = create_token_tree (dfa, NULL, NULL, token);
2377       if (BE (tree == NULL, 0))
2378         {
2379           *err = REG_ESPACE;
2380           return NULL;
2381         }
2382       if (dfa->mb_cur_max > 1)
2383         dfa->has_mb_node = 1;
2384       break;
2385     case OP_WORD:
2386     case OP_NOTWORD:
2387       tree = build_charclass_op (dfa, regexp->trans,
2388                                  "alnum",
2389                                  "_",
2390                                  token->type == OP_NOTWORD, err);
2391       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2392         return NULL;
2393       break;
2394     case OP_SPACE:
2395     case OP_NOTSPACE:
2396       tree = build_charclass_op (dfa, regexp->trans,
2397                                  "space",
2398                                  "",
2399                                  token->type == OP_NOTSPACE, err);
2400       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2401         return NULL;
2402       break;
2403     case OP_ALT:
2404     case END_OF_RE:
2405       return NULL;
2406     case BACK_SLASH:
2407       *err = REG_EESCAPE;
2408       return NULL;
2409     default:
2410       /* Must not happen?  */
2411 #ifdef DEBUG
2412       assert (0);
2413 #endif
2414       return NULL;
2415     }
2416   fetch_token (token, regexp, syntax);
2417
2418   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2419          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2420     {
2421       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2422       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2423         return NULL;
2424       /* In BRE consecutive duplications are not allowed.  */
2425       if ((syntax & RE_CONTEXT_INVALID_DUP)
2426           && (token->type == OP_DUP_ASTERISK
2427               || token->type == OP_OPEN_DUP_NUM))
2428         {
2429           *err = REG_BADRPT;
2430           return NULL;
2431         }
2432     }
2433
2434   return tree;
2435 }
2436
2437 /* This function build the following tree, from regular expression
2438    (<reg_exp>):
2439          SUBEXP
2440             |
2441         <reg_exp>
2442 */
2443
2444 static bin_tree_t *
2445 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2446                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2447 {
2448   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2449   bin_tree_t *tree;
2450   size_t cur_nsub;
2451   cur_nsub = preg->re_nsub++;
2452
2453   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2454
2455   /* The subexpression may be a null string.  */
2456   if (token->type == OP_CLOSE_SUBEXP)
2457     tree = NULL;
2458   else
2459     {
2460       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2461       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2462         *err = REG_EPAREN;
2463       if (BE (*err != REG_NOERROR, 0))
2464         return NULL;
2465     }
2466
2467   if (cur_nsub <= '9' - '1')
2468     dfa->completed_bkref_map |= 1 << cur_nsub;
2469
2470   tree = create_tree (dfa, tree, NULL, SUBEXP);
2471   if (BE (tree == NULL, 0))
2472     {
2473       *err = REG_ESPACE;
2474       return NULL;
2475     }
2476   tree->token.opr.idx = cur_nsub;
2477   return tree;
2478 }
2479
2480 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2481
2482 static bin_tree_t *
2483 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2484               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2485 {
2486   bin_tree_t *tree = NULL, *old_tree = NULL;
2487   int i, start, end, start_idx = re_string_cur_idx (regexp);
2488 #ifndef RE_TOKEN_INIT_BUG
2489   re_token_t start_token = *token;
2490 #else
2491   re_token_t start_token;
2492
2493   memcpy ((void *) &start_token, (void *) token, sizeof start_token);
2494 #endif
2495
2496   if (token->type == OP_OPEN_DUP_NUM)
2497     {
2498       end = 0;
2499       start = fetch_number (regexp, token, syntax);
2500       if (start == -1)
2501         {
2502           if (token->type == CHARACTER && token->opr.c == ',')
2503             start = 0; /* We treat "{,m}" as "{0,m}".  */
2504           else
2505             {
2506               *err = REG_BADBR; /* <re>{} is invalid.  */
2507               return NULL;
2508             }
2509         }
2510       if (BE (start != -2, 1))
2511         {
2512           /* We treat "{n}" as "{n,n}".  */
2513           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2514                  : ((token->type == CHARACTER && token->opr.c == ',')
2515                     ? fetch_number (regexp, token, syntax) : -2));
2516         }
2517       if (BE (start == -2 || end == -2, 0))
2518         {
2519           /* Invalid sequence.  */
2520           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2521             {
2522               if (token->type == END_OF_RE)
2523                 *err = REG_EBRACE;
2524               else
2525                 *err = REG_BADBR;
2526
2527               return NULL;
2528             }
2529
2530           /* If the syntax bit is set, rollback.  */
2531           re_string_set_index (regexp, start_idx);
2532           *token = start_token;
2533           token->type = CHARACTER;
2534           /* mb_partial and word_char bits should be already initialized by
2535              peek_token.  */
2536           return elem;
2537         }
2538
2539       if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2540         {
2541           /* First number greater than second.  */
2542           *err = REG_BADBR;
2543           return NULL;
2544         }
2545     }
2546   else
2547     {
2548       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2549       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2550     }
2551
2552   fetch_token (token, regexp, syntax);
2553
2554   if (BE (elem == NULL, 0))
2555     return NULL;
2556   if (BE (start == 0 && end == 0, 0))
2557     {
2558       postorder (elem, free_tree, NULL);
2559       return NULL;
2560     }
2561
2562   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2563   if (BE (start > 0, 0))
2564     {
2565       tree = elem;
2566       for (i = 2; i <= start; ++i)
2567         {
2568           elem = duplicate_tree (elem, dfa);
2569           tree = create_tree (dfa, tree, elem, CONCAT);
2570           if (BE (elem == NULL || tree == NULL, 0))
2571             goto parse_dup_op_espace;
2572         }
2573
2574       if (start == end)
2575         return tree;
2576
2577       /* Duplicate ELEM before it is marked optional.  */
2578       elem = duplicate_tree (elem, dfa);
2579       old_tree = tree;
2580     }
2581   else
2582     old_tree = NULL;
2583
2584   if (elem->token.type == SUBEXP)
2585     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2586
2587   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2588   if (BE (tree == NULL, 0))
2589     goto parse_dup_op_espace;
2590
2591   /* This loop is actually executed only when end != -1,
2592      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2593      already created the start+1-th copy.  */
2594   for (i = start + 2; i <= end; ++i)
2595     {
2596       elem = duplicate_tree (elem, dfa);
2597       tree = create_tree (dfa, tree, elem, CONCAT);
2598       if (BE (elem == NULL || tree == NULL, 0))
2599         goto parse_dup_op_espace;
2600
2601       tree = create_tree (dfa, tree, NULL, OP_ALT);
2602       if (BE (tree == NULL, 0))
2603         goto parse_dup_op_espace;
2604     }
2605
2606   if (old_tree)
2607     tree = create_tree (dfa, old_tree, tree, CONCAT);
2608
2609   return tree;
2610
2611  parse_dup_op_espace:
2612   *err = REG_ESPACE;
2613   return NULL;
2614 }
2615
2616 /* Size of the names for collating symbol/equivalence_class/character_class.
2617    I'm not sure, but maybe enough.  */
2618 #define BRACKET_NAME_BUF_SIZE 32
2619
2620 #ifndef _LIBC
2621   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2622      Build the range expression which starts from START_ELEM, and ends
2623      at END_ELEM.  The result are written to MBCSET and SBCSET.
2624      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2625      mbcset->range_ends, is a pointer argument sinse we may
2626      update it.  */
2627
2628 static reg_errcode_t
2629 internal_function
2630 # ifdef RE_ENABLE_I18N
2631 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2632                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2633 # else /* not RE_ENABLE_I18N */
2634 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2635                  bracket_elem_t *end_elem)
2636 # endif /* not RE_ENABLE_I18N */
2637 {
2638   unsigned int start_ch, end_ch;
2639   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2640   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2641           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2642           0))
2643     return REG_ERANGE;
2644
2645   /* We can handle no multi character collating elements without libc
2646      support.  */
2647   if (BE ((start_elem->type == COLL_SYM
2648            && strlen ((char *) start_elem->opr.name) > 1)
2649           || (end_elem->type == COLL_SYM
2650               && strlen ((char *) end_elem->opr.name) > 1), 0))
2651     return REG_ECOLLATE;
2652
2653 # ifdef RE_ENABLE_I18N
2654   {
2655     wchar_t wc;
2656     wint_t start_wc;
2657     wint_t end_wc;
2658     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2659
2660     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2661                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2662                    : 0));
2663     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2664               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2665                  : 0));
2666 #ifdef GAWK
2667     /*
2668      * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2669      * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2670      * unsigned, so we don't have sign extension problems.
2671      */
2672     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2673                 ? start_ch : start_elem->opr.wch);
2674     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2675               ? end_ch : end_elem->opr.wch);
2676 #else
2677     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2678                 ? __btowc (start_ch) : start_elem->opr.wch);
2679     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2680               ? __btowc (end_ch) : end_elem->opr.wch);
2681 #endif
2682     if (start_wc == WEOF || end_wc == WEOF)
2683       return REG_ECOLLATE;
2684     cmp_buf[0] = start_wc;
2685     cmp_buf[4] = end_wc;
2686     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2687       return REG_ERANGE;
2688
2689     /* Got valid collation sequence values, add them as a new entry.
2690        However, for !_LIBC we have no collation elements: if the
2691        character set is single byte, the single byte character set
2692        that we build below suffices.  parse_bracket_exp passes
2693        no MBCSET if dfa->mb_cur_max == 1.  */
2694     if (mbcset)
2695       {
2696         /* Check the space of the arrays.  */
2697         if (BE (*range_alloc == mbcset->nranges, 0))
2698           {
2699             /* There is not enough space, need realloc.  */
2700             wchar_t *new_array_start, *new_array_end;
2701             int new_nranges;
2702
2703             /* +1 in case of mbcset->nranges is 0.  */
2704             new_nranges = 2 * mbcset->nranges + 1;
2705             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2706                are NULL if *range_alloc == 0.  */
2707             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2708                                           new_nranges);
2709             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2710                                         new_nranges);
2711
2712             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2713               return REG_ESPACE;
2714
2715             mbcset->range_starts = new_array_start;
2716             mbcset->range_ends = new_array_end;
2717             *range_alloc = new_nranges;
2718           }
2719
2720         mbcset->range_starts[mbcset->nranges] = start_wc;
2721         mbcset->range_ends[mbcset->nranges++] = end_wc;
2722       }
2723
2724     /* Build the table for single byte characters.  */
2725     for (wc = 0; wc < SBC_MAX; ++wc)
2726       {
2727         cmp_buf[2] = wc;
2728         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2729             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2730           bitset_set (sbcset, wc);
2731       }
2732   }
2733 # else /* not RE_ENABLE_I18N */
2734   {
2735     unsigned int ch;
2736     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2737                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2738                    : 0));
2739     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2740               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2741                  : 0));
2742     if (start_ch > end_ch)
2743       return REG_ERANGE;
2744     /* Build the table for single byte characters.  */
2745     for (ch = 0; ch < SBC_MAX; ++ch)
2746       if (start_ch <= ch  && ch <= end_ch)
2747         bitset_set (sbcset, ch);
2748   }
2749 # endif /* not RE_ENABLE_I18N */
2750   return REG_NOERROR;
2751 }
2752 #endif /* not _LIBC */
2753
2754 #ifndef _LIBC
2755 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2756    Build the collating element which is represented by NAME.
2757    The result are written to MBCSET and SBCSET.
2758    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2759    pointer argument since we may update it.  */
2760
2761 static reg_errcode_t
2762 internal_function
2763 # ifdef RE_ENABLE_I18N
2764 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2765                         int *coll_sym_alloc, const unsigned char *name)
2766 # else /* not RE_ENABLE_I18N */
2767 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2768 # endif /* not RE_ENABLE_I18N */
2769 {
2770   size_t name_len = strlen ((const char *) name);
2771   if (BE (name_len != 1, 0))
2772     return REG_ECOLLATE;
2773   else
2774     {
2775       bitset_set (sbcset, name[0]);
2776       return REG_NOERROR;
2777     }
2778 }
2779 #endif /* not _LIBC */
2780
2781 /* This function parse bracket expression like "[abc]", "[a-c]",
2782    "[[.a-a.]]" etc.  */
2783
2784 static bin_tree_t *
2785 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2786                    reg_syntax_t syntax, reg_errcode_t *err)
2787 {
2788 #ifdef _LIBC
2789   const unsigned char *collseqmb;
2790   const char *collseqwc;
2791   uint32_t nrules;
2792   int32_t table_size;
2793   const int32_t *symb_table;
2794   const unsigned char *extra;
2795
2796   /* Local function for parse_bracket_exp used in _LIBC environement.
2797      Seek the collating symbol entry correspondings to NAME.
2798      Return the index of the symbol in the SYMB_TABLE.  */
2799
2800   auto inline int32_t
2801   __attribute ((always_inline))
2802   seek_collating_symbol_entry (name, name_len)
2803          const unsigned char *name;
2804          size_t name_len;
2805     {
2806       int32_t hash = elem_hash ((const char *) name, name_len);
2807       int32_t elem = hash % table_size;
2808       if (symb_table[2 * elem] != 0)
2809         {
2810           int32_t second = hash % (table_size - 2) + 1;
2811
2812           do
2813             {
2814               /* First compare the hashing value.  */
2815               if (symb_table[2 * elem] == hash
2816                   /* Compare the length of the name.  */
2817                   && name_len == extra[symb_table[2 * elem + 1]]
2818                   /* Compare the name.  */
2819                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2820                              name_len) == 0)
2821                 {
2822                   /* Yep, this is the entry.  */
2823                   break;
2824                 }
2825
2826               /* Next entry.  */
2827               elem += second;
2828             }
2829           while (symb_table[2 * elem] != 0);
2830         }
2831       return elem;
2832     }
2833
2834   /* Local function for parse_bracket_exp used in _LIBC environment.
2835      Look up the collation sequence value of BR_ELEM.
2836      Return the value if succeeded, UINT_MAX otherwise.  */
2837
2838   auto inline unsigned int
2839   __attribute ((always_inline))
2840   lookup_collation_sequence_value (br_elem)
2841          bracket_elem_t *br_elem;
2842     {
2843       if (br_elem->type == SB_CHAR)
2844         {
2845           /*
2846           if (MB_CUR_MAX == 1)
2847           */
2848           if (nrules == 0)
2849             return collseqmb[br_elem->opr.ch];
2850           else
2851             {
2852               wint_t wc = __btowc (br_elem->opr.ch);
2853               return __collseq_table_lookup (collseqwc, wc);
2854             }
2855         }
2856       else if (br_elem->type == MB_CHAR)
2857         {
2858           if (nrules != 0)
2859             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2860         }
2861       else if (br_elem->type == COLL_SYM)
2862         {
2863           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2864           if (nrules != 0)
2865             {
2866               int32_t elem, idx;
2867               elem = seek_collating_symbol_entry (br_elem->opr.name,
2868                                                   sym_name_len);
2869               if (symb_table[2 * elem] != 0)
2870                 {
2871                   /* We found the entry.  */
2872                   idx = symb_table[2 * elem + 1];
2873                   /* Skip the name of collating element name.  */
2874                   idx += 1 + extra[idx];
2875                   /* Skip the byte sequence of the collating element.  */
2876                   idx += 1 + extra[idx];
2877                   /* Adjust for the alignment.  */
2878                   idx = (idx + 3) & ~3;
2879                   /* Skip the multibyte collation sequence value.  */
2880                   idx += sizeof (unsigned int);
2881                   /* Skip the wide char sequence of the collating element.  */
2882                   idx += sizeof (unsigned int) *
2883                     (1 + *(unsigned int *) (extra + idx));
2884                   /* Return the collation sequence value.  */
2885                   return *(unsigned int *) (extra + idx);
2886                 }
2887               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2888                 {
2889                   /* No valid character.  Match it as a single byte
2890                      character.  */
2891                   return collseqmb[br_elem->opr.name[0]];
2892                 }
2893             }
2894           else if (sym_name_len == 1)
2895             return collseqmb[br_elem->opr.name[0]];
2896         }
2897       return UINT_MAX;
2898     }
2899
2900   /* Local function for parse_bracket_exp used in _LIBC environement.
2901      Build the range expression which starts from START_ELEM, and ends
2902      at END_ELEM.  The result are written to MBCSET and SBCSET.
2903      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2904      mbcset->range_ends, is a pointer argument sinse we may
2905      update it.  */
2906
2907   auto inline reg_errcode_t
2908   __attribute ((always_inline))
2909   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2910          re_charset_t *mbcset;
2911          int *range_alloc;
2912          bitset_t sbcset;
2913          bracket_elem_t *start_elem, *end_elem;
2914     {
2915       unsigned int ch;
2916       uint32_t start_collseq;
2917       uint32_t end_collseq;
2918
2919       /* Equivalence Classes and Character Classes can't be a range
2920          start/end.  */
2921       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2922               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2923               0))
2924         return REG_ERANGE;
2925
2926       start_collseq = lookup_collation_sequence_value (start_elem);
2927       end_collseq = lookup_collation_sequence_value (end_elem);
2928       /* Check start/end collation sequence values.  */
2929       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2930         return REG_ECOLLATE;
2931       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2932         return REG_ERANGE;
2933
2934       /* Got valid collation sequence values, add them as a new entry.
2935          However, if we have no collation elements, and the character set
2936          is single byte, the single byte character set that we
2937          build below suffices. */
2938       if (nrules > 0 || dfa->mb_cur_max > 1)
2939         {
2940           /* Check the space of the arrays.  */
2941           if (BE (*range_alloc == mbcset->nranges, 0))
2942             {
2943               /* There is not enough space, need realloc.  */
2944               uint32_t *new_array_start;
2945               uint32_t *new_array_end;
2946               int new_nranges;
2947
2948               /* +1 in case of mbcset->nranges is 0.  */
2949               new_nranges = 2 * mbcset->nranges + 1;
2950               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2951                                             new_nranges);
2952               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2953                                           new_nranges);
2954
2955               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2956                 return REG_ESPACE;
2957
2958               mbcset->range_starts = new_array_start;
2959               mbcset->range_ends = new_array_end;
2960               *range_alloc = new_nranges;
2961             }
2962
2963           mbcset->range_starts[mbcset->nranges] = start_collseq;
2964           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2965         }
2966
2967       /* Build the table for single byte characters.  */
2968       for (ch = 0; ch < SBC_MAX; ch++)
2969         {
2970           uint32_t ch_collseq;
2971           /*
2972           if (MB_CUR_MAX == 1)
2973           */
2974           if (nrules == 0)
2975             ch_collseq = collseqmb[ch];
2976           else
2977             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2978           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2979             bitset_set (sbcset, ch);
2980         }
2981       return REG_NOERROR;
2982     }
2983
2984   /* Local function for parse_bracket_exp used in _LIBC environement.
2985      Build the collating element which is represented by NAME.
2986      The result are written to MBCSET and SBCSET.
2987      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2988      pointer argument sinse we may update it.  */
2989
2990   auto inline reg_errcode_t
2991   __attribute ((always_inline))
2992   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2993          re_charset_t *mbcset;
2994          int *coll_sym_alloc;
2995          bitset_t sbcset;
2996          const unsigned char *name;
2997     {
2998       int32_t elem, idx;
2999       size_t name_len = strlen ((const char *) name);
3000       if (nrules != 0)
3001         {
3002           elem = seek_collating_symbol_entry (name, name_len);
3003           if (symb_table[2 * elem] != 0)
3004             {
3005               /* We found the entry.  */
3006               idx = symb_table[2 * elem + 1];
3007               /* Skip the name of collating element name.  */
3008               idx += 1 + extra[idx];
3009             }
3010           else if (symb_table[2 * elem] == 0 && name_len == 1)
3011             {
3012               /* No valid character, treat it as a normal
3013                  character.  */
3014               bitset_set (sbcset, name[0]);
3015               return REG_NOERROR;
3016             }
3017           else
3018             return REG_ECOLLATE;
3019
3020           /* Got valid collation sequence, add it as a new entry.  */
3021           /* Check the space of the arrays.  */
3022           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3023             {
3024               /* Not enough, realloc it.  */
3025               /* +1 in case of mbcset->ncoll_syms is 0.  */
3026               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3027               /* Use realloc since mbcset->coll_syms is NULL
3028                  if *alloc == 0.  */
3029               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3030                                                    new_coll_sym_alloc);
3031               if (BE (new_coll_syms == NULL, 0))
3032                 return REG_ESPACE;
3033               mbcset->coll_syms = new_coll_syms;
3034               *coll_sym_alloc = new_coll_sym_alloc;
3035             }
3036           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3037           return REG_NOERROR;
3038         }
3039       else
3040         {
3041           if (BE (name_len != 1, 0))
3042             return REG_ECOLLATE;
3043           else
3044             {
3045               bitset_set (sbcset, name[0]);
3046               return REG_NOERROR;
3047             }
3048         }
3049     }
3050 #endif
3051
3052   re_token_t br_token;
3053   re_bitset_ptr_t sbcset;
3054 #ifdef RE_ENABLE_I18N
3055   re_charset_t *mbcset;
3056   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3057   int equiv_class_alloc = 0, char_class_alloc = 0;
3058 #endif /* not RE_ENABLE_I18N */
3059   int non_match = 0;
3060   bin_tree_t *work_tree;
3061   int token_len;
3062   int first_round = 1;
3063 #ifdef _LIBC
3064   collseqmb = (const unsigned char *)
3065     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3066   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3067   if (nrules)
3068     {
3069       /*
3070       if (MB_CUR_MAX > 1)
3071       */
3072       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3073       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3074       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3075                                                   _NL_COLLATE_SYMB_TABLEMB);
3076       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3077                                                    _NL_COLLATE_SYMB_EXTRAMB);
3078     }
3079 #endif
3080   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3081 #ifdef RE_ENABLE_I18N
3082   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3083 #endif /* RE_ENABLE_I18N */
3084 #ifdef RE_ENABLE_I18N
3085   if (BE (sbcset == NULL || mbcset == NULL, 0))
3086 #else
3087   if (BE (sbcset == NULL, 0))
3088 #endif /* RE_ENABLE_I18N */
3089     {
3090       *err = REG_ESPACE;
3091       return NULL;
3092     }
3093
3094   token_len = peek_token_bracket (token, regexp, syntax);
3095   if (BE (token->type == END_OF_RE, 0))
3096     {
3097       *err = REG_BADPAT;
3098       goto parse_bracket_exp_free_return;
3099     }
3100   if (token->type == OP_NON_MATCH_LIST)
3101     {
3102 #ifdef RE_ENABLE_I18N
3103       mbcset->non_match = 1;
3104 #endif /* not RE_ENABLE_I18N */
3105       non_match = 1;
3106       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3107         bitset_set (sbcset, '\n');
3108       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3109       token_len = peek_token_bracket (token, regexp, syntax);
3110       if (BE (token->type == END_OF_RE, 0))
3111         {
3112           *err = REG_BADPAT;
3113           goto parse_bracket_exp_free_return;
3114         }
3115     }
3116
3117   /* We treat the first ']' as a normal character.  */
3118   if (token->type == OP_CLOSE_BRACKET)
3119     token->type = CHARACTER;
3120
3121   while (1)
3122     {
3123       bracket_elem_t start_elem, end_elem;
3124       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3125       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3126       reg_errcode_t ret;
3127       int token_len2 = 0, is_range_exp = 0;
3128       re_token_t token2;
3129
3130       start_elem.opr.name = start_name_buf;
3131       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3132                                    syntax, first_round);
3133       if (BE (ret != REG_NOERROR, 0))
3134         {
3135           *err = ret;
3136           goto parse_bracket_exp_free_return;
3137         }
3138       first_round = 0;
3139
3140       /* Get information about the next token.  We need it in any case.  */
3141       token_len = peek_token_bracket (token, regexp, syntax);
3142
3143       /* Do not check for ranges if we know they are not allowed.  */
3144       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3145         {
3146           if (BE (token->type == END_OF_RE, 0))
3147             {
3148               *err = REG_EBRACK;
3149               goto parse_bracket_exp_free_return;
3150             }
3151           if (token->type == OP_CHARSET_RANGE)
3152             {
3153               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3154               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3155               if (BE (token2.type == END_OF_RE, 0))
3156                 {
3157                   *err = REG_EBRACK;
3158                   goto parse_bracket_exp_free_return;
3159                 }
3160               if (token2.type == OP_CLOSE_BRACKET)
3161                 {
3162                   /* We treat the last '-' as a normal character.  */
3163                   re_string_skip_bytes (regexp, -token_len);
3164                   token->type = CHARACTER;
3165                 }
3166               else
3167                 is_range_exp = 1;
3168             }
3169         }
3170
3171       if (is_range_exp == 1)
3172         {
3173           end_elem.opr.name = end_name_buf;
3174           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3175                                        dfa, syntax, 1);
3176           if (BE (ret != REG_NOERROR, 0))
3177             {
3178               *err = ret;
3179               goto parse_bracket_exp_free_return;
3180             }
3181
3182           token_len = peek_token_bracket (token, regexp, syntax);
3183
3184 #ifdef _LIBC
3185           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3186                                   &start_elem, &end_elem);
3187 #else
3188 # ifdef RE_ENABLE_I18N
3189           *err = build_range_exp (sbcset,
3190                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3191                                   &range_alloc, &start_elem, &end_elem);
3192 # else
3193           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3194 # endif
3195 #endif /* RE_ENABLE_I18N */
3196           if (BE (*err != REG_NOERROR, 0))
3197             goto parse_bracket_exp_free_return;
3198         }
3199       else
3200         {
3201           switch (start_elem.type)
3202             {
3203             case SB_CHAR:
3204               bitset_set (sbcset, start_elem.opr.ch);
3205               break;
3206 #ifdef RE_ENABLE_I18N
3207             case MB_CHAR:
3208               /* Check whether the array has enough space.  */
3209               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3210                 {
3211                   wchar_t *new_mbchars;
3212                   /* Not enough, realloc it.  */
3213                   /* +1 in case of mbcset->nmbchars is 0.  */
3214                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3215                   /* Use realloc since array is NULL if *alloc == 0.  */
3216                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3217                                             mbchar_alloc);
3218                   if (BE (new_mbchars == NULL, 0))
3219                     goto parse_bracket_exp_espace;
3220                   mbcset->mbchars = new_mbchars;
3221                 }
3222               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3223               break;
3224 #endif /* RE_ENABLE_I18N */
3225             case EQUIV_CLASS:
3226               *err = build_equiv_class (sbcset,
3227 #ifdef RE_ENABLE_I18N
3228                                         mbcset, &equiv_class_alloc,
3229 #endif /* RE_ENABLE_I18N */
3230                                         start_elem.opr.name);
3231               if (BE (*err != REG_NOERROR, 0))
3232                 goto parse_bracket_exp_free_return;
3233               break;
3234             case COLL_SYM:
3235               *err = build_collating_symbol (sbcset,
3236 #ifdef RE_ENABLE_I18N
3237                                              mbcset, &coll_sym_alloc,
3238 #endif /* RE_ENABLE_I18N */
3239                                              start_elem.opr.name);
3240               if (BE (*err != REG_NOERROR, 0))
3241                 goto parse_bracket_exp_free_return;
3242               break;
3243             case CHAR_CLASS:
3244               *err = build_charclass (regexp->trans, sbcset,
3245 #ifdef RE_ENABLE_I18N
3246                                       mbcset, &char_class_alloc,
3247 #endif /* RE_ENABLE_I18N */
3248                                       (const char *) start_elem.opr.name, syntax);
3249               if (BE (*err != REG_NOERROR, 0))
3250                goto parse_bracket_exp_free_return;
3251               break;
3252             default:
3253               assert (0);
3254               break;
3255             }
3256         }
3257       if (BE (token->type == END_OF_RE, 0))
3258         {
3259           *err = REG_EBRACK;
3260           goto parse_bracket_exp_free_return;
3261         }
3262       if (token->type == OP_CLOSE_BRACKET)
3263         break;
3264     }
3265
3266   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3267
3268   /* If it is non-matching list.  */
3269   if (non_match)
3270     bitset_not (sbcset);
3271
3272 #ifdef RE_ENABLE_I18N
3273   /* Ensure only single byte characters are set.  */
3274   if (dfa->mb_cur_max > 1)
3275     bitset_mask (sbcset, dfa->sb_char);
3276
3277   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3278       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3279                                                      || mbcset->non_match)))
3280     {
3281       bin_tree_t *mbc_tree;
3282       int sbc_idx;
3283       /* Build a tree for complex bracket.  */
3284       dfa->has_mb_node = 1;
3285       br_token.type = COMPLEX_BRACKET;
3286       br_token.opr.mbcset = mbcset;
3287       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3288       if (BE (mbc_tree == NULL, 0))
3289         goto parse_bracket_exp_espace;
3290       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3291         if (sbcset[sbc_idx])
3292           break;
3293       /* If there are no bits set in sbcset, there is no point
3294          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3295       if (sbc_idx < BITSET_WORDS)
3296         {
3297           /* Build a tree for simple bracket.  */
3298           br_token.type = SIMPLE_BRACKET;
3299           br_token.opr.sbcset = sbcset;
3300           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3301           if (BE (work_tree == NULL, 0))
3302             goto parse_bracket_exp_espace;
3303
3304           /* Then join them by ALT node.  */
3305           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3306           if (BE (work_tree == NULL, 0))
3307             goto parse_bracket_exp_espace;
3308         }
3309       else
3310         {
3311           re_free (sbcset);
3312           work_tree = mbc_tree;
3313         }
3314     }
3315   else
3316 #endif /* not RE_ENABLE_I18N */
3317     {
3318 #ifdef RE_ENABLE_I18N
3319       free_charset (mbcset);
3320 #endif
3321       /* Build a tree for simple bracket.  */
3322       br_token.type = SIMPLE_BRACKET;
3323       br_token.opr.sbcset = sbcset;
3324       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3325       if (BE (work_tree == NULL, 0))
3326         goto parse_bracket_exp_espace;
3327     }
3328   return work_tree;
3329
3330  parse_bracket_exp_espace:
3331   *err = REG_ESPACE;
3332  parse_bracket_exp_free_return:
3333   re_free (sbcset);
3334 #ifdef RE_ENABLE_I18N
3335   free_charset (mbcset);
3336 #endif /* RE_ENABLE_I18N */
3337   return NULL;
3338 }
3339
3340 /* Parse an element in the bracket expression.  */
3341
3342 static reg_errcode_t
3343 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3344                        re_token_t *token, int token_len, re_dfa_t *dfa,
3345                        reg_syntax_t syntax, int accept_hyphen)
3346 {
3347 #ifdef RE_ENABLE_I18N
3348   int cur_char_size;
3349   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3350   if (cur_char_size > 1)
3351     {
3352       elem->type = MB_CHAR;
3353       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3354       re_string_skip_bytes (regexp, cur_char_size);
3355       return REG_NOERROR;
3356     }
3357 #endif /* RE_ENABLE_I18N */
3358   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3359   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3360       || token->type == OP_OPEN_EQUIV_CLASS)
3361     return parse_bracket_symbol (elem, regexp, token);
3362   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3363     {
3364       /* A '-' must only appear as anything but a range indicator before
3365          the closing bracket.  Everything else is an error.  */
3366       re_token_t token2;
3367       (void) peek_token_bracket (&token2, regexp, syntax);
3368       if (token2.type != OP_CLOSE_BRACKET)
3369         /* The actual error value is not standardized since this whole
3370            case is undefined.  But ERANGE makes good sense.  */
3371         return REG_ERANGE;
3372     }
3373   elem->type = SB_CHAR;
3374   elem->opr.ch = token->opr.c;
3375   return REG_NOERROR;
3376 }
3377
3378 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3379    such as [:<character_class>:], [.<collating_element>.], and
3380    [=<equivalent_class>=].  */
3381
3382 static reg_errcode_t
3383 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3384                       re_token_t *token)
3385 {
3386   unsigned char ch, delim = token->opr.c;
3387   int i = 0;
3388   if (re_string_eoi(regexp))
3389     return REG_EBRACK;
3390   for (;; ++i)
3391     {
3392       if (i >= BRACKET_NAME_BUF_SIZE)
3393         return REG_EBRACK;
3394       if (token->type == OP_OPEN_CHAR_CLASS)
3395         ch = re_string_fetch_byte_case (regexp);
3396       else
3397         ch = re_string_fetch_byte (regexp);
3398       if (re_string_eoi(regexp))
3399         return REG_EBRACK;
3400       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3401         break;
3402       elem->opr.name[i] = ch;
3403     }
3404   re_string_skip_bytes (regexp, 1);
3405   elem->opr.name[i] = '\0';
3406   switch (token->type)
3407     {
3408     case OP_OPEN_COLL_ELEM:
3409       elem->type = COLL_SYM;
3410       break;
3411     case OP_OPEN_EQUIV_CLASS:
3412       elem->type = EQUIV_CLASS;
3413       break;
3414     case OP_OPEN_CHAR_CLASS:
3415       elem->type = CHAR_CLASS;
3416       break;
3417     default:
3418       break;
3419     }
3420   return REG_NOERROR;
3421 }
3422
3423   /* Helper function for parse_bracket_exp.
3424      Build the equivalence class which is represented by NAME.
3425      The result are written to MBCSET and SBCSET.
3426      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3427      is a pointer argument sinse we may update it.  */
3428
3429 static reg_errcode_t
3430 #ifdef RE_ENABLE_I18N
3431 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3432                    int *equiv_class_alloc, const unsigned char *name)
3433 #else /* not RE_ENABLE_I18N */
3434 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3435 #endif /* not RE_ENABLE_I18N */
3436 {
3437 #ifdef _LIBC
3438   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3439   if (nrules != 0)
3440     {
3441       const int32_t *table, *indirect;
3442       const unsigned char *weights, *extra, *cp;
3443       unsigned char char_buf[2];
3444       int32_t idx1, idx2;
3445       unsigned int ch;
3446       size_t len;
3447       /* This #include defines a local function!  */
3448 # include <locale/weight.h>
3449       /* Calculate the index for equivalence class.  */
3450       cp = name;
3451       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3452       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3453                                                _NL_COLLATE_WEIGHTMB);
3454       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3455                                                    _NL_COLLATE_EXTRAMB);
3456       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3457                                                 _NL_COLLATE_INDIRECTMB);
3458       idx1 = findidx (&cp);
3459       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3460         /* This isn't a valid character.  */
3461         return REG_ECOLLATE;
3462
3463       /* Build single byte matcing table for this equivalence class.  */
3464       char_buf[1] = (unsigned char) '\0';
3465       len = weights[idx1 & 0xffffff];
3466       for (ch = 0; ch < SBC_MAX; ++ch)
3467         {
3468           char_buf[0] = ch;
3469           cp = char_buf;
3470           idx2 = findidx (&cp);
3471 /*
3472           idx2 = table[ch];
3473 */
3474           if (idx2 == 0)
3475             /* This isn't a valid character.  */
3476             continue;
3477           /* Compare only if the length matches and the collation rule
3478              index is the same.  */
3479           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3480             {
3481               int cnt = 0;
3482
3483               while (cnt <= len &&
3484                      weights[(idx1 & 0xffffff) + 1 + cnt]
3485                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3486                 ++cnt;
3487
3488               if (cnt > len)
3489                 bitset_set (sbcset, ch);
3490             }
3491         }
3492       /* Check whether the array has enough space.  */
3493       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3494         {
3495           /* Not enough, realloc it.  */
3496           /* +1 in case of mbcset->nequiv_classes is 0.  */
3497           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3498           /* Use realloc since the array is NULL if *alloc == 0.  */
3499           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3500                                                    int32_t,
3501                                                    new_equiv_class_alloc);
3502           if (BE (new_equiv_classes == NULL, 0))
3503             return REG_ESPACE;
3504           mbcset->equiv_classes = new_equiv_classes;
3505           *equiv_class_alloc = new_equiv_class_alloc;
3506         }
3507       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3508     }
3509   else
3510 #endif /* _LIBC */
3511     {
3512       if (BE (strlen ((const char *) name) != 1, 0))
3513         return REG_ECOLLATE;
3514       bitset_set (sbcset, *name);
3515     }
3516   return REG_NOERROR;
3517 }
3518
3519   /* Helper function for parse_bracket_exp.
3520      Build the character class which is represented by NAME.
3521      The result are written to MBCSET and SBCSET.
3522      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3523      is a pointer argument sinse we may update it.  */
3524
3525 static reg_errcode_t
3526 #ifdef RE_ENABLE_I18N
3527 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3528                  re_charset_t *mbcset, int *char_class_alloc,
3529                  const char *class_name, reg_syntax_t syntax)
3530 #else /* not RE_ENABLE_I18N */
3531 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3532                  const char *class_name, reg_syntax_t syntax)
3533 #endif /* not RE_ENABLE_I18N */
3534 {
3535   int i;
3536
3537   /* In case of REG_ICASE "upper" and "lower" match the both of
3538      upper and lower cases.  */
3539   if ((syntax & RE_ICASE)
3540       && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3541     class_name = "alpha";
3542
3543 #ifdef RE_ENABLE_I18N
3544   /* Check the space of the arrays.  */
3545   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3546     {
3547       /* Not enough, realloc it.  */
3548       /* +1 in case of mbcset->nchar_classes is 0.  */
3549       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3550       /* Use realloc since array is NULL if *alloc == 0.  */
3551       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3552                                                new_char_class_alloc);
3553       if (BE (new_char_classes == NULL, 0))
3554         return REG_ESPACE;
3555       mbcset->char_classes = new_char_classes;
3556       *char_class_alloc = new_char_class_alloc;
3557     }
3558   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3559 #endif /* RE_ENABLE_I18N */
3560
3561 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3562   do {                                          \
3563     if (BE (trans != NULL, 0))                  \
3564       {                                         \
3565         for (i = 0; i < SBC_MAX; ++i)           \
3566           if (ctype_func (i))                   \
3567             bitset_set (sbcset, trans[i]);      \
3568       }                                         \
3569     else                                        \
3570       {                                         \
3571         for (i = 0; i < SBC_MAX; ++i)           \
3572           if (ctype_func (i))                   \
3573             bitset_set (sbcset, i);             \
3574       }                                         \
3575   } while (0)
3576
3577   if (strcmp (class_name, "alnum") == 0)
3578     BUILD_CHARCLASS_LOOP (isalnum);
3579   else if (strcmp (class_name, "cntrl") == 0)
3580     BUILD_CHARCLASS_LOOP (iscntrl);
3581   else if (strcmp (class_name, "lower") == 0)
3582     BUILD_CHARCLASS_LOOP (islower);
3583   else if (strcmp (class_name, "space") == 0)
3584     BUILD_CHARCLASS_LOOP (isspace);
3585   else if (strcmp (class_name, "alpha") == 0)
3586     BUILD_CHARCLASS_LOOP (isalpha);
3587   else if (strcmp (class_name, "digit") == 0)
3588     BUILD_CHARCLASS_LOOP (isdigit);
3589   else if (strcmp (class_name, "print") == 0)
3590     BUILD_CHARCLASS_LOOP (isprint);
3591   else if (strcmp (class_name, "upper") == 0)
3592     BUILD_CHARCLASS_LOOP (isupper);
3593   else if (strcmp (class_name, "blank") == 0)
3594 #ifndef GAWK
3595     BUILD_CHARCLASS_LOOP (isblank);
3596 #else
3597     /* see comments above */
3598     BUILD_CHARCLASS_LOOP (is_blank);
3599 #endif
3600   else if (strcmp (class_name, "graph") == 0)
3601     BUILD_CHARCLASS_LOOP (isgraph);
3602   else if (strcmp (class_name, "punct") == 0)
3603     BUILD_CHARCLASS_LOOP (ispunct);
3604   else if (strcmp (class_name, "xdigit") == 0)
3605     BUILD_CHARCLASS_LOOP (isxdigit);
3606   else
3607     return REG_ECTYPE;
3608
3609   return REG_NOERROR;
3610 }
3611
3612 static bin_tree_t *
3613 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3614                     const char *class_name,
3615                     const char *extra, int non_match,
3616                     reg_errcode_t *err)
3617 {
3618   re_bitset_ptr_t sbcset;
3619 #ifdef RE_ENABLE_I18N
3620   re_charset_t *mbcset;
3621   int alloc = 0;
3622 #endif /* not RE_ENABLE_I18N */
3623   reg_errcode_t ret;
3624   re_token_t br_token;
3625   bin_tree_t *tree;
3626
3627   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3628 #ifdef RE_ENABLE_I18N
3629   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3630 #endif /* RE_ENABLE_I18N */
3631
3632 #ifdef RE_ENABLE_I18N
3633   if (BE (sbcset == NULL || mbcset == NULL, 0))
3634 #else /* not RE_ENABLE_I18N */
3635   if (BE (sbcset == NULL, 0))
3636 #endif /* not RE_ENABLE_I18N */
3637     {
3638       *err = REG_ESPACE;
3639       return NULL;
3640     }
3641
3642   if (non_match)
3643     {
3644 #ifdef RE_ENABLE_I18N
3645       mbcset->non_match = 1;
3646 #endif /* not RE_ENABLE_I18N */
3647     }
3648
3649   /* We don't care the syntax in this case.  */
3650   ret = build_charclass (trans, sbcset,
3651 #ifdef RE_ENABLE_I18N
3652                          mbcset, &alloc,
3653 #endif /* RE_ENABLE_I18N */
3654                          class_name, 0);
3655
3656   if (BE (ret != REG_NOERROR, 0))
3657     {
3658       re_free (sbcset);
3659 #ifdef RE_ENABLE_I18N
3660       free_charset (mbcset);
3661 #endif /* RE_ENABLE_I18N */
3662       *err = ret;
3663       return NULL;
3664     }
3665   /* \w match '_' also.  */
3666   for (; *extra; extra++)
3667     bitset_set (sbcset, *extra);
3668
3669   /* If it is non-matching list.  */
3670   if (non_match)
3671     bitset_not (sbcset);
3672
3673 #ifdef RE_ENABLE_I18N
3674   /* Ensure only single byte characters are set.  */
3675   if (dfa->mb_cur_max > 1)
3676     bitset_mask (sbcset, dfa->sb_char);
3677 #endif
3678
3679   /* Build a tree for simple bracket.  */
3680   br_token.type = SIMPLE_BRACKET;
3681   br_token.opr.sbcset = sbcset;
3682   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3683   if (BE (tree == NULL, 0))
3684     goto build_word_op_espace;
3685
3686 #ifdef RE_ENABLE_I18N
3687   if (dfa->mb_cur_max > 1)
3688     {
3689       bin_tree_t *mbc_tree;
3690       /* Build a tree for complex bracket.  */
3691       br_token.type = COMPLEX_BRACKET;
3692       br_token.opr.mbcset = mbcset;
3693       dfa->has_mb_node = 1;
3694       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3695       if (BE (mbc_tree == NULL, 0))
3696         goto build_word_op_espace;
3697       /* Then join them by ALT node.  */
3698       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3699       if (BE (mbc_tree != NULL, 1))
3700         return tree;
3701     }
3702   else
3703     {
3704       free_charset (mbcset);
3705       return tree;
3706     }
3707 #else /* not RE_ENABLE_I18N */
3708   return tree;
3709 #endif /* not RE_ENABLE_I18N */
3710
3711  build_word_op_espace:
3712   re_free (sbcset);
3713 #ifdef RE_ENABLE_I18N
3714   free_charset (mbcset);
3715 #endif /* RE_ENABLE_I18N */
3716   *err = REG_ESPACE;
3717   return NULL;
3718 }
3719
3720 /* This is intended for the expressions like "a{1,3}".
3721    Fetch a number from `input', and return the number.
3722    Return -1, if the number field is empty like "{,1}".
3723    Return -2, If an error is occured.  */
3724
3725 static int
3726 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3727 {
3728   int num = -1;
3729   unsigned char c;
3730   while (1)
3731     {
3732       fetch_token (token, input, syntax);
3733       c = token->opr.c;
3734       if (BE (token->type == END_OF_RE, 0))
3735         return -2;
3736       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3737         break;
3738       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3739              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3740       num = (num > RE_DUP_MAX) ? -2 : num;
3741     }
3742   return num;
3743 }
3744 \f
3745 #ifdef RE_ENABLE_I18N
3746 static void
3747 free_charset (re_charset_t *cset)
3748 {
3749   re_free (cset->mbchars);
3750 # ifdef _LIBC
3751   re_free (cset->coll_syms);
3752   re_free (cset->equiv_classes);
3753   re_free (cset->range_starts);
3754   re_free (cset->range_ends);
3755 # endif
3756   re_free (cset->char_classes);
3757   re_free (cset);
3758 }
3759 #endif /* RE_ENABLE_I18N */
3760 \f
3761 /* Functions for binary tree operation.  */
3762
3763 /* Create a tree node.  */
3764
3765 static bin_tree_t *
3766 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3767              re_token_type_t type)
3768 {
3769   re_token_t t;
3770   t.type = type;
3771   return create_token_tree (dfa, left, right, &t);
3772 }
3773
3774 static bin_tree_t *
3775 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3776                    const re_token_t *token)
3777 {
3778   bin_tree_t *tree;
3779   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3780     {
3781       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3782
3783       if (storage == NULL)
3784         return NULL;
3785       storage->next = dfa->str_tree_storage;
3786       dfa->str_tree_storage = storage;
3787       dfa->str_tree_storage_idx = 0;
3788     }
3789   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3790
3791   tree->parent = NULL;
3792   tree->left = left;
3793   tree->right = right;
3794   tree->token = *token;
3795   tree->token.duplicated = 0;
3796   tree->token.opt_subexp = 0;
3797   tree->first = NULL;
3798   tree->next = NULL;
3799   tree->node_idx = -1;
3800
3801   if (left != NULL)
3802     left->parent = tree;
3803   if (right != NULL)
3804     right->parent = tree;
3805   return tree;
3806 }
3807
3808 /* Mark the tree SRC as an optional subexpression.
3809    To be called from preorder or postorder.  */
3810
3811 static reg_errcode_t
3812 mark_opt_subexp (void *extra, bin_tree_t *node)
3813 {
3814   int idx = (int) (long) extra;
3815   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3816     node->token.opt_subexp = 1;
3817
3818   return REG_NOERROR;
3819 }
3820
3821 /* Free the allocated memory inside NODE. */
3822
3823 static void
3824 free_token (re_token_t *node)
3825 {
3826 #ifdef RE_ENABLE_I18N
3827   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3828     free_charset (node->opr.mbcset);
3829   else
3830 #endif /* RE_ENABLE_I18N */
3831     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3832       re_free (node->opr.sbcset);
3833 }
3834
3835 /* Worker function for tree walking.  Free the allocated memory inside NODE
3836    and its children. */
3837
3838 static reg_errcode_t
3839 free_tree (void *extra, bin_tree_t *node)
3840 {
3841   free_token (&node->token);
3842   return REG_NOERROR;
3843 }
3844
3845
3846 /* Duplicate the node SRC, and return new node.  This is a preorder
3847    visit similar to the one implemented by the generic visitor, but
3848    we need more infrastructure to maintain two parallel trees --- so,
3849    it's easier to duplicate.  */
3850
3851 static bin_tree_t *
3852 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3853 {
3854   const bin_tree_t *node;
3855   bin_tree_t *dup_root;
3856   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3857
3858   for (node = root; ; )
3859     {
3860       /* Create a new tree and link it back to the current parent.  */
3861       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3862       if (*p_new == NULL)
3863         return NULL;
3864       (*p_new)->parent = dup_node;
3865       (*p_new)->token.duplicated = 1;
3866       dup_node = *p_new;
3867
3868       /* Go to the left node, or up and to the right.  */
3869       if (node->left)
3870         {
3871           node = node->left;
3872           p_new = &dup_node->left;
3873         }
3874       else
3875         {
3876           const bin_tree_t *prev = NULL;
3877           while (node->right == prev || node->right == NULL)
3878             {
3879               prev = node;
3880               node = node->parent;
3881               dup_node = dup_node->parent;
3882               if (!node)
3883                 return dup_root;
3884             }
3885           node = node->right;
3886           p_new = &dup_node->right;
3887         }
3888     }
3889 }