compat/regex/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19    02110-1301 USA.  */
  20
  21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                           size_t length, reg_syntax_t syntax);
  23 static void re_compile_fastmap_iter (regex_t *bufp,
  24                                      const re_dfastate_t *init_state,
  25                                      char *fastmap);
  26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  27 #ifdef RE_ENABLE_I18N
  28 static void free_charset (re_charset_t *cset);
  29 #endif /* RE_ENABLE_I18N */
  30 static void free_workarea_compile (regex_t *preg);
  31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  32 #ifdef RE_ENABLE_I18N
  33 static void optimize_utf8 (re_dfa_t *dfa);
  34 #endif
  35 static reg_errcode_t analyze (regex_t *preg);
  36 static reg_errcode_t preorder (bin_tree_t *root,
  37                                reg_errcode_t (fn (void *, bin_tree_t *)),
  38                                void *extra);
  39 static reg_errcode_t postorder (bin_tree_t *root,
  40                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  41                                 void *extra);
  42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  45                                  bin_tree_t *node);
  46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  49 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  50 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  51                                    unsigned int constraint);
  52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  54                                          int node, int root);
  55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  56 static int fetch_number (re_string_t *input, re_token_t *token,
  57                          reg_syntax_t syntax);
  58 static int peek_token (re_token_t *token, re_string_t *input,
  59                         reg_syntax_t syntax) internal_function;
  60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  61                           reg_syntax_t syntax, reg_errcode_t *err);
  62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  63                                   re_token_t *token, reg_syntax_t syntax,
  64                                   int nest, reg_errcode_t *err);
  65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  66                                  re_token_t *token, reg_syntax_t syntax,
  67                                  int nest, reg_errcode_t *err);
  68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  69                                      re_token_t *token, reg_syntax_t syntax,
  70                                      int nest, reg_errcode_t *err);
  71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  72                                   re_token_t *token, reg_syntax_t syntax,
  73                                   int nest, reg_errcode_t *err);
  74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  75                                  re_dfa_t *dfa, re_token_t *token,
  76                                  reg_syntax_t syntax, reg_errcode_t *err);
  77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  78                                       re_token_t *token, reg_syntax_t syntax,
  79                                       reg_errcode_t *err);
  80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  81                                             re_string_t *regexp,
  82                                             re_token_t *token, int token_len,
  83                                             re_dfa_t *dfa,
  84                                             reg_syntax_t syntax,
  85                                             int accept_hyphen);
  86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  87                                           re_string_t *regexp,
  88                                           re_token_t *token);
  89 #ifdef RE_ENABLE_I18N
  90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  91                                         re_charset_t *mbcset,
  92                                         int *equiv_class_alloc,
  93                                         const unsigned char *name);
  94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  95                                       bitset_t sbcset,
  96                                       re_charset_t *mbcset,
  97                                       int *char_class_alloc,
  98                                       const char *class_name,
  99                                       reg_syntax_t syntax);
 100 #else  /* not RE_ENABLE_I18N */
 101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 102                                         const unsigned char *name);
 103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 104                                       bitset_t sbcset,
 105                                       const char *class_name,
 106                                       reg_syntax_t syntax);
 107 #endif /* not RE_ENABLE_I18N */
 108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 109                                        RE_TRANSLATE_TYPE trans,
 110                                        const char *class_name,
 111                                        const char *extra,
 112                                        int non_match, reg_errcode_t *err);
 113 static bin_tree_t *create_tree (re_dfa_t *dfa,
 114                                 bin_tree_t *left, bin_tree_t *right,
 115                                 re_token_type_t type);
 116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 117                                       bin_tree_t *left, bin_tree_t *right,
 118                                       const re_token_t *token);
 119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 120 static void free_token (re_token_t *node);
 121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 123 \f
 124 /* This table gives an error message for each of the error codes listed
 125    in regex.h.  Obviously the order here has to be same as there.
 126    POSIX doesn't require that we do anything for REG_NOERROR,
 127    but why not be nice?  */
 128
 129 const char __re_error_msgid[] attribute_hidden =
 130   {
 131 #define REG_NOERROR_IDX 0
 132     gettext_noop ("Success")    /* REG_NOERROR */
 133     "\0"
 134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 135     gettext_noop ("No match")   /* REG_NOMATCH */
 136     "\0"
 137 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 138     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 139     "\0"
 140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 141     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 142     "\0"
 143 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 144     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 145     "\0"
 146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 147     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 148     "\0"
 149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 150     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 151     "\0"
 152 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 153     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 154     "\0"
 155 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 156     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 157     "\0"
 158 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 159     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 160     "\0"
 161 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 162     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 163     "\0"
 164 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 165     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 166     "\0"
 167 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 168     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 169     "\0"
 170 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 171     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 172     "\0"
 173 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 174     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 175     "\0"
 176 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 177     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 178     "\0"
 179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 180     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 181   };
 182
 183 const size_t __re_error_msgid_idx[] attribute_hidden =
 184   {
 185     REG_NOERROR_IDX,
 186     REG_NOMATCH_IDX,
 187     REG_BADPAT_IDX,
 188     REG_ECOLLATE_IDX,
 189     REG_ECTYPE_IDX,
 190     REG_EESCAPE_IDX,
 191     REG_ESUBREG_IDX,
 192     REG_EBRACK_IDX,
 193     REG_EPAREN_IDX,
 194     REG_EBRACE_IDX,
 195     REG_BADBR_IDX,
 196     REG_ERANGE_IDX,
 197     REG_ESPACE_IDX,
 198     REG_BADRPT_IDX,
 199     REG_EEND_IDX,
 200     REG_ESIZE_IDX,
 201     REG_ERPAREN_IDX
 202   };
 203 \f
 204 /* Entry points for GNU code.  */
 205
 206
 207 #ifdef ZOS_USS
 208
 209 /* For ZOS USS we must define btowc */
 210
 211 wchar_t
 212 btowc (int c)
 213 {
 214    wchar_t wtmp[2];
 215    char tmp[2];
 216
 217    tmp[0] = c;
 218    tmp[1] = 0;
 219
 220    mbtowc (wtmp, tmp, 1);
 221    return wtmp[0];
 222 }
 223 #endif
 224
 225 /* re_compile_pattern is the GNU regular expression compiler: it
 226    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 227    Returns 0 if the pattern was valid, otherwise an error string.
 228
 229    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 230    are set in BUFP on entry.  */
 231
 232 const char *
 233 re_compile_pattern (pattern, length, bufp)
 234     const char *pattern;
 235     size_t length;
 236     struct re_pattern_buffer *bufp;
 237 {
 238   reg_errcode_t ret;
 239
 240   /* And GNU code determines whether or not to get register information
 241      by passing null for the REGS argument to re_match, etc., not by
 242      setting no_sub, unless RE_NO_SUB is set.  */
 243   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 244
 245   /* Match anchors at newline.  */
 246   bufp->newline_anchor = 1;
 247
 248   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 249
 250   if (!ret)
 251     return NULL;
 252   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 253 }
 254 #ifdef _LIBC
 255 weak_alias (__re_compile_pattern, re_compile_pattern)
 256 #endif
 257
 258 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 259    also be assigned to arbitrarily: each pattern buffer stores its own
 260    syntax, so it can be changed between regex compilations.  */
 261 /* This has no initializer because initialized variables in Emacs
 262    become read-only after dumping.  */
 263 reg_syntax_t re_syntax_options;
 264
 265
 266 /* Specify the precise syntax of regexps for compilation.  This provides
 267    for compatibility for various utilities which historically have
 268    different, incompatible syntaxes.
 269
 270    The argument SYNTAX is a bit mask comprised of the various bits
 271    defined in regex.h.  We return the old syntax.  */
 272
 273 reg_syntax_t
 274 re_set_syntax (syntax)
 275     reg_syntax_t syntax;
 276 {
 277   reg_syntax_t ret = re_syntax_options;
 278
 279   re_syntax_options = syntax;
 280   return ret;
 281 }
 282 #ifdef _LIBC
 283 weak_alias (__re_set_syntax, re_set_syntax)
 284 #endif
 285
 286 int
 287 re_compile_fastmap (bufp)
 288     struct re_pattern_buffer *bufp;
 289 {
 290   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 291   char *fastmap = bufp->fastmap;
 292
 293   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 294   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 295   if (dfa->init_state != dfa->init_state_word)
 296     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 297   if (dfa->init_state != dfa->init_state_nl)
 298     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 299   if (dfa->init_state != dfa->init_state_begbuf)
 300     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 301   bufp->fastmap_accurate = 1;
 302   return 0;
 303 }
 304 #ifdef _LIBC
 305 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 306 #endif
 307
 308 static inline void
 309 __attribute ((always_inline))
 310 re_set_fastmap (char *fastmap, int icase, int ch)
 311 {
 312   fastmap[ch] = 1;
 313   if (icase)
 314     fastmap[tolower (ch)] = 1;
 315 }
 316
 317 /* Helper function for re_compile_fastmap.
 318    Compile fastmap for the initial_state INIT_STATE.  */
 319
 320 static void
 321 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 322                          char *fastmap)
 323 {
 324   volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 325   int node_cnt;
 326   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 327   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 328     {
 329       int node = init_state->nodes.elems[node_cnt];
 330       re_token_type_t type = dfa->nodes[node].type;
 331
 332       if (type == CHARACTER)
 333         {
 334           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 335 #ifdef RE_ENABLE_I18N
 336           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 337             {
 338               unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
 339               wchar_t wc;
 340               mbstate_t state;
 341
 342               p = buf;
 343               *p++ = dfa->nodes[node].opr.c;
 344               while (++node < dfa->nodes_len
 345                      && dfa->nodes[node].type == CHARACTER
 346                      && dfa->nodes[node].mb_partial)
 347                 *p++ = dfa->nodes[node].opr.c;
 348               memset (&state, '\0', sizeof (state));
 349               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 350                              &state) == p - buf
 351                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 352                       != (size_t) -1))
 353                 re_set_fastmap (fastmap, 0, buf[0]);
 354               re_free (buf);
 355             }
 356 #endif
 357         }
 358       else if (type == SIMPLE_BRACKET)
 359         {
 360           int i, ch;
 361           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 362             {
 363               int j;
 364               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 365               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 366                 if (w & ((bitset_word_t) 1 << j))
 367                   re_set_fastmap (fastmap, icase, ch);
 368             }
 369         }
 370 #ifdef RE_ENABLE_I18N
 371       else if (type == COMPLEX_BRACKET)
 372         {
 373           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 374           int i;
 375
 376 # ifdef _LIBC
 377           /* See if we have to try all bytes which start multiple collation
 378              elements.
 379              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 380                   collation element, and don't catch 'b' since 'b' is
 381                   the only collation element which starts from 'b' (and
 382                   it is caught by SIMPLE_BRACKET).  */
 383               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 384                   && (cset->ncoll_syms || cset->nranges))
 385                 {
 386                   const int32_t *table = (const int32_t *)
 387                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 388                   for (i = 0; i < SBC_MAX; ++i)
 389                     if (table[i] < 0)
 390                       re_set_fastmap (fastmap, icase, i);
 391                 }
 392 # endif /* _LIBC */
 393
 394           /* See if we have to start the match at all multibyte characters,
 395              i.e. where we would not find an invalid sequence.  This only
 396              applies to multibyte character sets; for single byte character
 397              sets, the SIMPLE_BRACKET again suffices.  */
 398           if (dfa->mb_cur_max > 1
 399               && (cset->nchar_classes || cset->non_match || cset->nranges
 400 # ifdef _LIBC
 401                   || cset->nequiv_classes
 402 # endif /* _LIBC */
 403                  ))
 404             {
 405               unsigned char c = 0;
 406               do
 407                 {
 408                   mbstate_t mbs;
 409                   memset (&mbs, 0, sizeof (mbs));
 410                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 411                     re_set_fastmap (fastmap, false, (int) c);
 412                 }
 413               while (++c != 0);
 414             }
 415
 416           else
 417             {
 418               /* ... Else catch all bytes which can start the mbchars.  */
 419               for (i = 0; i < cset->nmbchars; ++i)
 420                 {
 421                   char buf[256];
 422                   mbstate_t state;
 423                   memset (&state, '\0', sizeof (state));
 424                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 425                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 426                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 427                     {
 428                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 429                           != (size_t) -1)
 430                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 431                     }
 432                 }
 433             }
 434         }
 435 #endif /* RE_ENABLE_I18N */
 436       else if (type == OP_PERIOD
 437 #ifdef RE_ENABLE_I18N
 438                || type == OP_UTF8_PERIOD
 439 #endif /* RE_ENABLE_I18N */
 440                || type == END_OF_RE)
 441         {
 442           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 443           if (type == END_OF_RE)
 444             bufp->can_be_null = 1;
 445           return;
 446         }
 447     }
 448 }
 449 \f
 450 /* Entry point for POSIX code.  */
 451 /* regcomp takes a regular expression as a string and compiles it.
 452
 453    PREG is a regex_t *.  We do not expect any fields to be initialized,
 454    since POSIX says we shouldn't.  Thus, we set
 455
 456      `buffer' to the compiled pattern;
 457      `used' to the length of the compiled pattern;
 458      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 459        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 460        RE_SYNTAX_POSIX_BASIC;
 461      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 462      `fastmap' to an allocated space for the fastmap;
 463      `fastmap_accurate' to zero;
 464      `re_nsub' to the number of subexpressions in PATTERN.
 465
 466    PATTERN is the address of the pattern string.
 467
 468    CFLAGS is a series of bits which affect compilation.
 469
 470      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 471      use POSIX basic syntax.
 472
 473      If REG_NEWLINE is set, then . and [^...] don't match newline.
 474      Also, regexec will try a match beginning after every newline.
 475
 476      If REG_ICASE is set, then we considers upper- and lowercase
 477      versions of letters to be equivalent when matching.
 478
 479      If REG_NOSUB is set, then when PREG is passed to regexec, that
 480      routine will report only success or failure, and nothing about the
 481      registers.
 482
 483    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 484    the return codes and their meanings.)  */
 485
 486 int
 487 regcomp (preg, pattern, cflags)
 488     regex_t *__restrict preg;
 489     const char *__restrict pattern;
 490     int cflags;
 491 {
 492   reg_errcode_t ret;
 493   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 494                          : RE_SYNTAX_POSIX_BASIC);
 495
 496   preg->buffer = NULL;
 497   preg->allocated = 0;
 498   preg->used = 0;
 499
 500   /* Try to allocate space for the fastmap.  */
 501   preg->fastmap = re_malloc (char, SBC_MAX);
 502   if (BE (preg->fastmap == NULL, 0))
 503     return REG_ESPACE;
 504
 505   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 506
 507   /* If REG_NEWLINE is set, newlines are treated differently.  */
 508   if (cflags & REG_NEWLINE)
 509     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 510       syntax &= ~RE_DOT_NEWLINE;
 511       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 512       /* It also changes the matching behavior.  */
 513       preg->newline_anchor = 1;
 514     }
 515   else
 516     preg->newline_anchor = 0;
 517   preg->no_sub = !!(cflags & REG_NOSUB);
 518   preg->translate = NULL;
 519
 520   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 521
 522   /* POSIX doesn't distinguish between an unmatched open-group and an
 523      unmatched close-group: both are REG_EPAREN.  */
 524   if (ret == REG_ERPAREN)
 525     ret = REG_EPAREN;
 526
 527   /* We have already checked preg->fastmap != NULL.  */
 528   if (BE (ret == REG_NOERROR, 1))
 529     /* Compute the fastmap now, since regexec cannot modify the pattern
 530        buffer.  This function never fails in this implementation.  */
 531     (void) re_compile_fastmap (preg);
 532   else
 533     {
 534       /* Some error occurred while compiling the expression.  */
 535       re_free (preg->fastmap);
 536       preg->fastmap = NULL;
 537     }
 538
 539   return (int) ret;
 540 }
 541 #ifdef _LIBC
 542 weak_alias (__regcomp, regcomp)
 543 #endif
 544
 545 /* Returns a message corresponding to an error code, ERRCODE, returned
 546    from either regcomp or regexec.   We don't use PREG here.  */
 547
 548 size_t
 549 regerror (errcode, preg, errbuf, errbuf_size)
 550     int errcode;
 551     const regex_t *__restrict preg;
 552     char *__restrict errbuf;
 553     size_t errbuf_size;
 554 {
 555   const char *msg;
 556   size_t msg_size;
 557
 558   if (BE (errcode < 0
 559           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 560                                / sizeof (__re_error_msgid_idx[0])), 0))
 561     /* Only error codes returned by the rest of the code should be passed
 562        to this routine.  If we are given anything else, or if other regex
 563        code generates an invalid error code, then the program has a bug.
 564        Dump core so we can fix it.  */
 565     abort ();
 566
 567   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 568
 569   msg_size = strlen (msg) + 1; /* Includes the null.  */
 570
 571   if (BE (errbuf_size != 0, 1))
 572     {
 573       if (BE (msg_size > errbuf_size, 0))
 574         {
 575           memcpy (errbuf, msg, errbuf_size - 1);
 576           errbuf[errbuf_size - 1] = 0;
 577         }
 578       else
 579         memcpy (errbuf, msg, msg_size);
 580     }
 581
 582   return msg_size;
 583 }
 584 #ifdef _LIBC
 585 weak_alias (__regerror, regerror)
 586 #endif
 587
 588
 589 #ifdef RE_ENABLE_I18N
 590 /* This static array is used for the map to single-byte characters when
 591    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 592    it the same all the time.  UTF-8 is the preferred encoding so this is
 593    a worthwhile optimization.  */
 594 #if __GNUC__ >= 3
 595 static const bitset_t utf8_sb_map = {
 596   /* Set the first 128 bits.  */
 597   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 598 };
 599 #else /* ! (__GNUC__ >= 3) */
 600 static bitset_t utf8_sb_map;
 601 #endif /* __GNUC__ >= 3 */
 602 #endif /* RE_ENABLE_I18N */
 603
 604
 605 static void
 606 free_dfa_content (re_dfa_t *dfa)
 607 {
 608   int i, j;
 609
 610   if (dfa->nodes)
 611     for (i = 0; i < dfa->nodes_len; ++i)
 612       free_token (dfa->nodes + i);
 613   re_free (dfa->nexts);
 614   for (i = 0; i < dfa->nodes_len; ++i)
 615     {
 616       if (dfa->eclosures != NULL)
 617         re_node_set_free (dfa->eclosures + i);
 618       if (dfa->inveclosures != NULL)
 619         re_node_set_free (dfa->inveclosures + i);
 620       if (dfa->edests != NULL)
 621         re_node_set_free (dfa->edests + i);
 622     }
 623   re_free (dfa->edests);
 624   re_free (dfa->eclosures);
 625   re_free (dfa->inveclosures);
 626   re_free (dfa->nodes);
 627
 628   if (dfa->state_table)
 629     for (i = 0; i <= dfa->state_hash_mask; ++i)
 630       {
 631         struct re_state_table_entry *entry = dfa->state_table + i;
 632         for (j = 0; j < entry->num; ++j)
 633           {
 634             re_dfastate_t *state = entry->array[j];
 635             free_state (state);
 636           }
 637         re_free (entry->array);
 638       }
 639   re_free (dfa->state_table);
 640 #ifdef RE_ENABLE_I18N
 641   if (dfa->sb_char != utf8_sb_map)
 642     re_free (dfa->sb_char);
 643 #endif
 644   re_free (dfa->subexp_map);
 645 #ifdef DEBUG
 646   re_free (dfa->re_str);
 647 #endif
 648
 649   re_free (dfa);
 650 }
 651
 652
 653 /* Free dynamically allocated space used by PREG.  */
 654
 655 void
 656 regfree (preg)
 657     regex_t *preg;
 658 {
 659   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 660   if (BE (dfa != NULL, 1))
 661     free_dfa_content (dfa);
 662   preg->buffer = NULL;
 663   preg->allocated = 0;
 664
 665   re_free (preg->fastmap);
 666   preg->fastmap = NULL;
 667
 668   re_free (preg->translate);
 669   preg->translate = NULL;
 670 }
 671 #ifdef _LIBC
 672 weak_alias (__regfree, regfree)
 673 #endif
 674 \f
 675 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 676    them unless specifically requested.  */
 677
 678 #if defined _REGEX_RE_COMP || defined _LIBC
 679
 680 /* BSD has one and only one pattern buffer.  */
 681 static struct re_pattern_buffer re_comp_buf;
 682
 683 char *
 684 # ifdef _LIBC
 685 /* Make these definitions weak in libc, so POSIX programs can redefine
 686    these names if they don't use our functions, and still use
 687    regcomp/regexec above without link errors.  */
 688 weak_function
 689 # endif
 690 re_comp (s)
 691      const char *s;
 692 {
 693   reg_errcode_t ret;
 694   char *fastmap;
 695
 696   if (!s)
 697     {
 698       if (!re_comp_buf.buffer)
 699         return gettext ("No previous regular expression");
 700       return 0;
 701     }
 702
 703   if (re_comp_buf.buffer)
 704     {
 705       fastmap = re_comp_buf.fastmap;
 706       re_comp_buf.fastmap = NULL;
 707       __regfree (&re_comp_buf);
 708       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 709       re_comp_buf.fastmap = fastmap;
 710     }
 711
 712   if (re_comp_buf.fastmap == NULL)
 713     {
 714       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 715       if (re_comp_buf.fastmap == NULL)
 716         return (char *) gettext (__re_error_msgid
 717                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 718     }
 719
 720   /* Since `re_exec' always passes NULL for the `regs' argument, we
 721      don't need to initialize the pattern buffer fields which affect it.  */
 722
 723   /* Match anchors at newlines.  */
 724   re_comp_buf.newline_anchor = 1;
 725
 726   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 727
 728   if (!ret)
 729     return NULL;
 730
 731   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 732   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 733 }
 734
 735 #ifdef _LIBC
 736 libc_freeres_fn (free_mem)
 737 {
 738   __regfree (&re_comp_buf);
 739 }
 740 #endif
 741
 742 #endif /* _REGEX_RE_COMP */
 743 \f
 744 /* Internal entry point.
 745    Compile the regular expression PATTERN, whose length is LENGTH.
 746    SYNTAX indicate regular expression's syntax.  */
 747
 748 static reg_errcode_t
 749 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 750                      reg_syntax_t syntax)
 751 {
 752   reg_errcode_t err = REG_NOERROR;
 753   re_dfa_t *dfa;
 754   re_string_t regexp;
 755
 756   /* Initialize the pattern buffer.  */
 757   preg->fastmap_accurate = 0;
 758   preg->syntax = syntax;
 759   preg->not_bol = preg->not_eol = 0;
 760   preg->used = 0;
 761   preg->re_nsub = 0;
 762   preg->can_be_null = 0;
 763   preg->regs_allocated = REGS_UNALLOCATED;
 764
 765   /* Initialize the dfa.  */
 766   dfa = (re_dfa_t *) preg->buffer;
 767   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 768     {
 769       /* If zero allocated, but buffer is non-null, try to realloc
 770          enough space.  This loses if buffer's address is bogus, but
 771          that is the user's responsibility.  If ->buffer is NULL this
 772          is a simple allocation.  */
 773       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 774       if (dfa == NULL)
 775         return REG_ESPACE;
 776       preg->allocated = sizeof (re_dfa_t);
 777       preg->buffer = (unsigned char *) dfa;
 778     }
 779   preg->used = sizeof (re_dfa_t);
 780
 781   err = init_dfa (dfa, length);
 782   if (BE (err != REG_NOERROR, 0))
 783     {
 784       free_dfa_content (dfa);
 785       preg->buffer = NULL;
 786       preg->allocated = 0;
 787       return err;
 788     }
 789 #ifdef DEBUG
 790   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 791   dfa->re_str = re_malloc (char, length + 1);
 792   strncpy (dfa->re_str, pattern, length + 1);
 793 #endif
 794
 795   __libc_lock_init (dfa->lock);
 796
 797   err = re_string_construct (&regexp, pattern, length, preg->translate,
 798                              syntax & RE_ICASE, dfa);
 799   if (BE (err != REG_NOERROR, 0))
 800     {
 801     re_compile_internal_free_return:
 802       free_workarea_compile (preg);
 803       re_string_destruct (&regexp);
 804       free_dfa_content (dfa);
 805       preg->buffer = NULL;
 806       preg->allocated = 0;
 807       return err;
 808     }
 809
 810   /* Parse the regular expression, and build a structure tree.  */
 811   preg->re_nsub = 0;
 812   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 813   if (BE (dfa->str_tree == NULL, 0))
 814     goto re_compile_internal_free_return;
 815
 816   /* Analyze the tree and create the nfa.  */
 817   err = analyze (preg);
 818   if (BE (err != REG_NOERROR, 0))
 819     goto re_compile_internal_free_return;
 820
 821 #ifdef RE_ENABLE_I18N
 822   /* If possible, do searching in single byte encoding to speed things up.  */
 823   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 824     optimize_utf8 (dfa);
 825 #endif
 826
 827   /* Then create the initial state of the dfa.  */
 828   err = create_initial_state (dfa);
 829
 830   /* Release work areas.  */
 831   free_workarea_compile (preg);
 832   re_string_destruct (&regexp);
 833
 834   if (BE (err != REG_NOERROR, 0))
 835     {
 836       free_dfa_content (dfa);
 837       preg->buffer = NULL;
 838       preg->allocated = 0;
 839     }
 840
 841   return err;
 842 }
 843
 844 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 845    as the initial length of some arrays.  */
 846
 847 static reg_errcode_t
 848 init_dfa (re_dfa_t *dfa, size_t pat_len)
 849 {
 850   unsigned int table_size;
 851 #ifndef _LIBC
 852   char *codeset_name;
 853 #endif
 854
 855   memset (dfa, '\0', sizeof (re_dfa_t));
 856
 857   /* Force allocation of str_tree_storage the first time.  */
 858   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 859
 860   /* Avoid overflows.  */
 861   if (pat_len == SIZE_MAX)
 862     return REG_ESPACE;
 863
 864   dfa->nodes_alloc = pat_len + 1;
 865   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 866
 867   /*  table_size = 2 ^ ceil(log pat_len) */
 868   for (table_size = 1; ; table_size <<= 1)
 869     if (table_size > pat_len)
 870       break;
 871
 872   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 873   dfa->state_hash_mask = table_size - 1;
 874
 875   dfa->mb_cur_max = MB_CUR_MAX;
 876 #ifdef _LIBC
 877   if (dfa->mb_cur_max == 6
 878       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 879     dfa->is_utf8 = 1;
 880   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 881                        != 0);
 882 #else
 883 # ifdef HAVE_LANGINFO_CODESET
 884   codeset_name = nl_langinfo (CODESET);
 885 # else
 886   codeset_name = getenv ("LC_ALL");
 887   if (codeset_name == NULL || codeset_name[0] == '\0')
 888     codeset_name = getenv ("LC_CTYPE");
 889   if (codeset_name == NULL || codeset_name[0] == '\0')
 890     codeset_name = getenv ("LANG");
 891   if (codeset_name == NULL)
 892     codeset_name = "";
 893   else if (strchr (codeset_name, '.') !=  NULL)
 894     codeset_name = strchr (codeset_name, '.') + 1;
 895 # endif
 896
 897   /* strcasecmp isn't a standard interface. brute force check */
 898 #if 0
 899   if (strcasecmp (codeset_name, "UTF-8") == 0
 900       || strcasecmp (codeset_name, "UTF8") == 0)
 901     dfa->is_utf8 = 1;
 902 #else
 903   if (   (codeset_name[0] == 'U' || codeset_name[0] == 'u')
 904       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
 905       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
 906       && (codeset_name[3] == '-'
 907           ? codeset_name[4] == '8' && codeset_name[5] == '\0'
 908           : codeset_name[3] == '8' && codeset_name[4] == '\0'))
 909     dfa->is_utf8 = 1;
 910 #endif
 911
 912   /* We check exhaustively in the loop below if this charset is a
 913      superset of ASCII.  */
 914   dfa->map_notascii = 0;
 915 #endif
 916
 917 #ifdef RE_ENABLE_I18N
 918   if (dfa->mb_cur_max > 1)
 919     {
 920       if (dfa->is_utf8)
 921         {
 922 #if !defined(__GNUC__) || __GNUC__ < 3
 923           static short utf8_sb_map_inited = 0;
 924
 925           if (! utf8_sb_map_inited)
 926             {
 927                 int i;
 928
 929                 utf8_sb_map_inited = 0;
 930                 for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
 931                   utf8_sb_map[i] = BITSET_WORD_MAX;
 932             }
 933 #endif
 934           dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 935         }
 936       else
 937         {
 938           int i, j, ch;
 939
 940           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 941           if (BE (dfa->sb_char == NULL, 0))
 942             return REG_ESPACE;
 943
 944           /* Set the bits corresponding to single byte chars.  */
 945           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 946             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 947               {
 948                 wint_t wch = __btowc (ch);
 949                 if (wch != WEOF)
 950                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 951 # ifndef _LIBC
 952                 if (isascii (ch) && wch != ch)
 953                   dfa->map_notascii = 1;
 954 # endif
 955               }
 956         }
 957     }
 958 #endif
 959
 960   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 961     return REG_ESPACE;
 962   return REG_NOERROR;
 963 }
 964
 965 /* Initialize WORD_CHAR table, which indicate which character is
 966    "word".  In this case "word" means that it is the word construction
 967    character used by some operators like "\<", "\>", etc.  */
 968
 969 static void
 970 internal_function
 971 init_word_char (re_dfa_t *dfa)
 972 {
 973   int i, j, ch;
 974   dfa->word_ops_used = 1;
 975   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 976     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 977       if (isalnum (ch) || ch == '_')
 978         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 979 }
 980
 981 /* Free the work area which are only used while compiling.  */
 982
 983 static void
 984 free_workarea_compile (regex_t *preg)
 985 {
 986   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 987   bin_tree_storage_t *storage, *next;
 988   for (storage = dfa->str_tree_storage; storage; storage = next)
 989     {
 990       next = storage->next;
 991       re_free (storage);
 992     }
 993   dfa->str_tree_storage = NULL;
 994   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 995   dfa->str_tree = NULL;
 996   re_free (dfa->org_indices);
 997   dfa->org_indices = NULL;
 998 }
 999
1000 /* Create initial states for all contexts.  */
1001
1002 static reg_errcode_t
1003 create_initial_state (re_dfa_t *dfa)
1004 {
1005   int first, i;
1006   reg_errcode_t err;
1007   re_node_set init_nodes;
1008
1009   /* Initial states have the epsilon closure of the node which is
1010      the first node of the regular expression.  */
1011   first = dfa->str_tree->first->node_idx;
1012   dfa->init_node = first;
1013   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1014   if (BE (err != REG_NOERROR, 0))
1015     return err;
1016
1017   /* The back-references which are in initial states can epsilon transit,
1018      since in this case all of the subexpressions can be null.
1019      Then we add epsilon closures of the nodes which are the next nodes of
1020      the back-references.  */
1021   if (dfa->nbackref > 0)
1022     for (i = 0; i < init_nodes.nelem; ++i)
1023       {
1024         int node_idx = init_nodes.elems[i];
1025         re_token_type_t type = dfa->nodes[node_idx].type;
1026
1027         int clexp_idx;
1028         if (type != OP_BACK_REF)
1029           continue;
1030         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1031           {
1032             re_token_t *clexp_node;
1033             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1034             if (clexp_node->type == OP_CLOSE_SUBEXP
1035                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1036               break;
1037           }
1038         if (clexp_idx == init_nodes.nelem)
1039           continue;
1040
1041         if (type == OP_BACK_REF)
1042           {
1043             int dest_idx = dfa->edests[node_idx].elems[0];
1044             if (!re_node_set_contains (&init_nodes, dest_idx))
1045               {
1046                 reg_errcode_t err = re_node_set_merge (&init_nodes,
1047                                                        dfa->eclosures
1048                                                        + dest_idx);
1049                 if (err != REG_NOERROR)
1050                   return err;
1051                 i = 0;
1052               }
1053           }
1054       }
1055
1056   /* It must be the first time to invoke acquire_state.  */
1057   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1058   /* We don't check ERR here, since the initial state must not be NULL.  */
1059   if (BE (dfa->init_state == NULL, 0))
1060     return err;
1061   if (dfa->init_state->has_constraint)
1062     {
1063       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1064                                                        CONTEXT_WORD);
1065       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1066                                                      CONTEXT_NEWLINE);
1067       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1068                                                          &init_nodes,
1069                                                          CONTEXT_NEWLINE
1070                                                          | CONTEXT_BEGBUF);
1071       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1072               || dfa->init_state_begbuf == NULL, 0))
1073         return err;
1074     }
1075   else
1076     dfa->init_state_word = dfa->init_state_nl
1077       = dfa->init_state_begbuf = dfa->init_state;
1078
1079   re_node_set_free (&init_nodes);
1080   return REG_NOERROR;
1081 }
1082 \f
1083 #ifdef RE_ENABLE_I18N
1084 /* If it is possible to do searching in single byte encoding instead of UTF-8
1085    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1086    DFA nodes where needed.  */
1087
1088 static void
1089 optimize_utf8 (re_dfa_t *dfa)
1090 {
1091   int node, i, mb_chars = 0, has_period = 0;
1092
1093   for (node = 0; node < dfa->nodes_len; ++node)
1094     switch (dfa->nodes[node].type)
1095       {
1096       case CHARACTER:
1097         if (dfa->nodes[node].opr.c >= 0x80)
1098           mb_chars = 1;
1099         break;
1100       case ANCHOR:
1101         switch (dfa->nodes[node].opr.ctx_type)
1102           {
1103           case LINE_FIRST:
1104           case LINE_LAST:
1105           case BUF_FIRST:
1106           case BUF_LAST:
1107             break;
1108           default:
1109             /* Word anchors etc. cannot be handled.  It's okay to test
1110                opr.ctx_type since constraints (for all DFA nodes) are
1111                created by ORing one or more opr.ctx_type values.  */
1112             return;
1113           }
1114         break;
1115       case OP_PERIOD:
1116         has_period = 1;
1117         break;
1118       case OP_BACK_REF:
1119       case OP_ALT:
1120       case END_OF_RE:
1121       case OP_DUP_ASTERISK:
1122       case OP_OPEN_SUBEXP:
1123       case OP_CLOSE_SUBEXP:
1124         break;
1125       case COMPLEX_BRACKET:
1126         return;
1127       case SIMPLE_BRACKET:
1128         /* Just double check.  The non-ASCII range starts at 0x80.  */
1129         assert (0x80 % BITSET_WORD_BITS == 0);
1130         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1131           if (dfa->nodes[node].opr.sbcset[i])
1132             return;
1133         break;
1134       default:
1135         abort ();
1136       }
1137
1138   if (mb_chars || has_period)
1139     for (node = 0; node < dfa->nodes_len; ++node)
1140       {
1141         if (dfa->nodes[node].type == CHARACTER
1142             && dfa->nodes[node].opr.c >= 0x80)
1143           dfa->nodes[node].mb_partial = 0;
1144         else if (dfa->nodes[node].type == OP_PERIOD)
1145           dfa->nodes[node].type = OP_UTF8_PERIOD;
1146       }
1147
1148   /* The search can be in single byte locale.  */
1149   dfa->mb_cur_max = 1;
1150   dfa->is_utf8 = 0;
1151   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1152 }
1153 #endif
1154 \f
1155 /* Analyze the structure tree, and calculate "first", "next", "edest",
1156    "eclosure", and "inveclosure".  */
1157
1158 static reg_errcode_t
1159 analyze (regex_t *preg)
1160 {
1161   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1162   reg_errcode_t ret;
1163
1164   /* Allocate arrays.  */
1165   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1166   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1167   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1168   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1169   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1170           || dfa->eclosures == NULL, 0))
1171     return REG_ESPACE;
1172
1173   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1174   if (dfa->subexp_map != NULL)
1175     {
1176       int i;
1177       for (i = 0; i < preg->re_nsub; i++)
1178         dfa->subexp_map[i] = i;
1179       preorder (dfa->str_tree, optimize_subexps, dfa);
1180       for (i = 0; i < preg->re_nsub; i++)
1181         if (dfa->subexp_map[i] != i)
1182           break;
1183       if (i == preg->re_nsub)
1184         {
1185           free (dfa->subexp_map);
1186           dfa->subexp_map = NULL;
1187         }
1188     }
1189
1190   ret = postorder (dfa->str_tree, lower_subexps, preg);
1191   if (BE (ret != REG_NOERROR, 0))
1192     return ret;
1193   ret = postorder (dfa->str_tree, calc_first, dfa);
1194   if (BE (ret != REG_NOERROR, 0))
1195     return ret;
1196   preorder (dfa->str_tree, calc_next, dfa);
1197   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1198   if (BE (ret != REG_NOERROR, 0))
1199     return ret;
1200   ret = calc_eclosure (dfa);
1201   if (BE (ret != REG_NOERROR, 0))
1202     return ret;
1203
1204   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1205      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1206   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1207       || dfa->nbackref)
1208     {
1209       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1210       if (BE (dfa->inveclosures == NULL, 0))
1211         return REG_ESPACE;
1212       ret = calc_inveclosure (dfa);
1213     }
1214
1215   return ret;
1216 }
1217
1218 /* Our parse trees are very unbalanced, so we cannot use a stack to
1219    implement parse tree visits.  Instead, we use parent pointers and
1220    some hairy code in these two functions.  */
1221 static reg_errcode_t
1222 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1223            void *extra)
1224 {
1225   bin_tree_t *node, *prev;
1226
1227   for (node = root; ; )
1228     {
1229       /* Descend down the tree, preferably to the left (or to the right
1230          if that's the only child).  */
1231       while (node->left || node->right)
1232         if (node->left)
1233           node = node->left;
1234         else
1235           node = node->right;
1236
1237       do
1238         {
1239           reg_errcode_t err = fn (extra, node);
1240           if (BE (err != REG_NOERROR, 0))
1241             return err;
1242           if (node->parent == NULL)
1243             return REG_NOERROR;
1244           prev = node;
1245           node = node->parent;
1246         }
1247       /* Go up while we have a node that is reached from the right.  */
1248       while (node->right == prev || node->right == NULL);
1249       node = node->right;
1250     }
1251 }
1252
1253 static reg_errcode_t
1254 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1255           void *extra)
1256 {
1257   bin_tree_t *node;
1258
1259   for (node = root; ; )
1260     {
1261       reg_errcode_t err = fn (extra, node);
1262       if (BE (err != REG_NOERROR, 0))
1263         return err;
1264
1265       /* Go to the left node, or up and to the right.  */
1266       if (node->left)
1267         node = node->left;
1268       else
1269         {
1270           bin_tree_t *prev = NULL;
1271           while (node->right == prev || node->right == NULL)
1272             {
1273               prev = node;
1274               node = node->parent;
1275               if (!node)
1276                 return REG_NOERROR;
1277             }
1278           node = node->right;
1279         }
1280     }
1281 }
1282
1283 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1284    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1285    backreferences as well.  Requires a preorder visit.  */
1286 static reg_errcode_t
1287 optimize_subexps (void *extra, bin_tree_t *node)
1288 {
1289   re_dfa_t *dfa = (re_dfa_t *) extra;
1290
1291   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1292     {
1293       int idx = node->token.opr.idx;
1294       node->token.opr.idx = dfa->subexp_map[idx];
1295       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1296     }
1297
1298   else if (node->token.type == SUBEXP
1299            && node->left && node->left->token.type == SUBEXP)
1300     {
1301       int other_idx = node->left->token.opr.idx;
1302
1303       node->left = node->left->left;
1304       if (node->left)
1305         node->left->parent = node;
1306
1307       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1308       if (other_idx < BITSET_WORD_BITS)
1309           dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1310     }
1311
1312   return REG_NOERROR;
1313 }
1314
1315 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1316    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1317 static reg_errcode_t
1318 lower_subexps (void *extra, bin_tree_t *node)
1319 {
1320   regex_t *preg = (regex_t *) extra;
1321   reg_errcode_t err = REG_NOERROR;
1322
1323   if (node->left && node->left->token.type == SUBEXP)
1324     {
1325       node->left = lower_subexp (&err, preg, node->left);
1326       if (node->left)
1327         node->left->parent = node;
1328     }
1329   if (node->right && node->right->token.type == SUBEXP)
1330     {
1331       node->right = lower_subexp (&err, preg, node->right);
1332       if (node->right)
1333         node->right->parent = node;
1334     }
1335
1336   return err;
1337 }
1338
1339 static bin_tree_t *
1340 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1341 {
1342   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1343   bin_tree_t *body = node->left;
1344   bin_tree_t *op, *cls, *tree1, *tree;
1345
1346   if (preg->no_sub
1347       /* We do not optimize empty subexpressions, because otherwise we may
1348          have bad CONCAT nodes with NULL children.  This is obviously not
1349          very common, so we do not lose much.  An example that triggers
1350          this case is the sed "script" /\(\)/x.  */
1351       && node->left != NULL
1352       && (node->token.opr.idx >= BITSET_WORD_BITS
1353           || !(dfa->used_bkref_map
1354                & ((bitset_word_t) 1 << node->token.opr.idx))))
1355     return node->left;
1356
1357   /* Convert the SUBEXP node to the concatenation of an
1358      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1359   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1360   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1361   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1362   tree = create_tree (dfa, op, tree1, CONCAT);
1363   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1364     {
1365       *err = REG_ESPACE;
1366       return NULL;
1367     }
1368
1369   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1370   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1371   return tree;
1372 }
1373
1374 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1375    nodes.  Requires a postorder visit.  */
1376 static reg_errcode_t
1377 calc_first (void *extra, bin_tree_t *node)
1378 {
1379   re_dfa_t *dfa = (re_dfa_t *) extra;
1380   if (node->token.type == CONCAT)
1381     {
1382       node->first = node->left->first;
1383       node->node_idx = node->left->node_idx;
1384     }
1385   else
1386     {
1387       node->first = node;
1388       node->node_idx = re_dfa_add_node (dfa, node->token);
1389       if (BE (node->node_idx == -1, 0))
1390         return REG_ESPACE;
1391       if (node->token.type == ANCHOR)
1392         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1393     }
1394   return REG_NOERROR;
1395 }
1396
1397 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1398 static reg_errcode_t
1399 calc_next (void *extra, bin_tree_t *node)
1400 {
1401   switch (node->token.type)
1402     {
1403     case OP_DUP_ASTERISK:
1404       node->left->next = node;
1405       break;
1406     case CONCAT:
1407       node->left->next = node->right->first;
1408       node->right->next = node->next;
1409       break;
1410     default:
1411       if (node->left)
1412         node->left->next = node->next;
1413       if (node->right)
1414         node->right->next = node->next;
1415       break;
1416     }
1417   return REG_NOERROR;
1418 }
1419
1420 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1421 static reg_errcode_t
1422 link_nfa_nodes (void *extra, bin_tree_t *node)
1423 {
1424   re_dfa_t *dfa = (re_dfa_t *) extra;
1425   int idx = node->node_idx;
1426   reg_errcode_t err = REG_NOERROR;
1427
1428   switch (node->token.type)
1429     {
1430     case CONCAT:
1431       break;
1432
1433     case END_OF_RE:
1434       assert (node->next == NULL);
1435       break;
1436
1437     case OP_DUP_ASTERISK:
1438     case OP_ALT:
1439       {
1440         int left, right;
1441         dfa->has_plural_match = 1;
1442         if (node->left != NULL)
1443           left = node->left->first->node_idx;
1444         else
1445           left = node->next->node_idx;
1446         if (node->right != NULL)
1447           right = node->right->first->node_idx;
1448         else
1449           right = node->next->node_idx;
1450         assert (left > -1);
1451         assert (right > -1);
1452         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1453       }
1454       break;
1455
1456     case ANCHOR:
1457     case OP_OPEN_SUBEXP:
1458     case OP_CLOSE_SUBEXP:
1459       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1460       break;
1461
1462     case OP_BACK_REF:
1463       dfa->nexts[idx] = node->next->node_idx;
1464       if (node->token.type == OP_BACK_REF)
1465         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1466       break;
1467
1468     default:
1469       assert (!IS_EPSILON_NODE (node->token.type));
1470       dfa->nexts[idx] = node->next->node_idx;
1471       break;
1472     }
1473
1474   return err;
1475 }
1476
1477 /* Duplicate the epsilon closure of the node ROOT_NODE.
1478    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1479    to their own constraint.  */
1480
1481 static reg_errcode_t
1482 internal_function
1483 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1484                         int root_node, unsigned int init_constraint)
1485 {
1486   int org_node, clone_node, ret;
1487   unsigned int constraint = init_constraint;
1488   for (org_node = top_org_node, clone_node = top_clone_node;;)
1489     {
1490       int org_dest, clone_dest;
1491       if (dfa->nodes[org_node].type == OP_BACK_REF)
1492         {
1493           /* If the back reference epsilon-transit, its destination must
1494              also have the constraint.  Then duplicate the epsilon closure
1495              of the destination of the back reference, and store it in
1496              edests of the back reference.  */
1497           org_dest = dfa->nexts[org_node];
1498           re_node_set_empty (dfa->edests + clone_node);
1499           clone_dest = duplicate_node (dfa, org_dest, constraint);
1500           if (BE (clone_dest == -1, 0))
1501             return REG_ESPACE;
1502           dfa->nexts[clone_node] = dfa->nexts[org_node];
1503           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1504           if (BE (ret < 0, 0))
1505             return REG_ESPACE;
1506         }
1507       else if (dfa->edests[org_node].nelem == 0)
1508         {
1509           /* In case of the node can't epsilon-transit, don't duplicate the
1510              destination and store the original destination as the
1511              destination of the node.  */
1512           dfa->nexts[clone_node] = dfa->nexts[org_node];
1513           break;
1514         }
1515       else if (dfa->edests[org_node].nelem == 1)
1516         {
1517           /* In case of the node can epsilon-transit, and it has only one
1518              destination.  */
1519           org_dest = dfa->edests[org_node].elems[0];
1520           re_node_set_empty (dfa->edests + clone_node);
1521           /* If the node is root_node itself, it means the epsilon clsoure
1522              has a loop.   Then tie it to the destination of the root_node.  */
1523           if (org_node == root_node && clone_node != org_node)
1524             {
1525               ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1526               if (BE (ret < 0, 0))
1527                 return REG_ESPACE;
1528               break;
1529             }
1530           /* In case of the node has another constraint, add it.  */
1531           constraint |= dfa->nodes[org_node].constraint;
1532           clone_dest = duplicate_node (dfa, org_dest, constraint);
1533           if (BE (clone_dest == -1, 0))
1534             return REG_ESPACE;
1535           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1536           if (BE (ret < 0, 0))
1537             return REG_ESPACE;
1538         }
1539       else /* dfa->edests[org_node].nelem == 2 */
1540         {
1541           /* In case of the node can epsilon-transit, and it has two
1542              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1543           org_dest = dfa->edests[org_node].elems[0];
1544           re_node_set_empty (dfa->edests + clone_node);
1545           /* Search for a duplicated node which satisfies the constraint.  */
1546           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1547           if (clone_dest == -1)
1548             {
1549               /* There is no such duplicated node, create a new one.  */
1550               reg_errcode_t err;
1551               clone_dest = duplicate_node (dfa, org_dest, constraint);
1552               if (BE (clone_dest == -1, 0))
1553                 return REG_ESPACE;
1554               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1555               if (BE (ret < 0, 0))
1556                 return REG_ESPACE;
1557               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1558                                             root_node, constraint);
1559               if (BE (err != REG_NOERROR, 0))
1560                 return err;
1561             }
1562           else
1563             {
1564               /* There is a duplicated node which satisfies the constraint,
1565                  use it to avoid infinite loop.  */
1566               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1567               if (BE (ret < 0, 0))
1568                 return REG_ESPACE;
1569             }
1570
1571           org_dest = dfa->edests[org_node].elems[1];
1572           clone_dest = duplicate_node (dfa, org_dest, constraint);
1573           if (BE (clone_dest == -1, 0))
1574             return REG_ESPACE;
1575           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1576           if (BE (ret < 0, 0))
1577             return REG_ESPACE;
1578         }
1579       org_node = org_dest;
1580       clone_node = clone_dest;
1581     }
1582   return REG_NOERROR;
1583 }
1584
1585 /* Search for a node which is duplicated from the node ORG_NODE, and
1586    satisfies the constraint CONSTRAINT.  */
1587
1588 static int
1589 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1590                         unsigned int constraint)
1591 {
1592   int idx;
1593   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1594     {
1595       if (org_node == dfa->org_indices[idx]
1596           && constraint == dfa->nodes[idx].constraint)
1597         return idx; /* Found.  */
1598     }
1599   return -1; /* Not found.  */
1600 }
1601
1602 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1603    Return the index of the new node, or -1 if insufficient storage is
1604    available.  */
1605
1606 static int
1607 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1608 {
1609   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1610   if (BE (dup_idx != -1, 1))
1611     {
1612       dfa->nodes[dup_idx].constraint = constraint;
1613       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1614       dfa->nodes[dup_idx].duplicated = 1;
1615
1616       /* Store the index of the original node.  */
1617       dfa->org_indices[dup_idx] = org_idx;
1618     }
1619   return dup_idx;
1620 }
1621
1622 static reg_errcode_t
1623 calc_inveclosure (re_dfa_t *dfa)
1624 {
1625   int src, idx, ret;
1626   for (idx = 0; idx < dfa->nodes_len; ++idx)
1627     re_node_set_init_empty (dfa->inveclosures + idx);
1628
1629   for (src = 0; src < dfa->nodes_len; ++src)
1630     {
1631       int *elems = dfa->eclosures[src].elems;
1632       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1633         {
1634           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1635           if (BE (ret == -1, 0))
1636             return REG_ESPACE;
1637         }
1638     }
1639
1640   return REG_NOERROR;
1641 }
1642
1643 /* Calculate "eclosure" for all the node in DFA.  */
1644
1645 static reg_errcode_t
1646 calc_eclosure (re_dfa_t *dfa)
1647 {
1648   int node_idx, incomplete;
1649 #ifdef DEBUG
1650   assert (dfa->nodes_len > 0);
1651 #endif
1652   incomplete = 0;
1653   /* For each nodes, calculate epsilon closure.  */
1654   for (node_idx = 0; ; ++node_idx)
1655     {
1656       reg_errcode_t err;
1657       re_node_set eclosure_elem;
1658       if (node_idx == dfa->nodes_len)
1659         {
1660           if (!incomplete)
1661             break;
1662           incomplete = 0;
1663           node_idx = 0;
1664         }
1665
1666 #ifdef DEBUG
1667       assert (dfa->eclosures[node_idx].nelem != -1);
1668 #endif
1669
1670       /* If we have already calculated, skip it.  */
1671       if (dfa->eclosures[node_idx].nelem != 0)
1672         continue;
1673       /* Calculate epsilon closure of `node_idx'.  */
1674       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1675       if (BE (err != REG_NOERROR, 0))
1676         return err;
1677
1678       if (dfa->eclosures[node_idx].nelem == 0)
1679         {
1680           incomplete = 1;
1681           re_node_set_free (&eclosure_elem);
1682         }
1683     }
1684   return REG_NOERROR;
1685 }
1686
1687 /* Calculate epsilon closure of NODE.  */
1688
1689 static reg_errcode_t
1690 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1691 {
1692   reg_errcode_t err;
1693   int i;
1694   re_node_set eclosure;
1695   int ret;
1696   int incomplete = 0;
1697   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1698   if (BE (err != REG_NOERROR, 0))
1699     return err;
1700
1701   /* This indicates that we are calculating this node now.
1702      We reference this value to avoid infinite loop.  */
1703   dfa->eclosures[node].nelem = -1;
1704
1705   /* If the current node has constraints, duplicate all nodes
1706      since they must inherit the constraints.  */
1707   if (dfa->nodes[node].constraint
1708       && dfa->edests[node].nelem
1709       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1710     {
1711       err = duplicate_node_closure (dfa, node, node, node,
1712                                     dfa->nodes[node].constraint);
1713       if (BE (err != REG_NOERROR, 0))
1714         return err;
1715     }
1716
1717   /* Expand each epsilon destination nodes.  */
1718   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1719     for (i = 0; i < dfa->edests[node].nelem; ++i)
1720       {
1721         re_node_set eclosure_elem;
1722         int edest = dfa->edests[node].elems[i];
1723         /* If calculating the epsilon closure of `edest' is in progress,
1724            return intermediate result.  */
1725         if (dfa->eclosures[edest].nelem == -1)
1726           {
1727             incomplete = 1;
1728             continue;
1729           }
1730         /* If we haven't calculated the epsilon closure of `edest' yet,
1731            calculate now. Otherwise use calculated epsilon closure.  */
1732         if (dfa->eclosures[edest].nelem == 0)
1733           {
1734             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1735             if (BE (err != REG_NOERROR, 0))
1736               return err;
1737           }
1738         else
1739           eclosure_elem = dfa->eclosures[edest];
1740         /* Merge the epsilon closure of `edest'.  */
1741         err = re_node_set_merge (&eclosure, &eclosure_elem);
1742         if (BE (err != REG_NOERROR, 0))
1743           return err;
1744         /* If the epsilon closure of `edest' is incomplete,
1745            the epsilon closure of this node is also incomplete.  */
1746         if (dfa->eclosures[edest].nelem == 0)
1747           {
1748             incomplete = 1;
1749             re_node_set_free (&eclosure_elem);
1750           }
1751       }
1752
1753   /* An epsilon closure includes itself.  */
1754   ret = re_node_set_insert (&eclosure, node);
1755   if (BE (ret < 0, 0))
1756     return REG_ESPACE;
1757   if (incomplete && !root)
1758     dfa->eclosures[node].nelem = 0;
1759   else
1760     dfa->eclosures[node] = eclosure;
1761   *new_set = eclosure;
1762   return REG_NOERROR;
1763 }
1764 \f
1765 /* Functions for token which are used in the parser.  */
1766
1767 /* Fetch a token from INPUT.
1768    We must not use this function inside bracket expressions.  */
1769
1770 static void
1771 internal_function
1772 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1773 {
1774   re_string_skip_bytes (input, peek_token (result, input, syntax));
1775 }
1776
1777 /* Peek a token from INPUT, and return the length of the token.
1778    We must not use this function inside bracket expressions.  */
1779
1780 static int
1781 internal_function
1782 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1783 {
1784   unsigned char c;
1785
1786   if (re_string_eoi (input))
1787     {
1788       token->type = END_OF_RE;
1789       return 0;
1790     }
1791
1792   c = re_string_peek_byte (input, 0);
1793   token->opr.c = c;
1794
1795   token->word_char = 0;
1796 #ifdef RE_ENABLE_I18N
1797   token->mb_partial = 0;
1798   if (input->mb_cur_max > 1 &&
1799       !re_string_first_byte (input, re_string_cur_idx (input)))
1800     {
1801       token->type = CHARACTER;
1802       token->mb_partial = 1;
1803       return 1;
1804     }
1805 #endif
1806   if (c == '\\')
1807     {
1808       unsigned char c2;
1809       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1810         {
1811           token->type = BACK_SLASH;
1812           return 1;
1813         }
1814
1815       c2 = re_string_peek_byte_case (input, 1);
1816       token->opr.c = c2;
1817       token->type = CHARACTER;
1818 #ifdef RE_ENABLE_I18N
1819       if (input->mb_cur_max > 1)
1820         {
1821           wint_t wc = re_string_wchar_at (input,
1822                                           re_string_cur_idx (input) + 1);
1823           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1824         }
1825       else
1826 #endif
1827         token->word_char = IS_WORD_CHAR (c2) != 0;
1828
1829       switch (c2)
1830         {
1831         case '|':
1832           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1833             token->type = OP_ALT;
1834           break;
1835         case '1': case '2': case '3': case '4': case '5':
1836         case '6': case '7': case '8': case '9':
1837           if (!(syntax & RE_NO_BK_REFS))
1838             {
1839               token->type = OP_BACK_REF;
1840               token->opr.idx = c2 - '1';
1841             }
1842           break;
1843         case '<':
1844           if (!(syntax & RE_NO_GNU_OPS))
1845             {
1846               token->type = ANCHOR;
1847               token->opr.ctx_type = WORD_FIRST;
1848             }
1849           break;
1850         case '>':
1851           if (!(syntax & RE_NO_GNU_OPS))
1852             {
1853               token->type = ANCHOR;
1854               token->opr.ctx_type = WORD_LAST;
1855             }
1856           break;
1857         case 'b':
1858           if (!(syntax & RE_NO_GNU_OPS))
1859             {
1860               token->type = ANCHOR;
1861               token->opr.ctx_type = WORD_DELIM;
1862             }
1863           break;
1864         case 'B':
1865           if (!(syntax & RE_NO_GNU_OPS))
1866             {
1867               token->type = ANCHOR;
1868               token->opr.ctx_type = NOT_WORD_DELIM;
1869             }
1870           break;
1871         case 'w':
1872           if (!(syntax & RE_NO_GNU_OPS))
1873             token->type = OP_WORD;
1874           break;
1875         case 'W':
1876           if (!(syntax & RE_NO_GNU_OPS))
1877             token->type = OP_NOTWORD;
1878           break;
1879         case 's':
1880           if (!(syntax & RE_NO_GNU_OPS))
1881             token->type = OP_SPACE;
1882           break;
1883         case 'S':
1884           if (!(syntax & RE_NO_GNU_OPS))
1885             token->type = OP_NOTSPACE;
1886           break;
1887         case '`':
1888           if (!(syntax & RE_NO_GNU_OPS))
1889             {
1890               token->type = ANCHOR;
1891               token->opr.ctx_type = BUF_FIRST;
1892             }
1893           break;
1894         case '\'':
1895           if (!(syntax & RE_NO_GNU_OPS))
1896             {
1897               token->type = ANCHOR;
1898               token->opr.ctx_type = BUF_LAST;
1899             }
1900           break;
1901         case '(':
1902           if (!(syntax & RE_NO_BK_PARENS))
1903             token->type = OP_OPEN_SUBEXP;
1904           break;
1905         case ')':
1906           if (!(syntax & RE_NO_BK_PARENS))
1907             token->type = OP_CLOSE_SUBEXP;
1908           break;
1909         case '+':
1910           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1911             token->type = OP_DUP_PLUS;
1912           break;
1913         case '?':
1914           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1915             token->type = OP_DUP_QUESTION;
1916           break;
1917         case '{':
1918           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1919             token->type = OP_OPEN_DUP_NUM;
1920           break;
1921         case '}':
1922           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1923             token->type = OP_CLOSE_DUP_NUM;
1924           break;
1925         default:
1926           break;
1927         }
1928       return 2;
1929     }
1930
1931   token->type = CHARACTER;
1932 #ifdef RE_ENABLE_I18N
1933   if (input->mb_cur_max > 1)
1934     {
1935       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1936       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1937     }
1938   else
1939 #endif
1940     token->word_char = IS_WORD_CHAR (token->opr.c);
1941
1942   switch (c)
1943     {
1944     case '\n':
1945       if (syntax & RE_NEWLINE_ALT)
1946         token->type = OP_ALT;
1947       break;
1948     case '|':
1949       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1950         token->type = OP_ALT;
1951       break;
1952     case '*':
1953       token->type = OP_DUP_ASTERISK;
1954       break;
1955     case '+':
1956       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1957         token->type = OP_DUP_PLUS;
1958       break;
1959     case '?':
1960       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1961         token->type = OP_DUP_QUESTION;
1962       break;
1963     case '{':
1964       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1965         token->type = OP_OPEN_DUP_NUM;
1966       break;
1967     case '}':
1968       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1969         token->type = OP_CLOSE_DUP_NUM;
1970       break;
1971     case '(':
1972       if (syntax & RE_NO_BK_PARENS)
1973         token->type = OP_OPEN_SUBEXP;
1974       break;
1975     case ')':
1976       if (syntax & RE_NO_BK_PARENS)
1977         token->type = OP_CLOSE_SUBEXP;
1978       break;
1979     case '[':
1980       token->type = OP_OPEN_BRACKET;
1981       break;
1982     case '.':
1983       token->type = OP_PERIOD;
1984       break;
1985     case '^':
1986       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1987           re_string_cur_idx (input) != 0)
1988         {
1989           char prev = re_string_peek_byte (input, -1);
1990           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1991             break;
1992         }
1993       token->type = ANCHOR;
1994       token->opr.ctx_type = LINE_FIRST;
1995       break;
1996     case '$':
1997       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1998           re_string_cur_idx (input) + 1 != re_string_length (input))
1999         {
2000           re_token_t next;
2001           re_string_skip_bytes (input, 1);
2002           peek_token (&next, input, syntax);
2003           re_string_skip_bytes (input, -1);
2004           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2005             break;
2006         }
2007       token->type = ANCHOR;
2008       token->opr.ctx_type = LINE_LAST;
2009       break;
2010     default:
2011       break;
2012     }
2013   return 1;
2014 }
2015
2016 /* Peek a token from INPUT, and return the length of the token.
2017    We must not use this function out of bracket expressions.  */
2018
2019 static int
2020 internal_function
2021 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2022 {
2023   unsigned char c;
2024   if (re_string_eoi (input))
2025     {
2026       token->type = END_OF_RE;
2027       return 0;
2028     }
2029   c = re_string_peek_byte (input, 0);
2030   token->opr.c = c;
2031
2032 #ifdef RE_ENABLE_I18N
2033   if (input->mb_cur_max > 1 &&
2034       !re_string_first_byte (input, re_string_cur_idx (input)))
2035     {
2036       token->type = CHARACTER;
2037       return 1;
2038     }
2039 #endif /* RE_ENABLE_I18N */
2040
2041   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2042       && re_string_cur_idx (input) + 1 < re_string_length (input))
2043     {
2044       /* In this case, '\' escape a character.  */
2045       unsigned char c2;
2046       re_string_skip_bytes (input, 1);
2047       c2 = re_string_peek_byte (input, 0);
2048       token->opr.c = c2;
2049       token->type = CHARACTER;
2050       return 1;
2051     }
2052   if (c == '[') /* '[' is a special char in a bracket exps.  */
2053     {
2054       unsigned char c2;
2055       int token_len;
2056       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2057         c2 = re_string_peek_byte (input, 1);
2058       else
2059         c2 = 0;
2060       token->opr.c = c2;
2061       token_len = 2;
2062       switch (c2)
2063         {
2064         case '.':
2065           token->type = OP_OPEN_COLL_ELEM;
2066           break;
2067         case '=':
2068           token->type = OP_OPEN_EQUIV_CLASS;
2069           break;
2070         case ':':
2071           if (syntax & RE_CHAR_CLASSES)
2072             {
2073               token->type = OP_OPEN_CHAR_CLASS;
2074               break;
2075             }
2076           /* else fall through.  */
2077         default:
2078           token->type = CHARACTER;
2079           token->opr.c = c;
2080           token_len = 1;
2081           break;
2082         }
2083       return token_len;
2084     }
2085   switch (c)
2086     {
2087     case '-':
2088       token->type = OP_CHARSET_RANGE;
2089       break;
2090     case ']':
2091       token->type = OP_CLOSE_BRACKET;
2092       break;
2093     case '^':
2094       token->type = OP_NON_MATCH_LIST;
2095       break;
2096     default:
2097       token->type = CHARACTER;
2098     }
2099   return 1;
2100 }
2101 \f
2102 /* Functions for parser.  */
2103
2104 /* Entry point of the parser.
2105    Parse the regular expression REGEXP and return the structure tree.
2106    If an error is occured, ERR is set by error code, and return NULL.
2107    This function build the following tree, from regular expression <reg_exp>:
2108            CAT
2109            / \
2110           /   \
2111    <reg_exp>  EOR
2112
2113    CAT means concatenation.
2114    EOR means end of regular expression.  */
2115
2116 static bin_tree_t *
2117 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2118        reg_errcode_t *err)
2119 {
2120   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2121   bin_tree_t *tree, *eor, *root;
2122   re_token_t current_token;
2123   dfa->syntax = syntax;
2124   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2125   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2126   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2127     return NULL;
2128   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2129   if (tree != NULL)
2130     root = create_tree (dfa, tree, eor, CONCAT);
2131   else
2132     root = eor;
2133   if (BE (eor == NULL || root == NULL, 0))
2134     {
2135       *err = REG_ESPACE;
2136       return NULL;
2137     }
2138   return root;
2139 }
2140
2141 /* This function build the following tree, from regular expression
2142    <branch1>|<branch2>:
2143            ALT
2144            / \
2145           /   \
2146    <branch1> <branch2>
2147
2148    ALT means alternative, which represents the operator `|'.  */
2149
2150 static bin_tree_t *
2151 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2152                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2153 {
2154   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2155   bin_tree_t *tree, *branch = NULL;
2156   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2157   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2158     return NULL;
2159
2160   while (token->type == OP_ALT)
2161     {
2162       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2163       if (token->type != OP_ALT && token->type != END_OF_RE
2164           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2165         {
2166           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2167           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2168             return NULL;
2169         }
2170       else
2171         branch = NULL;
2172       tree = create_tree (dfa, tree, branch, OP_ALT);
2173       if (BE (tree == NULL, 0))
2174         {
2175           *err = REG_ESPACE;
2176           return NULL;
2177         }
2178     }
2179   return tree;
2180 }
2181
2182 /* This function build the following tree, from regular expression
2183    <exp1><exp2>:
2184         CAT
2185         / \
2186        /   \
2187    <exp1> <exp2>
2188
2189    CAT means concatenation.  */
2190
2191 static bin_tree_t *
2192 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2193               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2194 {
2195   bin_tree_t *tree, *exp;
2196   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2197   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2198   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2199     return NULL;
2200
2201   while (token->type != OP_ALT && token->type != END_OF_RE
2202          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2203     {
2204       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2205       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2206         {
2207           return NULL;
2208         }
2209       if (tree != NULL && exp != NULL)
2210         {
2211           tree = create_tree (dfa, tree, exp, CONCAT);
2212           if (tree == NULL)
2213             {
2214               *err = REG_ESPACE;
2215               return NULL;
2216             }
2217         }
2218       else if (tree == NULL)
2219         tree = exp;
2220       /* Otherwise exp == NULL, we don't need to create new tree.  */
2221     }
2222   return tree;
2223 }
2224
2225 /* This function build the following tree, from regular expression a*:
2226          *
2227          |
2228          a
2229 */
2230
2231 static bin_tree_t *
2232 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2233                   reg_syntax_t syntax, int nest, reg_errcode_t *err)
2234 {
2235   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2236   bin_tree_t *tree;
2237   switch (token->type)
2238     {
2239     case CHARACTER:
2240       tree = create_token_tree (dfa, NULL, NULL, token);
2241       if (BE (tree == NULL, 0))
2242         {
2243           *err = REG_ESPACE;
2244           return NULL;
2245         }
2246 #ifdef RE_ENABLE_I18N
2247       if (dfa->mb_cur_max > 1)
2248         {
2249           while (!re_string_eoi (regexp)
2250                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2251             {
2252               bin_tree_t *mbc_remain;
2253               fetch_token (token, regexp, syntax);
2254               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2255               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2256               if (BE (mbc_remain == NULL || tree == NULL, 0))
2257                 {
2258                   *err = REG_ESPACE;
2259                   return NULL;
2260                 }
2261             }
2262         }
2263 #endif
2264       break;
2265     case OP_OPEN_SUBEXP:
2266       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2267       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2268         return NULL;
2269       break;
2270     case OP_OPEN_BRACKET:
2271       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2272       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2273         return NULL;
2274       break;
2275     case OP_BACK_REF:
2276       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2277         {
2278           *err = REG_ESUBREG;
2279           return NULL;
2280         }
2281       dfa->used_bkref_map |= 1 << token->opr.idx;
2282       tree = create_token_tree (dfa, NULL, NULL, token);
2283       if (BE (tree == NULL, 0))
2284         {
2285           *err = REG_ESPACE;
2286           return NULL;
2287         }
2288       ++dfa->nbackref;
2289       dfa->has_mb_node = 1;
2290       break;
2291     case OP_OPEN_DUP_NUM:
2292       if (syntax & RE_CONTEXT_INVALID_DUP)
2293         {
2294           *err = REG_BADRPT;
2295           return NULL;
2296         }
2297       /* FALLTHROUGH */
2298     case OP_DUP_ASTERISK:
2299     case OP_DUP_PLUS:
2300     case OP_DUP_QUESTION:
2301       if (syntax & RE_CONTEXT_INVALID_OPS)
2302         {
2303           *err = REG_BADRPT;
2304           return NULL;
2305         }
2306       else if (syntax & RE_CONTEXT_INDEP_OPS)
2307         {
2308           fetch_token (token, regexp, syntax);
2309           return parse_expression (regexp, preg, token, syntax, nest, err);
2310         }
2311       /* else fall through  */
2312     case OP_CLOSE_SUBEXP:
2313       if ((token->type == OP_CLOSE_SUBEXP) &&
2314           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2315         {
2316           *err = REG_ERPAREN;
2317           return NULL;
2318         }
2319       /* else fall through  */
2320     case OP_CLOSE_DUP_NUM:
2321       /* We treat it as a normal character.  */
2322
2323       /* Then we can these characters as normal characters.  */
2324       token->type = CHARACTER;
2325       /* mb_partial and word_char bits should be initialized already
2326          by peek_token.  */
2327       tree = create_token_tree (dfa, NULL, NULL, token);
2328       if (BE (tree == NULL, 0))
2329         {
2330           *err = REG_ESPACE;
2331           return NULL;
2332         }
2333       break;
2334     case ANCHOR:
2335       if ((token->opr.ctx_type
2336            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2337           && dfa->word_ops_used == 0)
2338         init_word_char (dfa);
2339       if (token->opr.ctx_type == WORD_DELIM
2340           || token->opr.ctx_type == NOT_WORD_DELIM)
2341         {
2342           bin_tree_t *tree_first, *tree_last;
2343           if (token->opr.ctx_type == WORD_DELIM)
2344             {
2345               token->opr.ctx_type = WORD_FIRST;
2346               tree_first = create_token_tree (dfa, NULL, NULL, token);
2347               token->opr.ctx_type = WORD_LAST;
2348             }
2349           else
2350             {
2351               token->opr.ctx_type = INSIDE_WORD;
2352               tree_first = create_token_tree (dfa, NULL, NULL, token);
2353               token->opr.ctx_type = INSIDE_NOTWORD;
2354             }
2355           tree_last = create_token_tree (dfa, NULL, NULL, token);
2356           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2357           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2358             {
2359               *err = REG_ESPACE;
2360               return NULL;
2361             }
2362         }
2363       else
2364         {
2365           tree = create_token_tree (dfa, NULL, NULL, token);
2366           if (BE (tree == NULL, 0))
2367             {
2368               *err = REG_ESPACE;
2369               return NULL;
2370             }
2371         }
2372       /* We must return here, since ANCHORs can't be followed
2373          by repetition operators.
2374          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2375              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2376       fetch_token (token, regexp, syntax);
2377       return tree;
2378     case OP_PERIOD:
2379       tree = create_token_tree (dfa, NULL, NULL, token);
2380       if (BE (tree == NULL, 0))
2381         {
2382           *err = REG_ESPACE;
2383           return NULL;
2384         }
2385       if (dfa->mb_cur_max > 1)
2386         dfa->has_mb_node = 1;
2387       break;
2388     case OP_WORD:
2389     case OP_NOTWORD:
2390       tree = build_charclass_op (dfa, regexp->trans,
2391                                  "alnum",
2392                                  "_",
2393                                  token->type == OP_NOTWORD, err);
2394       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2395         return NULL;
2396       break;
2397     case OP_SPACE:
2398     case OP_NOTSPACE:
2399       tree = build_charclass_op (dfa, regexp->trans,
2400                                  "space",
2401                                  "",
2402                                  token->type == OP_NOTSPACE, err);
2403       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2404         return NULL;
2405       break;
2406     case OP_ALT:
2407     case END_OF_RE:
2408       return NULL;
2409     case BACK_SLASH:
2410       *err = REG_EESCAPE;
2411       return NULL;
2412     default:
2413       /* Must not happen?  */
2414 #ifdef DEBUG
2415       assert (0);
2416 #endif
2417       return NULL;
2418     }
2419   fetch_token (token, regexp, syntax);
2420
2421   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2422          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2423     {
2424       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2425       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2426         return NULL;
2427       /* In BRE consecutive duplications are not allowed.  */
2428       if ((syntax & RE_CONTEXT_INVALID_DUP)
2429           && (token->type == OP_DUP_ASTERISK
2430               || token->type == OP_OPEN_DUP_NUM))
2431         {
2432           *err = REG_BADRPT;
2433           return NULL;
2434         }
2435     }
2436
2437   return tree;
2438 }
2439
2440 /* This function build the following tree, from regular expression
2441    (<reg_exp>):
2442          SUBEXP
2443             |
2444         <reg_exp>
2445 */
2446
2447 static bin_tree_t *
2448 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2449                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2450 {
2451   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2452   bin_tree_t *tree;
2453   size_t cur_nsub;
2454   cur_nsub = preg->re_nsub++;
2455
2456   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2457
2458   /* The subexpression may be a null string.  */
2459   if (token->type == OP_CLOSE_SUBEXP)
2460     tree = NULL;
2461   else
2462     {
2463       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2464       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2465         *err = REG_EPAREN;
2466       if (BE (*err != REG_NOERROR, 0))
2467         return NULL;
2468     }
2469
2470   if (cur_nsub <= '9' - '1')
2471     dfa->completed_bkref_map |= 1 << cur_nsub;
2472
2473   tree = create_tree (dfa, tree, NULL, SUBEXP);
2474   if (BE (tree == NULL, 0))
2475     {
2476       *err = REG_ESPACE;
2477       return NULL;
2478     }
2479   tree->token.opr.idx = cur_nsub;
2480   return tree;
2481 }
2482
2483 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2484
2485 static bin_tree_t *
2486 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2487               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2488 {
2489   bin_tree_t *tree = NULL, *old_tree = NULL;
2490   int i, start, end, start_idx = re_string_cur_idx (regexp);
2491 #ifndef RE_TOKEN_INIT_BUG
2492   re_token_t start_token = *token;
2493 #else
2494   re_token_t start_token;
2495
2496   memcpy ((void *) &start_token, (void *) token, sizeof start_token);
2497 #endif
2498
2499   if (token->type == OP_OPEN_DUP_NUM)
2500     {
2501       end = 0;
2502       start = fetch_number (regexp, token, syntax);
2503       if (start == -1)
2504         {
2505           if (token->type == CHARACTER && token->opr.c == ',')
2506             start = 0; /* We treat "{,m}" as "{0,m}".  */
2507           else
2508             {
2509               *err = REG_BADBR; /* <re>{} is invalid.  */
2510               return NULL;
2511             }
2512         }
2513       if (BE (start != -2, 1))
2514         {
2515           /* We treat "{n}" as "{n,n}".  */
2516           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2517                  : ((token->type == CHARACTER && token->opr.c == ',')
2518                     ? fetch_number (regexp, token, syntax) : -2));
2519         }
2520       if (BE (start == -2 || end == -2, 0))
2521         {
2522           /* Invalid sequence.  */
2523           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2524             {
2525               if (token->type == END_OF_RE)
2526                 *err = REG_EBRACE;
2527               else
2528                 *err = REG_BADBR;
2529
2530               return NULL;
2531             }
2532
2533           /* If the syntax bit is set, rollback.  */
2534           re_string_set_index (regexp, start_idx);
2535           *token = start_token;
2536           token->type = CHARACTER;
2537           /* mb_partial and word_char bits should be already initialized by
2538              peek_token.  */
2539           return elem;
2540         }
2541
2542       if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2543         {
2544           /* First number greater than second.  */
2545           *err = REG_BADBR;
2546           return NULL;
2547         }
2548     }
2549   else
2550     {
2551       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2552       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2553     }
2554
2555   fetch_token (token, regexp, syntax);
2556
2557   if (BE (elem == NULL, 0))
2558     return NULL;
2559   if (BE (start == 0 && end == 0, 0))
2560     {
2561       postorder (elem, free_tree, NULL);
2562       return NULL;
2563     }
2564
2565   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2566   if (BE (start > 0, 0))
2567     {
2568       tree = elem;
2569       for (i = 2; i <= start; ++i)
2570         {
2571           elem = duplicate_tree (elem, dfa);
2572           tree = create_tree (dfa, tree, elem, CONCAT);
2573           if (BE (elem == NULL || tree == NULL, 0))
2574             goto parse_dup_op_espace;
2575         }
2576
2577       if (start == end)
2578         return tree;
2579
2580       /* Duplicate ELEM before it is marked optional.  */
2581       elem = duplicate_tree (elem, dfa);
2582       old_tree = tree;
2583     }
2584   else
2585     old_tree = NULL;
2586
2587   if (elem->token.type == SUBEXP)
2588     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2589
2590   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2591   if (BE (tree == NULL, 0))
2592     goto parse_dup_op_espace;
2593
2594   /* This loop is actually executed only when end != -1,
2595      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2596      already created the start+1-th copy.  */
2597   for (i = start + 2; i <= end; ++i)
2598     {
2599       elem = duplicate_tree (elem, dfa);
2600       tree = create_tree (dfa, tree, elem, CONCAT);
2601       if (BE (elem == NULL || tree == NULL, 0))
2602         goto parse_dup_op_espace;
2603
2604       tree = create_tree (dfa, tree, NULL, OP_ALT);
2605       if (BE (tree == NULL, 0))
2606         goto parse_dup_op_espace;
2607     }
2608
2609   if (old_tree)
2610     tree = create_tree (dfa, old_tree, tree, CONCAT);
2611
2612   return tree;
2613
2614  parse_dup_op_espace:
2615   *err = REG_ESPACE;
2616   return NULL;
2617 }
2618
2619 /* Size of the names for collating symbol/equivalence_class/character_class.
2620    I'm not sure, but maybe enough.  */
2621 #define BRACKET_NAME_BUF_SIZE 32
2622
2623 #ifndef _LIBC
2624   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2625      Build the range expression which starts from START_ELEM, and ends
2626      at END_ELEM.  The result are written to MBCSET and SBCSET.
2627      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2628      mbcset->range_ends, is a pointer argument sinse we may
2629      update it.  */
2630
2631 static reg_errcode_t
2632 internal_function
2633 # ifdef RE_ENABLE_I18N
2634 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2635                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2636 # else /* not RE_ENABLE_I18N */
2637 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2638                  bracket_elem_t *end_elem)
2639 # endif /* not RE_ENABLE_I18N */
2640 {
2641   unsigned int start_ch, end_ch;
2642   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2643   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2644           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2645           0))
2646     return REG_ERANGE;
2647
2648   /* We can handle no multi character collating elements without libc
2649      support.  */
2650   if (BE ((start_elem->type == COLL_SYM
2651            && strlen ((char *) start_elem->opr.name) > 1)
2652           || (end_elem->type == COLL_SYM
2653               && strlen ((char *) end_elem->opr.name) > 1), 0))
2654     return REG_ECOLLATE;
2655
2656 # ifdef RE_ENABLE_I18N
2657   {
2658     wchar_t wc;
2659     wint_t start_wc;
2660     wint_t end_wc;
2661     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2662
2663     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2664                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2665                    : 0));
2666     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2667               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2668                  : 0));
2669 #ifdef GAWK
2670     /*
2671      * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2672      * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2673      * unsigned, so we don't have sign extension problems.
2674      */
2675     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2676                 ? start_ch : start_elem->opr.wch);
2677     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2678               ? end_ch : end_elem->opr.wch);
2679 #else
2680     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2681                 ? __btowc (start_ch) : start_elem->opr.wch);
2682     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2683               ? __btowc (end_ch) : end_elem->opr.wch);
2684 #endif
2685     if (start_wc == WEOF || end_wc == WEOF)
2686       return REG_ECOLLATE;
2687     cmp_buf[0] = start_wc;
2688     cmp_buf[4] = end_wc;
2689     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2690       return REG_ERANGE;
2691
2692     /* Got valid collation sequence values, add them as a new entry.
2693        However, for !_LIBC we have no collation elements: if the
2694        character set is single byte, the single byte character set
2695        that we build below suffices.  parse_bracket_exp passes
2696        no MBCSET if dfa->mb_cur_max == 1.  */
2697     if (mbcset)
2698       {
2699         /* Check the space of the arrays.  */
2700         if (BE (*range_alloc == mbcset->nranges, 0))
2701           {
2702             /* There is not enough space, need realloc.  */
2703             wchar_t *new_array_start, *new_array_end;
2704             int new_nranges;
2705
2706             /* +1 in case of mbcset->nranges is 0.  */
2707             new_nranges = 2 * mbcset->nranges + 1;
2708             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2709                are NULL if *range_alloc == 0.  */
2710             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2711                                           new_nranges);
2712             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2713                                         new_nranges);
2714
2715             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2716               return REG_ESPACE;
2717
2718             mbcset->range_starts = new_array_start;
2719             mbcset->range_ends = new_array_end;
2720             *range_alloc = new_nranges;
2721           }
2722
2723         mbcset->range_starts[mbcset->nranges] = start_wc;
2724         mbcset->range_ends[mbcset->nranges++] = end_wc;
2725       }
2726
2727     /* Build the table for single byte characters.  */
2728     for (wc = 0; wc < SBC_MAX; ++wc)
2729       {
2730         cmp_buf[2] = wc;
2731         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2732             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2733           bitset_set (sbcset, wc);
2734       }
2735   }
2736 # else /* not RE_ENABLE_I18N */
2737   {
2738     unsigned int ch;
2739     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2740                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2741                    : 0));
2742     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2743               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2744                  : 0));
2745     if (start_ch > end_ch)
2746       return REG_ERANGE;
2747     /* Build the table for single byte characters.  */
2748     for (ch = 0; ch < SBC_MAX; ++ch)
2749       if (start_ch <= ch  && ch <= end_ch)
2750         bitset_set (sbcset, ch);
2751   }
2752 # endif /* not RE_ENABLE_I18N */
2753   return REG_NOERROR;
2754 }
2755 #endif /* not _LIBC */
2756
2757 #ifndef _LIBC
2758 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2759    Build the collating element which is represented by NAME.
2760    The result are written to MBCSET and SBCSET.
2761    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2762    pointer argument since we may update it.  */
2763
2764 static reg_errcode_t
2765 internal_function
2766 # ifdef RE_ENABLE_I18N
2767 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2768                         int *coll_sym_alloc, const unsigned char *name)
2769 # else /* not RE_ENABLE_I18N */
2770 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2771 # endif /* not RE_ENABLE_I18N */
2772 {
2773   size_t name_len = strlen ((const char *) name);
2774   if (BE (name_len != 1, 0))
2775     return REG_ECOLLATE;
2776   else
2777     {
2778       bitset_set (sbcset, name[0]);
2779       return REG_NOERROR;
2780     }
2781 }
2782 #endif /* not _LIBC */
2783
2784 /* This function parse bracket expression like "[abc]", "[a-c]",
2785    "[[.a-a.]]" etc.  */
2786
2787 static bin_tree_t *
2788 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2789                    reg_syntax_t syntax, reg_errcode_t *err)
2790 {
2791 #ifdef _LIBC
2792   const unsigned char *collseqmb;
2793   const char *collseqwc;
2794   uint32_t nrules;
2795   int32_t table_size;
2796   const int32_t *symb_table;
2797   const unsigned char *extra;
2798
2799   /* Local function for parse_bracket_exp used in _LIBC environement.
2800      Seek the collating symbol entry correspondings to NAME.
2801      Return the index of the symbol in the SYMB_TABLE.  */
2802
2803   auto inline int32_t
2804   __attribute ((always_inline))
2805   seek_collating_symbol_entry (name, name_len)
2806          const unsigned char *name;
2807          size_t name_len;
2808     {
2809       int32_t hash = elem_hash ((const char *) name, name_len);
2810       int32_t elem = hash % table_size;
2811       if (symb_table[2 * elem] != 0)
2812         {
2813           int32_t second = hash % (table_size - 2) + 1;
2814
2815           do
2816             {
2817               /* First compare the hashing value.  */
2818               if (symb_table[2 * elem] == hash
2819                   /* Compare the length of the name.  */
2820                   && name_len == extra[symb_table[2 * elem + 1]]
2821                   /* Compare the name.  */
2822                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2823                              name_len) == 0)
2824                 {
2825                   /* Yep, this is the entry.  */
2826                   break;
2827                 }
2828
2829               /* Next entry.  */
2830               elem += second;
2831             }
2832           while (symb_table[2 * elem] != 0);
2833         }
2834       return elem;
2835     }
2836
2837   /* Local function for parse_bracket_exp used in _LIBC environment.
2838      Look up the collation sequence value of BR_ELEM.
2839      Return the value if succeeded, UINT_MAX otherwise.  */
2840
2841   auto inline unsigned int
2842   __attribute ((always_inline))
2843   lookup_collation_sequence_value (br_elem)
2844          bracket_elem_t *br_elem;
2845     {
2846       if (br_elem->type == SB_CHAR)
2847         {
2848           /*
2849           if (MB_CUR_MAX == 1)
2850           */
2851           if (nrules == 0)
2852             return collseqmb[br_elem->opr.ch];
2853           else
2854             {
2855               wint_t wc = __btowc (br_elem->opr.ch);
2856               return __collseq_table_lookup (collseqwc, wc);
2857             }
2858         }
2859       else if (br_elem->type == MB_CHAR)
2860         {
2861           if (nrules != 0)
2862             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2863         }
2864       else if (br_elem->type == COLL_SYM)
2865         {
2866           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2867           if (nrules != 0)
2868             {
2869               int32_t elem, idx;
2870               elem = seek_collating_symbol_entry (br_elem->opr.name,
2871                                                   sym_name_len);
2872               if (symb_table[2 * elem] != 0)
2873                 {
2874                   /* We found the entry.  */
2875                   idx = symb_table[2 * elem + 1];
2876                   /* Skip the name of collating element name.  */
2877                   idx += 1 + extra[idx];
2878                   /* Skip the byte sequence of the collating element.  */
2879                   idx += 1 + extra[idx];
2880                   /* Adjust for the alignment.  */
2881                   idx = (idx + 3) & ~3;
2882                   /* Skip the multibyte collation sequence value.  */
2883                   idx += sizeof (unsigned int);
2884                   /* Skip the wide char sequence of the collating element.  */
2885                   idx += sizeof (unsigned int) *
2886                     (1 + *(unsigned int *) (extra + idx));
2887                   /* Return the collation sequence value.  */
2888                   return *(unsigned int *) (extra + idx);
2889                 }
2890               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2891                 {
2892                   /* No valid character.  Match it as a single byte
2893                      character.  */
2894                   return collseqmb[br_elem->opr.name[0]];
2895                 }
2896             }
2897           else if (sym_name_len == 1)
2898             return collseqmb[br_elem->opr.name[0]];
2899         }
2900       return UINT_MAX;
2901     }
2902
2903   /* Local function for parse_bracket_exp used in _LIBC environement.
2904      Build the range expression which starts from START_ELEM, and ends
2905      at END_ELEM.  The result are written to MBCSET and SBCSET.
2906      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2907      mbcset->range_ends, is a pointer argument sinse we may
2908      update it.  */
2909
2910   auto inline reg_errcode_t
2911   __attribute ((always_inline))
2912   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2913          re_charset_t *mbcset;
2914          int *range_alloc;
2915          bitset_t sbcset;
2916          bracket_elem_t *start_elem, *end_elem;
2917     {
2918       unsigned int ch;
2919       uint32_t start_collseq;
2920       uint32_t end_collseq;
2921
2922       /* Equivalence Classes and Character Classes can't be a range
2923          start/end.  */
2924       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2925               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2926               0))
2927         return REG_ERANGE;
2928
2929       start_collseq = lookup_collation_sequence_value (start_elem);
2930       end_collseq = lookup_collation_sequence_value (end_elem);
2931       /* Check start/end collation sequence values.  */
2932       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2933         return REG_ECOLLATE;
2934       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2935         return REG_ERANGE;
2936
2937       /* Got valid collation sequence values, add them as a new entry.
2938          However, if we have no collation elements, and the character set
2939          is single byte, the single byte character set that we
2940          build below suffices. */
2941       if (nrules > 0 || dfa->mb_cur_max > 1)
2942         {
2943           /* Check the space of the arrays.  */
2944           if (BE (*range_alloc == mbcset->nranges, 0))
2945             {
2946               /* There is not enough space, need realloc.  */
2947               uint32_t *new_array_start;
2948               uint32_t *new_array_end;
2949               int new_nranges;
2950
2951               /* +1 in case of mbcset->nranges is 0.  */
2952               new_nranges = 2 * mbcset->nranges + 1;
2953               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2954                                             new_nranges);
2955               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2956                                           new_nranges);
2957
2958               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2959                 return REG_ESPACE;
2960
2961               mbcset->range_starts = new_array_start;
2962               mbcset->range_ends = new_array_end;
2963               *range_alloc = new_nranges;
2964             }
2965
2966           mbcset->range_starts[mbcset->nranges] = start_collseq;
2967           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2968         }
2969
2970       /* Build the table for single byte characters.  */
2971       for (ch = 0; ch < SBC_MAX; ch++)
2972         {
2973           uint32_t ch_collseq;
2974           /*
2975           if (MB_CUR_MAX == 1)
2976           */
2977           if (nrules == 0)
2978             ch_collseq = collseqmb[ch];
2979           else
2980             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2981           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2982             bitset_set (sbcset, ch);
2983         }
2984       return REG_NOERROR;
2985     }
2986
2987   /* Local function for parse_bracket_exp used in _LIBC environement.
2988      Build the collating element which is represented by NAME.
2989      The result are written to MBCSET and SBCSET.
2990      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2991      pointer argument sinse we may update it.  */
2992
2993   auto inline reg_errcode_t
2994   __attribute ((always_inline))
2995   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2996          re_charset_t *mbcset;
2997          int *coll_sym_alloc;
2998          bitset_t sbcset;
2999          const unsigned char *name;
3000     {
3001       int32_t elem, idx;
3002       size_t name_len = strlen ((const char *) name);
3003       if (nrules != 0)
3004         {
3005           elem = seek_collating_symbol_entry (name, name_len);
3006           if (symb_table[2 * elem] != 0)
3007             {
3008               /* We found the entry.  */
3009               idx = symb_table[2 * elem + 1];
3010               /* Skip the name of collating element name.  */
3011               idx += 1 + extra[idx];
3012             }
3013           else if (symb_table[2 * elem] == 0 && name_len == 1)
3014             {
3015               /* No valid character, treat it as a normal
3016                  character.  */
3017               bitset_set (sbcset, name[0]);
3018               return REG_NOERROR;
3019             }
3020           else
3021             return REG_ECOLLATE;
3022
3023           /* Got valid collation sequence, add it as a new entry.  */
3024           /* Check the space of the arrays.  */
3025           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3026             {
3027               /* Not enough, realloc it.  */
3028               /* +1 in case of mbcset->ncoll_syms is 0.  */
3029               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3030               /* Use realloc since mbcset->coll_syms is NULL
3031                  if *alloc == 0.  */
3032               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3033                                                    new_coll_sym_alloc);
3034               if (BE (new_coll_syms == NULL, 0))
3035                 return REG_ESPACE;
3036               mbcset->coll_syms = new_coll_syms;
3037               *coll_sym_alloc = new_coll_sym_alloc;
3038             }
3039           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3040           return REG_NOERROR;
3041         }
3042       else
3043         {
3044           if (BE (name_len != 1, 0))
3045             return REG_ECOLLATE;
3046           else
3047             {
3048               bitset_set (sbcset, name[0]);
3049               return REG_NOERROR;
3050             }
3051         }
3052     }
3053 #endif
3054
3055   re_token_t br_token;
3056   re_bitset_ptr_t sbcset;
3057 #ifdef RE_ENABLE_I18N
3058   re_charset_t *mbcset;
3059   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3060   int equiv_class_alloc = 0, char_class_alloc = 0;
3061 #endif /* not RE_ENABLE_I18N */
3062   int non_match = 0;
3063   bin_tree_t *work_tree;
3064   int token_len;
3065   int first_round = 1;
3066 #ifdef _LIBC
3067   collseqmb = (const unsigned char *)
3068     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3069   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3070   if (nrules)
3071     {
3072       /*
3073       if (MB_CUR_MAX > 1)
3074       */
3075       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3076       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3077       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3078                                                   _NL_COLLATE_SYMB_TABLEMB);
3079       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3080                                                    _NL_COLLATE_SYMB_EXTRAMB);
3081     }
3082 #endif
3083   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3084 #ifdef RE_ENABLE_I18N
3085   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3086 #endif /* RE_ENABLE_I18N */
3087 #ifdef RE_ENABLE_I18N
3088   if (BE (sbcset == NULL || mbcset == NULL, 0))
3089 #else
3090   if (BE (sbcset == NULL, 0))
3091 #endif /* RE_ENABLE_I18N */
3092     {
3093       *err = REG_ESPACE;
3094       return NULL;
3095     }
3096
3097   token_len = peek_token_bracket (token, regexp, syntax);
3098   if (BE (token->type == END_OF_RE, 0))
3099     {
3100       *err = REG_BADPAT;
3101       goto parse_bracket_exp_free_return;
3102     }
3103   if (token->type == OP_NON_MATCH_LIST)
3104     {
3105 #ifdef RE_ENABLE_I18N
3106       mbcset->non_match = 1;
3107 #endif /* not RE_ENABLE_I18N */
3108       non_match = 1;
3109       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3110         bitset_set (sbcset, '\n');
3111       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3112       token_len = peek_token_bracket (token, regexp, syntax);
3113       if (BE (token->type == END_OF_RE, 0))
3114         {
3115           *err = REG_BADPAT;
3116           goto parse_bracket_exp_free_return;
3117         }
3118     }
3119
3120   /* We treat the first ']' as a normal character.  */
3121   if (token->type == OP_CLOSE_BRACKET)
3122     token->type = CHARACTER;
3123
3124   while (1)
3125     {
3126       bracket_elem_t start_elem, end_elem;
3127       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3128       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3129       reg_errcode_t ret;
3130       int token_len2 = 0, is_range_exp = 0;
3131       re_token_t token2;
3132
3133       start_elem.opr.name = start_name_buf;
3134       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3135                                    syntax, first_round);
3136       if (BE (ret != REG_NOERROR, 0))
3137         {
3138           *err = ret;
3139           goto parse_bracket_exp_free_return;
3140         }
3141       first_round = 0;
3142
3143       /* Get information about the next token.  We need it in any case.  */
3144       token_len = peek_token_bracket (token, regexp, syntax);
3145
3146       /* Do not check for ranges if we know they are not allowed.  */
3147       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3148         {
3149           if (BE (token->type == END_OF_RE, 0))
3150             {
3151               *err = REG_EBRACK;
3152               goto parse_bracket_exp_free_return;
3153             }
3154           if (token->type == OP_CHARSET_RANGE)
3155             {
3156               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3157               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3158               if (BE (token2.type == END_OF_RE, 0))
3159                 {
3160                   *err = REG_EBRACK;
3161                   goto parse_bracket_exp_free_return;
3162                 }
3163               if (token2.type == OP_CLOSE_BRACKET)
3164                 {
3165                   /* We treat the last '-' as a normal character.  */
3166                   re_string_skip_bytes (regexp, -token_len);
3167                   token->type = CHARACTER;
3168                 }
3169               else
3170                 is_range_exp = 1;
3171             }
3172         }
3173
3174       if (is_range_exp == 1)
3175         {
3176           end_elem.opr.name = end_name_buf;
3177           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3178                                        dfa, syntax, 1);
3179           if (BE (ret != REG_NOERROR, 0))
3180             {
3181               *err = ret;
3182               goto parse_bracket_exp_free_return;
3183             }
3184
3185           token_len = peek_token_bracket (token, regexp, syntax);
3186
3187 #ifdef _LIBC
3188           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3189                                   &start_elem, &end_elem);
3190 #else
3191 # ifdef RE_ENABLE_I18N
3192           *err = build_range_exp (sbcset,
3193                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3194                                   &range_alloc, &start_elem, &end_elem);
3195 # else
3196           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3197 # endif
3198 #endif /* RE_ENABLE_I18N */
3199           if (BE (*err != REG_NOERROR, 0))
3200             goto parse_bracket_exp_free_return;
3201         }
3202       else
3203         {
3204           switch (start_elem.type)
3205             {
3206             case SB_CHAR:
3207               bitset_set (sbcset, start_elem.opr.ch);
3208               break;
3209 #ifdef RE_ENABLE_I18N
3210             case MB_CHAR:
3211               /* Check whether the array has enough space.  */
3212               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3213                 {
3214                   wchar_t *new_mbchars;
3215                   /* Not enough, realloc it.  */
3216                   /* +1 in case of mbcset->nmbchars is 0.  */
3217                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3218                   /* Use realloc since array is NULL if *alloc == 0.  */
3219                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3220                                             mbchar_alloc);
3221                   if (BE (new_mbchars == NULL, 0))
3222                     goto parse_bracket_exp_espace;
3223                   mbcset->mbchars = new_mbchars;
3224                 }
3225               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3226               break;
3227 #endif /* RE_ENABLE_I18N */
3228             case EQUIV_CLASS:
3229               *err = build_equiv_class (sbcset,
3230 #ifdef RE_ENABLE_I18N
3231                                         mbcset, &equiv_class_alloc,
3232 #endif /* RE_ENABLE_I18N */
3233                                         start_elem.opr.name);
3234               if (BE (*err != REG_NOERROR, 0))
3235                 goto parse_bracket_exp_free_return;
3236               break;
3237             case COLL_SYM:
3238               *err = build_collating_symbol (sbcset,
3239 #ifdef RE_ENABLE_I18N
3240                                              mbcset, &coll_sym_alloc,
3241 #endif /* RE_ENABLE_I18N */
3242                                              start_elem.opr.name);
3243               if (BE (*err != REG_NOERROR, 0))
3244                 goto parse_bracket_exp_free_return;
3245               break;
3246             case CHAR_CLASS:
3247               *err = build_charclass (regexp->trans, sbcset,
3248 #ifdef RE_ENABLE_I18N
3249                                       mbcset, &char_class_alloc,
3250 #endif /* RE_ENABLE_I18N */
3251                                       (const char *) start_elem.opr.name, syntax);
3252               if (BE (*err != REG_NOERROR, 0))
3253                goto parse_bracket_exp_free_return;
3254               break;
3255             default:
3256               assert (0);
3257               break;
3258             }
3259         }
3260       if (BE (token->type == END_OF_RE, 0))
3261         {
3262           *err = REG_EBRACK;
3263           goto parse_bracket_exp_free_return;
3264         }
3265       if (token->type == OP_CLOSE_BRACKET)
3266         break;
3267     }
3268
3269   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3270
3271   /* If it is non-matching list.  */
3272   if (non_match)
3273     bitset_not (sbcset);
3274
3275 #ifdef RE_ENABLE_I18N
3276   /* Ensure only single byte characters are set.  */
3277   if (dfa->mb_cur_max > 1)
3278     bitset_mask (sbcset, dfa->sb_char);
3279
3280   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3281       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3282                                                      || mbcset->non_match)))
3283     {
3284       bin_tree_t *mbc_tree;
3285       int sbc_idx;
3286       /* Build a tree for complex bracket.  */
3287       dfa->has_mb_node = 1;
3288       br_token.type = COMPLEX_BRACKET;
3289       br_token.opr.mbcset = mbcset;
3290       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3291       if (BE (mbc_tree == NULL, 0))
3292         goto parse_bracket_exp_espace;
3293       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3294         if (sbcset[sbc_idx])
3295           break;
3296       /* If there are no bits set in sbcset, there is no point
3297          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3298       if (sbc_idx < BITSET_WORDS)
3299         {
3300           /* Build a tree for simple bracket.  */
3301           br_token.type = SIMPLE_BRACKET;
3302           br_token.opr.sbcset = sbcset;
3303           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3304           if (BE (work_tree == NULL, 0))
3305             goto parse_bracket_exp_espace;
3306
3307           /* Then join them by ALT node.  */
3308           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3309           if (BE (work_tree == NULL, 0))
3310             goto parse_bracket_exp_espace;
3311         }
3312       else
3313         {
3314           re_free (sbcset);
3315           work_tree = mbc_tree;
3316         }
3317     }
3318   else
3319 #endif /* not RE_ENABLE_I18N */
3320     {
3321 #ifdef RE_ENABLE_I18N
3322       free_charset (mbcset);
3323 #endif
3324       /* Build a tree for simple bracket.  */
3325       br_token.type = SIMPLE_BRACKET;
3326       br_token.opr.sbcset = sbcset;
3327       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3328       if (BE (work_tree == NULL, 0))
3329         goto parse_bracket_exp_espace;
3330     }
3331   return work_tree;
3332
3333  parse_bracket_exp_espace:
3334   *err = REG_ESPACE;
3335  parse_bracket_exp_free_return:
3336   re_free (sbcset);
3337 #ifdef RE_ENABLE_I18N
3338   free_charset (mbcset);
3339 #endif /* RE_ENABLE_I18N */
3340   return NULL;
3341 }
3342
3343 /* Parse an element in the bracket expression.  */
3344
3345 static reg_errcode_t
3346 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3347                        re_token_t *token, int token_len, re_dfa_t *dfa,
3348                        reg_syntax_t syntax, int accept_hyphen)
3349 {
3350 #ifdef RE_ENABLE_I18N
3351   int cur_char_size;
3352   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3353   if (cur_char_size > 1)
3354     {
3355       elem->type = MB_CHAR;
3356       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3357       re_string_skip_bytes (regexp, cur_char_size);
3358       return REG_NOERROR;
3359     }
3360 #endif /* RE_ENABLE_I18N */
3361   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3362   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3363       || token->type == OP_OPEN_EQUIV_CLASS)
3364     return parse_bracket_symbol (elem, regexp, token);
3365   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3366     {
3367       /* A '-' must only appear as anything but a range indicator before
3368          the closing bracket.  Everything else is an error.  */
3369       re_token_t token2;
3370       (void) peek_token_bracket (&token2, regexp, syntax);
3371       if (token2.type != OP_CLOSE_BRACKET)
3372         /* The actual error value is not standardized since this whole
3373            case is undefined.  But ERANGE makes good sense.  */
3374         return REG_ERANGE;
3375     }
3376   elem->type = SB_CHAR;
3377   elem->opr.ch = token->opr.c;
3378   return REG_NOERROR;
3379 }
3380
3381 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3382    such as [:<character_class>:], [.<collating_element>.], and
3383    [=<equivalent_class>=].  */
3384
3385 static reg_errcode_t
3386 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3387                       re_token_t *token)
3388 {
3389   unsigned char ch, delim = token->opr.c;
3390   int i = 0;
3391   if (re_string_eoi(regexp))
3392     return REG_EBRACK;
3393   for (;; ++i)
3394     {
3395       if (i >= BRACKET_NAME_BUF_SIZE)
3396         return REG_EBRACK;
3397       if (token->type == OP_OPEN_CHAR_CLASS)
3398         ch = re_string_fetch_byte_case (regexp);
3399       else
3400         ch = re_string_fetch_byte (regexp);
3401       if (re_string_eoi(regexp))
3402         return REG_EBRACK;
3403       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3404         break;
3405       elem->opr.name[i] = ch;
3406     }
3407   re_string_skip_bytes (regexp, 1);
3408   elem->opr.name[i] = '\0';
3409   switch (token->type)
3410     {
3411     case OP_OPEN_COLL_ELEM:
3412       elem->type = COLL_SYM;
3413       break;
3414     case OP_OPEN_EQUIV_CLASS:
3415       elem->type = EQUIV_CLASS;
3416       break;
3417     case OP_OPEN_CHAR_CLASS:
3418       elem->type = CHAR_CLASS;
3419       break;
3420     default:
3421       break;
3422     }
3423   return REG_NOERROR;
3424 }
3425
3426   /* Helper function for parse_bracket_exp.
3427      Build the equivalence class which is represented by NAME.
3428      The result are written to MBCSET and SBCSET.
3429      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3430      is a pointer argument sinse we may update it.  */
3431
3432 static reg_errcode_t
3433 #ifdef RE_ENABLE_I18N
3434 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3435                    int *equiv_class_alloc, const unsigned char *name)
3436 #else /* not RE_ENABLE_I18N */
3437 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3438 #endif /* not RE_ENABLE_I18N */
3439 {
3440 #ifdef _LIBC
3441   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3442   if (nrules != 0)
3443     {
3444       const int32_t *table, *indirect;
3445       const unsigned char *weights, *extra, *cp;
3446       unsigned char char_buf[2];
3447       int32_t idx1, idx2;
3448       unsigned int ch;
3449       size_t len;
3450       /* This #include defines a local function!  */
3451 # include <locale/weight.h>
3452       /* Calculate the index for equivalence class.  */
3453       cp = name;
3454       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3455       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3456                                                _NL_COLLATE_WEIGHTMB);
3457       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3458                                                    _NL_COLLATE_EXTRAMB);
3459       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3460                                                 _NL_COLLATE_INDIRECTMB);
3461       idx1 = findidx (&cp);
3462       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3463         /* This isn't a valid character.  */
3464         return REG_ECOLLATE;
3465
3466       /* Build single byte matcing table for this equivalence class.  */
3467       char_buf[1] = (unsigned char) '\0';
3468       len = weights[idx1 & 0xffffff];
3469       for (ch = 0; ch < SBC_MAX; ++ch)
3470         {
3471           char_buf[0] = ch;
3472           cp = char_buf;
3473           idx2 = findidx (&cp);
3474 /*
3475           idx2 = table[ch];
3476 */
3477           if (idx2 == 0)
3478             /* This isn't a valid character.  */
3479             continue;
3480           /* Compare only if the length matches and the collation rule
3481              index is the same.  */
3482           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3483             {
3484               int cnt = 0;
3485
3486               while (cnt <= len &&
3487                      weights[(idx1 & 0xffffff) + 1 + cnt]
3488                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3489                 ++cnt;
3490
3491               if (cnt > len)
3492                 bitset_set (sbcset, ch);
3493             }
3494         }
3495       /* Check whether the array has enough space.  */
3496       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3497         {
3498           /* Not enough, realloc it.  */
3499           /* +1 in case of mbcset->nequiv_classes is 0.  */
3500           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3501           /* Use realloc since the array is NULL if *alloc == 0.  */
3502           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3503                                                    int32_t,
3504                                                    new_equiv_class_alloc);
3505           if (BE (new_equiv_classes == NULL, 0))
3506             return REG_ESPACE;
3507           mbcset->equiv_classes = new_equiv_classes;
3508           *equiv_class_alloc = new_equiv_class_alloc;
3509         }
3510       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3511     }
3512   else
3513 #endif /* _LIBC */
3514     {
3515       if (BE (strlen ((const char *) name) != 1, 0))
3516         return REG_ECOLLATE;
3517       bitset_set (sbcset, *name);
3518     }
3519   return REG_NOERROR;
3520 }
3521
3522   /* Helper function for parse_bracket_exp.
3523      Build the character class which is represented by NAME.
3524      The result are written to MBCSET and SBCSET.
3525      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3526      is a pointer argument sinse we may update it.  */
3527
3528 static reg_errcode_t
3529 #ifdef RE_ENABLE_I18N
3530 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3531                  re_charset_t *mbcset, int *char_class_alloc,
3532                  const char *class_name, reg_syntax_t syntax)
3533 #else /* not RE_ENABLE_I18N */
3534 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3535                  const char *class_name, reg_syntax_t syntax)
3536 #endif /* not RE_ENABLE_I18N */
3537 {
3538   int i;
3539
3540   /* In case of REG_ICASE "upper" and "lower" match the both of
3541      upper and lower cases.  */
3542   if ((syntax & RE_ICASE)
3543       && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3544     class_name = "alpha";
3545
3546 #ifdef RE_ENABLE_I18N
3547   /* Check the space of the arrays.  */
3548   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3549     {
3550       /* Not enough, realloc it.  */
3551       /* +1 in case of mbcset->nchar_classes is 0.  */
3552       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3553       /* Use realloc since array is NULL if *alloc == 0.  */
3554       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3555                                                new_char_class_alloc);
3556       if (BE (new_char_classes == NULL, 0))
3557         return REG_ESPACE;
3558       mbcset->char_classes = new_char_classes;
3559       *char_class_alloc = new_char_class_alloc;
3560     }
3561   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3562 #endif /* RE_ENABLE_I18N */
3563
3564 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3565   do {                                          \
3566     if (BE (trans != NULL, 0))                  \
3567       {                                         \
3568         for (i = 0; i < SBC_MAX; ++i)           \
3569           if (ctype_func (i))                   \
3570             bitset_set (sbcset, trans[i]);      \
3571       }                                         \
3572     else                                        \
3573       {                                         \
3574         for (i = 0; i < SBC_MAX; ++i)           \
3575           if (ctype_func (i))                   \
3576             bitset_set (sbcset, i);             \
3577       }                                         \
3578   } while (0)
3579
3580   if (strcmp (class_name, "alnum") == 0)
3581     BUILD_CHARCLASS_LOOP (isalnum);
3582   else if (strcmp (class_name, "cntrl") == 0)
3583     BUILD_CHARCLASS_LOOP (iscntrl);
3584   else if (strcmp (class_name, "lower") == 0)
3585     BUILD_CHARCLASS_LOOP (islower);
3586   else if (strcmp (class_name, "space") == 0)
3587     BUILD_CHARCLASS_LOOP (isspace);
3588   else if (strcmp (class_name, "alpha") == 0)
3589     BUILD_CHARCLASS_LOOP (isalpha);
3590   else if (strcmp (class_name, "digit") == 0)
3591     BUILD_CHARCLASS_LOOP (isdigit);
3592   else if (strcmp (class_name, "print") == 0)
3593     BUILD_CHARCLASS_LOOP (isprint);
3594   else if (strcmp (class_name, "upper") == 0)
3595     BUILD_CHARCLASS_LOOP (isupper);
3596   else if (strcmp (class_name, "blank") == 0)
3597 #ifndef GAWK
3598     BUILD_CHARCLASS_LOOP (isblank);
3599 #else
3600     /* see comments above */
3601     BUILD_CHARCLASS_LOOP (is_blank);
3602 #endif
3603   else if (strcmp (class_name, "graph") == 0)
3604     BUILD_CHARCLASS_LOOP (isgraph);
3605   else if (strcmp (class_name, "punct") == 0)
3606     BUILD_CHARCLASS_LOOP (ispunct);
3607   else if (strcmp (class_name, "xdigit") == 0)
3608     BUILD_CHARCLASS_LOOP (isxdigit);
3609   else
3610     return REG_ECTYPE;
3611
3612   return REG_NOERROR;
3613 }
3614
3615 static bin_tree_t *
3616 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3617                     const char *class_name,
3618                     const char *extra, int non_match,
3619                     reg_errcode_t *err)
3620 {
3621   re_bitset_ptr_t sbcset;
3622 #ifdef RE_ENABLE_I18N
3623   re_charset_t *mbcset;
3624   int alloc = 0;
3625 #endif /* not RE_ENABLE_I18N */
3626   reg_errcode_t ret;
3627   re_token_t br_token;
3628   bin_tree_t *tree;
3629
3630   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3631 #ifdef RE_ENABLE_I18N
3632   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3633 #endif /* RE_ENABLE_I18N */
3634
3635 #ifdef RE_ENABLE_I18N
3636   if (BE (sbcset == NULL || mbcset == NULL, 0))
3637 #else /* not RE_ENABLE_I18N */
3638   if (BE (sbcset == NULL, 0))
3639 #endif /* not RE_ENABLE_I18N */
3640     {
3641       *err = REG_ESPACE;
3642       return NULL;
3643     }
3644
3645   if (non_match)
3646     {
3647 #ifdef RE_ENABLE_I18N
3648       mbcset->non_match = 1;
3649 #endif /* not RE_ENABLE_I18N */
3650     }
3651
3652   /* We don't care the syntax in this case.  */
3653   ret = build_charclass (trans, sbcset,
3654 #ifdef RE_ENABLE_I18N
3655                          mbcset, &alloc,
3656 #endif /* RE_ENABLE_I18N */
3657                          class_name, 0);
3658
3659   if (BE (ret != REG_NOERROR, 0))
3660     {
3661       re_free (sbcset);
3662 #ifdef RE_ENABLE_I18N
3663       free_charset (mbcset);
3664 #endif /* RE_ENABLE_I18N */
3665       *err = ret;
3666       return NULL;
3667     }
3668   /* \w match '_' also.  */
3669   for (; *extra; extra++)
3670     bitset_set (sbcset, *extra);
3671
3672   /* If it is non-matching list.  */
3673   if (non_match)
3674     bitset_not (sbcset);
3675
3676 #ifdef RE_ENABLE_I18N
3677   /* Ensure only single byte characters are set.  */
3678   if (dfa->mb_cur_max > 1)
3679     bitset_mask (sbcset, dfa->sb_char);
3680 #endif
3681
3682   /* Build a tree for simple bracket.  */
3683   br_token.type = SIMPLE_BRACKET;
3684   br_token.opr.sbcset = sbcset;
3685   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3686   if (BE (tree == NULL, 0))
3687     goto build_word_op_espace;
3688
3689 #ifdef RE_ENABLE_I18N
3690   if (dfa->mb_cur_max > 1)
3691     {
3692       bin_tree_t *mbc_tree;
3693       /* Build a tree for complex bracket.  */
3694       br_token.type = COMPLEX_BRACKET;
3695       br_token.opr.mbcset = mbcset;
3696       dfa->has_mb_node = 1;
3697       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3698       if (BE (mbc_tree == NULL, 0))
3699         goto build_word_op_espace;
3700       /* Then join them by ALT node.  */
3701       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3702       if (BE (mbc_tree != NULL, 1))
3703         return tree;
3704     }
3705   else
3706     {
3707       free_charset (mbcset);
3708       return tree;
3709     }
3710 #else /* not RE_ENABLE_I18N */
3711   return tree;
3712 #endif /* not RE_ENABLE_I18N */
3713
3714  build_word_op_espace:
3715   re_free (sbcset);
3716 #ifdef RE_ENABLE_I18N
3717   free_charset (mbcset);
3718 #endif /* RE_ENABLE_I18N */
3719   *err = REG_ESPACE;
3720   return NULL;
3721 }
3722
3723 /* This is intended for the expressions like "a{1,3}".
3724    Fetch a number from `input', and return the number.
3725    Return -1, if the number field is empty like "{,1}".
3726    Return -2, If an error is occured.  */
3727
3728 static int
3729 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3730 {
3731   int num = -1;
3732   unsigned char c;
3733   while (1)
3734     {
3735       fetch_token (token, input, syntax);
3736       c = token->opr.c;
3737       if (BE (token->type == END_OF_RE, 0))
3738         return -2;
3739       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3740         break;
3741       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3742              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3743       num = (num > RE_DUP_MAX) ? -2 : num;
3744     }
3745   return num;
3746 }
3747 \f
3748 #ifdef RE_ENABLE_I18N
3749 static void
3750 free_charset (re_charset_t *cset)
3751 {
3752   re_free (cset->mbchars);
3753 # ifdef _LIBC
3754   re_free (cset->coll_syms);
3755   re_free (cset->equiv_classes);
3756   re_free (cset->range_starts);
3757   re_free (cset->range_ends);
3758 # endif
3759   re_free (cset->char_classes);
3760   re_free (cset);
3761 }
3762 #endif /* RE_ENABLE_I18N */
3763 \f
3764 /* Functions for binary tree operation.  */
3765
3766 /* Create a tree node.  */
3767
3768 static bin_tree_t *
3769 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3770              re_token_type_t type)
3771 {
3772   re_token_t t;
3773   t.type = type;
3774   return create_token_tree (dfa, left, right, &t);
3775 }
3776
3777 static bin_tree_t *
3778 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3779                    const re_token_t *token)
3780 {
3781   bin_tree_t *tree;
3782   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3783     {
3784       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3785
3786       if (storage == NULL)
3787         return NULL;
3788       storage->next = dfa->str_tree_storage;
3789       dfa->str_tree_storage = storage;
3790       dfa->str_tree_storage_idx = 0;
3791     }
3792   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3793
3794   tree->parent = NULL;
3795   tree->left = left;
3796   tree->right = right;
3797   tree->token = *token;
3798   tree->token.duplicated = 0;
3799   tree->token.opt_subexp = 0;
3800   tree->first = NULL;
3801   tree->next = NULL;
3802   tree->node_idx = -1;
3803
3804   if (left != NULL)
3805     left->parent = tree;
3806   if (right != NULL)
3807     right->parent = tree;
3808   return tree;
3809 }
3810
3811 /* Mark the tree SRC as an optional subexpression.
3812    To be called from preorder or postorder.  */
3813
3814 static reg_errcode_t
3815 mark_opt_subexp (void *extra, bin_tree_t *node)
3816 {
3817   int idx = (int) (long) extra;
3818   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3819     node->token.opt_subexp = 1;
3820
3821   return REG_NOERROR;
3822 }
3823
3824 /* Free the allocated memory inside NODE. */
3825
3826 static void
3827 free_token (re_token_t *node)
3828 {
3829 #ifdef RE_ENABLE_I18N
3830   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3831     free_charset (node->opr.mbcset);
3832   else
3833 #endif /* RE_ENABLE_I18N */
3834     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3835       re_free (node->opr.sbcset);
3836 }
3837
3838 /* Worker function for tree walking.  Free the allocated memory inside NODE
3839    and its children. */
3840
3841 static reg_errcode_t
3842 free_tree (void *extra, bin_tree_t *node)
3843 {
3844   free_token (&node->token);
3845   return REG_NOERROR;
3846 }
3847
3848
3849 /* Duplicate the node SRC, and return new node.  This is a preorder
3850    visit similar to the one implemented by the generic visitor, but
3851    we need more infrastructure to maintain two parallel trees --- so,
3852    it's easier to duplicate.  */
3853
3854 static bin_tree_t *
3855 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3856 {
3857   const bin_tree_t *node;
3858   bin_tree_t *dup_root;
3859   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3860
3861   for (node = root; ; )
3862     {
3863       /* Create a new tree and link it back to the current parent.  */
3864       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3865       if (*p_new == NULL)
3866         return NULL;
3867       (*p_new)->parent = dup_node;
3868       (*p_new)->token.duplicated = 1;
3869       dup_node = *p_new;
3870
3871       /* Go to the left node, or up and to the right.  */
3872       if (node->left)
3873         {
3874           node = node->left;
3875           p_new = &dup_node->left;
3876         }
3877       else
3878         {
3879           const bin_tree_t *prev = NULL;
3880           while (node->right == prev || node->right == NULL)
3881             {
3882               prev = node;
3883               node = node->parent;
3884               dup_node = dup_node->parent;
3885               if (!node)
3886                 return dup_root;
3887             }
3888           node = node->right;
3889           p_new = &dup_node->right;
3890         }
3891     }
3892 }