gl/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002,2003,2004,2005,2006,2007 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, write to the Free Software Foundation,
  18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  19
  20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  21                                           size_t length, reg_syntax_t syntax);
  22 static void re_compile_fastmap_iter (regex_t *bufp,
  23                                      const re_dfastate_t *init_state,
  24                                      char *fastmap);
  25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  26 #ifdef RE_ENABLE_I18N
  27 static void free_charset (re_charset_t *cset);
  28 #endif /* RE_ENABLE_I18N */
  29 static void free_workarea_compile (regex_t *preg);
  30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  31 #ifdef RE_ENABLE_I18N
  32 static void optimize_utf8 (re_dfa_t *dfa);
  33 #endif
  34 static reg_errcode_t analyze (regex_t *preg);
  35 static reg_errcode_t preorder (bin_tree_t *root,
  36                                reg_errcode_t (fn (void *, bin_tree_t *)),
  37                                void *extra);
  38 static reg_errcode_t postorder (bin_tree_t *root,
  39                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                 void *extra);
  41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  44                                  bin_tree_t *node);
  45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  50                                    unsigned int constraint);
  51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  53                                          Idx node, bool root);
  54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  55 static Idx fetch_number (re_string_t *input, re_token_t *token,
  56                          reg_syntax_t syntax);
  57 static int peek_token (re_token_t *token, re_string_t *input,
  58                         reg_syntax_t syntax) internal_function;
  59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  60                           reg_syntax_t syntax, reg_errcode_t *err);
  61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  62                                   re_token_t *token, reg_syntax_t syntax,
  63                                   Idx nest, reg_errcode_t *err);
  64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  65                                  re_token_t *token, reg_syntax_t syntax,
  66                                  Idx nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  68                                      re_token_t *token, reg_syntax_t syntax,
  69                                      Idx nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  71                                   re_token_t *token, reg_syntax_t syntax,
  72                                   Idx nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  74                                  re_dfa_t *dfa, re_token_t *token,
  75                                  reg_syntax_t syntax, reg_errcode_t *err);
  76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  77                                       re_token_t *token, reg_syntax_t syntax,
  78                                       reg_errcode_t *err);
  79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  80                                             re_string_t *regexp,
  81                                             re_token_t *token, int token_len,
  82                                             re_dfa_t *dfa,
  83                                             reg_syntax_t syntax,
  84                                             bool accept_hyphen);
  85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  86                                           re_string_t *regexp,
  87                                           re_token_t *token);
  88 #ifdef RE_ENABLE_I18N
  89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  90                                         re_charset_t *mbcset,
  91                                         Idx *equiv_class_alloc,
  92                                         const unsigned char *name);
  93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  94                                       bitset_t sbcset,
  95                                       re_charset_t *mbcset,
  96                                       Idx *char_class_alloc,
  97                                       const unsigned char *class_name,
  98                                       reg_syntax_t syntax);
  99 #else  /* not RE_ENABLE_I18N */
 100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 101                                         const unsigned char *name);
 102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 103                                       bitset_t sbcset,
 104                                       const unsigned char *class_name,
 105                                       reg_syntax_t syntax);
 106 #endif /* not RE_ENABLE_I18N */
 107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 108                                        RE_TRANSLATE_TYPE trans,
 109                                        const unsigned char *class_name,
 110                                        const unsigned char *extra,
 111                                        bool non_match, reg_errcode_t *err);
 112 static bin_tree_t *create_tree (re_dfa_t *dfa,
 113                                 bin_tree_t *left, bin_tree_t *right,
 114                                 re_token_type_t type);
 115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 116                                       bin_tree_t *left, bin_tree_t *right,
 117                                       const re_token_t *token);
 118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 119 static void free_token (re_token_t *node);
 120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 122 \f
 123 /* This table gives an error message for each of the error codes listed
 124    in regex.h.  Obviously the order here has to be same as there.
 125    POSIX doesn't require that we do anything for REG_NOERROR,
 126    but why not be nice?  */
 127
 128 static const char __re_error_msgid[] =
 129   {
 130 #define REG_NOERROR_IDX 0
 131     gettext_noop ("Success")    /* REG_NOERROR */
 132     "\0"
 133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 134     gettext_noop ("No match")   /* REG_NOMATCH */
 135     "\0"
 136 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 137     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 138     "\0"
 139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 140     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 141     "\0"
 142 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 143     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 144     "\0"
 145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 146     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 147     "\0"
 148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 149     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 150     "\0"
 151 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 152     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 153     "\0"
 154 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 155     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 156     "\0"
 157 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 158     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 159     "\0"
 160 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 161     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 162     "\0"
 163 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 164     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 165     "\0"
 166 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 167     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 168     "\0"
 169 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 170     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 171     "\0"
 172 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 173     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 174     "\0"
 175 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 176     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 177     "\0"
 178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 179     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 180   };
 181
 182 static const size_t __re_error_msgid_idx[] =
 183   {
 184     REG_NOERROR_IDX,
 185     REG_NOMATCH_IDX,
 186     REG_BADPAT_IDX,
 187     REG_ECOLLATE_IDX,
 188     REG_ECTYPE_IDX,
 189     REG_EESCAPE_IDX,
 190     REG_ESUBREG_IDX,
 191     REG_EBRACK_IDX,
 192     REG_EPAREN_IDX,
 193     REG_EBRACE_IDX,
 194     REG_BADBR_IDX,
 195     REG_ERANGE_IDX,
 196     REG_ESPACE_IDX,
 197     REG_BADRPT_IDX,
 198     REG_EEND_IDX,
 199     REG_ESIZE_IDX,
 200     REG_ERPAREN_IDX
 201   };
 202 \f
 203 /* Entry points for GNU code.  */
 204
 205 /* re_compile_pattern is the GNU regular expression compiler: it
 206    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 207    Returns 0 if the pattern was valid, otherwise an error string.
 208
 209    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 210    are set in BUFP on entry.  */
 211
 212 #ifdef _LIBC
 213 const char *
 214 re_compile_pattern (pattern, length, bufp)
 215     const char *pattern;
 216     size_t length;
 217     struct re_pattern_buffer *bufp;
 218 #else /* size_t might promote */
 219 const char *
 220 re_compile_pattern (const char *pattern, size_t length,
 221                     struct re_pattern_buffer *bufp)
 222 #endif
 223 {
 224   reg_errcode_t ret;
 225
 226   /* And GNU code determines whether or not to get register information
 227      by passing null for the REGS argument to re_match, etc., not by
 228      setting no_sub, unless RE_NO_SUB is set.  */
 229   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 230
 231   /* Match anchors at newline.  */
 232   bufp->newline_anchor = 1;
 233
 234   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 235
 236   if (!ret)
 237     return NULL;
 238   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 239 }
 240 #ifdef _LIBC
 241 weak_alias (__re_compile_pattern, re_compile_pattern)
 242 #endif
 243
 244 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 245    also be assigned to arbitrarily: each pattern buffer stores its own
 246    syntax, so it can be changed between regex compilations.  */
 247 /* This has no initializer because initialized variables in Emacs
 248    become read-only after dumping.  */
 249 reg_syntax_t re_syntax_options;
 250
 251
 252 /* Specify the precise syntax of regexps for compilation.  This provides
 253    for compatibility for various utilities which historically have
 254    different, incompatible syntaxes.
 255
 256    The argument SYNTAX is a bit mask comprised of the various bits
 257    defined in regex.h.  We return the old syntax.  */
 258
 259 reg_syntax_t
 260 re_set_syntax (syntax)
 261     reg_syntax_t syntax;
 262 {
 263   reg_syntax_t ret = re_syntax_options;
 264
 265   re_syntax_options = syntax;
 266   return ret;
 267 }
 268 #ifdef _LIBC
 269 weak_alias (__re_set_syntax, re_set_syntax)
 270 #endif
 271
 272 int
 273 re_compile_fastmap (bufp)
 274     struct re_pattern_buffer *bufp;
 275 {
 276   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 277   char *fastmap = bufp->fastmap;
 278
 279   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 280   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 281   if (dfa->init_state != dfa->init_state_word)
 282     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 283   if (dfa->init_state != dfa->init_state_nl)
 284     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 285   if (dfa->init_state != dfa->init_state_begbuf)
 286     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 287   bufp->fastmap_accurate = 1;
 288   return 0;
 289 }
 290 #ifdef _LIBC
 291 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 292 #endif
 293
 294 static inline void
 295 __attribute ((always_inline))
 296 re_set_fastmap (char *fastmap, bool icase, int ch)
 297 {
 298   fastmap[ch] = 1;
 299   if (icase)
 300     fastmap[tolower (ch)] = 1;
 301 }
 302
 303 /* Helper function for re_compile_fastmap.
 304    Compile fastmap for the initial_state INIT_STATE.  */
 305
 306 static void
 307 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 308                          char *fastmap)
 309 {
 310   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 311   Idx node_cnt;
 312   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 313   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 314     {
 315       Idx node = init_state->nodes.elems[node_cnt];
 316       re_token_type_t type = dfa->nodes[node].type;
 317
 318       if (type == CHARACTER)
 319         {
 320           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 321 #ifdef RE_ENABLE_I18N
 322           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 323             {
 324               unsigned char buf[MB_LEN_MAX];
 325               unsigned char *p;
 326               wchar_t wc;
 327               mbstate_t state;
 328
 329               p = buf;
 330               *p++ = dfa->nodes[node].opr.c;
 331               while (++node < dfa->nodes_len
 332                      && dfa->nodes[node].type == CHARACTER
 333                      && dfa->nodes[node].mb_partial)
 334                 *p++ = dfa->nodes[node].opr.c;
 335               memset (&state, '\0', sizeof (state));
 336               if (mbrtowc (&wc, (const char *) buf, p - buf,
 337                            &state) == p - buf
 338                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 339                       != (size_t) -1))
 340                 re_set_fastmap (fastmap, false, buf[0]);
 341             }
 342 #endif
 343         }
 344       else if (type == SIMPLE_BRACKET)
 345         {
 346           int i, ch;
 347           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 348             {
 349               int j;
 350               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 351               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 352                 if (w & ((bitset_word_t) 1 << j))
 353                   re_set_fastmap (fastmap, icase, ch);
 354             }
 355         }
 356 #ifdef RE_ENABLE_I18N
 357       else if (type == COMPLEX_BRACKET)
 358         {
 359           Idx i;
 360           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 361           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 362               || cset->nranges || cset->nchar_classes)
 363             {
 364 # ifdef _LIBC
 365               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 366                 {
 367                   /* In this case we want to catch the bytes which are
 368                      the first byte of any collation elements.
 369                      e.g. In da_DK, we want to catch 'a' since "aa"
 370                           is a valid collation element, and don't catch
 371                           'b' since 'b' is the only collation element
 372                           which starts from 'b'.  */
 373                   const int32_t *table = (const int32_t *)
 374                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 375                   for (i = 0; i < SBC_MAX; ++i)
 376                     if (table[i] < 0)
 377                       re_set_fastmap (fastmap, icase, i);
 378                 }
 379 # else
 380               if (dfa->mb_cur_max > 1)
 381                 for (i = 0; i < SBC_MAX; ++i)
 382                   if (__btowc (i) == WEOF)
 383                     re_set_fastmap (fastmap, icase, i);
 384 # endif /* not _LIBC */
 385             }
 386           for (i = 0; i < cset->nmbchars; ++i)
 387             {
 388               char buf[256];
 389               mbstate_t state;
 390               memset (&state, '\0', sizeof (state));
 391               if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 392                 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 393               if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 394                 {
 395                   if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 396                       != (size_t) -1)
 397                     re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 398                 }
 399             }
 400         }
 401 #endif /* RE_ENABLE_I18N */
 402       else if (type == OP_PERIOD
 403 #ifdef RE_ENABLE_I18N
 404                || type == OP_UTF8_PERIOD
 405 #endif /* RE_ENABLE_I18N */
 406                || type == END_OF_RE)
 407         {
 408           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 409           if (type == END_OF_RE)
 410             bufp->can_be_null = 1;
 411           return;
 412         }
 413     }
 414 }
 415 \f
 416 /* Entry point for POSIX code.  */
 417 /* regcomp takes a regular expression as a string and compiles it.
 418
 419    PREG is a regex_t *.  We do not expect any fields to be initialized,
 420    since POSIX says we shouldn't.  Thus, we set
 421
 422      `buffer' to the compiled pattern;
 423      `used' to the length of the compiled pattern;
 424      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 425        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 426        RE_SYNTAX_POSIX_BASIC;
 427      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 428      `fastmap' to an allocated space for the fastmap;
 429      `fastmap_accurate' to zero;
 430      `re_nsub' to the number of subexpressions in PATTERN.
 431
 432    PATTERN is the address of the pattern string.
 433
 434    CFLAGS is a series of bits which affect compilation.
 435
 436      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 437      use POSIX basic syntax.
 438
 439      If REG_NEWLINE is set, then . and [^...] don't match newline.
 440      Also, regexec will try a match beginning after every newline.
 441
 442      If REG_ICASE is set, then we considers upper- and lowercase
 443      versions of letters to be equivalent when matching.
 444
 445      If REG_NOSUB is set, then when PREG is passed to regexec, that
 446      routine will report only success or failure, and nothing about the
 447      registers.
 448
 449    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 450    the return codes and their meanings.)  */
 451
 452 int
 453 regcomp (preg, pattern, cflags)
 454     regex_t *_Restrict_ preg;
 455     const char *_Restrict_ pattern;
 456     int cflags;
 457 {
 458   reg_errcode_t ret;
 459   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 460                          : RE_SYNTAX_POSIX_BASIC);
 461
 462   preg->buffer = NULL;
 463   preg->allocated = 0;
 464   preg->used = 0;
 465
 466   /* Try to allocate space for the fastmap.  */
 467   preg->fastmap = re_malloc (char, SBC_MAX);
 468   if (BE (preg->fastmap == NULL, 0))
 469     return REG_ESPACE;
 470
 471   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 472
 473   /* If REG_NEWLINE is set, newlines are treated differently.  */
 474   if (cflags & REG_NEWLINE)
 475     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 476       syntax &= ~RE_DOT_NEWLINE;
 477       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 478       /* It also changes the matching behavior.  */
 479       preg->newline_anchor = 1;
 480     }
 481   else
 482     preg->newline_anchor = 0;
 483   preg->no_sub = !!(cflags & REG_NOSUB);
 484   preg->translate = NULL;
 485
 486   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 487
 488   /* POSIX doesn't distinguish between an unmatched open-group and an
 489      unmatched close-group: both are REG_EPAREN.  */
 490   if (ret == REG_ERPAREN)
 491     ret = REG_EPAREN;
 492
 493   /* We have already checked preg->fastmap != NULL.  */
 494   if (BE (ret == REG_NOERROR, 1))
 495     /* Compute the fastmap now, since regexec cannot modify the pattern
 496        buffer.  This function never fails in this implementation.  */
 497     (void) re_compile_fastmap (preg);
 498   else
 499     {
 500       /* Some error occurred while compiling the expression.  */
 501       re_free (preg->fastmap);
 502       preg->fastmap = NULL;
 503     }
 504
 505   return (int) ret;
 506 }
 507 #ifdef _LIBC
 508 weak_alias (__regcomp, regcomp)
 509 #endif
 510
 511 /* Returns a message corresponding to an error code, ERRCODE, returned
 512    from either regcomp or regexec.   We don't use PREG here.  */
 513
 514 #ifdef _LIBC
 515 size_t
 516 regerror (errcode, preg, errbuf, errbuf_size)
 517     int errcode;
 518     const regex_t *_Restrict_ preg;
 519     char *_Restrict_ errbuf;
 520     size_t errbuf_size;
 521 #else /* size_t might promote */
 522 size_t
 523 regerror (int errcode, const regex_t *_Restrict_ preg,
 524           char *_Restrict_ errbuf, size_t errbuf_size)
 525 #endif
 526 {
 527   const char *msg;
 528   size_t msg_size;
 529
 530   if (BE (errcode < 0
 531           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 532                                / sizeof (__re_error_msgid_idx[0])), 0))
 533     /* Only error codes returned by the rest of the code should be passed
 534        to this routine.  If we are given anything else, or if other regex
 535        code generates an invalid error code, then the program has a bug.
 536        Dump core so we can fix it.  */
 537     abort ();
 538
 539   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 540
 541   msg_size = strlen (msg) + 1; /* Includes the null.  */
 542
 543   if (BE (errbuf_size != 0, 1))
 544     {
 545       size_t cpy_size = msg_size;
 546       if (BE (msg_size > errbuf_size, 0))
 547         {
 548           cpy_size = errbuf_size - 1;
 549           errbuf[cpy_size] = '\0';
 550         }
 551       memcpy (errbuf, msg, cpy_size);
 552     }
 553
 554   return msg_size;
 555 }
 556 #ifdef _LIBC
 557 weak_alias (__regerror, regerror)
 558 #endif
 559
 560
 561 #ifdef RE_ENABLE_I18N
 562 /* This static array is used for the map to single-byte characters when
 563    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 564    it the same all the time.  UTF-8 is the preferred encoding so this is
 565    a worthwhile optimization.  */
 566 static const bitset_t utf8_sb_map =
 567 {
 568   /* Set the first 128 bits.  */
 569 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
 570 #  error "bitset_word_t is narrower than 32 bits"
 571 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 572   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 573 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 574   BITSET_WORD_MAX, BITSET_WORD_MAX,
 575 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 576   BITSET_WORD_MAX,
 577 # endif
 578   (BITSET_WORD_MAX
 579    >> (SBC_MAX % BITSET_WORD_BITS == 0
 580        ? 0
 581        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 582 };
 583 #endif
 584
 585
 586 static void
 587 free_dfa_content (re_dfa_t *dfa)
 588 {
 589   Idx i, j;
 590
 591   if (dfa->nodes)
 592     for (i = 0; i < dfa->nodes_len; ++i)
 593       free_token (dfa->nodes + i);
 594   re_free (dfa->nexts);
 595   for (i = 0; i < dfa->nodes_len; ++i)
 596     {
 597       if (dfa->eclosures != NULL)
 598         re_node_set_free (dfa->eclosures + i);
 599       if (dfa->inveclosures != NULL)
 600         re_node_set_free (dfa->inveclosures + i);
 601       if (dfa->edests != NULL)
 602         re_node_set_free (dfa->edests + i);
 603     }
 604   re_free (dfa->edests);
 605   re_free (dfa->eclosures);
 606   re_free (dfa->inveclosures);
 607   re_free (dfa->nodes);
 608
 609   if (dfa->state_table)
 610     for (i = 0; i <= dfa->state_hash_mask; ++i)
 611       {
 612         struct re_state_table_entry *entry = dfa->state_table + i;
 613         for (j = 0; j < entry->num; ++j)
 614           {
 615             re_dfastate_t *state = entry->array[j];
 616             free_state (state);
 617           }
 618         re_free (entry->array);
 619       }
 620   re_free (dfa->state_table);
 621 #ifdef RE_ENABLE_I18N
 622   if (dfa->sb_char != utf8_sb_map)
 623     re_free (dfa->sb_char);
 624 #endif
 625   re_free (dfa->subexp_map);
 626 #ifdef DEBUG
 627   re_free (dfa->re_str);
 628 #endif
 629
 630   re_free (dfa);
 631 }
 632
 633
 634 /* Free dynamically allocated space used by PREG.  */
 635
 636 void
 637 regfree (preg)
 638     regex_t *preg;
 639 {
 640   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 641   if (BE (dfa != NULL, 1))
 642     free_dfa_content (dfa);
 643   preg->buffer = NULL;
 644   preg->allocated = 0;
 645
 646   re_free (preg->fastmap);
 647   preg->fastmap = NULL;
 648
 649   re_free (preg->translate);
 650   preg->translate = NULL;
 651 }
 652 #ifdef _LIBC
 653 weak_alias (__regfree, regfree)
 654 #endif
 655 \f
 656 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 657    them unless specifically requested.  */
 658
 659 #if defined _REGEX_RE_COMP || defined _LIBC
 660
 661 /* BSD has one and only one pattern buffer.  */
 662 static struct re_pattern_buffer re_comp_buf;
 663
 664 char *
 665 # ifdef _LIBC
 666 /* Make these definitions weak in libc, so POSIX programs can redefine
 667    these names if they don't use our functions, and still use
 668    regcomp/regexec above without link errors.  */
 669 weak_function
 670 # endif
 671 re_comp (s)
 672      const char *s;
 673 {
 674   reg_errcode_t ret;
 675   char *fastmap;
 676
 677   if (!s)
 678     {
 679       if (!re_comp_buf.buffer)
 680         return gettext ("No previous regular expression");
 681       return 0;
 682     }
 683
 684   if (re_comp_buf.buffer)
 685     {
 686       fastmap = re_comp_buf.fastmap;
 687       re_comp_buf.fastmap = NULL;
 688       __regfree (&re_comp_buf);
 689       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 690       re_comp_buf.fastmap = fastmap;
 691     }
 692
 693   if (re_comp_buf.fastmap == NULL)
 694     {
 695       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 696       if (re_comp_buf.fastmap == NULL)
 697         return (char *) gettext (__re_error_msgid
 698                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 699     }
 700
 701   /* Since `re_exec' always passes NULL for the `regs' argument, we
 702      don't need to initialize the pattern buffer fields which affect it.  */
 703
 704   /* Match anchors at newlines.  */
 705   re_comp_buf.newline_anchor = 1;
 706
 707   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 708
 709   if (!ret)
 710     return NULL;
 711
 712   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 713   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 714 }
 715
 716 #ifdef _LIBC
 717 libc_freeres_fn (free_mem)
 718 {
 719   __regfree (&re_comp_buf);
 720 }
 721 #endif
 722
 723 #endif /* _REGEX_RE_COMP */
 724 \f
 725 /* Internal entry point.
 726    Compile the regular expression PATTERN, whose length is LENGTH.
 727    SYNTAX indicate regular expression's syntax.  */
 728
 729 static reg_errcode_t
 730 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 731                      reg_syntax_t syntax)
 732 {
 733   reg_errcode_t err = REG_NOERROR;
 734   re_dfa_t *dfa;
 735   re_string_t regexp;
 736
 737   /* Initialize the pattern buffer.  */
 738   preg->fastmap_accurate = 0;
 739   preg->syntax = syntax;
 740   preg->not_bol = preg->not_eol = 0;
 741   preg->used = 0;
 742   preg->re_nsub = 0;
 743   preg->can_be_null = 0;
 744   preg->regs_allocated = REGS_UNALLOCATED;
 745
 746   /* Initialize the dfa.  */
 747   dfa = (re_dfa_t *) preg->buffer;
 748   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 749     {
 750       /* If zero allocated, but buffer is non-null, try to realloc
 751          enough space.  This loses if buffer's address is bogus, but
 752          that is the user's responsibility.  If ->buffer is NULL this
 753          is a simple allocation.  */
 754       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 755       if (dfa == NULL)
 756         return REG_ESPACE;
 757       preg->allocated = sizeof (re_dfa_t);
 758       preg->buffer = (unsigned char *) dfa;
 759     }
 760   preg->used = sizeof (re_dfa_t);
 761
 762   err = init_dfa (dfa, length);
 763   if (BE (err != REG_NOERROR, 0))
 764     {
 765       free_dfa_content (dfa);
 766       preg->buffer = NULL;
 767       preg->allocated = 0;
 768       return err;
 769     }
 770 #ifdef DEBUG
 771   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 772   dfa->re_str = re_malloc (char, length + 1);
 773   strncpy (dfa->re_str, pattern, length + 1);
 774 #endif
 775
 776   __libc_lock_init (dfa->lock);
 777
 778   err = re_string_construct (&regexp, pattern, length, preg->translate,
 779                              syntax & RE_ICASE, dfa);
 780   if (BE (err != REG_NOERROR, 0))
 781     {
 782     re_compile_internal_free_return:
 783       free_workarea_compile (preg);
 784       re_string_destruct (&regexp);
 785       free_dfa_content (dfa);
 786       preg->buffer = NULL;
 787       preg->allocated = 0;
 788       return err;
 789     }
 790
 791   /* Parse the regular expression, and build a structure tree.  */
 792   preg->re_nsub = 0;
 793   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 794   if (BE (dfa->str_tree == NULL, 0))
 795     goto re_compile_internal_free_return;
 796
 797   /* Analyze the tree and create the nfa.  */
 798   err = analyze (preg);
 799   if (BE (err != REG_NOERROR, 0))
 800     goto re_compile_internal_free_return;
 801
 802 #ifdef RE_ENABLE_I18N
 803   /* If possible, do searching in single byte encoding to speed things up.  */
 804   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 805     optimize_utf8 (dfa);
 806 #endif
 807
 808   /* Then create the initial state of the dfa.  */
 809   err = create_initial_state (dfa);
 810
 811   /* Release work areas.  */
 812   free_workarea_compile (preg);
 813   re_string_destruct (&regexp);
 814
 815   if (BE (err != REG_NOERROR, 0))
 816     {
 817       free_dfa_content (dfa);
 818       preg->buffer = NULL;
 819       preg->allocated = 0;
 820     }
 821
 822   return err;
 823 }
 824
 825 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 826    as the initial length of some arrays.  */
 827
 828 static reg_errcode_t
 829 init_dfa (re_dfa_t *dfa, size_t pat_len)
 830 {
 831   __re_size_t table_size;
 832 #ifdef RE_ENABLE_I18N
 833   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 834 #else
 835   size_t max_i18n_object_size = 0;
 836 #endif
 837   size_t max_object_size =
 838     MAX (sizeof (struct re_state_table_entry),
 839          MAX (sizeof (re_token_t),
 840               MAX (sizeof (re_node_set),
 841                    MAX (sizeof (regmatch_t),
 842                         max_i18n_object_size))));
 843
 844   memset (dfa, '\0', sizeof (re_dfa_t));
 845
 846   /* Force allocation of str_tree_storage the first time.  */
 847   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 848
 849   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 850      calculation below, and for similar doubling calculations
 851      elsewhere.  And it's <= rather than <, because some of the
 852      doubling calculations add 1 afterwards.  */
 853   if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
 854     return REG_ESPACE;
 855
 856   dfa->nodes_alloc = pat_len + 1;
 857   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 858
 859   /*  table_size = 2 ^ ceil(log pat_len) */
 860   for (table_size = 1; ; table_size <<= 1)
 861     if (table_size > pat_len)
 862       break;
 863
 864   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 865   dfa->state_hash_mask = table_size - 1;
 866
 867   dfa->mb_cur_max = MB_CUR_MAX;
 868 #ifdef _LIBC
 869   if (dfa->mb_cur_max == 6
 870       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 871     dfa->is_utf8 = 1;
 872   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 873                        != 0);
 874 #else
 875   if (strcmp (locale_charset (), "UTF-8") == 0)
 876     dfa->is_utf8 = 1;
 877
 878   /* We check exhaustively in the loop below if this charset is a
 879      superset of ASCII.  */
 880   dfa->map_notascii = 0;
 881 #endif
 882
 883 #ifdef RE_ENABLE_I18N
 884   if (dfa->mb_cur_max > 1)
 885     {
 886       if (dfa->is_utf8)
 887         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 888       else
 889         {
 890           int i, j, ch;
 891
 892           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 893           if (BE (dfa->sb_char == NULL, 0))
 894             return REG_ESPACE;
 895
 896           /* Set the bits corresponding to single byte chars.  */
 897           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 898             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 899               {
 900                 wint_t wch = __btowc (ch);
 901                 if (wch != WEOF)
 902                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 903 # ifndef _LIBC
 904                 if (isascii (ch) && wch != ch)
 905                   dfa->map_notascii = 1;
 906 # endif
 907               }
 908         }
 909     }
 910 #endif
 911
 912   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 913     return REG_ESPACE;
 914   return REG_NOERROR;
 915 }
 916
 917 /* Initialize WORD_CHAR table, which indicate which character is
 918    "word".  In this case "word" means that it is the word construction
 919    character used by some operators like "\<", "\>", etc.  */
 920
 921 static void
 922 internal_function
 923 init_word_char (re_dfa_t *dfa)
 924 {
 925   int i, j, ch;
 926   dfa->word_ops_used = 1;
 927   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 928     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 929       if (isalnum (ch) || ch == '_')
 930         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 931 }
 932
 933 /* Free the work area which are only used while compiling.  */
 934
 935 static void
 936 free_workarea_compile (regex_t *preg)
 937 {
 938   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 939   bin_tree_storage_t *storage, *next;
 940   for (storage = dfa->str_tree_storage; storage; storage = next)
 941     {
 942       next = storage->next;
 943       re_free (storage);
 944     }
 945   dfa->str_tree_storage = NULL;
 946   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 947   dfa->str_tree = NULL;
 948   re_free (dfa->org_indices);
 949   dfa->org_indices = NULL;
 950 }
 951
 952 /* Create initial states for all contexts.  */
 953
 954 static reg_errcode_t
 955 create_initial_state (re_dfa_t *dfa)
 956 {
 957   Idx first, i;
 958   reg_errcode_t err;
 959   re_node_set init_nodes;
 960
 961   /* Initial states have the epsilon closure of the node which is
 962      the first node of the regular expression.  */
 963   first = dfa->str_tree->first->node_idx;
 964   dfa->init_node = first;
 965   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 966   if (BE (err != REG_NOERROR, 0))
 967     return err;
 968
 969   /* The back-references which are in initial states can epsilon transit,
 970      since in this case all of the subexpressions can be null.
 971      Then we add epsilon closures of the nodes which are the next nodes of
 972      the back-references.  */
 973   if (dfa->nbackref > 0)
 974     for (i = 0; i < init_nodes.nelem; ++i)
 975       {
 976         Idx node_idx = init_nodes.elems[i];
 977         re_token_type_t type = dfa->nodes[node_idx].type;
 978
 979         Idx clexp_idx;
 980         if (type != OP_BACK_REF)
 981           continue;
 982         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
 983           {
 984             re_token_t *clexp_node;
 985             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
 986             if (clexp_node->type == OP_CLOSE_SUBEXP
 987                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
 988               break;
 989           }
 990         if (clexp_idx == init_nodes.nelem)
 991           continue;
 992
 993         if (type == OP_BACK_REF)
 994           {
 995             Idx dest_idx = dfa->edests[node_idx].elems[0];
 996             if (!re_node_set_contains (&init_nodes, dest_idx))
 997               {
 998                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
 999                 i = 0;
1000               }
1001           }
1002       }
1003
1004   /* It must be the first time to invoke acquire_state.  */
1005   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1006   /* We don't check ERR here, since the initial state must not be NULL.  */
1007   if (BE (dfa->init_state == NULL, 0))
1008     return err;
1009   if (dfa->init_state->has_constraint)
1010     {
1011       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1012                                                        CONTEXT_WORD);
1013       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1014                                                      CONTEXT_NEWLINE);
1015       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1016                                                          &init_nodes,
1017                                                          CONTEXT_NEWLINE
1018                                                          | CONTEXT_BEGBUF);
1019       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1020               || dfa->init_state_begbuf == NULL, 0))
1021         return err;
1022     }
1023   else
1024     dfa->init_state_word = dfa->init_state_nl
1025       = dfa->init_state_begbuf = dfa->init_state;
1026
1027   re_node_set_free (&init_nodes);
1028   return REG_NOERROR;
1029 }
1030 \f
1031 #ifdef RE_ENABLE_I18N
1032 /* If it is possible to do searching in single byte encoding instead of UTF-8
1033    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1034    DFA nodes where needed.  */
1035
1036 static void
1037 optimize_utf8 (re_dfa_t *dfa)
1038 {
1039   Idx node;
1040   int i;
1041   bool mb_chars = false;
1042   bool has_period = false;
1043
1044   for (node = 0; node < dfa->nodes_len; ++node)
1045     switch (dfa->nodes[node].type)
1046       {
1047       case CHARACTER:
1048         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1049           mb_chars = true;
1050         break;
1051       case ANCHOR:
1052         switch (dfa->nodes[node].opr.ctx_type)
1053           {
1054           case LINE_FIRST:
1055           case LINE_LAST:
1056           case BUF_FIRST:
1057           case BUF_LAST:
1058             break;
1059           default:
1060             /* Word anchors etc. cannot be handled.  */
1061             return;
1062           }
1063         break;
1064       case OP_PERIOD:
1065         has_period = true;
1066         break;
1067       case OP_BACK_REF:
1068       case OP_ALT:
1069       case END_OF_RE:
1070       case OP_DUP_ASTERISK:
1071       case OP_OPEN_SUBEXP:
1072       case OP_CLOSE_SUBEXP:
1073         break;
1074       case COMPLEX_BRACKET:
1075         return;
1076       case SIMPLE_BRACKET:
1077         /* Just double check.  */
1078         {
1079           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1080                         ? 0
1081                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1082           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1083             {
1084               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1085                 return;
1086               rshift = 0;
1087             }
1088         }
1089         break;
1090       default:
1091         abort ();
1092       }
1093
1094   if (mb_chars || has_period)
1095     for (node = 0; node < dfa->nodes_len; ++node)
1096       {
1097         if (dfa->nodes[node].type == CHARACTER
1098             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1099           dfa->nodes[node].mb_partial = 0;
1100         else if (dfa->nodes[node].type == OP_PERIOD)
1101           dfa->nodes[node].type = OP_UTF8_PERIOD;
1102       }
1103
1104   /* The search can be in single byte locale.  */
1105   dfa->mb_cur_max = 1;
1106   dfa->is_utf8 = 0;
1107   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1108 }
1109 #endif
1110 \f
1111 /* Analyze the structure tree, and calculate "first", "next", "edest",
1112    "eclosure", and "inveclosure".  */
1113
1114 static reg_errcode_t
1115 analyze (regex_t *preg)
1116 {
1117   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1118   reg_errcode_t ret;
1119
1120   /* Allocate arrays.  */
1121   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1122   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1123   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1124   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1125   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1126           || dfa->eclosures == NULL, 0))
1127     return REG_ESPACE;
1128
1129   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1130   if (dfa->subexp_map != NULL)
1131     {
1132       Idx i;
1133       for (i = 0; i < preg->re_nsub; i++)
1134         dfa->subexp_map[i] = i;
1135       preorder (dfa->str_tree, optimize_subexps, dfa);
1136       for (i = 0; i < preg->re_nsub; i++)
1137         if (dfa->subexp_map[i] != i)
1138           break;
1139       if (i == preg->re_nsub)
1140         {
1141           free (dfa->subexp_map);
1142           dfa->subexp_map = NULL;
1143         }
1144     }
1145
1146   ret = postorder (dfa->str_tree, lower_subexps, preg);
1147   if (BE (ret != REG_NOERROR, 0))
1148     return ret;
1149   ret = postorder (dfa->str_tree, calc_first, dfa);
1150   if (BE (ret != REG_NOERROR, 0))
1151     return ret;
1152   preorder (dfa->str_tree, calc_next, dfa);
1153   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1154   if (BE (ret != REG_NOERROR, 0))
1155     return ret;
1156   ret = calc_eclosure (dfa);
1157   if (BE (ret != REG_NOERROR, 0))
1158     return ret;
1159
1160   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1161      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1162   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1163       || dfa->nbackref)
1164     {
1165       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1166       if (BE (dfa->inveclosures == NULL, 0))
1167         return REG_ESPACE;
1168       ret = calc_inveclosure (dfa);
1169     }
1170
1171   return ret;
1172 }
1173
1174 /* Our parse trees are very unbalanced, so we cannot use a stack to
1175    implement parse tree visits.  Instead, we use parent pointers and
1176    some hairy code in these two functions.  */
1177 static reg_errcode_t
1178 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1179            void *extra)
1180 {
1181   bin_tree_t *node, *prev;
1182
1183   for (node = root; ; )
1184     {
1185       /* Descend down the tree, preferably to the left (or to the right
1186          if that's the only child).  */
1187       while (node->left || node->right)
1188         if (node->left)
1189           node = node->left;
1190         else
1191           node = node->right;
1192
1193       do
1194         {
1195           reg_errcode_t err = fn (extra, node);
1196           if (BE (err != REG_NOERROR, 0))
1197             return err;
1198           if (node->parent == NULL)
1199             return REG_NOERROR;
1200           prev = node;
1201           node = node->parent;
1202         }
1203       /* Go up while we have a node that is reached from the right.  */
1204       while (node->right == prev || node->right == NULL);
1205       node = node->right;
1206     }
1207 }
1208
1209 static reg_errcode_t
1210 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1211           void *extra)
1212 {
1213   bin_tree_t *node;
1214
1215   for (node = root; ; )
1216     {
1217       reg_errcode_t err = fn (extra, node);
1218       if (BE (err != REG_NOERROR, 0))
1219         return err;
1220
1221       /* Go to the left node, or up and to the right.  */
1222       if (node->left)
1223         node = node->left;
1224       else
1225         {
1226           bin_tree_t *prev = NULL;
1227           while (node->right == prev || node->right == NULL)
1228             {
1229               prev = node;
1230               node = node->parent;
1231               if (!node)
1232                 return REG_NOERROR;
1233             }
1234           node = node->right;
1235         }
1236     }
1237 }
1238
1239 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1240    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1241    backreferences as well.  Requires a preorder visit.  */
1242 static reg_errcode_t
1243 optimize_subexps (void *extra, bin_tree_t *node)
1244 {
1245   re_dfa_t *dfa = (re_dfa_t *) extra;
1246
1247   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1248     {
1249       int idx = node->token.opr.idx;
1250       node->token.opr.idx = dfa->subexp_map[idx];
1251       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1252     }
1253
1254   else if (node->token.type == SUBEXP
1255            && node->left && node->left->token.type == SUBEXP)
1256     {
1257       Idx other_idx = node->left->token.opr.idx;
1258
1259       node->left = node->left->left;
1260       if (node->left)
1261         node->left->parent = node;
1262
1263       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1264       if (other_idx < BITSET_WORD_BITS)
1265         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1266     }
1267
1268   return REG_NOERROR;
1269 }
1270
1271 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1272    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1273 static reg_errcode_t
1274 lower_subexps (void *extra, bin_tree_t *node)
1275 {
1276   regex_t *preg = (regex_t *) extra;
1277   reg_errcode_t err = REG_NOERROR;
1278
1279   if (node->left && node->left->token.type == SUBEXP)
1280     {
1281       node->left = lower_subexp (&err, preg, node->left);
1282       if (node->left)
1283         node->left->parent = node;
1284     }
1285   if (node->right && node->right->token.type == SUBEXP)
1286     {
1287       node->right = lower_subexp (&err, preg, node->right);
1288       if (node->right)
1289         node->right->parent = node;
1290     }
1291
1292   return err;
1293 }
1294
1295 static bin_tree_t *
1296 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1297 {
1298   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1299   bin_tree_t *body = node->left;
1300   bin_tree_t *op, *cls, *tree1, *tree;
1301
1302   if (preg->no_sub
1303       /* We do not optimize empty subexpressions, because otherwise we may
1304          have bad CONCAT nodes with NULL children.  This is obviously not
1305          very common, so we do not lose much.  An example that triggers
1306          this case is the sed "script" /\(\)/x.  */
1307       && node->left != NULL
1308       && (node->token.opr.idx >= BITSET_WORD_BITS
1309           || !(dfa->used_bkref_map
1310                & ((bitset_word_t) 1 << node->token.opr.idx))))
1311     return node->left;
1312
1313   /* Convert the SUBEXP node to the concatenation of an
1314      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1315   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1316   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1317   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1318   tree = create_tree (dfa, op, tree1, CONCAT);
1319   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1320     {
1321       *err = REG_ESPACE;
1322       return NULL;
1323     }
1324
1325   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1326   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1327   return tree;
1328 }
1329
1330 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1331    nodes.  Requires a postorder visit.  */
1332 static reg_errcode_t
1333 calc_first (void *extra, bin_tree_t *node)
1334 {
1335   re_dfa_t *dfa = (re_dfa_t *) extra;
1336   if (node->token.type == CONCAT)
1337     {
1338       node->first = node->left->first;
1339       node->node_idx = node->left->node_idx;
1340     }
1341   else
1342     {
1343       node->first = node;
1344       node->node_idx = re_dfa_add_node (dfa, node->token);
1345       if (BE (node->node_idx == REG_MISSING, 0))
1346         return REG_ESPACE;
1347     }
1348   return REG_NOERROR;
1349 }
1350
1351 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1352 static reg_errcode_t
1353 calc_next (void *extra, bin_tree_t *node)
1354 {
1355   switch (node->token.type)
1356     {
1357     case OP_DUP_ASTERISK:
1358       node->left->next = node;
1359       break;
1360     case CONCAT:
1361       node->left->next = node->right->first;
1362       node->right->next = node->next;
1363       break;
1364     default:
1365       if (node->left)
1366         node->left->next = node->next;
1367       if (node->right)
1368         node->right->next = node->next;
1369       break;
1370     }
1371   return REG_NOERROR;
1372 }
1373
1374 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1375 static reg_errcode_t
1376 link_nfa_nodes (void *extra, bin_tree_t *node)
1377 {
1378   re_dfa_t *dfa = (re_dfa_t *) extra;
1379   Idx idx = node->node_idx;
1380   reg_errcode_t err = REG_NOERROR;
1381
1382   switch (node->token.type)
1383     {
1384     case CONCAT:
1385       break;
1386
1387     case END_OF_RE:
1388       assert (node->next == NULL);
1389       break;
1390
1391     case OP_DUP_ASTERISK:
1392     case OP_ALT:
1393       {
1394         Idx left, right;
1395         dfa->has_plural_match = 1;
1396         if (node->left != NULL)
1397           left = node->left->first->node_idx;
1398         else
1399           left = node->next->node_idx;
1400         if (node->right != NULL)
1401           right = node->right->first->node_idx;
1402         else
1403           right = node->next->node_idx;
1404         assert (REG_VALID_INDEX (left));
1405         assert (REG_VALID_INDEX (right));
1406         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1407       }
1408       break;
1409
1410     case ANCHOR:
1411     case OP_OPEN_SUBEXP:
1412     case OP_CLOSE_SUBEXP:
1413       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1414       break;
1415
1416     case OP_BACK_REF:
1417       dfa->nexts[idx] = node->next->node_idx;
1418       if (node->token.type == OP_BACK_REF)
1419         re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1420       break;
1421
1422     default:
1423       assert (!IS_EPSILON_NODE (node->token.type));
1424       dfa->nexts[idx] = node->next->node_idx;
1425       break;
1426     }
1427
1428   return err;
1429 }
1430
1431 /* Duplicate the epsilon closure of the node ROOT_NODE.
1432    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1433    to their own constraint.  */
1434
1435 static reg_errcode_t
1436 internal_function
1437 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1438                         Idx root_node, unsigned int init_constraint)
1439 {
1440   Idx org_node, clone_node;
1441   bool ok;
1442   unsigned int constraint = init_constraint;
1443   for (org_node = top_org_node, clone_node = top_clone_node;;)
1444     {
1445       Idx org_dest, clone_dest;
1446       if (dfa->nodes[org_node].type == OP_BACK_REF)
1447         {
1448           /* If the back reference epsilon-transit, its destination must
1449              also have the constraint.  Then duplicate the epsilon closure
1450              of the destination of the back reference, and store it in
1451              edests of the back reference.  */
1452           org_dest = dfa->nexts[org_node];
1453           re_node_set_empty (dfa->edests + clone_node);
1454           clone_dest = duplicate_node (dfa, org_dest, constraint);
1455           if (BE (clone_dest == REG_MISSING, 0))
1456             return REG_ESPACE;
1457           dfa->nexts[clone_node] = dfa->nexts[org_node];
1458           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1459           if (BE (! ok, 0))
1460             return REG_ESPACE;
1461         }
1462       else if (dfa->edests[org_node].nelem == 0)
1463         {
1464           /* In case of the node can't epsilon-transit, don't duplicate the
1465              destination and store the original destination as the
1466              destination of the node.  */
1467           dfa->nexts[clone_node] = dfa->nexts[org_node];
1468           break;
1469         }
1470       else if (dfa->edests[org_node].nelem == 1)
1471         {
1472           /* In case of the node can epsilon-transit, and it has only one
1473              destination.  */
1474           org_dest = dfa->edests[org_node].elems[0];
1475           re_node_set_empty (dfa->edests + clone_node);
1476           if (dfa->nodes[org_node].type == ANCHOR)
1477             {
1478               /* In case of the node has another constraint, append it.  */
1479               if (org_node == root_node && clone_node != org_node)
1480                 {
1481                   /* ...but if the node is root_node itself, it means the
1482                      epsilon closure have a loop, then tie it to the
1483                      destination of the root_node.  */
1484                   ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1485                   if (BE (! ok, 0))
1486                     return REG_ESPACE;
1487                   break;
1488                 }
1489               constraint |= dfa->nodes[org_node].opr.ctx_type;
1490             }
1491           clone_dest = duplicate_node (dfa, org_dest, constraint);
1492           if (BE (clone_dest == REG_MISSING, 0))
1493             return REG_ESPACE;
1494           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1495           if (BE (! ok, 0))
1496             return REG_ESPACE;
1497         }
1498       else /* dfa->edests[org_node].nelem == 2 */
1499         {
1500           /* In case of the node can epsilon-transit, and it has two
1501              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1502           org_dest = dfa->edests[org_node].elems[0];
1503           re_node_set_empty (dfa->edests + clone_node);
1504           /* Search for a duplicated node which satisfies the constraint.  */
1505           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1506           if (clone_dest == REG_MISSING)
1507             {
1508               /* There are no such a duplicated node, create a new one.  */
1509               reg_errcode_t err;
1510               clone_dest = duplicate_node (dfa, org_dest, constraint);
1511               if (BE (clone_dest == REG_MISSING, 0))
1512                 return REG_ESPACE;
1513               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1514               if (BE (! ok, 0))
1515                 return REG_ESPACE;
1516               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1517                                             root_node, constraint);
1518               if (BE (err != REG_NOERROR, 0))
1519                 return err;
1520             }
1521           else
1522             {
1523               /* There are a duplicated node which satisfy the constraint,
1524                  use it to avoid infinite loop.  */
1525               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1526               if (BE (! ok, 0))
1527                 return REG_ESPACE;
1528             }
1529
1530           org_dest = dfa->edests[org_node].elems[1];
1531           clone_dest = duplicate_node (dfa, org_dest, constraint);
1532           if (BE (clone_dest == REG_MISSING, 0))
1533             return REG_ESPACE;
1534           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1535           if (BE (! ok, 0))
1536             return REG_ESPACE;
1537         }
1538       org_node = org_dest;
1539       clone_node = clone_dest;
1540     }
1541   return REG_NOERROR;
1542 }
1543
1544 /* Search for a node which is duplicated from the node ORG_NODE, and
1545    satisfies the constraint CONSTRAINT.  */
1546
1547 static Idx
1548 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1549                         unsigned int constraint)
1550 {
1551   Idx idx;
1552   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1553     {
1554       if (org_node == dfa->org_indices[idx]
1555           && constraint == dfa->nodes[idx].constraint)
1556         return idx; /* Found.  */
1557     }
1558   return REG_MISSING; /* Not found.  */
1559 }
1560
1561 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1562    Return the index of the new node, or REG_MISSING if insufficient storage is
1563    available.  */
1564
1565 static Idx
1566 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1567 {
1568   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1569   if (BE (dup_idx != REG_MISSING, 1))
1570     {
1571       dfa->nodes[dup_idx].constraint = constraint;
1572       if (dfa->nodes[org_idx].type == ANCHOR)
1573         dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1574       dfa->nodes[dup_idx].duplicated = 1;
1575
1576       /* Store the index of the original node.  */
1577       dfa->org_indices[dup_idx] = org_idx;
1578     }
1579   return dup_idx;
1580 }
1581
1582 static reg_errcode_t
1583 calc_inveclosure (re_dfa_t *dfa)
1584 {
1585   Idx src, idx;
1586   bool ok;
1587   for (idx = 0; idx < dfa->nodes_len; ++idx)
1588     re_node_set_init_empty (dfa->inveclosures + idx);
1589
1590   for (src = 0; src < dfa->nodes_len; ++src)
1591     {
1592       Idx *elems = dfa->eclosures[src].elems;
1593       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1594         {
1595           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1596           if (BE (! ok, 0))
1597             return REG_ESPACE;
1598         }
1599     }
1600
1601   return REG_NOERROR;
1602 }
1603
1604 /* Calculate "eclosure" for all the node in DFA.  */
1605
1606 static reg_errcode_t
1607 calc_eclosure (re_dfa_t *dfa)
1608 {
1609   Idx node_idx;
1610   bool incomplete;
1611 #ifdef DEBUG
1612   assert (dfa->nodes_len > 0);
1613 #endif
1614   incomplete = false;
1615   /* For each nodes, calculate epsilon closure.  */
1616   for (node_idx = 0; ; ++node_idx)
1617     {
1618       reg_errcode_t err;
1619       re_node_set eclosure_elem;
1620       if (node_idx == dfa->nodes_len)
1621         {
1622           if (!incomplete)
1623             break;
1624           incomplete = false;
1625           node_idx = 0;
1626         }
1627
1628 #ifdef DEBUG
1629       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1630 #endif
1631
1632       /* If we have already calculated, skip it.  */
1633       if (dfa->eclosures[node_idx].nelem != 0)
1634         continue;
1635       /* Calculate epsilon closure of `node_idx'.  */
1636       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1637       if (BE (err != REG_NOERROR, 0))
1638         return err;
1639
1640       if (dfa->eclosures[node_idx].nelem == 0)
1641         {
1642           incomplete = true;
1643           re_node_set_free (&eclosure_elem);
1644         }
1645     }
1646   return REG_NOERROR;
1647 }
1648
1649 /* Calculate epsilon closure of NODE.  */
1650
1651 static reg_errcode_t
1652 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1653 {
1654   reg_errcode_t err;
1655   unsigned int constraint;
1656   Idx i;
1657   bool incomplete;
1658   bool ok;
1659   re_node_set eclosure;
1660   incomplete = false;
1661   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1662   if (BE (err != REG_NOERROR, 0))
1663     return err;
1664
1665   /* This indicates that we are calculating this node now.
1666      We reference this value to avoid infinite loop.  */
1667   dfa->eclosures[node].nelem = REG_MISSING;
1668
1669   constraint = ((dfa->nodes[node].type == ANCHOR)
1670                 ? dfa->nodes[node].opr.ctx_type : 0);
1671   /* If the current node has constraints, duplicate all nodes.
1672      Since they must inherit the constraints.  */
1673   if (constraint
1674       && dfa->edests[node].nelem
1675       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1676     {
1677       err = duplicate_node_closure (dfa, node, node, node, constraint);
1678       if (BE (err != REG_NOERROR, 0))
1679         return err;
1680     }
1681
1682   /* Expand each epsilon destination nodes.  */
1683   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1684     for (i = 0; i < dfa->edests[node].nelem; ++i)
1685       {
1686         re_node_set eclosure_elem;
1687         Idx edest = dfa->edests[node].elems[i];
1688         /* If calculating the epsilon closure of `edest' is in progress,
1689            return intermediate result.  */
1690         if (dfa->eclosures[edest].nelem == REG_MISSING)
1691           {
1692             incomplete = true;
1693             continue;
1694           }
1695         /* If we haven't calculated the epsilon closure of `edest' yet,
1696            calculate now. Otherwise use calculated epsilon closure.  */
1697         if (dfa->eclosures[edest].nelem == 0)
1698           {
1699             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1700             if (BE (err != REG_NOERROR, 0))
1701               return err;
1702           }
1703         else
1704           eclosure_elem = dfa->eclosures[edest];
1705         /* Merge the epsilon closure of `edest'.  */
1706         re_node_set_merge (&eclosure, &eclosure_elem);
1707         /* If the epsilon closure of `edest' is incomplete,
1708            the epsilon closure of this node is also incomplete.  */
1709         if (dfa->eclosures[edest].nelem == 0)
1710           {
1711             incomplete = true;
1712             re_node_set_free (&eclosure_elem);
1713           }
1714       }
1715
1716   /* Epsilon closures include itself.  */
1717   ok = re_node_set_insert (&eclosure, node);
1718   if (BE (! ok, 0))
1719     return REG_ESPACE;
1720   if (incomplete && !root)
1721     dfa->eclosures[node].nelem = 0;
1722   else
1723     dfa->eclosures[node] = eclosure;
1724   *new_set = eclosure;
1725   return REG_NOERROR;
1726 }
1727 \f
1728 /* Functions for token which are used in the parser.  */
1729
1730 /* Fetch a token from INPUT.
1731    We must not use this function inside bracket expressions.  */
1732
1733 static void
1734 internal_function
1735 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1736 {
1737   re_string_skip_bytes (input, peek_token (result, input, syntax));
1738 }
1739
1740 /* Peek a token from INPUT, and return the length of the token.
1741    We must not use this function inside bracket expressions.  */
1742
1743 static int
1744 internal_function
1745 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1746 {
1747   unsigned char c;
1748
1749   if (re_string_eoi (input))
1750     {
1751       token->type = END_OF_RE;
1752       return 0;
1753     }
1754
1755   c = re_string_peek_byte (input, 0);
1756   token->opr.c = c;
1757
1758   token->word_char = 0;
1759 #ifdef RE_ENABLE_I18N
1760   token->mb_partial = 0;
1761   if (input->mb_cur_max > 1 &&
1762       !re_string_first_byte (input, re_string_cur_idx (input)))
1763     {
1764       token->type = CHARACTER;
1765       token->mb_partial = 1;
1766       return 1;
1767     }
1768 #endif
1769   if (c == '\\')
1770     {
1771       unsigned char c2;
1772       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1773         {
1774           token->type = BACK_SLASH;
1775           return 1;
1776         }
1777
1778       c2 = re_string_peek_byte_case (input, 1);
1779       token->opr.c = c2;
1780       token->type = CHARACTER;
1781 #ifdef RE_ENABLE_I18N
1782       if (input->mb_cur_max > 1)
1783         {
1784           wint_t wc = re_string_wchar_at (input,
1785                                           re_string_cur_idx (input) + 1);
1786           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1787         }
1788       else
1789 #endif
1790         token->word_char = IS_WORD_CHAR (c2) != 0;
1791
1792       switch (c2)
1793         {
1794         case '|':
1795           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1796             token->type = OP_ALT;
1797           break;
1798         case '1': case '2': case '3': case '4': case '5':
1799         case '6': case '7': case '8': case '9':
1800           if (!(syntax & RE_NO_BK_REFS))
1801             {
1802               token->type = OP_BACK_REF;
1803               token->opr.idx = c2 - '1';
1804             }
1805           break;
1806         case '<':
1807           if (!(syntax & RE_NO_GNU_OPS))
1808             {
1809               token->type = ANCHOR;
1810               token->opr.ctx_type = WORD_FIRST;
1811             }
1812           break;
1813         case '>':
1814           if (!(syntax & RE_NO_GNU_OPS))
1815             {
1816               token->type = ANCHOR;
1817               token->opr.ctx_type = WORD_LAST;
1818             }
1819           break;
1820         case 'b':
1821           if (!(syntax & RE_NO_GNU_OPS))
1822             {
1823               token->type = ANCHOR;
1824               token->opr.ctx_type = WORD_DELIM;
1825             }
1826           break;
1827         case 'B':
1828           if (!(syntax & RE_NO_GNU_OPS))
1829             {
1830               token->type = ANCHOR;
1831               token->opr.ctx_type = NOT_WORD_DELIM;
1832             }
1833           break;
1834         case 'w':
1835           if (!(syntax & RE_NO_GNU_OPS))
1836             token->type = OP_WORD;
1837           break;
1838         case 'W':
1839           if (!(syntax & RE_NO_GNU_OPS))
1840             token->type = OP_NOTWORD;
1841           break;
1842         case 's':
1843           if (!(syntax & RE_NO_GNU_OPS))
1844             token->type = OP_SPACE;
1845           break;
1846         case 'S':
1847           if (!(syntax & RE_NO_GNU_OPS))
1848             token->type = OP_NOTSPACE;
1849           break;
1850         case '`':
1851           if (!(syntax & RE_NO_GNU_OPS))
1852             {
1853               token->type = ANCHOR;
1854               token->opr.ctx_type = BUF_FIRST;
1855             }
1856           break;
1857         case '\'':
1858           if (!(syntax & RE_NO_GNU_OPS))
1859             {
1860               token->type = ANCHOR;
1861               token->opr.ctx_type = BUF_LAST;
1862             }
1863           break;
1864         case '(':
1865           if (!(syntax & RE_NO_BK_PARENS))
1866             token->type = OP_OPEN_SUBEXP;
1867           break;
1868         case ')':
1869           if (!(syntax & RE_NO_BK_PARENS))
1870             token->type = OP_CLOSE_SUBEXP;
1871           break;
1872         case '+':
1873           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1874             token->type = OP_DUP_PLUS;
1875           break;
1876         case '?':
1877           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1878             token->type = OP_DUP_QUESTION;
1879           break;
1880         case '{':
1881           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1882             token->type = OP_OPEN_DUP_NUM;
1883           break;
1884         case '}':
1885           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1886             token->type = OP_CLOSE_DUP_NUM;
1887           break;
1888         default:
1889           break;
1890         }
1891       return 2;
1892     }
1893
1894   token->type = CHARACTER;
1895 #ifdef RE_ENABLE_I18N
1896   if (input->mb_cur_max > 1)
1897     {
1898       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1899       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1900     }
1901   else
1902 #endif
1903     token->word_char = IS_WORD_CHAR (token->opr.c);
1904
1905   switch (c)
1906     {
1907     case '\n':
1908       if (syntax & RE_NEWLINE_ALT)
1909         token->type = OP_ALT;
1910       break;
1911     case '|':
1912       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1913         token->type = OP_ALT;
1914       break;
1915     case '*':
1916       token->type = OP_DUP_ASTERISK;
1917       break;
1918     case '+':
1919       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1920         token->type = OP_DUP_PLUS;
1921       break;
1922     case '?':
1923       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1924         token->type = OP_DUP_QUESTION;
1925       break;
1926     case '{':
1927       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1928         token->type = OP_OPEN_DUP_NUM;
1929       break;
1930     case '}':
1931       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1932         token->type = OP_CLOSE_DUP_NUM;
1933       break;
1934     case '(':
1935       if (syntax & RE_NO_BK_PARENS)
1936         token->type = OP_OPEN_SUBEXP;
1937       break;
1938     case ')':
1939       if (syntax & RE_NO_BK_PARENS)
1940         token->type = OP_CLOSE_SUBEXP;
1941       break;
1942     case '[':
1943       token->type = OP_OPEN_BRACKET;
1944       break;
1945     case '.':
1946       token->type = OP_PERIOD;
1947       break;
1948     case '^':
1949       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1950           re_string_cur_idx (input) != 0)
1951         {
1952           char prev = re_string_peek_byte (input, -1);
1953           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1954             break;
1955         }
1956       token->type = ANCHOR;
1957       token->opr.ctx_type = LINE_FIRST;
1958       break;
1959     case '$':
1960       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1961           re_string_cur_idx (input) + 1 != re_string_length (input))
1962         {
1963           re_token_t next;
1964           re_string_skip_bytes (input, 1);
1965           peek_token (&next, input, syntax);
1966           re_string_skip_bytes (input, -1);
1967           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1968             break;
1969         }
1970       token->type = ANCHOR;
1971       token->opr.ctx_type = LINE_LAST;
1972       break;
1973     default:
1974       break;
1975     }
1976   return 1;
1977 }
1978
1979 /* Peek a token from INPUT, and return the length of the token.
1980    We must not use this function out of bracket expressions.  */
1981
1982 static int
1983 internal_function
1984 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1985 {
1986   unsigned char c;
1987   if (re_string_eoi (input))
1988     {
1989       token->type = END_OF_RE;
1990       return 0;
1991     }
1992   c = re_string_peek_byte (input, 0);
1993   token->opr.c = c;
1994
1995 #ifdef RE_ENABLE_I18N
1996   if (input->mb_cur_max > 1 &&
1997       !re_string_first_byte (input, re_string_cur_idx (input)))
1998     {
1999       token->type = CHARACTER;
2000       return 1;
2001     }
2002 #endif /* RE_ENABLE_I18N */
2003
2004   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2005       && re_string_cur_idx (input) + 1 < re_string_length (input))
2006     {
2007       /* In this case, '\' escape a character.  */
2008       unsigned char c2;
2009       re_string_skip_bytes (input, 1);
2010       c2 = re_string_peek_byte (input, 0);
2011       token->opr.c = c2;
2012       token->type = CHARACTER;
2013       return 1;
2014     }
2015   if (c == '[') /* '[' is a special char in a bracket exps.  */
2016     {
2017       unsigned char c2;
2018       int token_len;
2019       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2020         c2 = re_string_peek_byte (input, 1);
2021       else
2022         c2 = 0;
2023       token->opr.c = c2;
2024       token_len = 2;
2025       switch (c2)
2026         {
2027         case '.':
2028           token->type = OP_OPEN_COLL_ELEM;
2029           break;
2030         case '=':
2031           token->type = OP_OPEN_EQUIV_CLASS;
2032           break;
2033         case ':':
2034           if (syntax & RE_CHAR_CLASSES)
2035             {
2036               token->type = OP_OPEN_CHAR_CLASS;
2037               break;
2038             }
2039           /* else fall through.  */
2040         default:
2041           token->type = CHARACTER;
2042           token->opr.c = c;
2043           token_len = 1;
2044           break;
2045         }
2046       return token_len;
2047     }
2048   switch (c)
2049     {
2050     case '-':
2051       token->type = OP_CHARSET_RANGE;
2052       break;
2053     case ']':
2054       token->type = OP_CLOSE_BRACKET;
2055       break;
2056     case '^':
2057       token->type = OP_NON_MATCH_LIST;
2058       break;
2059     default:
2060       token->type = CHARACTER;
2061     }
2062   return 1;
2063 }
2064 \f
2065 /* Functions for parser.  */
2066
2067 /* Entry point of the parser.
2068    Parse the regular expression REGEXP and return the structure tree.
2069    If an error is occured, ERR is set by error code, and return NULL.
2070    This function build the following tree, from regular expression <reg_exp>:
2071            CAT
2072            / \
2073           /   \
2074    <reg_exp>  EOR
2075
2076    CAT means concatenation.
2077    EOR means end of regular expression.  */
2078
2079 static bin_tree_t *
2080 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2081        reg_errcode_t *err)
2082 {
2083   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2084   bin_tree_t *tree, *eor, *root;
2085   re_token_t current_token;
2086   dfa->syntax = syntax;
2087   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2088   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2089   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2090     return NULL;
2091   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2092   if (tree != NULL)
2093     root = create_tree (dfa, tree, eor, CONCAT);
2094   else
2095     root = eor;
2096   if (BE (eor == NULL || root == NULL, 0))
2097     {
2098       *err = REG_ESPACE;
2099       return NULL;
2100     }
2101   return root;
2102 }
2103
2104 /* This function build the following tree, from regular expression
2105    <branch1>|<branch2>:
2106            ALT
2107            / \
2108           /   \
2109    <branch1> <branch2>
2110
2111    ALT means alternative, which represents the operator `|'.  */
2112
2113 static bin_tree_t *
2114 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2115                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2116 {
2117   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2118   bin_tree_t *tree, *branch = NULL;
2119   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2120   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2121     return NULL;
2122
2123   while (token->type == OP_ALT)
2124     {
2125       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2126       if (token->type != OP_ALT && token->type != END_OF_RE
2127           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2128         {
2129           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2130           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2131             return NULL;
2132         }
2133       else
2134         branch = NULL;
2135       tree = create_tree (dfa, tree, branch, OP_ALT);
2136       if (BE (tree == NULL, 0))
2137         {
2138           *err = REG_ESPACE;
2139           return NULL;
2140         }
2141     }
2142   return tree;
2143 }
2144
2145 /* This function build the following tree, from regular expression
2146    <exp1><exp2>:
2147         CAT
2148         / \
2149        /   \
2150    <exp1> <exp2>
2151
2152    CAT means concatenation.  */
2153
2154 static bin_tree_t *
2155 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2156               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2157 {
2158   bin_tree_t *tree, *expr;
2159   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2160   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2161   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2162     return NULL;
2163
2164   while (token->type != OP_ALT && token->type != END_OF_RE
2165          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2166     {
2167       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2168       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2169         {
2170           return NULL;
2171         }
2172       if (tree != NULL && expr != NULL)
2173         {
2174           tree = create_tree (dfa, tree, expr, CONCAT);
2175           if (tree == NULL)
2176             {
2177               *err = REG_ESPACE;
2178               return NULL;
2179             }
2180         }
2181       else if (tree == NULL)
2182         tree = expr;
2183       /* Otherwise expr == NULL, we don't need to create new tree.  */
2184     }
2185   return tree;
2186 }
2187
2188 /* This function build the following tree, from regular expression a*:
2189          *
2190          |
2191          a
2192 */
2193
2194 static bin_tree_t *
2195 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2196                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2197 {
2198   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2199   bin_tree_t *tree;
2200   switch (token->type)
2201     {
2202     case CHARACTER:
2203       tree = create_token_tree (dfa, NULL, NULL, token);
2204       if (BE (tree == NULL, 0))
2205         {
2206           *err = REG_ESPACE;
2207           return NULL;
2208         }
2209 #ifdef RE_ENABLE_I18N
2210       if (dfa->mb_cur_max > 1)
2211         {
2212           while (!re_string_eoi (regexp)
2213                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2214             {
2215               bin_tree_t *mbc_remain;
2216               fetch_token (token, regexp, syntax);
2217               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2218               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2219               if (BE (mbc_remain == NULL || tree == NULL, 0))
2220                 {
2221                   *err = REG_ESPACE;
2222                   return NULL;
2223                 }
2224             }
2225         }
2226 #endif
2227       break;
2228     case OP_OPEN_SUBEXP:
2229       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2230       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2231         return NULL;
2232       break;
2233     case OP_OPEN_BRACKET:
2234       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2235       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2236         return NULL;
2237       break;
2238     case OP_BACK_REF:
2239       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2240         {
2241           *err = REG_ESUBREG;
2242           return NULL;
2243         }
2244       dfa->used_bkref_map |= 1 << token->opr.idx;
2245       tree = create_token_tree (dfa, NULL, NULL, token);
2246       if (BE (tree == NULL, 0))
2247         {
2248           *err = REG_ESPACE;
2249           return NULL;
2250         }
2251       ++dfa->nbackref;
2252       dfa->has_mb_node = 1;
2253       break;
2254     case OP_OPEN_DUP_NUM:
2255       if (syntax & RE_CONTEXT_INVALID_DUP)
2256         {
2257           *err = REG_BADRPT;
2258           return NULL;
2259         }
2260       /* FALLTHROUGH */
2261     case OP_DUP_ASTERISK:
2262     case OP_DUP_PLUS:
2263     case OP_DUP_QUESTION:
2264       if (syntax & RE_CONTEXT_INVALID_OPS)
2265         {
2266           *err = REG_BADRPT;
2267           return NULL;
2268         }
2269       else if (syntax & RE_CONTEXT_INDEP_OPS)
2270         {
2271           fetch_token (token, regexp, syntax);
2272           return parse_expression (regexp, preg, token, syntax, nest, err);
2273         }
2274       /* else fall through  */
2275     case OP_CLOSE_SUBEXP:
2276       if ((token->type == OP_CLOSE_SUBEXP) &&
2277           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2278         {
2279           *err = REG_ERPAREN;
2280           return NULL;
2281         }
2282       /* else fall through  */
2283     case OP_CLOSE_DUP_NUM:
2284       /* We treat it as a normal character.  */
2285
2286       /* Then we can these characters as normal characters.  */
2287       token->type = CHARACTER;
2288       /* mb_partial and word_char bits should be initialized already
2289          by peek_token.  */
2290       tree = create_token_tree (dfa, NULL, NULL, token);
2291       if (BE (tree == NULL, 0))
2292         {
2293           *err = REG_ESPACE;
2294           return NULL;
2295         }
2296       break;
2297     case ANCHOR:
2298       if ((token->opr.ctx_type
2299            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2300           && dfa->word_ops_used == 0)
2301         init_word_char (dfa);
2302       if (token->opr.ctx_type == WORD_DELIM
2303           || token->opr.ctx_type == NOT_WORD_DELIM)
2304         {
2305           bin_tree_t *tree_first, *tree_last;
2306           if (token->opr.ctx_type == WORD_DELIM)
2307             {
2308               token->opr.ctx_type = WORD_FIRST;
2309               tree_first = create_token_tree (dfa, NULL, NULL, token);
2310               token->opr.ctx_type = WORD_LAST;
2311             }
2312           else
2313             {
2314               token->opr.ctx_type = INSIDE_WORD;
2315               tree_first = create_token_tree (dfa, NULL, NULL, token);
2316               token->opr.ctx_type = INSIDE_NOTWORD;
2317             }
2318           tree_last = create_token_tree (dfa, NULL, NULL, token);
2319           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2320           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2321             {
2322               *err = REG_ESPACE;
2323               return NULL;
2324             }
2325         }
2326       else
2327         {
2328           tree = create_token_tree (dfa, NULL, NULL, token);
2329           if (BE (tree == NULL, 0))
2330             {
2331               *err = REG_ESPACE;
2332               return NULL;
2333             }
2334         }
2335       /* We must return here, since ANCHORs can't be followed
2336          by repetition operators.
2337          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2338              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2339       fetch_token (token, regexp, syntax);
2340       return tree;
2341     case OP_PERIOD:
2342       tree = create_token_tree (dfa, NULL, NULL, token);
2343       if (BE (tree == NULL, 0))
2344         {
2345           *err = REG_ESPACE;
2346           return NULL;
2347         }
2348       if (dfa->mb_cur_max > 1)
2349         dfa->has_mb_node = 1;
2350       break;
2351     case OP_WORD:
2352     case OP_NOTWORD:
2353       tree = build_charclass_op (dfa, regexp->trans,
2354                                  (const unsigned char *) "alnum",
2355                                  (const unsigned char *) "_",
2356                                  token->type == OP_NOTWORD, err);
2357       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2358         return NULL;
2359       break;
2360     case OP_SPACE:
2361     case OP_NOTSPACE:
2362       tree = build_charclass_op (dfa, regexp->trans,
2363                                  (const unsigned char *) "space",
2364                                  (const unsigned char *) "",
2365                                  token->type == OP_NOTSPACE, err);
2366       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2367         return NULL;
2368       break;
2369     case OP_ALT:
2370     case END_OF_RE:
2371       return NULL;
2372     case BACK_SLASH:
2373       *err = REG_EESCAPE;
2374       return NULL;
2375     default:
2376       /* Must not happen?  */
2377 #ifdef DEBUG
2378       assert (0);
2379 #endif
2380       return NULL;
2381     }
2382   fetch_token (token, regexp, syntax);
2383
2384   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2385          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2386     {
2387       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2388       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2389         return NULL;
2390       /* In BRE consecutive duplications are not allowed.  */
2391       if ((syntax & RE_CONTEXT_INVALID_DUP)
2392           && (token->type == OP_DUP_ASTERISK
2393               || token->type == OP_OPEN_DUP_NUM))
2394         {
2395           *err = REG_BADRPT;
2396           return NULL;
2397         }
2398     }
2399
2400   return tree;
2401 }
2402
2403 /* This function build the following tree, from regular expression
2404    (<reg_exp>):
2405          SUBEXP
2406             |
2407         <reg_exp>
2408 */
2409
2410 static bin_tree_t *
2411 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2412                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2413 {
2414   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2415   bin_tree_t *tree;
2416   size_t cur_nsub;
2417   cur_nsub = preg->re_nsub++;
2418
2419   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2420
2421   /* The subexpression may be a null string.  */
2422   if (token->type == OP_CLOSE_SUBEXP)
2423     tree = NULL;
2424   else
2425     {
2426       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2427       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2428         *err = REG_EPAREN;
2429       if (BE (*err != REG_NOERROR, 0))
2430         return NULL;
2431     }
2432
2433   if (cur_nsub <= '9' - '1')
2434     dfa->completed_bkref_map |= 1 << cur_nsub;
2435
2436   tree = create_tree (dfa, tree, NULL, SUBEXP);
2437   if (BE (tree == NULL, 0))
2438     {
2439       *err = REG_ESPACE;
2440       return NULL;
2441     }
2442   tree->token.opr.idx = cur_nsub;
2443   return tree;
2444 }
2445
2446 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2447
2448 static bin_tree_t *
2449 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2450               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2451 {
2452   bin_tree_t *tree = NULL, *old_tree = NULL;
2453   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2454   re_token_t start_token = *token;
2455
2456   if (token->type == OP_OPEN_DUP_NUM)
2457     {
2458       end = 0;
2459       start = fetch_number (regexp, token, syntax);
2460       if (start == REG_MISSING)
2461         {
2462           if (token->type == CHARACTER && token->opr.c == ',')
2463             start = 0; /* We treat "{,m}" as "{0,m}".  */
2464           else
2465             {
2466               *err = REG_BADBR; /* <re>{} is invalid.  */
2467               return NULL;
2468             }
2469         }
2470       if (BE (start != REG_ERROR, 1))
2471         {
2472           /* We treat "{n}" as "{n,n}".  */
2473           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2474                  : ((token->type == CHARACTER && token->opr.c == ',')
2475                     ? fetch_number (regexp, token, syntax) : REG_ERROR));
2476         }
2477       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2478         {
2479           /* Invalid sequence.  */
2480           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2481             {
2482               if (token->type == END_OF_RE)
2483                 *err = REG_EBRACE;
2484               else
2485                 *err = REG_BADBR;
2486
2487               return NULL;
2488             }
2489
2490           /* If the syntax bit is set, rollback.  */
2491           re_string_set_index (regexp, start_idx);
2492           *token = start_token;
2493           token->type = CHARACTER;
2494           /* mb_partial and word_char bits should be already initialized by
2495              peek_token.  */
2496           return elem;
2497         }
2498
2499       if (BE (end != REG_MISSING && start > end, 0))
2500         {
2501           /* First number greater than second.  */
2502           *err = REG_BADBR;
2503           return NULL;
2504         }
2505     }
2506   else
2507     {
2508       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2509       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2510     }
2511
2512   fetch_token (token, regexp, syntax);
2513
2514   if (BE (elem == NULL, 0))
2515     return NULL;
2516   if (BE (start == 0 && end == 0, 0))
2517     {
2518       postorder (elem, free_tree, NULL);
2519       return NULL;
2520     }
2521
2522   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2523   if (BE (start > 0, 0))
2524     {
2525       tree = elem;
2526       for (i = 2; i <= start; ++i)
2527         {
2528           elem = duplicate_tree (elem, dfa);
2529           tree = create_tree (dfa, tree, elem, CONCAT);
2530           if (BE (elem == NULL || tree == NULL, 0))
2531             goto parse_dup_op_espace;
2532         }
2533
2534       if (start == end)
2535         return tree;
2536
2537       /* Duplicate ELEM before it is marked optional.  */
2538       elem = duplicate_tree (elem, dfa);
2539       old_tree = tree;
2540     }
2541   else
2542     old_tree = NULL;
2543
2544   if (elem->token.type == SUBEXP)
2545     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2546
2547   tree = create_tree (dfa, elem, NULL,
2548                       (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2549   if (BE (tree == NULL, 0))
2550     goto parse_dup_op_espace;
2551
2552   /* This loop is actually executed only when end != REG_MISSING,
2553      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2554      already created the start+1-th copy.  */
2555   if ((Idx) -1 < 0 || end != REG_MISSING)
2556     for (i = start + 2; i <= end; ++i)
2557       {
2558         elem = duplicate_tree (elem, dfa);
2559         tree = create_tree (dfa, tree, elem, CONCAT);
2560         if (BE (elem == NULL || tree == NULL, 0))
2561           goto parse_dup_op_espace;
2562
2563         tree = create_tree (dfa, tree, NULL, OP_ALT);
2564         if (BE (tree == NULL, 0))
2565           goto parse_dup_op_espace;
2566       }
2567
2568   if (old_tree)
2569     tree = create_tree (dfa, old_tree, tree, CONCAT);
2570
2571   return tree;
2572
2573  parse_dup_op_espace:
2574   *err = REG_ESPACE;
2575   return NULL;
2576 }
2577
2578 /* Size of the names for collating symbol/equivalence_class/character_class.
2579    I'm not sure, but maybe enough.  */
2580 #define BRACKET_NAME_BUF_SIZE 32
2581
2582 #ifndef _LIBC
2583   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2584      Build the range expression which starts from START_ELEM, and ends
2585      at END_ELEM.  The result are written to MBCSET and SBCSET.
2586      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2587      mbcset->range_ends, is a pointer argument sinse we may
2588      update it.  */
2589
2590 static reg_errcode_t
2591 internal_function
2592 # ifdef RE_ENABLE_I18N
2593 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2594                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2595 # else /* not RE_ENABLE_I18N */
2596 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2597                  bracket_elem_t *end_elem)
2598 # endif /* not RE_ENABLE_I18N */
2599 {
2600   unsigned int start_ch, end_ch;
2601   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2602   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2603           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2604           0))
2605     return REG_ERANGE;
2606
2607   /* We can handle no multi character collating elements without libc
2608      support.  */
2609   if (BE ((start_elem->type == COLL_SYM
2610            && strlen ((char *) start_elem->opr.name) > 1)
2611           || (end_elem->type == COLL_SYM
2612               && strlen ((char *) end_elem->opr.name) > 1), 0))
2613     return REG_ECOLLATE;
2614
2615 # ifdef RE_ENABLE_I18N
2616   {
2617     wchar_t wc;
2618     wint_t start_wc;
2619     wint_t end_wc;
2620     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2621
2622     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2623                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2624                    : 0));
2625     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2626               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2627                  : 0));
2628     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2629                 ? __btowc (start_ch) : start_elem->opr.wch);
2630     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2631               ? __btowc (end_ch) : end_elem->opr.wch);
2632     if (start_wc == WEOF || end_wc == WEOF)
2633       return REG_ECOLLATE;
2634     cmp_buf[0] = start_wc;
2635     cmp_buf[4] = end_wc;
2636     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2637       return REG_ERANGE;
2638
2639     /* Got valid collation sequence values, add them as a new entry.
2640        However, for !_LIBC we have no collation elements: if the
2641        character set is single byte, the single byte character set
2642        that we build below suffices.  parse_bracket_exp passes
2643        no MBCSET if dfa->mb_cur_max == 1.  */
2644     if (mbcset)
2645       {
2646         /* Check the space of the arrays.  */
2647         if (BE (*range_alloc == mbcset->nranges, 0))
2648           {
2649             /* There is not enough space, need realloc.  */
2650             wchar_t *new_array_start, *new_array_end;
2651             Idx new_nranges;
2652
2653             /* +1 in case of mbcset->nranges is 0.  */
2654             new_nranges = 2 * mbcset->nranges + 1;
2655             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2656                are NULL if *range_alloc == 0.  */
2657             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2658                                           new_nranges);
2659             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2660                                         new_nranges);
2661
2662             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2663               return REG_ESPACE;
2664
2665             mbcset->range_starts = new_array_start;
2666             mbcset->range_ends = new_array_end;
2667             *range_alloc = new_nranges;
2668           }
2669
2670         mbcset->range_starts[mbcset->nranges] = start_wc;
2671         mbcset->range_ends[mbcset->nranges++] = end_wc;
2672       }
2673
2674     /* Build the table for single byte characters.  */
2675     for (wc = 0; wc < SBC_MAX; ++wc)
2676       {
2677         cmp_buf[2] = wc;
2678         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2679             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2680           bitset_set (sbcset, wc);
2681       }
2682   }
2683 # else /* not RE_ENABLE_I18N */
2684   {
2685     unsigned int ch;
2686     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2687                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2688                    : 0));
2689     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2690               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2691                  : 0));
2692     if (start_ch > end_ch)
2693       return REG_ERANGE;
2694     /* Build the table for single byte characters.  */
2695     for (ch = 0; ch < SBC_MAX; ++ch)
2696       if (start_ch <= ch  && ch <= end_ch)
2697         bitset_set (sbcset, ch);
2698   }
2699 # endif /* not RE_ENABLE_I18N */
2700   return REG_NOERROR;
2701 }
2702 #endif /* not _LIBC */
2703
2704 #ifndef _LIBC
2705 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2706    Build the collating element which is represented by NAME.
2707    The result are written to MBCSET and SBCSET.
2708    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2709    pointer argument since we may update it.  */
2710
2711 static reg_errcode_t
2712 internal_function
2713 build_collating_symbol (bitset_t sbcset,
2714 # ifdef RE_ENABLE_I18N
2715                         re_charset_t *mbcset, Idx *coll_sym_alloc,
2716 # endif
2717                         const unsigned char *name)
2718 {
2719   size_t name_len = strlen ((const char *) name);
2720   if (BE (name_len != 1, 0))
2721     return REG_ECOLLATE;
2722   else
2723     {
2724       bitset_set (sbcset, name[0]);
2725       return REG_NOERROR;
2726     }
2727 }
2728 #endif /* not _LIBC */
2729
2730 /* This function parse bracket expression like "[abc]", "[a-c]",
2731    "[[.a-a.]]" etc.  */
2732
2733 static bin_tree_t *
2734 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2735                    reg_syntax_t syntax, reg_errcode_t *err)
2736 {
2737 #ifdef _LIBC
2738   const unsigned char *collseqmb;
2739   const char *collseqwc;
2740   uint32_t nrules;
2741   int32_t table_size;
2742   const int32_t *symb_table;
2743   const unsigned char *extra;
2744
2745   /* Local function for parse_bracket_exp used in _LIBC environement.
2746      Seek the collating symbol entry correspondings to NAME.
2747      Return the index of the symbol in the SYMB_TABLE.  */
2748
2749   auto inline int32_t
2750   __attribute ((always_inline))
2751   seek_collating_symbol_entry (name, name_len)
2752          const unsigned char *name;
2753          size_t name_len;
2754     {
2755       int32_t hash = elem_hash ((const char *) name, name_len);
2756       int32_t elem = hash % table_size;
2757       if (symb_table[2 * elem] != 0)
2758         {
2759           int32_t second = hash % (table_size - 2) + 1;
2760
2761           do
2762             {
2763               /* First compare the hashing value.  */
2764               if (symb_table[2 * elem] == hash
2765                   /* Compare the length of the name.  */
2766                   && name_len == extra[symb_table[2 * elem + 1]]
2767                   /* Compare the name.  */
2768                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2769                              name_len) == 0)
2770                 {
2771                   /* Yep, this is the entry.  */
2772                   break;
2773                 }
2774
2775               /* Next entry.  */
2776               elem += second;
2777             }
2778           while (symb_table[2 * elem] != 0);
2779         }
2780       return elem;
2781     }
2782
2783   /* Local function for parse_bracket_exp used in _LIBC environement.
2784      Look up the collation sequence value of BR_ELEM.
2785      Return the value if succeeded, UINT_MAX otherwise.  */
2786
2787   auto inline unsigned int
2788   __attribute ((always_inline))
2789   lookup_collation_sequence_value (br_elem)
2790          bracket_elem_t *br_elem;
2791     {
2792       if (br_elem->type == SB_CHAR)
2793         {
2794           /*
2795           if (MB_CUR_MAX == 1)
2796           */
2797           if (nrules == 0)
2798             return collseqmb[br_elem->opr.ch];
2799           else
2800             {
2801               wint_t wc = __btowc (br_elem->opr.ch);
2802               return __collseq_table_lookup (collseqwc, wc);
2803             }
2804         }
2805       else if (br_elem->type == MB_CHAR)
2806         {
2807           return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2808         }
2809       else if (br_elem->type == COLL_SYM)
2810         {
2811           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2812           if (nrules != 0)
2813             {
2814               int32_t elem, idx;
2815               elem = seek_collating_symbol_entry (br_elem->opr.name,
2816                                                   sym_name_len);
2817               if (symb_table[2 * elem] != 0)
2818                 {
2819                   /* We found the entry.  */
2820                   idx = symb_table[2 * elem + 1];
2821                   /* Skip the name of collating element name.  */
2822                   idx += 1 + extra[idx];
2823                   /* Skip the byte sequence of the collating element.  */
2824                   idx += 1 + extra[idx];
2825                   /* Adjust for the alignment.  */
2826                   idx = (idx + 3) & ~3;
2827                   /* Skip the multibyte collation sequence value.  */
2828                   idx += sizeof (unsigned int);
2829                   /* Skip the wide char sequence of the collating element.  */
2830                   idx += sizeof (unsigned int) *
2831                     (1 + *(unsigned int *) (extra + idx));
2832                   /* Return the collation sequence value.  */
2833                   return *(unsigned int *) (extra + idx);
2834                 }
2835               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2836                 {
2837                   /* No valid character.  Match it as a single byte
2838                      character.  */
2839                   return collseqmb[br_elem->opr.name[0]];
2840                 }
2841             }
2842           else if (sym_name_len == 1)
2843             return collseqmb[br_elem->opr.name[0]];
2844         }
2845       return UINT_MAX;
2846     }
2847
2848   /* Local function for parse_bracket_exp used in _LIBC environement.
2849      Build the range expression which starts from START_ELEM, and ends
2850      at END_ELEM.  The result are written to MBCSET and SBCSET.
2851      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2852      mbcset->range_ends, is a pointer argument sinse we may
2853      update it.  */
2854
2855   auto inline reg_errcode_t
2856   __attribute ((always_inline))
2857   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2858          re_charset_t *mbcset;
2859          Idx *range_alloc;
2860          bitset_t sbcset;
2861          bracket_elem_t *start_elem, *end_elem;
2862     {
2863       unsigned int ch;
2864       uint32_t start_collseq;
2865       uint32_t end_collseq;
2866
2867       /* Equivalence Classes and Character Classes can't be a range
2868          start/end.  */
2869       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2870               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2871               0))
2872         return REG_ERANGE;
2873
2874       start_collseq = lookup_collation_sequence_value (start_elem);
2875       end_collseq = lookup_collation_sequence_value (end_elem);
2876       /* Check start/end collation sequence values.  */
2877       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2878         return REG_ECOLLATE;
2879       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2880         return REG_ERANGE;
2881
2882       /* Got valid collation sequence values, add them as a new entry.
2883          However, if we have no collation elements, and the character set
2884          is single byte, the single byte character set that we
2885          build below suffices. */
2886       if (nrules > 0 || dfa->mb_cur_max > 1)
2887         {
2888           /* Check the space of the arrays.  */
2889           if (BE (*range_alloc == mbcset->nranges, 0))
2890             {
2891               /* There is not enough space, need realloc.  */
2892               uint32_t *new_array_start;
2893               uint32_t *new_array_end;
2894               Idx new_nranges;
2895
2896               /* +1 in case of mbcset->nranges is 0.  */
2897               new_nranges = 2 * mbcset->nranges + 1;
2898               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2899                                             new_nranges);
2900               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2901                                           new_nranges);
2902
2903               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2904                 return REG_ESPACE;
2905
2906               mbcset->range_starts = new_array_start;
2907               mbcset->range_ends = new_array_end;
2908               *range_alloc = new_nranges;
2909             }
2910
2911           mbcset->range_starts[mbcset->nranges] = start_collseq;
2912           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2913         }
2914
2915       /* Build the table for single byte characters.  */
2916       for (ch = 0; ch < SBC_MAX; ch++)
2917         {
2918           uint32_t ch_collseq;
2919           /*
2920           if (MB_CUR_MAX == 1)
2921           */
2922           if (nrules == 0)
2923             ch_collseq = collseqmb[ch];
2924           else
2925             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2926           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2927             bitset_set (sbcset, ch);
2928         }
2929       return REG_NOERROR;
2930     }
2931
2932   /* Local function for parse_bracket_exp used in _LIBC environement.
2933      Build the collating element which is represented by NAME.
2934      The result are written to MBCSET and SBCSET.
2935      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2936      pointer argument sinse we may update it.  */
2937
2938   auto inline reg_errcode_t
2939   __attribute ((always_inline))
2940   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2941          re_charset_t *mbcset;
2942          Idx *coll_sym_alloc;
2943          bitset_t sbcset;
2944          const unsigned char *name;
2945     {
2946       int32_t elem, idx;
2947       size_t name_len = strlen ((const char *) name);
2948       if (nrules != 0)
2949         {
2950           elem = seek_collating_symbol_entry (name, name_len);
2951           if (symb_table[2 * elem] != 0)
2952             {
2953               /* We found the entry.  */
2954               idx = symb_table[2 * elem + 1];
2955               /* Skip the name of collating element name.  */
2956               idx += 1 + extra[idx];
2957             }
2958           else if (symb_table[2 * elem] == 0 && name_len == 1)
2959             {
2960               /* No valid character, treat it as a normal
2961                  character.  */
2962               bitset_set (sbcset, name[0]);
2963               return REG_NOERROR;
2964             }
2965           else
2966             return REG_ECOLLATE;
2967
2968           /* Got valid collation sequence, add it as a new entry.  */
2969           /* Check the space of the arrays.  */
2970           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2971             {
2972               /* Not enough, realloc it.  */
2973               /* +1 in case of mbcset->ncoll_syms is 0.  */
2974               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2975               /* Use realloc since mbcset->coll_syms is NULL
2976                  if *alloc == 0.  */
2977               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2978                                                    new_coll_sym_alloc);
2979               if (BE (new_coll_syms == NULL, 0))
2980                 return REG_ESPACE;
2981               mbcset->coll_syms = new_coll_syms;
2982               *coll_sym_alloc = new_coll_sym_alloc;
2983             }
2984           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2985           return REG_NOERROR;
2986         }
2987       else
2988         {
2989           if (BE (name_len != 1, 0))
2990             return REG_ECOLLATE;
2991           else
2992             {
2993               bitset_set (sbcset, name[0]);
2994               return REG_NOERROR;
2995             }
2996         }
2997     }
2998 #endif
2999
3000   re_token_t br_token;
3001   re_bitset_ptr_t sbcset;
3002 #ifdef RE_ENABLE_I18N
3003   re_charset_t *mbcset;
3004   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3005   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3006 #endif /* not RE_ENABLE_I18N */
3007   bool non_match = false;
3008   bin_tree_t *work_tree;
3009   int token_len;
3010   bool first_round = true;
3011 #ifdef _LIBC
3012   collseqmb = (const unsigned char *)
3013     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3014   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3015   if (nrules)
3016     {
3017       /*
3018       if (MB_CUR_MAX > 1)
3019       */
3020       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3021       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3022       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3023                                                   _NL_COLLATE_SYMB_TABLEMB);
3024       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3025                                                    _NL_COLLATE_SYMB_EXTRAMB);
3026     }
3027 #endif
3028   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3029 #ifdef RE_ENABLE_I18N
3030   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3031 #endif /* RE_ENABLE_I18N */
3032 #ifdef RE_ENABLE_I18N
3033   if (BE (sbcset == NULL || mbcset == NULL, 0))
3034 #else
3035   if (BE (sbcset == NULL, 0))
3036 #endif /* RE_ENABLE_I18N */
3037     {
3038       *err = REG_ESPACE;
3039       return NULL;
3040     }
3041
3042   token_len = peek_token_bracket (token, regexp, syntax);
3043   if (BE (token->type == END_OF_RE, 0))
3044     {
3045       *err = REG_BADPAT;
3046       goto parse_bracket_exp_free_return;
3047     }
3048   if (token->type == OP_NON_MATCH_LIST)
3049     {
3050 #ifdef RE_ENABLE_I18N
3051       mbcset->non_match = 1;
3052 #endif /* not RE_ENABLE_I18N */
3053       non_match = true;
3054       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3055         bitset_set (sbcset, '\n');
3056       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3057       token_len = peek_token_bracket (token, regexp, syntax);
3058       if (BE (token->type == END_OF_RE, 0))
3059         {
3060           *err = REG_BADPAT;
3061           goto parse_bracket_exp_free_return;
3062         }
3063     }
3064
3065   /* We treat the first ']' as a normal character.  */
3066   if (token->type == OP_CLOSE_BRACKET)
3067     token->type = CHARACTER;
3068
3069   while (1)
3070     {
3071       bracket_elem_t start_elem, end_elem;
3072       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3073       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3074       reg_errcode_t ret;
3075       int token_len2 = 0;
3076       bool is_range_exp = false;
3077       re_token_t token2;
3078
3079       start_elem.opr.name = start_name_buf;
3080       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3081                                    syntax, first_round);
3082       if (BE (ret != REG_NOERROR, 0))
3083         {
3084           *err = ret;
3085           goto parse_bracket_exp_free_return;
3086         }
3087       first_round = false;
3088
3089       /* Get information about the next token.  We need it in any case.  */
3090       token_len = peek_token_bracket (token, regexp, syntax);
3091
3092       /* Do not check for ranges if we know they are not allowed.  */
3093       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3094         {
3095           if (BE (token->type == END_OF_RE, 0))
3096             {
3097               *err = REG_EBRACK;
3098               goto parse_bracket_exp_free_return;
3099             }
3100           if (token->type == OP_CHARSET_RANGE)
3101             {
3102               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3103               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3104               if (BE (token2.type == END_OF_RE, 0))
3105                 {
3106                   *err = REG_EBRACK;
3107                   goto parse_bracket_exp_free_return;
3108                 }
3109               if (token2.type == OP_CLOSE_BRACKET)
3110                 {
3111                   /* We treat the last '-' as a normal character.  */
3112                   re_string_skip_bytes (regexp, -token_len);
3113                   token->type = CHARACTER;
3114                 }
3115               else
3116                 is_range_exp = true;
3117             }
3118         }
3119
3120       if (is_range_exp == true)
3121         {
3122           end_elem.opr.name = end_name_buf;
3123           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3124                                        dfa, syntax, true);
3125           if (BE (ret != REG_NOERROR, 0))
3126             {
3127               *err = ret;
3128               goto parse_bracket_exp_free_return;
3129             }
3130
3131           token_len = peek_token_bracket (token, regexp, syntax);
3132
3133 #ifdef _LIBC
3134           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3135                                   &start_elem, &end_elem);
3136 #else
3137 # ifdef RE_ENABLE_I18N
3138           *err = build_range_exp (sbcset,
3139                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3140                                   &range_alloc, &start_elem, &end_elem);
3141 # else
3142           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3143 # endif
3144 #endif /* RE_ENABLE_I18N */
3145           if (BE (*err != REG_NOERROR, 0))
3146             goto parse_bracket_exp_free_return;
3147         }
3148       else
3149         {
3150           switch (start_elem.type)
3151             {
3152             case SB_CHAR:
3153               bitset_set (sbcset, start_elem.opr.ch);
3154               break;
3155 #ifdef RE_ENABLE_I18N
3156             case MB_CHAR:
3157               /* Check whether the array has enough space.  */
3158               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3159                 {
3160                   wchar_t *new_mbchars;
3161                   /* Not enough, realloc it.  */
3162                   /* +1 in case of mbcset->nmbchars is 0.  */
3163                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3164                   /* Use realloc since array is NULL if *alloc == 0.  */
3165                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3166                                             mbchar_alloc);
3167                   if (BE (new_mbchars == NULL, 0))
3168                     goto parse_bracket_exp_espace;
3169                   mbcset->mbchars = new_mbchars;
3170                 }
3171               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3172               break;
3173 #endif /* RE_ENABLE_I18N */
3174             case EQUIV_CLASS:
3175               *err = build_equiv_class (sbcset,
3176 #ifdef RE_ENABLE_I18N
3177                                         mbcset, &equiv_class_alloc,
3178 #endif /* RE_ENABLE_I18N */
3179                                         start_elem.opr.name);
3180               if (BE (*err != REG_NOERROR, 0))
3181                 goto parse_bracket_exp_free_return;
3182               break;
3183             case COLL_SYM:
3184               *err = build_collating_symbol (sbcset,
3185 #ifdef RE_ENABLE_I18N
3186                                              mbcset, &coll_sym_alloc,
3187 #endif /* RE_ENABLE_I18N */
3188                                              start_elem.opr.name);
3189               if (BE (*err != REG_NOERROR, 0))
3190                 goto parse_bracket_exp_free_return;
3191               break;
3192             case CHAR_CLASS:
3193               *err = build_charclass (regexp->trans, sbcset,
3194 #ifdef RE_ENABLE_I18N
3195                                       mbcset, &char_class_alloc,
3196 #endif /* RE_ENABLE_I18N */
3197                                       start_elem.opr.name, syntax);
3198               if (BE (*err != REG_NOERROR, 0))
3199                goto parse_bracket_exp_free_return;
3200               break;
3201             default:
3202               assert (0);
3203               break;
3204             }
3205         }
3206       if (BE (token->type == END_OF_RE, 0))
3207         {
3208           *err = REG_EBRACK;
3209           goto parse_bracket_exp_free_return;
3210         }
3211       if (token->type == OP_CLOSE_BRACKET)
3212         break;
3213     }
3214
3215   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3216
3217   /* If it is non-matching list.  */
3218   if (non_match)
3219     bitset_not (sbcset);
3220
3221 #ifdef RE_ENABLE_I18N
3222   /* Ensure only single byte characters are set.  */
3223   if (dfa->mb_cur_max > 1)
3224     bitset_mask (sbcset, dfa->sb_char);
3225
3226   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3227       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3228                                                      || mbcset->non_match)))
3229     {
3230       bin_tree_t *mbc_tree;
3231       int sbc_idx;
3232       /* Build a tree for complex bracket.  */
3233       dfa->has_mb_node = 1;
3234       br_token.type = COMPLEX_BRACKET;
3235       br_token.opr.mbcset = mbcset;
3236       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3237       if (BE (mbc_tree == NULL, 0))
3238         goto parse_bracket_exp_espace;
3239       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3240         if (sbcset[sbc_idx])
3241           break;
3242       /* If there are no bits set in sbcset, there is no point
3243          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3244       if (sbc_idx < BITSET_WORDS)
3245         {
3246           /* Build a tree for simple bracket.  */
3247           br_token.type = SIMPLE_BRACKET;
3248           br_token.opr.sbcset = sbcset;
3249           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3250           if (BE (work_tree == NULL, 0))
3251             goto parse_bracket_exp_espace;
3252
3253           /* Then join them by ALT node.  */
3254           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3255           if (BE (work_tree == NULL, 0))
3256             goto parse_bracket_exp_espace;
3257         }
3258       else
3259         {
3260           re_free (sbcset);
3261           work_tree = mbc_tree;
3262         }
3263     }
3264   else
3265 #endif /* not RE_ENABLE_I18N */
3266     {
3267 #ifdef RE_ENABLE_I18N
3268       free_charset (mbcset);
3269 #endif
3270       /* Build a tree for simple bracket.  */
3271       br_token.type = SIMPLE_BRACKET;
3272       br_token.opr.sbcset = sbcset;
3273       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3274       if (BE (work_tree == NULL, 0))
3275         goto parse_bracket_exp_espace;
3276     }
3277   return work_tree;
3278
3279  parse_bracket_exp_espace:
3280   *err = REG_ESPACE;
3281  parse_bracket_exp_free_return:
3282   re_free (sbcset);
3283 #ifdef RE_ENABLE_I18N
3284   free_charset (mbcset);
3285 #endif /* RE_ENABLE_I18N */
3286   return NULL;
3287 }
3288
3289 /* Parse an element in the bracket expression.  */
3290
3291 static reg_errcode_t
3292 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3293                        re_token_t *token, int token_len, re_dfa_t *dfa,
3294                        reg_syntax_t syntax, bool accept_hyphen)
3295 {
3296 #ifdef RE_ENABLE_I18N
3297   int cur_char_size;
3298   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3299   if (cur_char_size > 1)
3300     {
3301       elem->type = MB_CHAR;
3302       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3303       re_string_skip_bytes (regexp, cur_char_size);
3304       return REG_NOERROR;
3305     }
3306 #endif /* RE_ENABLE_I18N */
3307   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3308   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3309       || token->type == OP_OPEN_EQUIV_CLASS)
3310     return parse_bracket_symbol (elem, regexp, token);
3311   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3312     {
3313       /* A '-' must only appear as anything but a range indicator before
3314          the closing bracket.  Everything else is an error.  */
3315       re_token_t token2;
3316       (void) peek_token_bracket (&token2, regexp, syntax);
3317       if (token2.type != OP_CLOSE_BRACKET)
3318         /* The actual error value is not standardized since this whole
3319            case is undefined.  But ERANGE makes good sense.  */
3320         return REG_ERANGE;
3321     }
3322   elem->type = SB_CHAR;
3323   elem->opr.ch = token->opr.c;
3324   return REG_NOERROR;
3325 }
3326
3327 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3328    such as [:<character_class>:], [.<collating_element>.], and
3329    [=<equivalent_class>=].  */
3330
3331 static reg_errcode_t
3332 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3333                       re_token_t *token)
3334 {
3335   unsigned char ch, delim = token->opr.c;
3336   int i = 0;
3337   if (re_string_eoi(regexp))
3338     return REG_EBRACK;
3339   for (;; ++i)
3340     {
3341       if (i >= BRACKET_NAME_BUF_SIZE)
3342         return REG_EBRACK;
3343       if (token->type == OP_OPEN_CHAR_CLASS)
3344         ch = re_string_fetch_byte_case (regexp);
3345       else
3346         ch = re_string_fetch_byte (regexp);
3347       if (re_string_eoi(regexp))
3348         return REG_EBRACK;
3349       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3350         break;
3351       elem->opr.name[i] = ch;
3352     }
3353   re_string_skip_bytes (regexp, 1);
3354   elem->opr.name[i] = '\0';
3355   switch (token->type)
3356     {
3357     case OP_OPEN_COLL_ELEM:
3358       elem->type = COLL_SYM;
3359       break;
3360     case OP_OPEN_EQUIV_CLASS:
3361       elem->type = EQUIV_CLASS;
3362       break;
3363     case OP_OPEN_CHAR_CLASS:
3364       elem->type = CHAR_CLASS;
3365       break;
3366     default:
3367       break;
3368     }
3369   return REG_NOERROR;
3370 }
3371
3372   /* Helper function for parse_bracket_exp.
3373      Build the equivalence class which is represented by NAME.
3374      The result are written to MBCSET and SBCSET.
3375      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3376      is a pointer argument sinse we may update it.  */
3377
3378 static reg_errcode_t
3379 #ifdef RE_ENABLE_I18N
3380 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3381                    Idx *equiv_class_alloc, const unsigned char *name)
3382 #else /* not RE_ENABLE_I18N */
3383 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3384 #endif /* not RE_ENABLE_I18N */
3385 {
3386 #ifdef _LIBC
3387   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3388   if (nrules != 0)
3389     {
3390       const int32_t *table, *indirect;
3391       const unsigned char *weights, *extra, *cp;
3392       unsigned char char_buf[2];
3393       int32_t idx1, idx2;
3394       unsigned int ch;
3395       size_t len;
3396       /* This #include defines a local function!  */
3397 # include <locale/weight.h>
3398       /* Calculate the index for equivalence class.  */
3399       cp = name;
3400       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3401       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3402                                                _NL_COLLATE_WEIGHTMB);
3403       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3404                                                    _NL_COLLATE_EXTRAMB);
3405       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3406                                                 _NL_COLLATE_INDIRECTMB);
3407       idx1 = findidx (&cp);
3408       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3409         /* This isn't a valid character.  */
3410         return REG_ECOLLATE;
3411
3412       /* Build single byte matcing table for this equivalence class.  */
3413       char_buf[1] = (unsigned char) '\0';
3414       len = weights[idx1];
3415       for (ch = 0; ch < SBC_MAX; ++ch)
3416         {
3417           char_buf[0] = ch;
3418           cp = char_buf;
3419           idx2 = findidx (&cp);
3420 /*
3421           idx2 = table[ch];
3422 */
3423           if (idx2 == 0)
3424             /* This isn't a valid character.  */
3425             continue;
3426           if (len == weights[idx2])
3427             {
3428               int cnt = 0;
3429               while (cnt <= len &&
3430                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3431                 ++cnt;
3432
3433               if (cnt > len)
3434                 bitset_set (sbcset, ch);
3435             }
3436         }
3437       /* Check whether the array has enough space.  */
3438       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3439         {
3440           /* Not enough, realloc it.  */
3441           /* +1 in case of mbcset->nequiv_classes is 0.  */
3442           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3443           /* Use realloc since the array is NULL if *alloc == 0.  */
3444           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3445                                                    int32_t,
3446                                                    new_equiv_class_alloc);
3447           if (BE (new_equiv_classes == NULL, 0))
3448             return REG_ESPACE;
3449           mbcset->equiv_classes = new_equiv_classes;
3450           *equiv_class_alloc = new_equiv_class_alloc;
3451         }
3452       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3453     }
3454   else
3455 #endif /* _LIBC */
3456     {
3457       if (BE (strlen ((const char *) name) != 1, 0))
3458         return REG_ECOLLATE;
3459       bitset_set (sbcset, *name);
3460     }
3461   return REG_NOERROR;
3462 }
3463
3464   /* Helper function for parse_bracket_exp.
3465      Build the character class which is represented by NAME.
3466      The result are written to MBCSET and SBCSET.
3467      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3468      is a pointer argument sinse we may update it.  */
3469
3470 static reg_errcode_t
3471 #ifdef RE_ENABLE_I18N
3472 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3473                  re_charset_t *mbcset, Idx *char_class_alloc,
3474                  const unsigned char *class_name, reg_syntax_t syntax)
3475 #else /* not RE_ENABLE_I18N */
3476 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3477                  const unsigned char *class_name, reg_syntax_t syntax)
3478 #endif /* not RE_ENABLE_I18N */
3479 {
3480   int i;
3481   const char *name = (const char *) class_name;
3482
3483   /* In case of REG_ICASE "upper" and "lower" match the both of
3484      upper and lower cases.  */
3485   if ((syntax & RE_ICASE)
3486       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3487     name = "alpha";
3488
3489 #ifdef RE_ENABLE_I18N
3490   /* Check the space of the arrays.  */
3491   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3492     {
3493       /* Not enough, realloc it.  */
3494       /* +1 in case of mbcset->nchar_classes is 0.  */
3495       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3496       /* Use realloc since array is NULL if *alloc == 0.  */
3497       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3498                                                new_char_class_alloc);
3499       if (BE (new_char_classes == NULL, 0))
3500         return REG_ESPACE;
3501       mbcset->char_classes = new_char_classes;
3502       *char_class_alloc = new_char_class_alloc;
3503     }
3504   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3505 #endif /* RE_ENABLE_I18N */
3506
3507 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3508   do {                                          \
3509     if (BE (trans != NULL, 0))                  \
3510       {                                         \
3511         for (i = 0; i < SBC_MAX; ++i)           \
3512           if (ctype_func (i))                   \
3513             bitset_set (sbcset, trans[i]);      \
3514       }                                         \
3515     else                                        \
3516       {                                         \
3517         for (i = 0; i < SBC_MAX; ++i)           \
3518           if (ctype_func (i))                   \
3519             bitset_set (sbcset, i);             \
3520       }                                         \
3521   } while (0)
3522
3523   if (strcmp (name, "alnum") == 0)
3524     BUILD_CHARCLASS_LOOP (isalnum);
3525   else if (strcmp (name, "cntrl") == 0)
3526     BUILD_CHARCLASS_LOOP (iscntrl);
3527   else if (strcmp (name, "lower") == 0)
3528     BUILD_CHARCLASS_LOOP (islower);
3529   else if (strcmp (name, "space") == 0)
3530     BUILD_CHARCLASS_LOOP (isspace);
3531   else if (strcmp (name, "alpha") == 0)
3532     BUILD_CHARCLASS_LOOP (isalpha);
3533   else if (strcmp (name, "digit") == 0)
3534     BUILD_CHARCLASS_LOOP (isdigit);
3535   else if (strcmp (name, "print") == 0)
3536     BUILD_CHARCLASS_LOOP (isprint);
3537   else if (strcmp (name, "upper") == 0)
3538     BUILD_CHARCLASS_LOOP (isupper);
3539   else if (strcmp (name, "blank") == 0)
3540     BUILD_CHARCLASS_LOOP (isblank);
3541   else if (strcmp (name, "graph") == 0)
3542     BUILD_CHARCLASS_LOOP (isgraph);
3543   else if (strcmp (name, "punct") == 0)
3544     BUILD_CHARCLASS_LOOP (ispunct);
3545   else if (strcmp (name, "xdigit") == 0)
3546     BUILD_CHARCLASS_LOOP (isxdigit);
3547   else
3548     return REG_ECTYPE;
3549
3550   return REG_NOERROR;
3551 }
3552
3553 static bin_tree_t *
3554 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3555                     const unsigned char *class_name,
3556                     const unsigned char *extra, bool non_match,
3557                     reg_errcode_t *err)
3558 {
3559   re_bitset_ptr_t sbcset;
3560 #ifdef RE_ENABLE_I18N
3561   re_charset_t *mbcset;
3562   Idx alloc = 0;
3563 #endif /* not RE_ENABLE_I18N */
3564   reg_errcode_t ret;
3565   re_token_t br_token;
3566   bin_tree_t *tree;
3567
3568   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3569 #ifdef RE_ENABLE_I18N
3570   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3571 #endif /* RE_ENABLE_I18N */
3572
3573 #ifdef RE_ENABLE_I18N
3574   if (BE (sbcset == NULL || mbcset == NULL, 0))
3575 #else /* not RE_ENABLE_I18N */
3576   if (BE (sbcset == NULL, 0))
3577 #endif /* not RE_ENABLE_I18N */
3578     {
3579       *err = REG_ESPACE;
3580       return NULL;
3581     }
3582
3583   if (non_match)
3584     {
3585 #ifdef RE_ENABLE_I18N
3586       mbcset->non_match = 1;
3587 #endif /* not RE_ENABLE_I18N */
3588     }
3589
3590   /* We don't care the syntax in this case.  */
3591   ret = build_charclass (trans, sbcset,
3592 #ifdef RE_ENABLE_I18N
3593                          mbcset, &alloc,
3594 #endif /* RE_ENABLE_I18N */
3595                          class_name, 0);
3596
3597   if (BE (ret != REG_NOERROR, 0))
3598     {
3599       re_free (sbcset);
3600 #ifdef RE_ENABLE_I18N
3601       free_charset (mbcset);
3602 #endif /* RE_ENABLE_I18N */
3603       *err = ret;
3604       return NULL;
3605     }
3606   /* \w match '_' also.  */
3607   for (; *extra; extra++)
3608     bitset_set (sbcset, *extra);
3609
3610   /* If it is non-matching list.  */
3611   if (non_match)
3612     bitset_not (sbcset);
3613
3614 #ifdef RE_ENABLE_I18N
3615   /* Ensure only single byte characters are set.  */
3616   if (dfa->mb_cur_max > 1)
3617     bitset_mask (sbcset, dfa->sb_char);
3618 #endif
3619
3620   /* Build a tree for simple bracket.  */
3621   br_token.type = SIMPLE_BRACKET;
3622   br_token.opr.sbcset = sbcset;
3623   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3624   if (BE (tree == NULL, 0))
3625     goto build_word_op_espace;
3626
3627 #ifdef RE_ENABLE_I18N
3628   if (dfa->mb_cur_max > 1)
3629     {
3630       bin_tree_t *mbc_tree;
3631       /* Build a tree for complex bracket.  */
3632       br_token.type = COMPLEX_BRACKET;
3633       br_token.opr.mbcset = mbcset;
3634       dfa->has_mb_node = 1;
3635       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3636       if (BE (mbc_tree == NULL, 0))
3637         goto build_word_op_espace;
3638       /* Then join them by ALT node.  */
3639       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3640       if (BE (mbc_tree != NULL, 1))
3641         return tree;
3642     }
3643   else
3644     {
3645       free_charset (mbcset);
3646       return tree;
3647     }
3648 #else /* not RE_ENABLE_I18N */
3649   return tree;
3650 #endif /* not RE_ENABLE_I18N */
3651
3652  build_word_op_espace:
3653   re_free (sbcset);
3654 #ifdef RE_ENABLE_I18N
3655   free_charset (mbcset);
3656 #endif /* RE_ENABLE_I18N */
3657   *err = REG_ESPACE;
3658   return NULL;
3659 }
3660
3661 /* This is intended for the expressions like "a{1,3}".
3662    Fetch a number from `input', and return the number.
3663    Return REG_MISSING if the number field is empty like "{,1}".
3664    Return REG_ERROR if an error occurred.  */
3665
3666 static Idx
3667 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3668 {
3669   Idx num = REG_MISSING;
3670   unsigned char c;
3671   while (1)
3672     {
3673       fetch_token (token, input, syntax);
3674       c = token->opr.c;
3675       if (BE (token->type == END_OF_RE, 0))
3676         return REG_ERROR;
3677       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3678         break;
3679       num = ((token->type != CHARACTER || c < '0' || '9' < c
3680               || num == REG_ERROR)
3681              ? REG_ERROR
3682              : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3683       num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3684     }
3685   return num;
3686 }
3687 \f
3688 #ifdef RE_ENABLE_I18N
3689 static void
3690 free_charset (re_charset_t *cset)
3691 {
3692   re_free (cset->mbchars);
3693 # ifdef _LIBC
3694   re_free (cset->coll_syms);
3695   re_free (cset->equiv_classes);
3696   re_free (cset->range_starts);
3697   re_free (cset->range_ends);
3698 # endif
3699   re_free (cset->char_classes);
3700   re_free (cset);
3701 }
3702 #endif /* RE_ENABLE_I18N */
3703 \f
3704 /* Functions for binary tree operation.  */
3705
3706 /* Create a tree node.  */
3707
3708 static bin_tree_t *
3709 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3710              re_token_type_t type)
3711 {
3712   re_token_t t;
3713   t.type = type;
3714   return create_token_tree (dfa, left, right, &t);
3715 }
3716
3717 static bin_tree_t *
3718 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3719                    const re_token_t *token)
3720 {
3721   bin_tree_t *tree;
3722   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3723     {
3724       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3725
3726       if (storage == NULL)
3727         return NULL;
3728       storage->next = dfa->str_tree_storage;
3729       dfa->str_tree_storage = storage;
3730       dfa->str_tree_storage_idx = 0;
3731     }
3732   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3733
3734   tree->parent = NULL;
3735   tree->left = left;
3736   tree->right = right;
3737   tree->token = *token;
3738   tree->token.duplicated = 0;
3739   tree->token.opt_subexp = 0;
3740   tree->first = NULL;
3741   tree->next = NULL;
3742   tree->node_idx = REG_MISSING;
3743
3744   if (left != NULL)
3745     left->parent = tree;
3746   if (right != NULL)
3747     right->parent = tree;
3748   return tree;
3749 }
3750
3751 /* Mark the tree SRC as an optional subexpression.
3752    To be called from preorder or postorder.  */
3753
3754 static reg_errcode_t
3755 mark_opt_subexp (void *extra, bin_tree_t *node)
3756 {
3757   Idx idx = (Idx) (long) extra;
3758   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3759     node->token.opt_subexp = 1;
3760
3761   return REG_NOERROR;
3762 }
3763
3764 /* Free the allocated memory inside NODE. */
3765
3766 static void
3767 free_token (re_token_t *node)
3768 {
3769 #ifdef RE_ENABLE_I18N
3770   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3771     free_charset (node->opr.mbcset);
3772   else
3773 #endif /* RE_ENABLE_I18N */
3774     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3775       re_free (node->opr.sbcset);
3776 }
3777
3778 /* Worker function for tree walking.  Free the allocated memory inside NODE
3779    and its children. */
3780
3781 static reg_errcode_t
3782 free_tree (void *extra, bin_tree_t *node)
3783 {
3784   free_token (&node->token);
3785   return REG_NOERROR;
3786 }
3787
3788
3789 /* Duplicate the node SRC, and return new node.  This is a preorder
3790    visit similar to the one implemented by the generic visitor, but
3791    we need more infrastructure to maintain two parallel trees --- so,
3792    it's easier to duplicate.  */
3793
3794 static bin_tree_t *
3795 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3796 {
3797   const bin_tree_t *node;
3798   bin_tree_t *dup_root;
3799   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3800
3801   for (node = root; ; )
3802     {
3803       /* Create a new tree and link it back to the current parent.  */
3804       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3805       if (*p_new == NULL)
3806         return NULL;
3807       (*p_new)->parent = dup_node;
3808       (*p_new)->token.duplicated = 1;
3809       dup_node = *p_new;
3810
3811       /* Go to the left node, or up and to the right.  */
3812       if (node->left)
3813         {
3814           node = node->left;
3815           p_new = &dup_node->left;
3816         }
3817       else
3818         {
3819           const bin_tree_t *prev = NULL;
3820           while (node->right == prev || node->right == NULL)
3821             {
3822               prev = node;
3823               node = node->parent;
3824               dup_node = dup_node->parent;
3825               if (!node)
3826                 return dup_root;
3827             }
3828           node = node->right;
3829           p_new = &dup_node->right;
3830         }
3831     }
3832 }