4a80056e1f8444dffe194e541376ea88bfc55fa1
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA. */
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 size_t length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27 #ifdef RE_ENABLE_I18N
28 static void free_charset (re_charset_t *cset);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t *preg);
31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 #ifdef RE_ENABLE_I18N
33 static void optimize_utf8 (re_dfa_t *dfa);
34 #endif
35 static reg_errcode_t analyze (regex_t *preg);
36 static reg_errcode_t preorder (bin_tree_t *root,
37 reg_errcode_t (fn (void *, bin_tree_t *)),
38 void *extra);
39 static reg_errcode_t postorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 bin_tree_t *node);
46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
50 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
51 unsigned int constraint);
52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 int node, int root);
55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56 static int fetch_number (re_string_t *input, re_token_t *token,
57 reg_syntax_t syntax);
58 static int peek_token (re_token_t *token, re_string_t *input,
59 reg_syntax_t syntax) internal_function;
60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 reg_syntax_t syntax, reg_errcode_t *err);
62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 re_token_t *token, reg_syntax_t syntax,
64 int nest, reg_errcode_t *err);
65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 int nest, reg_errcode_t *err);
68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 int nest, reg_errcode_t *err);
71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 int nest, reg_errcode_t *err);
74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 re_token_t *token, reg_syntax_t syntax,
79 reg_errcode_t *err);
80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_string_t *regexp,
82 re_token_t *token, int token_len,
83 re_dfa_t *dfa,
84 reg_syntax_t syntax,
85 int accept_hyphen);
86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
87 re_string_t *regexp,
88 re_token_t *token);
89 #ifdef RE_ENABLE_I18N
90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 re_charset_t *mbcset,
92 int *equiv_class_alloc,
93 const unsigned char *name);
94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
95 bitset_t sbcset,
96 re_charset_t *mbcset,
97 int *char_class_alloc,
98 const char *class_name,
99 reg_syntax_t syntax);
100 #else /* not RE_ENABLE_I18N */
101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
102 const unsigned char *name);
103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 bitset_t sbcset,
105 const char *class_name,
106 reg_syntax_t syntax);
107 #endif /* not RE_ENABLE_I18N */
108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109 RE_TRANSLATE_TYPE trans,
110 const char *class_name,
111 const char *extra,
112 int non_match, reg_errcode_t *err);
113 static bin_tree_t *create_tree (re_dfa_t *dfa,
114 bin_tree_t *left, bin_tree_t *right,
115 re_token_type_t type);
116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 const re_token_t *token);
119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120 static void free_token (re_token_t *node);
121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123 \f
124 /* This table gives an error message for each of the error codes listed
125 in regex.h. Obviously the order here has to be same as there.
126 POSIX doesn't require that we do anything for REG_NOERROR,
127 but why not be nice? */
129 const char __re_error_msgid[] attribute_hidden =
130 {
131 #define REG_NOERROR_IDX 0
132 gettext_noop ("Success") /* REG_NOERROR */
133 "\0"
134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135 gettext_noop ("No match") /* REG_NOMATCH */
136 "\0"
137 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 "\0"
140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 "\0"
143 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 "\0"
146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 "\0"
149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 "\0"
152 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 "\0"
155 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 "\0"
158 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 "\0"
161 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 "\0"
164 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 "\0"
167 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 "\0"
170 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 "\0"
173 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 "\0"
176 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 "\0"
179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
181 };
183 const size_t __re_error_msgid_idx[] attribute_hidden =
184 {
185 REG_NOERROR_IDX,
186 REG_NOMATCH_IDX,
187 REG_BADPAT_IDX,
188 REG_ECOLLATE_IDX,
189 REG_ECTYPE_IDX,
190 REG_EESCAPE_IDX,
191 REG_ESUBREG_IDX,
192 REG_EBRACK_IDX,
193 REG_EPAREN_IDX,
194 REG_EBRACE_IDX,
195 REG_BADBR_IDX,
196 REG_ERANGE_IDX,
197 REG_ESPACE_IDX,
198 REG_BADRPT_IDX,
199 REG_EEND_IDX,
200 REG_ESIZE_IDX,
201 REG_ERPAREN_IDX
202 };
203 \f
204 /* Entry points for GNU code. */
207 #ifdef ZOS_USS
209 /* For ZOS USS we must define btowc */
211 wchar_t
212 btowc (int c)
213 {
214 wchar_t wtmp[2];
215 char tmp[2];
217 tmp[0] = c;
218 tmp[1] = 0;
220 mbtowc (wtmp, tmp, 1);
221 return wtmp[0];
222 }
223 #endif
225 /* re_compile_pattern is the GNU regular expression compiler: it
226 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
227 Returns 0 if the pattern was valid, otherwise an error string.
229 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
230 are set in BUFP on entry. */
232 const char *
233 re_compile_pattern (pattern, length, bufp)
234 const char *pattern;
235 size_t length;
236 struct re_pattern_buffer *bufp;
237 {
238 reg_errcode_t ret;
240 /* And GNU code determines whether or not to get register information
241 by passing null for the REGS argument to re_match, etc., not by
242 setting no_sub, unless RE_NO_SUB is set. */
243 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
245 /* Match anchors at newline. */
246 bufp->newline_anchor = 1;
248 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
250 if (!ret)
251 return NULL;
252 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
253 }
254 #ifdef _LIBC
255 weak_alias (__re_compile_pattern, re_compile_pattern)
256 #endif
258 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
259 also be assigned to arbitrarily: each pattern buffer stores its own
260 syntax, so it can be changed between regex compilations. */
261 /* This has no initializer because initialized variables in Emacs
262 become read-only after dumping. */
263 reg_syntax_t re_syntax_options;
266 /* Specify the precise syntax of regexps for compilation. This provides
267 for compatibility for various utilities which historically have
268 different, incompatible syntaxes.
270 The argument SYNTAX is a bit mask comprised of the various bits
271 defined in regex.h. We return the old syntax. */
273 reg_syntax_t
274 re_set_syntax (syntax)
275 reg_syntax_t syntax;
276 {
277 reg_syntax_t ret = re_syntax_options;
279 re_syntax_options = syntax;
280 return ret;
281 }
282 #ifdef _LIBC
283 weak_alias (__re_set_syntax, re_set_syntax)
284 #endif
286 int
287 re_compile_fastmap (bufp)
288 struct re_pattern_buffer *bufp;
289 {
290 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
291 char *fastmap = bufp->fastmap;
293 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
294 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
295 if (dfa->init_state != dfa->init_state_word)
296 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
297 if (dfa->init_state != dfa->init_state_nl)
298 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
299 if (dfa->init_state != dfa->init_state_begbuf)
300 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
301 bufp->fastmap_accurate = 1;
302 return 0;
303 }
304 #ifdef _LIBC
305 weak_alias (__re_compile_fastmap, re_compile_fastmap)
306 #endif
308 static inline void
309 __attribute ((always_inline))
310 re_set_fastmap (char *fastmap, int icase, int ch)
311 {
312 fastmap[ch] = 1;
313 if (icase)
314 fastmap[tolower (ch)] = 1;
315 }
317 /* Helper function for re_compile_fastmap.
318 Compile fastmap for the initial_state INIT_STATE. */
320 static void
321 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
322 char *fastmap)
323 {
324 volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
325 int node_cnt;
326 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
327 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
328 {
329 int node = init_state->nodes.elems[node_cnt];
330 re_token_type_t type = dfa->nodes[node].type;
332 if (type == CHARACTER)
333 {
334 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
335 #ifdef RE_ENABLE_I18N
336 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
337 {
338 unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
339 wchar_t wc;
340 mbstate_t state;
342 p = buf;
343 *p++ = dfa->nodes[node].opr.c;
344 while (++node < dfa->nodes_len
345 && dfa->nodes[node].type == CHARACTER
346 && dfa->nodes[node].mb_partial)
347 *p++ = dfa->nodes[node].opr.c;
348 memset (&state, '\0', sizeof (state));
349 if (__mbrtowc (&wc, (const char *) buf, p - buf,
350 &state) == p - buf
351 && (__wcrtomb ((char *) buf, towlower (wc), &state)
352 != (size_t) -1))
353 re_set_fastmap (fastmap, 0, buf[0]);
354 re_free (buf);
355 }
356 #endif
357 }
358 else if (type == SIMPLE_BRACKET)
359 {
360 int i, ch;
361 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
362 {
363 int j;
364 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
365 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
366 if (w & ((bitset_word_t) 1 << j))
367 re_set_fastmap (fastmap, icase, ch);
368 }
369 }
370 #ifdef RE_ENABLE_I18N
371 else if (type == COMPLEX_BRACKET)
372 {
373 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
374 int i;
376 # ifdef _LIBC
377 /* See if we have to try all bytes which start multiple collation
378 elements.
379 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
380 collation element, and don't catch 'b' since 'b' is
381 the only collation element which starts from 'b' (and
382 it is caught by SIMPLE_BRACKET). */
383 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
384 && (cset->ncoll_syms || cset->nranges))
385 {
386 const int32_t *table = (const int32_t *)
387 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
388 for (i = 0; i < SBC_MAX; ++i)
389 if (table[i] < 0)
390 re_set_fastmap (fastmap, icase, i);
391 }
392 # endif /* _LIBC */
394 /* See if we have to start the match at all multibyte characters,
395 i.e. where we would not find an invalid sequence. This only
396 applies to multibyte character sets; for single byte character
397 sets, the SIMPLE_BRACKET again suffices. */
398 if (dfa->mb_cur_max > 1
399 && (cset->nchar_classes || cset->non_match || cset->nranges
400 # ifdef _LIBC
401 || cset->nequiv_classes
402 # endif /* _LIBC */
403 ))
404 {
405 unsigned char c = 0;
406 do
407 {
408 mbstate_t mbs;
409 memset (&mbs, 0, sizeof (mbs));
410 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
411 re_set_fastmap (fastmap, false, (int) c);
412 }
413 while (++c != 0);
414 }
416 else
417 {
418 /* ... Else catch all bytes which can start the mbchars. */
419 for (i = 0; i < cset->nmbchars; ++i)
420 {
421 char buf[256];
422 mbstate_t state;
423 memset (&state, '\0', sizeof (state));
424 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
425 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
426 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
427 {
428 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
429 != (size_t) -1)
430 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
431 }
432 }
433 }
434 }
435 #endif /* RE_ENABLE_I18N */
436 else if (type == OP_PERIOD
437 #ifdef RE_ENABLE_I18N
438 || type == OP_UTF8_PERIOD
439 #endif /* RE_ENABLE_I18N */
440 || type == END_OF_RE)
441 {
442 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
443 if (type == END_OF_RE)
444 bufp->can_be_null = 1;
445 return;
446 }
447 }
448 }
449 \f
450 /* Entry point for POSIX code. */
451 /* regcomp takes a regular expression as a string and compiles it.
453 PREG is a regex_t *. We do not expect any fields to be initialized,
454 since POSIX says we shouldn't. Thus, we set
456 `buffer' to the compiled pattern;
457 `used' to the length of the compiled pattern;
458 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
459 REG_EXTENDED bit in CFLAGS is set; otherwise, to
460 RE_SYNTAX_POSIX_BASIC;
461 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
462 `fastmap' to an allocated space for the fastmap;
463 `fastmap_accurate' to zero;
464 `re_nsub' to the number of subexpressions in PATTERN.
466 PATTERN is the address of the pattern string.
468 CFLAGS is a series of bits which affect compilation.
470 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
471 use POSIX basic syntax.
473 If REG_NEWLINE is set, then . and [^...] don't match newline.
474 Also, regexec will try a match beginning after every newline.
476 If REG_ICASE is set, then we considers upper- and lowercase
477 versions of letters to be equivalent when matching.
479 If REG_NOSUB is set, then when PREG is passed to regexec, that
480 routine will report only success or failure, and nothing about the
481 registers.
483 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
484 the return codes and their meanings.) */
486 int
487 regcomp (preg, pattern, cflags)
488 regex_t *__restrict preg;
489 const char *__restrict pattern;
490 int cflags;
491 {
492 reg_errcode_t ret;
493 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
494 : RE_SYNTAX_POSIX_BASIC);
496 preg->buffer = NULL;
497 preg->allocated = 0;
498 preg->used = 0;
500 /* Try to allocate space for the fastmap. */
501 preg->fastmap = re_malloc (char, SBC_MAX);
502 if (BE (preg->fastmap == NULL, 0))
503 return REG_ESPACE;
505 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
507 /* If REG_NEWLINE is set, newlines are treated differently. */
508 if (cflags & REG_NEWLINE)
509 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
510 syntax &= ~RE_DOT_NEWLINE;
511 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
512 /* It also changes the matching behavior. */
513 preg->newline_anchor = 1;
514 }
515 else
516 preg->newline_anchor = 0;
517 preg->no_sub = !!(cflags & REG_NOSUB);
518 preg->translate = NULL;
520 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
522 /* POSIX doesn't distinguish between an unmatched open-group and an
523 unmatched close-group: both are REG_EPAREN. */
524 if (ret == REG_ERPAREN)
525 ret = REG_EPAREN;
527 /* We have already checked preg->fastmap != NULL. */
528 if (BE (ret == REG_NOERROR, 1))
529 /* Compute the fastmap now, since regexec cannot modify the pattern
530 buffer. This function never fails in this implementation. */
531 (void) re_compile_fastmap (preg);
532 else
533 {
534 /* Some error occurred while compiling the expression. */
535 re_free (preg->fastmap);
536 preg->fastmap = NULL;
537 }
539 return (int) ret;
540 }
541 #ifdef _LIBC
542 weak_alias (__regcomp, regcomp)
543 #endif
545 /* Returns a message corresponding to an error code, ERRCODE, returned
546 from either regcomp or regexec. We don't use PREG here. */
548 size_t
549 regerror(int errcode, const regex_t *__restrict preg,
550 char *__restrict errbuf, size_t errbuf_size)
551 {
552 const char *msg;
553 size_t msg_size;
555 if (BE (errcode < 0
556 || errcode >= (int) (sizeof (__re_error_msgid_idx)
557 / sizeof (__re_error_msgid_idx[0])), 0))
558 /* Only error codes returned by the rest of the code should be passed
559 to this routine. If we are given anything else, or if other regex
560 code generates an invalid error code, then the program has a bug.
561 Dump core so we can fix it. */
562 abort ();
564 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
566 msg_size = strlen (msg) + 1; /* Includes the null. */
568 if (BE (errbuf_size != 0, 1))
569 {
570 if (BE (msg_size > errbuf_size, 0))
571 {
572 memcpy (errbuf, msg, errbuf_size - 1);
573 errbuf[errbuf_size - 1] = 0;
574 }
575 else
576 memcpy (errbuf, msg, msg_size);
577 }
579 return msg_size;
580 }
581 #ifdef _LIBC
582 weak_alias (__regerror, regerror)
583 #endif
586 #ifdef RE_ENABLE_I18N
587 /* This static array is used for the map to single-byte characters when
588 UTF-8 is used. Otherwise we would allocate memory just to initialize
589 it the same all the time. UTF-8 is the preferred encoding so this is
590 a worthwhile optimization. */
591 #if __GNUC__ >= 3
592 static const bitset_t utf8_sb_map = {
593 /* Set the first 128 bits. */
594 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
595 };
596 #else /* ! (__GNUC__ >= 3) */
597 static bitset_t utf8_sb_map;
598 #endif /* __GNUC__ >= 3 */
599 #endif /* RE_ENABLE_I18N */
602 static void
603 free_dfa_content (re_dfa_t *dfa)
604 {
605 int i, j;
607 if (dfa->nodes)
608 for (i = 0; i < dfa->nodes_len; ++i)
609 free_token (dfa->nodes + i);
610 re_free (dfa->nexts);
611 for (i = 0; i < dfa->nodes_len; ++i)
612 {
613 if (dfa->eclosures != NULL)
614 re_node_set_free (dfa->eclosures + i);
615 if (dfa->inveclosures != NULL)
616 re_node_set_free (dfa->inveclosures + i);
617 if (dfa->edests != NULL)
618 re_node_set_free (dfa->edests + i);
619 }
620 re_free (dfa->edests);
621 re_free (dfa->eclosures);
622 re_free (dfa->inveclosures);
623 re_free (dfa->nodes);
625 if (dfa->state_table)
626 for (i = 0; i <= dfa->state_hash_mask; ++i)
627 {
628 struct re_state_table_entry *entry = dfa->state_table + i;
629 for (j = 0; j < entry->num; ++j)
630 {
631 re_dfastate_t *state = entry->array[j];
632 free_state (state);
633 }
634 re_free (entry->array);
635 }
636 re_free (dfa->state_table);
637 #ifdef RE_ENABLE_I18N
638 if (dfa->sb_char != utf8_sb_map)
639 re_free (dfa->sb_char);
640 #endif
641 re_free (dfa->subexp_map);
642 #ifdef DEBUG
643 re_free (dfa->re_str);
644 #endif
646 re_free (dfa);
647 }
650 /* Free dynamically allocated space used by PREG. */
652 void
653 regfree (preg)
654 regex_t *preg;
655 {
656 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
657 if (BE (dfa != NULL, 1))
658 free_dfa_content (dfa);
659 preg->buffer = NULL;
660 preg->allocated = 0;
662 re_free (preg->fastmap);
663 preg->fastmap = NULL;
665 re_free (preg->translate);
666 preg->translate = NULL;
667 }
668 #ifdef _LIBC
669 weak_alias (__regfree, regfree)
670 #endif
671 \f
672 /* Entry points compatible with 4.2 BSD regex library. We don't define
673 them unless specifically requested. */
675 #if defined _REGEX_RE_COMP || defined _LIBC
677 /* BSD has one and only one pattern buffer. */
678 static struct re_pattern_buffer re_comp_buf;
680 char *
681 # ifdef _LIBC
682 /* Make these definitions weak in libc, so POSIX programs can redefine
683 these names if they don't use our functions, and still use
684 regcomp/regexec above without link errors. */
685 weak_function
686 # endif
687 re_comp (s)
688 const char *s;
689 {
690 reg_errcode_t ret;
691 char *fastmap;
693 if (!s)
694 {
695 if (!re_comp_buf.buffer)
696 return gettext ("No previous regular expression");
697 return 0;
698 }
700 if (re_comp_buf.buffer)
701 {
702 fastmap = re_comp_buf.fastmap;
703 re_comp_buf.fastmap = NULL;
704 __regfree (&re_comp_buf);
705 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
706 re_comp_buf.fastmap = fastmap;
707 }
709 if (re_comp_buf.fastmap == NULL)
710 {
711 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
712 if (re_comp_buf.fastmap == NULL)
713 return (char *) gettext (__re_error_msgid
714 + __re_error_msgid_idx[(int) REG_ESPACE]);
715 }
717 /* Since `re_exec' always passes NULL for the `regs' argument, we
718 don't need to initialize the pattern buffer fields which affect it. */
720 /* Match anchors at newlines. */
721 re_comp_buf.newline_anchor = 1;
723 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
725 if (!ret)
726 return NULL;
728 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
729 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
730 }
732 #ifdef _LIBC
733 libc_freeres_fn (free_mem)
734 {
735 __regfree (&re_comp_buf);
736 }
737 #endif
739 #endif /* _REGEX_RE_COMP */
740 \f
741 /* Internal entry point.
742 Compile the regular expression PATTERN, whose length is LENGTH.
743 SYNTAX indicate regular expression's syntax. */
745 static reg_errcode_t
746 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
747 reg_syntax_t syntax)
748 {
749 reg_errcode_t err = REG_NOERROR;
750 re_dfa_t *dfa;
751 re_string_t regexp;
753 /* Initialize the pattern buffer. */
754 preg->fastmap_accurate = 0;
755 preg->syntax = syntax;
756 preg->not_bol = preg->not_eol = 0;
757 preg->used = 0;
758 preg->re_nsub = 0;
759 preg->can_be_null = 0;
760 preg->regs_allocated = REGS_UNALLOCATED;
762 /* Initialize the dfa. */
763 dfa = (re_dfa_t *) preg->buffer;
764 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
765 {
766 /* If zero allocated, but buffer is non-null, try to realloc
767 enough space. This loses if buffer's address is bogus, but
768 that is the user's responsibility. If ->buffer is NULL this
769 is a simple allocation. */
770 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
771 if (dfa == NULL)
772 return REG_ESPACE;
773 preg->allocated = sizeof (re_dfa_t);
774 preg->buffer = (unsigned char *) dfa;
775 }
776 preg->used = sizeof (re_dfa_t);
778 err = init_dfa (dfa, length);
779 if (BE (err != REG_NOERROR, 0))
780 {
781 free_dfa_content (dfa);
782 preg->buffer = NULL;
783 preg->allocated = 0;
784 return err;
785 }
786 #ifdef DEBUG
787 /* Note: length+1 will not overflow since it is checked in init_dfa. */
788 dfa->re_str = re_malloc (char, length + 1);
789 strncpy (dfa->re_str, pattern, length + 1);
790 #endif
792 __libc_lock_init (dfa->lock);
794 err = re_string_construct (®exp, pattern, length, preg->translate,
795 syntax & RE_ICASE, dfa);
796 if (BE (err != REG_NOERROR, 0))
797 {
798 re_compile_internal_free_return:
799 free_workarea_compile (preg);
800 re_string_destruct (®exp);
801 free_dfa_content (dfa);
802 preg->buffer = NULL;
803 preg->allocated = 0;
804 return err;
805 }
807 /* Parse the regular expression, and build a structure tree. */
808 preg->re_nsub = 0;
809 dfa->str_tree = parse (®exp, preg, syntax, &err);
810 if (BE (dfa->str_tree == NULL, 0))
811 goto re_compile_internal_free_return;
813 /* Analyze the tree and create the nfa. */
814 err = analyze (preg);
815 if (BE (err != REG_NOERROR, 0))
816 goto re_compile_internal_free_return;
818 #ifdef RE_ENABLE_I18N
819 /* If possible, do searching in single byte encoding to speed things up. */
820 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
821 optimize_utf8 (dfa);
822 #endif
824 /* Then create the initial state of the dfa. */
825 err = create_initial_state (dfa);
827 /* Release work areas. */
828 free_workarea_compile (preg);
829 re_string_destruct (®exp);
831 if (BE (err != REG_NOERROR, 0))
832 {
833 free_dfa_content (dfa);
834 preg->buffer = NULL;
835 preg->allocated = 0;
836 }
838 return err;
839 }
841 /* Initialize DFA. We use the length of the regular expression PAT_LEN
842 as the initial length of some arrays. */
844 static reg_errcode_t
845 init_dfa (re_dfa_t *dfa, size_t pat_len)
846 {
847 unsigned int table_size;
848 #ifndef _LIBC
849 char *codeset_name;
850 #endif
852 memset (dfa, '\0', sizeof (re_dfa_t));
854 /* Force allocation of str_tree_storage the first time. */
855 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
857 /* Avoid overflows. */
858 if (pat_len == SIZE_MAX)
859 return REG_ESPACE;
861 dfa->nodes_alloc = pat_len + 1;
862 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
864 /* table_size = 2 ^ ceil(log pat_len) */
865 for (table_size = 1; ; table_size <<= 1)
866 if (table_size > pat_len)
867 break;
869 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
870 dfa->state_hash_mask = table_size - 1;
872 dfa->mb_cur_max = MB_CUR_MAX;
873 #ifdef _LIBC
874 if (dfa->mb_cur_max == 6
875 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
876 dfa->is_utf8 = 1;
877 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
878 != 0);
879 #else
880 # ifdef HAVE_LANGINFO_CODESET
881 codeset_name = nl_langinfo (CODESET);
882 # else
883 codeset_name = getenv ("LC_ALL");
884 if (codeset_name == NULL || codeset_name[0] == '\0')
885 codeset_name = getenv ("LC_CTYPE");
886 if (codeset_name == NULL || codeset_name[0] == '\0')
887 codeset_name = getenv ("LANG");
888 if (codeset_name == NULL)
889 codeset_name = "";
890 else if (strchr (codeset_name, '.') != NULL)
891 codeset_name = strchr (codeset_name, '.') + 1;
892 # endif
894 /* strcasecmp isn't a standard interface. brute force check */
895 #if 0
896 if (strcasecmp (codeset_name, "UTF-8") == 0
897 || strcasecmp (codeset_name, "UTF8") == 0)
898 dfa->is_utf8 = 1;
899 #else
900 if ( (codeset_name[0] == 'U' || codeset_name[0] == 'u')
901 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
902 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
903 && (codeset_name[3] == '-'
904 ? codeset_name[4] == '8' && codeset_name[5] == '\0'
905 : codeset_name[3] == '8' && codeset_name[4] == '\0'))
906 dfa->is_utf8 = 1;
907 #endif
909 /* We check exhaustively in the loop below if this charset is a
910 superset of ASCII. */
911 dfa->map_notascii = 0;
912 #endif
914 #ifdef RE_ENABLE_I18N
915 if (dfa->mb_cur_max > 1)
916 {
917 if (dfa->is_utf8)
918 {
919 #if !defined(__GNUC__) || __GNUC__ < 3
920 static short utf8_sb_map_inited = 0;
922 if (! utf8_sb_map_inited)
923 {
924 int i;
926 utf8_sb_map_inited = 0;
927 for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
928 utf8_sb_map[i] = BITSET_WORD_MAX;
929 }
930 #endif
931 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
932 }
933 else
934 {
935 int i, j, ch;
937 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
938 if (BE (dfa->sb_char == NULL, 0))
939 return REG_ESPACE;
941 /* Set the bits corresponding to single byte chars. */
942 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
943 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
944 {
945 wint_t wch = __btowc (ch);
946 if (wch != WEOF)
947 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
948 # ifndef _LIBC
949 if (isascii (ch) && wch != ch)
950 dfa->map_notascii = 1;
951 # endif
952 }
953 }
954 }
955 #endif
957 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
958 return REG_ESPACE;
959 return REG_NOERROR;
960 }
962 /* Initialize WORD_CHAR table, which indicate which character is
963 "word". In this case "word" means that it is the word construction
964 character used by some operators like "\<", "\>", etc. */
966 static void
967 internal_function
968 init_word_char (re_dfa_t *dfa)
969 {
970 int i, j, ch;
971 dfa->word_ops_used = 1;
972 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
973 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
974 if (isalnum (ch) || ch == '_')
975 dfa->word_char[i] |= (bitset_word_t) 1 << j;
976 }
978 /* Free the work area which are only used while compiling. */
980 static void
981 free_workarea_compile (regex_t *preg)
982 {
983 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
984 bin_tree_storage_t *storage, *next;
985 for (storage = dfa->str_tree_storage; storage; storage = next)
986 {
987 next = storage->next;
988 re_free (storage);
989 }
990 dfa->str_tree_storage = NULL;
991 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
992 dfa->str_tree = NULL;
993 re_free (dfa->org_indices);
994 dfa->org_indices = NULL;
995 }
997 /* Create initial states for all contexts. */
999 static reg_errcode_t
1000 create_initial_state (re_dfa_t *dfa)
1001 {
1002 int first, i;
1003 reg_errcode_t err;
1004 re_node_set init_nodes;
1006 /* Initial states have the epsilon closure of the node which is
1007 the first node of the regular expression. */
1008 first = dfa->str_tree->first->node_idx;
1009 dfa->init_node = first;
1010 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1011 if (BE (err != REG_NOERROR, 0))
1012 return err;
1014 /* The back-references which are in initial states can epsilon transit,
1015 since in this case all of the subexpressions can be null.
1016 Then we add epsilon closures of the nodes which are the next nodes of
1017 the back-references. */
1018 if (dfa->nbackref > 0)
1019 for (i = 0; i < init_nodes.nelem; ++i)
1020 {
1021 int node_idx = init_nodes.elems[i];
1022 re_token_type_t type = dfa->nodes[node_idx].type;
1024 int clexp_idx;
1025 if (type != OP_BACK_REF)
1026 continue;
1027 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1028 {
1029 re_token_t *clexp_node;
1030 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1031 if (clexp_node->type == OP_CLOSE_SUBEXP
1032 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1033 break;
1034 }
1035 if (clexp_idx == init_nodes.nelem)
1036 continue;
1038 if (type == OP_BACK_REF)
1039 {
1040 int dest_idx = dfa->edests[node_idx].elems[0];
1041 if (!re_node_set_contains (&init_nodes, dest_idx))
1042 {
1043 reg_errcode_t err = re_node_set_merge (&init_nodes,
1044 dfa->eclosures
1045 + dest_idx);
1046 if (err != REG_NOERROR)
1047 return err;
1048 i = 0;
1049 }
1050 }
1051 }
1053 /* It must be the first time to invoke acquire_state. */
1054 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1055 /* We don't check ERR here, since the initial state must not be NULL. */
1056 if (BE (dfa->init_state == NULL, 0))
1057 return err;
1058 if (dfa->init_state->has_constraint)
1059 {
1060 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1061 CONTEXT_WORD);
1062 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1063 CONTEXT_NEWLINE);
1064 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1065 &init_nodes,
1066 CONTEXT_NEWLINE
1067 | CONTEXT_BEGBUF);
1068 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1069 || dfa->init_state_begbuf == NULL, 0))
1070 return err;
1071 }
1072 else
1073 dfa->init_state_word = dfa->init_state_nl
1074 = dfa->init_state_begbuf = dfa->init_state;
1076 re_node_set_free (&init_nodes);
1077 return REG_NOERROR;
1078 }
1079 \f
1080 #ifdef RE_ENABLE_I18N
1081 /* If it is possible to do searching in single byte encoding instead of UTF-8
1082 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1083 DFA nodes where needed. */
1085 static void
1086 optimize_utf8 (re_dfa_t *dfa)
1087 {
1088 int node, i, mb_chars = 0, has_period = 0;
1090 for (node = 0; node < dfa->nodes_len; ++node)
1091 switch (dfa->nodes[node].type)
1092 {
1093 case CHARACTER:
1094 if (dfa->nodes[node].opr.c >= 0x80)
1095 mb_chars = 1;
1096 break;
1097 case ANCHOR:
1098 switch (dfa->nodes[node].opr.ctx_type)
1099 {
1100 case LINE_FIRST:
1101 case LINE_LAST:
1102 case BUF_FIRST:
1103 case BUF_LAST:
1104 break;
1105 default:
1106 /* Word anchors etc. cannot be handled. It's okay to test
1107 opr.ctx_type since constraints (for all DFA nodes) are
1108 created by ORing one or more opr.ctx_type values. */
1109 return;
1110 }
1111 break;
1112 case OP_PERIOD:
1113 has_period = 1;
1114 break;
1115 case OP_BACK_REF:
1116 case OP_ALT:
1117 case END_OF_RE:
1118 case OP_DUP_ASTERISK:
1119 case OP_OPEN_SUBEXP:
1120 case OP_CLOSE_SUBEXP:
1121 break;
1122 case COMPLEX_BRACKET:
1123 return;
1124 case SIMPLE_BRACKET:
1125 /* Just double check. The non-ASCII range starts at 0x80. */
1126 assert (0x80 % BITSET_WORD_BITS == 0);
1127 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1128 if (dfa->nodes[node].opr.sbcset[i])
1129 return;
1130 break;
1131 default:
1132 abort ();
1133 }
1135 if (mb_chars || has_period)
1136 for (node = 0; node < dfa->nodes_len; ++node)
1137 {
1138 if (dfa->nodes[node].type == CHARACTER
1139 && dfa->nodes[node].opr.c >= 0x80)
1140 dfa->nodes[node].mb_partial = 0;
1141 else if (dfa->nodes[node].type == OP_PERIOD)
1142 dfa->nodes[node].type = OP_UTF8_PERIOD;
1143 }
1145 /* The search can be in single byte locale. */
1146 dfa->mb_cur_max = 1;
1147 dfa->is_utf8 = 0;
1148 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1149 }
1150 #endif
1151 \f
1152 /* Analyze the structure tree, and calculate "first", "next", "edest",
1153 "eclosure", and "inveclosure". */
1155 static reg_errcode_t
1156 analyze (regex_t *preg)
1157 {
1158 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1159 reg_errcode_t ret;
1161 /* Allocate arrays. */
1162 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1163 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1164 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1165 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1166 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1167 || dfa->eclosures == NULL, 0))
1168 return REG_ESPACE;
1170 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1171 if (dfa->subexp_map != NULL)
1172 {
1173 int i;
1174 for (i = 0; i < preg->re_nsub; i++)
1175 dfa->subexp_map[i] = i;
1176 preorder (dfa->str_tree, optimize_subexps, dfa);
1177 for (i = 0; i < preg->re_nsub; i++)
1178 if (dfa->subexp_map[i] != i)
1179 break;
1180 if (i == preg->re_nsub)
1181 {
1182 free (dfa->subexp_map);
1183 dfa->subexp_map = NULL;
1184 }
1185 }
1187 ret = postorder (dfa->str_tree, lower_subexps, preg);
1188 if (BE (ret != REG_NOERROR, 0))
1189 return ret;
1190 ret = postorder (dfa->str_tree, calc_first, dfa);
1191 if (BE (ret != REG_NOERROR, 0))
1192 return ret;
1193 preorder (dfa->str_tree, calc_next, dfa);
1194 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1195 if (BE (ret != REG_NOERROR, 0))
1196 return ret;
1197 ret = calc_eclosure (dfa);
1198 if (BE (ret != REG_NOERROR, 0))
1199 return ret;
1201 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1202 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1203 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1204 || dfa->nbackref)
1205 {
1206 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1207 if (BE (dfa->inveclosures == NULL, 0))
1208 return REG_ESPACE;
1209 ret = calc_inveclosure (dfa);
1210 }
1212 return ret;
1213 }
1215 /* Our parse trees are very unbalanced, so we cannot use a stack to
1216 implement parse tree visits. Instead, we use parent pointers and
1217 some hairy code in these two functions. */
1218 static reg_errcode_t
1219 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1220 void *extra)
1221 {
1222 bin_tree_t *node, *prev;
1224 for (node = root; ; )
1225 {
1226 /* Descend down the tree, preferably to the left (or to the right
1227 if that's the only child). */
1228 while (node->left || node->right)
1229 if (node->left)
1230 node = node->left;
1231 else
1232 node = node->right;
1234 do
1235 {
1236 reg_errcode_t err = fn (extra, node);
1237 if (BE (err != REG_NOERROR, 0))
1238 return err;
1239 if (node->parent == NULL)
1240 return REG_NOERROR;
1241 prev = node;
1242 node = node->parent;
1243 }
1244 /* Go up while we have a node that is reached from the right. */
1245 while (node->right == prev || node->right == NULL);
1246 node = node->right;
1247 }
1248 }
1250 static reg_errcode_t
1251 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1252 void *extra)
1253 {
1254 bin_tree_t *node;
1256 for (node = root; ; )
1257 {
1258 reg_errcode_t err = fn (extra, node);
1259 if (BE (err != REG_NOERROR, 0))
1260 return err;
1262 /* Go to the left node, or up and to the right. */
1263 if (node->left)
1264 node = node->left;
1265 else
1266 {
1267 bin_tree_t *prev = NULL;
1268 while (node->right == prev || node->right == NULL)
1269 {
1270 prev = node;
1271 node = node->parent;
1272 if (!node)
1273 return REG_NOERROR;
1274 }
1275 node = node->right;
1276 }
1277 }
1278 }
1280 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1281 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1282 backreferences as well. Requires a preorder visit. */
1283 static reg_errcode_t
1284 optimize_subexps (void *extra, bin_tree_t *node)
1285 {
1286 re_dfa_t *dfa = (re_dfa_t *) extra;
1288 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1289 {
1290 int idx = node->token.opr.idx;
1291 node->token.opr.idx = dfa->subexp_map[idx];
1292 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1293 }
1295 else if (node->token.type == SUBEXP
1296 && node->left && node->left->token.type == SUBEXP)
1297 {
1298 int other_idx = node->left->token.opr.idx;
1300 node->left = node->left->left;
1301 if (node->left)
1302 node->left->parent = node;
1304 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1305 if (other_idx < BITSET_WORD_BITS)
1306 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1307 }
1309 return REG_NOERROR;
1310 }
1312 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1313 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1314 static reg_errcode_t
1315 lower_subexps (void *extra, bin_tree_t *node)
1316 {
1317 regex_t *preg = (regex_t *) extra;
1318 reg_errcode_t err = REG_NOERROR;
1320 if (node->left && node->left->token.type == SUBEXP)
1321 {
1322 node->left = lower_subexp (&err, preg, node->left);
1323 if (node->left)
1324 node->left->parent = node;
1325 }
1326 if (node->right && node->right->token.type == SUBEXP)
1327 {
1328 node->right = lower_subexp (&err, preg, node->right);
1329 if (node->right)
1330 node->right->parent = node;
1331 }
1333 return err;
1334 }
1336 static bin_tree_t *
1337 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1338 {
1339 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1340 bin_tree_t *body = node->left;
1341 bin_tree_t *op, *cls, *tree1, *tree;
1343 if (preg->no_sub
1344 /* We do not optimize empty subexpressions, because otherwise we may
1345 have bad CONCAT nodes with NULL children. This is obviously not
1346 very common, so we do not lose much. An example that triggers
1347 this case is the sed "script" /\(\)/x. */
1348 && node->left != NULL
1349 && (node->token.opr.idx >= BITSET_WORD_BITS
1350 || !(dfa->used_bkref_map
1351 & ((bitset_word_t) 1 << node->token.opr.idx))))
1352 return node->left;
1354 /* Convert the SUBEXP node to the concatenation of an
1355 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1356 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1357 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1358 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1359 tree = create_tree (dfa, op, tree1, CONCAT);
1360 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1361 {
1362 *err = REG_ESPACE;
1363 return NULL;
1364 }
1366 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1367 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1368 return tree;
1369 }
1371 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1372 nodes. Requires a postorder visit. */
1373 static reg_errcode_t
1374 calc_first (void *extra, bin_tree_t *node)
1375 {
1376 re_dfa_t *dfa = (re_dfa_t *) extra;
1377 if (node->token.type == CONCAT)
1378 {
1379 node->first = node->left->first;
1380 node->node_idx = node->left->node_idx;
1381 }
1382 else
1383 {
1384 node->first = node;
1385 node->node_idx = re_dfa_add_node (dfa, node->token);
1386 if (BE (node->node_idx == -1, 0))
1387 return REG_ESPACE;
1388 if (node->token.type == ANCHOR)
1389 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1390 }
1391 return REG_NOERROR;
1392 }
1394 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1395 static reg_errcode_t
1396 calc_next (void *extra, bin_tree_t *node)
1397 {
1398 switch (node->token.type)
1399 {
1400 case OP_DUP_ASTERISK:
1401 node->left->next = node;
1402 break;
1403 case CONCAT:
1404 node->left->next = node->right->first;
1405 node->right->next = node->next;
1406 break;
1407 default:
1408 if (node->left)
1409 node->left->next = node->next;
1410 if (node->right)
1411 node->right->next = node->next;
1412 break;
1413 }
1414 return REG_NOERROR;
1415 }
1417 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1418 static reg_errcode_t
1419 link_nfa_nodes (void *extra, bin_tree_t *node)
1420 {
1421 re_dfa_t *dfa = (re_dfa_t *) extra;
1422 int idx = node->node_idx;
1423 reg_errcode_t err = REG_NOERROR;
1425 switch (node->token.type)
1426 {
1427 case CONCAT:
1428 break;
1430 case END_OF_RE:
1431 assert (node->next == NULL);
1432 break;
1434 case OP_DUP_ASTERISK:
1435 case OP_ALT:
1436 {
1437 int left, right;
1438 dfa->has_plural_match = 1;
1439 if (node->left != NULL)
1440 left = node->left->first->node_idx;
1441 else
1442 left = node->next->node_idx;
1443 if (node->right != NULL)
1444 right = node->right->first->node_idx;
1445 else
1446 right = node->next->node_idx;
1447 assert (left > -1);
1448 assert (right > -1);
1449 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1450 }
1451 break;
1453 case ANCHOR:
1454 case OP_OPEN_SUBEXP:
1455 case OP_CLOSE_SUBEXP:
1456 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1457 break;
1459 case OP_BACK_REF:
1460 dfa->nexts[idx] = node->next->node_idx;
1461 if (node->token.type == OP_BACK_REF)
1462 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1463 break;
1465 default:
1466 assert (!IS_EPSILON_NODE (node->token.type));
1467 dfa->nexts[idx] = node->next->node_idx;
1468 break;
1469 }
1471 return err;
1472 }
1474 /* Duplicate the epsilon closure of the node ROOT_NODE.
1475 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1476 to their own constraint. */
1478 static reg_errcode_t
1479 internal_function
1480 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1481 int root_node, unsigned int init_constraint)
1482 {
1483 int org_node, clone_node, ret;
1484 unsigned int constraint = init_constraint;
1485 for (org_node = top_org_node, clone_node = top_clone_node;;)
1486 {
1487 int org_dest, clone_dest;
1488 if (dfa->nodes[org_node].type == OP_BACK_REF)
1489 {
1490 /* If the back reference epsilon-transit, its destination must
1491 also have the constraint. Then duplicate the epsilon closure
1492 of the destination of the back reference, and store it in
1493 edests of the back reference. */
1494 org_dest = dfa->nexts[org_node];
1495 re_node_set_empty (dfa->edests + clone_node);
1496 clone_dest = duplicate_node (dfa, org_dest, constraint);
1497 if (BE (clone_dest == -1, 0))
1498 return REG_ESPACE;
1499 dfa->nexts[clone_node] = dfa->nexts[org_node];
1500 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1501 if (BE (ret < 0, 0))
1502 return REG_ESPACE;
1503 }
1504 else if (dfa->edests[org_node].nelem == 0)
1505 {
1506 /* In case of the node can't epsilon-transit, don't duplicate the
1507 destination and store the original destination as the
1508 destination of the node. */
1509 dfa->nexts[clone_node] = dfa->nexts[org_node];
1510 break;
1511 }
1512 else if (dfa->edests[org_node].nelem == 1)
1513 {
1514 /* In case of the node can epsilon-transit, and it has only one
1515 destination. */
1516 org_dest = dfa->edests[org_node].elems[0];
1517 re_node_set_empty (dfa->edests + clone_node);
1518 /* If the node is root_node itself, it means the epsilon clsoure
1519 has a loop. Then tie it to the destination of the root_node. */
1520 if (org_node == root_node && clone_node != org_node)
1521 {
1522 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1523 if (BE (ret < 0, 0))
1524 return REG_ESPACE;
1525 break;
1526 }
1527 /* In case of the node has another constraint, add it. */
1528 constraint |= dfa->nodes[org_node].constraint;
1529 clone_dest = duplicate_node (dfa, org_dest, constraint);
1530 if (BE (clone_dest == -1, 0))
1531 return REG_ESPACE;
1532 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1533 if (BE (ret < 0, 0))
1534 return REG_ESPACE;
1535 }
1536 else /* dfa->edests[org_node].nelem == 2 */
1537 {
1538 /* In case of the node can epsilon-transit, and it has two
1539 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1540 org_dest = dfa->edests[org_node].elems[0];
1541 re_node_set_empty (dfa->edests + clone_node);
1542 /* Search for a duplicated node which satisfies the constraint. */
1543 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1544 if (clone_dest == -1)
1545 {
1546 /* There is no such duplicated node, create a new one. */
1547 reg_errcode_t err;
1548 clone_dest = duplicate_node (dfa, org_dest, constraint);
1549 if (BE (clone_dest == -1, 0))
1550 return REG_ESPACE;
1551 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1552 if (BE (ret < 0, 0))
1553 return REG_ESPACE;
1554 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1555 root_node, constraint);
1556 if (BE (err != REG_NOERROR, 0))
1557 return err;
1558 }
1559 else
1560 {
1561 /* There is a duplicated node which satisfies the constraint,
1562 use it to avoid infinite loop. */
1563 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1564 if (BE (ret < 0, 0))
1565 return REG_ESPACE;
1566 }
1568 org_dest = dfa->edests[org_node].elems[1];
1569 clone_dest = duplicate_node (dfa, org_dest, constraint);
1570 if (BE (clone_dest == -1, 0))
1571 return REG_ESPACE;
1572 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1573 if (BE (ret < 0, 0))
1574 return REG_ESPACE;
1575 }
1576 org_node = org_dest;
1577 clone_node = clone_dest;
1578 }
1579 return REG_NOERROR;
1580 }
1582 /* Search for a node which is duplicated from the node ORG_NODE, and
1583 satisfies the constraint CONSTRAINT. */
1585 static int
1586 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1587 unsigned int constraint)
1588 {
1589 int idx;
1590 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1591 {
1592 if (org_node == dfa->org_indices[idx]
1593 && constraint == dfa->nodes[idx].constraint)
1594 return idx; /* Found. */
1595 }
1596 return -1; /* Not found. */
1597 }
1599 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1600 Return the index of the new node, or -1 if insufficient storage is
1601 available. */
1603 static int
1604 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1605 {
1606 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1607 if (BE (dup_idx != -1, 1))
1608 {
1609 dfa->nodes[dup_idx].constraint = constraint;
1610 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1611 dfa->nodes[dup_idx].duplicated = 1;
1613 /* Store the index of the original node. */
1614 dfa->org_indices[dup_idx] = org_idx;
1615 }
1616 return dup_idx;
1617 }
1619 static reg_errcode_t
1620 calc_inveclosure (re_dfa_t *dfa)
1621 {
1622 int src, idx, ret;
1623 for (idx = 0; idx < dfa->nodes_len; ++idx)
1624 re_node_set_init_empty (dfa->inveclosures + idx);
1626 for (src = 0; src < dfa->nodes_len; ++src)
1627 {
1628 int *elems = dfa->eclosures[src].elems;
1629 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1630 {
1631 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1632 if (BE (ret == -1, 0))
1633 return REG_ESPACE;
1634 }
1635 }
1637 return REG_NOERROR;
1638 }
1640 /* Calculate "eclosure" for all the node in DFA. */
1642 static reg_errcode_t
1643 calc_eclosure (re_dfa_t *dfa)
1644 {
1645 int node_idx, incomplete;
1646 #ifdef DEBUG
1647 assert (dfa->nodes_len > 0);
1648 #endif
1649 incomplete = 0;
1650 /* For each nodes, calculate epsilon closure. */
1651 for (node_idx = 0; ; ++node_idx)
1652 {
1653 reg_errcode_t err;
1654 re_node_set eclosure_elem;
1655 if (node_idx == dfa->nodes_len)
1656 {
1657 if (!incomplete)
1658 break;
1659 incomplete = 0;
1660 node_idx = 0;
1661 }
1663 #ifdef DEBUG
1664 assert (dfa->eclosures[node_idx].nelem != -1);
1665 #endif
1667 /* If we have already calculated, skip it. */
1668 if (dfa->eclosures[node_idx].nelem != 0)
1669 continue;
1670 /* Calculate epsilon closure of `node_idx'. */
1671 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1672 if (BE (err != REG_NOERROR, 0))
1673 return err;
1675 if (dfa->eclosures[node_idx].nelem == 0)
1676 {
1677 incomplete = 1;
1678 re_node_set_free (&eclosure_elem);
1679 }
1680 }
1681 return REG_NOERROR;
1682 }
1684 /* Calculate epsilon closure of NODE. */
1686 static reg_errcode_t
1687 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1688 {
1689 reg_errcode_t err;
1690 int i;
1691 re_node_set eclosure;
1692 int ret;
1693 int incomplete = 0;
1694 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1695 if (BE (err != REG_NOERROR, 0))
1696 return err;
1698 /* This indicates that we are calculating this node now.
1699 We reference this value to avoid infinite loop. */
1700 dfa->eclosures[node].nelem = -1;
1702 /* If the current node has constraints, duplicate all nodes
1703 since they must inherit the constraints. */
1704 if (dfa->nodes[node].constraint
1705 && dfa->edests[node].nelem
1706 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1707 {
1708 err = duplicate_node_closure (dfa, node, node, node,
1709 dfa->nodes[node].constraint);
1710 if (BE (err != REG_NOERROR, 0))
1711 return err;
1712 }
1714 /* Expand each epsilon destination nodes. */
1715 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1716 for (i = 0; i < dfa->edests[node].nelem; ++i)
1717 {
1718 re_node_set eclosure_elem;
1719 int edest = dfa->edests[node].elems[i];
1720 /* If calculating the epsilon closure of `edest' is in progress,
1721 return intermediate result. */
1722 if (dfa->eclosures[edest].nelem == -1)
1723 {
1724 incomplete = 1;
1725 continue;
1726 }
1727 /* If we haven't calculated the epsilon closure of `edest' yet,
1728 calculate now. Otherwise use calculated epsilon closure. */
1729 if (dfa->eclosures[edest].nelem == 0)
1730 {
1731 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1732 if (BE (err != REG_NOERROR, 0))
1733 return err;
1734 }
1735 else
1736 eclosure_elem = dfa->eclosures[edest];
1737 /* Merge the epsilon closure of `edest'. */
1738 err = re_node_set_merge (&eclosure, &eclosure_elem);
1739 if (BE (err != REG_NOERROR, 0))
1740 return err;
1741 /* If the epsilon closure of `edest' is incomplete,
1742 the epsilon closure of this node is also incomplete. */
1743 if (dfa->eclosures[edest].nelem == 0)
1744 {
1745 incomplete = 1;
1746 re_node_set_free (&eclosure_elem);
1747 }
1748 }
1750 /* An epsilon closure includes itself. */
1751 ret = re_node_set_insert (&eclosure, node);
1752 if (BE (ret < 0, 0))
1753 return REG_ESPACE;
1754 if (incomplete && !root)
1755 dfa->eclosures[node].nelem = 0;
1756 else
1757 dfa->eclosures[node] = eclosure;
1758 *new_set = eclosure;
1759 return REG_NOERROR;
1760 }
1761 \f
1762 /* Functions for token which are used in the parser. */
1764 /* Fetch a token from INPUT.
1765 We must not use this function inside bracket expressions. */
1767 static void
1768 internal_function
1769 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1770 {
1771 re_string_skip_bytes (input, peek_token (result, input, syntax));
1772 }
1774 /* Peek a token from INPUT, and return the length of the token.
1775 We must not use this function inside bracket expressions. */
1777 static int
1778 internal_function
1779 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1780 {
1781 unsigned char c;
1783 if (re_string_eoi (input))
1784 {
1785 token->type = END_OF_RE;
1786 return 0;
1787 }
1789 c = re_string_peek_byte (input, 0);
1790 token->opr.c = c;
1792 token->word_char = 0;
1793 #ifdef RE_ENABLE_I18N
1794 token->mb_partial = 0;
1795 if (input->mb_cur_max > 1 &&
1796 !re_string_first_byte (input, re_string_cur_idx (input)))
1797 {
1798 token->type = CHARACTER;
1799 token->mb_partial = 1;
1800 return 1;
1801 }
1802 #endif
1803 if (c == '\\')
1804 {
1805 unsigned char c2;
1806 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1807 {
1808 token->type = BACK_SLASH;
1809 return 1;
1810 }
1812 c2 = re_string_peek_byte_case (input, 1);
1813 token->opr.c = c2;
1814 token->type = CHARACTER;
1815 #ifdef RE_ENABLE_I18N
1816 if (input->mb_cur_max > 1)
1817 {
1818 wint_t wc = re_string_wchar_at (input,
1819 re_string_cur_idx (input) + 1);
1820 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1821 }
1822 else
1823 #endif
1824 token->word_char = IS_WORD_CHAR (c2) != 0;
1826 switch (c2)
1827 {
1828 case '|':
1829 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1830 token->type = OP_ALT;
1831 break;
1832 case '1': case '2': case '3': case '4': case '5':
1833 case '6': case '7': case '8': case '9':
1834 if (!(syntax & RE_NO_BK_REFS))
1835 {
1836 token->type = OP_BACK_REF;
1837 token->opr.idx = c2 - '1';
1838 }
1839 break;
1840 case '<':
1841 if (!(syntax & RE_NO_GNU_OPS))
1842 {
1843 token->type = ANCHOR;
1844 token->opr.ctx_type = WORD_FIRST;
1845 }
1846 break;
1847 case '>':
1848 if (!(syntax & RE_NO_GNU_OPS))
1849 {
1850 token->type = ANCHOR;
1851 token->opr.ctx_type = WORD_LAST;
1852 }
1853 break;
1854 case 'b':
1855 if (!(syntax & RE_NO_GNU_OPS))
1856 {
1857 token->type = ANCHOR;
1858 token->opr.ctx_type = WORD_DELIM;
1859 }
1860 break;
1861 case 'B':
1862 if (!(syntax & RE_NO_GNU_OPS))
1863 {
1864 token->type = ANCHOR;
1865 token->opr.ctx_type = NOT_WORD_DELIM;
1866 }
1867 break;
1868 case 'w':
1869 if (!(syntax & RE_NO_GNU_OPS))
1870 token->type = OP_WORD;
1871 break;
1872 case 'W':
1873 if (!(syntax & RE_NO_GNU_OPS))
1874 token->type = OP_NOTWORD;
1875 break;
1876 case 's':
1877 if (!(syntax & RE_NO_GNU_OPS))
1878 token->type = OP_SPACE;
1879 break;
1880 case 'S':
1881 if (!(syntax & RE_NO_GNU_OPS))
1882 token->type = OP_NOTSPACE;
1883 break;
1884 case '`':
1885 if (!(syntax & RE_NO_GNU_OPS))
1886 {
1887 token->type = ANCHOR;
1888 token->opr.ctx_type = BUF_FIRST;
1889 }
1890 break;
1891 case '\'':
1892 if (!(syntax & RE_NO_GNU_OPS))
1893 {
1894 token->type = ANCHOR;
1895 token->opr.ctx_type = BUF_LAST;
1896 }
1897 break;
1898 case '(':
1899 if (!(syntax & RE_NO_BK_PARENS))
1900 token->type = OP_OPEN_SUBEXP;
1901 break;
1902 case ')':
1903 if (!(syntax & RE_NO_BK_PARENS))
1904 token->type = OP_CLOSE_SUBEXP;
1905 break;
1906 case '+':
1907 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1908 token->type = OP_DUP_PLUS;
1909 break;
1910 case '?':
1911 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1912 token->type = OP_DUP_QUESTION;
1913 break;
1914 case '{':
1915 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1916 token->type = OP_OPEN_DUP_NUM;
1917 break;
1918 case '}':
1919 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1920 token->type = OP_CLOSE_DUP_NUM;
1921 break;
1922 default:
1923 break;
1924 }
1925 return 2;
1926 }
1928 token->type = CHARACTER;
1929 #ifdef RE_ENABLE_I18N
1930 if (input->mb_cur_max > 1)
1931 {
1932 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1933 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1934 }
1935 else
1936 #endif
1937 token->word_char = IS_WORD_CHAR (token->opr.c);
1939 switch (c)
1940 {
1941 case '\n':
1942 if (syntax & RE_NEWLINE_ALT)
1943 token->type = OP_ALT;
1944 break;
1945 case '|':
1946 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1947 token->type = OP_ALT;
1948 break;
1949 case '*':
1950 token->type = OP_DUP_ASTERISK;
1951 break;
1952 case '+':
1953 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1954 token->type = OP_DUP_PLUS;
1955 break;
1956 case '?':
1957 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1958 token->type = OP_DUP_QUESTION;
1959 break;
1960 case '{':
1961 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1962 token->type = OP_OPEN_DUP_NUM;
1963 break;
1964 case '}':
1965 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1966 token->type = OP_CLOSE_DUP_NUM;
1967 break;
1968 case '(':
1969 if (syntax & RE_NO_BK_PARENS)
1970 token->type = OP_OPEN_SUBEXP;
1971 break;
1972 case ')':
1973 if (syntax & RE_NO_BK_PARENS)
1974 token->type = OP_CLOSE_SUBEXP;
1975 break;
1976 case '[':
1977 token->type = OP_OPEN_BRACKET;
1978 break;
1979 case '.':
1980 token->type = OP_PERIOD;
1981 break;
1982 case '^':
1983 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1984 re_string_cur_idx (input) != 0)
1985 {
1986 char prev = re_string_peek_byte (input, -1);
1987 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1988 break;
1989 }
1990 token->type = ANCHOR;
1991 token->opr.ctx_type = LINE_FIRST;
1992 break;
1993 case '$':
1994 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1995 re_string_cur_idx (input) + 1 != re_string_length (input))
1996 {
1997 re_token_t next;
1998 re_string_skip_bytes (input, 1);
1999 peek_token (&next, input, syntax);
2000 re_string_skip_bytes (input, -1);
2001 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2002 break;
2003 }
2004 token->type = ANCHOR;
2005 token->opr.ctx_type = LINE_LAST;
2006 break;
2007 default:
2008 break;
2009 }
2010 return 1;
2011 }
2013 /* Peek a token from INPUT, and return the length of the token.
2014 We must not use this function out of bracket expressions. */
2016 static int
2017 internal_function
2018 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2019 {
2020 unsigned char c;
2021 if (re_string_eoi (input))
2022 {
2023 token->type = END_OF_RE;
2024 return 0;
2025 }
2026 c = re_string_peek_byte (input, 0);
2027 token->opr.c = c;
2029 #ifdef RE_ENABLE_I18N
2030 if (input->mb_cur_max > 1 &&
2031 !re_string_first_byte (input, re_string_cur_idx (input)))
2032 {
2033 token->type = CHARACTER;
2034 return 1;
2035 }
2036 #endif /* RE_ENABLE_I18N */
2038 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2039 && re_string_cur_idx (input) + 1 < re_string_length (input))
2040 {
2041 /* In this case, '\' escape a character. */
2042 unsigned char c2;
2043 re_string_skip_bytes (input, 1);
2044 c2 = re_string_peek_byte (input, 0);
2045 token->opr.c = c2;
2046 token->type = CHARACTER;
2047 return 1;
2048 }
2049 if (c == '[') /* '[' is a special char in a bracket exps. */
2050 {
2051 unsigned char c2;
2052 int token_len;
2053 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2054 c2 = re_string_peek_byte (input, 1);
2055 else
2056 c2 = 0;
2057 token->opr.c = c2;
2058 token_len = 2;
2059 switch (c2)
2060 {
2061 case '.':
2062 token->type = OP_OPEN_COLL_ELEM;
2063 break;
2064 case '=':
2065 token->type = OP_OPEN_EQUIV_CLASS;
2066 break;
2067 case ':':
2068 if (syntax & RE_CHAR_CLASSES)
2069 {
2070 token->type = OP_OPEN_CHAR_CLASS;
2071 break;
2072 }
2073 /* else fall through. */
2074 default:
2075 token->type = CHARACTER;
2076 token->opr.c = c;
2077 token_len = 1;
2078 break;
2079 }
2080 return token_len;
2081 }
2082 switch (c)
2083 {
2084 case '-':
2085 token->type = OP_CHARSET_RANGE;
2086 break;
2087 case ']':
2088 token->type = OP_CLOSE_BRACKET;
2089 break;
2090 case '^':
2091 token->type = OP_NON_MATCH_LIST;
2092 break;
2093 default:
2094 token->type = CHARACTER;
2095 }
2096 return 1;
2097 }
2098 \f
2099 /* Functions for parser. */
2101 /* Entry point of the parser.
2102 Parse the regular expression REGEXP and return the structure tree.
2103 If an error is occured, ERR is set by error code, and return NULL.
2104 This function build the following tree, from regular expression <reg_exp>:
2105 CAT
2106 / \
2107 / \
2108 <reg_exp> EOR
2110 CAT means concatenation.
2111 EOR means end of regular expression. */
2113 static bin_tree_t *
2114 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2115 reg_errcode_t *err)
2116 {
2117 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2118 bin_tree_t *tree, *eor, *root;
2119 re_token_t current_token;
2120 dfa->syntax = syntax;
2121 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2122 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2123 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2124 return NULL;
2125 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2126 if (tree != NULL)
2127 root = create_tree (dfa, tree, eor, CONCAT);
2128 else
2129 root = eor;
2130 if (BE (eor == NULL || root == NULL, 0))
2131 {
2132 *err = REG_ESPACE;
2133 return NULL;
2134 }
2135 return root;
2136 }
2138 /* This function build the following tree, from regular expression
2139 <branch1>|<branch2>:
2140 ALT
2141 / \
2142 / \
2143 <branch1> <branch2>
2145 ALT means alternative, which represents the operator `|'. */
2147 static bin_tree_t *
2148 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2149 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2150 {
2151 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2152 bin_tree_t *tree, *branch = NULL;
2153 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2154 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2155 return NULL;
2157 while (token->type == OP_ALT)
2158 {
2159 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2160 if (token->type != OP_ALT && token->type != END_OF_RE
2161 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2162 {
2163 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2164 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2165 return NULL;
2166 }
2167 else
2168 branch = NULL;
2169 tree = create_tree (dfa, tree, branch, OP_ALT);
2170 if (BE (tree == NULL, 0))
2171 {
2172 *err = REG_ESPACE;
2173 return NULL;
2174 }
2175 }
2176 return tree;
2177 }
2179 /* This function build the following tree, from regular expression
2180 <exp1><exp2>:
2181 CAT
2182 / \
2183 / \
2184 <exp1> <exp2>
2186 CAT means concatenation. */
2188 static bin_tree_t *
2189 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2190 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2191 {
2192 bin_tree_t *tree, *exp;
2193 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2194 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2195 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2196 return NULL;
2198 while (token->type != OP_ALT && token->type != END_OF_RE
2199 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2200 {
2201 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2202 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2203 {
2204 return NULL;
2205 }
2206 if (tree != NULL && exp != NULL)
2207 {
2208 tree = create_tree (dfa, tree, exp, CONCAT);
2209 if (tree == NULL)
2210 {
2211 *err = REG_ESPACE;
2212 return NULL;
2213 }
2214 }
2215 else if (tree == NULL)
2216 tree = exp;
2217 /* Otherwise exp == NULL, we don't need to create new tree. */
2218 }
2219 return tree;
2220 }
2222 /* This function build the following tree, from regular expression a*:
2223 *
2224 |
2225 a
2226 */
2228 static bin_tree_t *
2229 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2230 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2231 {
2232 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2233 bin_tree_t *tree;
2234 switch (token->type)
2235 {
2236 case CHARACTER:
2237 tree = create_token_tree (dfa, NULL, NULL, token);
2238 if (BE (tree == NULL, 0))
2239 {
2240 *err = REG_ESPACE;
2241 return NULL;
2242 }
2243 #ifdef RE_ENABLE_I18N
2244 if (dfa->mb_cur_max > 1)
2245 {
2246 while (!re_string_eoi (regexp)
2247 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2248 {
2249 bin_tree_t *mbc_remain;
2250 fetch_token (token, regexp, syntax);
2251 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2252 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2253 if (BE (mbc_remain == NULL || tree == NULL, 0))
2254 {
2255 *err = REG_ESPACE;
2256 return NULL;
2257 }
2258 }
2259 }
2260 #endif
2261 break;
2262 case OP_OPEN_SUBEXP:
2263 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2264 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2265 return NULL;
2266 break;
2267 case OP_OPEN_BRACKET:
2268 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2269 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2270 return NULL;
2271 break;
2272 case OP_BACK_REF:
2273 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2274 {
2275 *err = REG_ESUBREG;
2276 return NULL;
2277 }
2278 dfa->used_bkref_map |= 1 << token->opr.idx;
2279 tree = create_token_tree (dfa, NULL, NULL, token);
2280 if (BE (tree == NULL, 0))
2281 {
2282 *err = REG_ESPACE;
2283 return NULL;
2284 }
2285 ++dfa->nbackref;
2286 dfa->has_mb_node = 1;
2287 break;
2288 case OP_OPEN_DUP_NUM:
2289 if (syntax & RE_CONTEXT_INVALID_DUP)
2290 {
2291 *err = REG_BADRPT;
2292 return NULL;
2293 }
2294 /* FALLTHROUGH */
2295 case OP_DUP_ASTERISK:
2296 case OP_DUP_PLUS:
2297 case OP_DUP_QUESTION:
2298 if (syntax & RE_CONTEXT_INVALID_OPS)
2299 {
2300 *err = REG_BADRPT;
2301 return NULL;
2302 }
2303 else if (syntax & RE_CONTEXT_INDEP_OPS)
2304 {
2305 fetch_token (token, regexp, syntax);
2306 return parse_expression (regexp, preg, token, syntax, nest, err);
2307 }
2308 /* else fall through */
2309 case OP_CLOSE_SUBEXP:
2310 if ((token->type == OP_CLOSE_SUBEXP) &&
2311 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2312 {
2313 *err = REG_ERPAREN;
2314 return NULL;
2315 }
2316 /* else fall through */
2317 case OP_CLOSE_DUP_NUM:
2318 /* We treat it as a normal character. */
2320 /* Then we can these characters as normal characters. */
2321 token->type = CHARACTER;
2322 /* mb_partial and word_char bits should be initialized already
2323 by peek_token. */
2324 tree = create_token_tree (dfa, NULL, NULL, token);
2325 if (BE (tree == NULL, 0))
2326 {
2327 *err = REG_ESPACE;
2328 return NULL;
2329 }
2330 break;
2331 case ANCHOR:
2332 if ((token->opr.ctx_type
2333 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2334 && dfa->word_ops_used == 0)
2335 init_word_char (dfa);
2336 if (token->opr.ctx_type == WORD_DELIM
2337 || token->opr.ctx_type == NOT_WORD_DELIM)
2338 {
2339 bin_tree_t *tree_first, *tree_last;
2340 if (token->opr.ctx_type == WORD_DELIM)
2341 {
2342 token->opr.ctx_type = WORD_FIRST;
2343 tree_first = create_token_tree (dfa, NULL, NULL, token);
2344 token->opr.ctx_type = WORD_LAST;
2345 }
2346 else
2347 {
2348 token->opr.ctx_type = INSIDE_WORD;
2349 tree_first = create_token_tree (dfa, NULL, NULL, token);
2350 token->opr.ctx_type = INSIDE_NOTWORD;
2351 }
2352 tree_last = create_token_tree (dfa, NULL, NULL, token);
2353 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2354 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2355 {
2356 *err = REG_ESPACE;
2357 return NULL;
2358 }
2359 }
2360 else
2361 {
2362 tree = create_token_tree (dfa, NULL, NULL, token);
2363 if (BE (tree == NULL, 0))
2364 {
2365 *err = REG_ESPACE;
2366 return NULL;
2367 }
2368 }
2369 /* We must return here, since ANCHORs can't be followed
2370 by repetition operators.
2371 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2372 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2373 fetch_token (token, regexp, syntax);
2374 return tree;
2375 case OP_PERIOD:
2376 tree = create_token_tree (dfa, NULL, NULL, token);
2377 if (BE (tree == NULL, 0))
2378 {
2379 *err = REG_ESPACE;
2380 return NULL;
2381 }
2382 if (dfa->mb_cur_max > 1)
2383 dfa->has_mb_node = 1;
2384 break;
2385 case OP_WORD:
2386 case OP_NOTWORD:
2387 tree = build_charclass_op (dfa, regexp->trans,
2388 "alnum",
2389 "_",
2390 token->type == OP_NOTWORD, err);
2391 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2392 return NULL;
2393 break;
2394 case OP_SPACE:
2395 case OP_NOTSPACE:
2396 tree = build_charclass_op (dfa, regexp->trans,
2397 "space",
2398 "",
2399 token->type == OP_NOTSPACE, err);
2400 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2401 return NULL;
2402 break;
2403 case OP_ALT:
2404 case END_OF_RE:
2405 return NULL;
2406 case BACK_SLASH:
2407 *err = REG_EESCAPE;
2408 return NULL;
2409 default:
2410 /* Must not happen? */
2411 #ifdef DEBUG
2412 assert (0);
2413 #endif
2414 return NULL;
2415 }
2416 fetch_token (token, regexp, syntax);
2418 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2419 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2420 {
2421 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2422 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2423 return NULL;
2424 /* In BRE consecutive duplications are not allowed. */
2425 if ((syntax & RE_CONTEXT_INVALID_DUP)
2426 && (token->type == OP_DUP_ASTERISK
2427 || token->type == OP_OPEN_DUP_NUM))
2428 {
2429 *err = REG_BADRPT;
2430 return NULL;
2431 }
2432 }
2434 return tree;
2435 }
2437 /* This function build the following tree, from regular expression
2438 (<reg_exp>):
2439 SUBEXP
2440 |
2441 <reg_exp>
2442 */
2444 static bin_tree_t *
2445 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2446 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2447 {
2448 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2449 bin_tree_t *tree;
2450 size_t cur_nsub;
2451 cur_nsub = preg->re_nsub++;
2453 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2455 /* The subexpression may be a null string. */
2456 if (token->type == OP_CLOSE_SUBEXP)
2457 tree = NULL;
2458 else
2459 {
2460 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2461 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2462 *err = REG_EPAREN;
2463 if (BE (*err != REG_NOERROR, 0))
2464 return NULL;
2465 }
2467 if (cur_nsub <= '9' - '1')
2468 dfa->completed_bkref_map |= 1 << cur_nsub;
2470 tree = create_tree (dfa, tree, NULL, SUBEXP);
2471 if (BE (tree == NULL, 0))
2472 {
2473 *err = REG_ESPACE;
2474 return NULL;
2475 }
2476 tree->token.opr.idx = cur_nsub;
2477 return tree;
2478 }
2480 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2482 static bin_tree_t *
2483 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2484 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2485 {
2486 bin_tree_t *tree = NULL, *old_tree = NULL;
2487 int i, start, end, start_idx = re_string_cur_idx (regexp);
2488 #ifndef RE_TOKEN_INIT_BUG
2489 re_token_t start_token = *token;
2490 #else
2491 re_token_t start_token;
2493 memcpy ((void *) &start_token, (void *) token, sizeof start_token);
2494 #endif
2496 if (token->type == OP_OPEN_DUP_NUM)
2497 {
2498 end = 0;
2499 start = fetch_number (regexp, token, syntax);
2500 if (start == -1)
2501 {
2502 if (token->type == CHARACTER && token->opr.c == ',')
2503 start = 0; /* We treat "{,m}" as "{0,m}". */
2504 else
2505 {
2506 *err = REG_BADBR; /* <re>{} is invalid. */
2507 return NULL;
2508 }
2509 }
2510 if (BE (start != -2, 1))
2511 {
2512 /* We treat "{n}" as "{n,n}". */
2513 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2514 : ((token->type == CHARACTER && token->opr.c == ',')
2515 ? fetch_number (regexp, token, syntax) : -2));
2516 }
2517 if (BE (start == -2 || end == -2, 0))
2518 {
2519 /* Invalid sequence. */
2520 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2521 {
2522 if (token->type == END_OF_RE)
2523 *err = REG_EBRACE;
2524 else
2525 *err = REG_BADBR;
2527 return NULL;
2528 }
2530 /* If the syntax bit is set, rollback. */
2531 re_string_set_index (regexp, start_idx);
2532 *token = start_token;
2533 token->type = CHARACTER;
2534 /* mb_partial and word_char bits should be already initialized by
2535 peek_token. */
2536 return elem;
2537 }
2539 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2540 {
2541 /* First number greater than second. */
2542 *err = REG_BADBR;
2543 return NULL;
2544 }
2545 }
2546 else
2547 {
2548 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2549 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2550 }
2552 fetch_token (token, regexp, syntax);
2554 if (BE (elem == NULL, 0))
2555 return NULL;
2556 if (BE (start == 0 && end == 0, 0))
2557 {
2558 postorder (elem, free_tree, NULL);
2559 return NULL;
2560 }
2562 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2563 if (BE (start > 0, 0))
2564 {
2565 tree = elem;
2566 for (i = 2; i <= start; ++i)
2567 {
2568 elem = duplicate_tree (elem, dfa);
2569 tree = create_tree (dfa, tree, elem, CONCAT);
2570 if (BE (elem == NULL || tree == NULL, 0))
2571 goto parse_dup_op_espace;
2572 }
2574 if (start == end)
2575 return tree;
2577 /* Duplicate ELEM before it is marked optional. */
2578 elem = duplicate_tree (elem, dfa);
2579 old_tree = tree;
2580 }
2581 else
2582 old_tree = NULL;
2584 if (elem->token.type == SUBEXP)
2585 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2587 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2588 if (BE (tree == NULL, 0))
2589 goto parse_dup_op_espace;
2591 /* This loop is actually executed only when end != -1,
2592 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2593 already created the start+1-th copy. */
2594 for (i = start + 2; i <= end; ++i)
2595 {
2596 elem = duplicate_tree (elem, dfa);
2597 tree = create_tree (dfa, tree, elem, CONCAT);
2598 if (BE (elem == NULL || tree == NULL, 0))
2599 goto parse_dup_op_espace;
2601 tree = create_tree (dfa, tree, NULL, OP_ALT);
2602 if (BE (tree == NULL, 0))
2603 goto parse_dup_op_espace;
2604 }
2606 if (old_tree)
2607 tree = create_tree (dfa, old_tree, tree, CONCAT);
2609 return tree;
2611 parse_dup_op_espace:
2612 *err = REG_ESPACE;
2613 return NULL;
2614 }
2616 /* Size of the names for collating symbol/equivalence_class/character_class.
2617 I'm not sure, but maybe enough. */
2618 #define BRACKET_NAME_BUF_SIZE 32
2620 #ifndef _LIBC
2621 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2622 Build the range expression which starts from START_ELEM, and ends
2623 at END_ELEM. The result are written to MBCSET and SBCSET.
2624 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2625 mbcset->range_ends, is a pointer argument sinse we may
2626 update it. */
2628 static reg_errcode_t
2629 internal_function
2630 # ifdef RE_ENABLE_I18N
2631 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2632 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2633 # else /* not RE_ENABLE_I18N */
2634 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2635 bracket_elem_t *end_elem)
2636 # endif /* not RE_ENABLE_I18N */
2637 {
2638 unsigned int start_ch, end_ch;
2639 /* Equivalence Classes and Character Classes can't be a range start/end. */
2640 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2641 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2642 0))
2643 return REG_ERANGE;
2645 /* We can handle no multi character collating elements without libc
2646 support. */
2647 if (BE ((start_elem->type == COLL_SYM
2648 && strlen ((char *) start_elem->opr.name) > 1)
2649 || (end_elem->type == COLL_SYM
2650 && strlen ((char *) end_elem->opr.name) > 1), 0))
2651 return REG_ECOLLATE;
2653 # ifdef RE_ENABLE_I18N
2654 {
2655 wchar_t wc;
2656 wint_t start_wc;
2657 wint_t end_wc;
2658 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2660 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2661 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2662 : 0));
2663 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2664 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2665 : 0));
2666 #ifdef GAWK
2667 /*
2668 * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2669 * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2670 * unsigned, so we don't have sign extension problems.
2671 */
2672 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2673 ? start_ch : start_elem->opr.wch);
2674 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2675 ? end_ch : end_elem->opr.wch);
2676 #else
2677 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2678 ? __btowc (start_ch) : start_elem->opr.wch);
2679 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2680 ? __btowc (end_ch) : end_elem->opr.wch);
2681 #endif
2682 if (start_wc == WEOF || end_wc == WEOF)
2683 return REG_ECOLLATE;
2684 cmp_buf[0] = start_wc;
2685 cmp_buf[4] = end_wc;
2686 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2687 return REG_ERANGE;
2689 /* Got valid collation sequence values, add them as a new entry.
2690 However, for !_LIBC we have no collation elements: if the
2691 character set is single byte, the single byte character set
2692 that we build below suffices. parse_bracket_exp passes
2693 no MBCSET if dfa->mb_cur_max == 1. */
2694 if (mbcset)
2695 {
2696 /* Check the space of the arrays. */
2697 if (BE (*range_alloc == mbcset->nranges, 0))
2698 {
2699 /* There is not enough space, need realloc. */
2700 wchar_t *new_array_start, *new_array_end;
2701 int new_nranges;
2703 /* +1 in case of mbcset->nranges is 0. */
2704 new_nranges = 2 * mbcset->nranges + 1;
2705 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2706 are NULL if *range_alloc == 0. */
2707 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2708 new_nranges);
2709 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2710 new_nranges);
2712 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2713 return REG_ESPACE;
2715 mbcset->range_starts = new_array_start;
2716 mbcset->range_ends = new_array_end;
2717 *range_alloc = new_nranges;
2718 }
2720 mbcset->range_starts[mbcset->nranges] = start_wc;
2721 mbcset->range_ends[mbcset->nranges++] = end_wc;
2722 }
2724 /* Build the table for single byte characters. */
2725 for (wc = 0; wc < SBC_MAX; ++wc)
2726 {
2727 cmp_buf[2] = wc;
2728 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2729 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2730 bitset_set (sbcset, wc);
2731 }
2732 }
2733 # else /* not RE_ENABLE_I18N */
2734 {
2735 unsigned int ch;
2736 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2737 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2738 : 0));
2739 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2740 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2741 : 0));
2742 if (start_ch > end_ch)
2743 return REG_ERANGE;
2744 /* Build the table for single byte characters. */
2745 for (ch = 0; ch < SBC_MAX; ++ch)
2746 if (start_ch <= ch && ch <= end_ch)
2747 bitset_set (sbcset, ch);
2748 }
2749 # endif /* not RE_ENABLE_I18N */
2750 return REG_NOERROR;
2751 }
2752 #endif /* not _LIBC */
2754 #ifndef _LIBC
2755 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2756 Build the collating element which is represented by NAME.
2757 The result are written to MBCSET and SBCSET.
2758 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2759 pointer argument since we may update it. */
2761 static reg_errcode_t
2762 internal_function
2763 # ifdef RE_ENABLE_I18N
2764 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2765 int *coll_sym_alloc, const unsigned char *name)
2766 # else /* not RE_ENABLE_I18N */
2767 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2768 # endif /* not RE_ENABLE_I18N */
2769 {
2770 size_t name_len = strlen ((const char *) name);
2771 if (BE (name_len != 1, 0))
2772 return REG_ECOLLATE;
2773 else
2774 {
2775 bitset_set (sbcset, name[0]);
2776 return REG_NOERROR;
2777 }
2778 }
2779 #endif /* not _LIBC */
2781 /* This function parse bracket expression like "[abc]", "[a-c]",
2782 "[[.a-a.]]" etc. */
2784 static bin_tree_t *
2785 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2786 reg_syntax_t syntax, reg_errcode_t *err)
2787 {
2788 #ifdef _LIBC
2789 const unsigned char *collseqmb;
2790 const char *collseqwc;
2791 uint32_t nrules;
2792 int32_t table_size;
2793 const int32_t *symb_table;
2794 const unsigned char *extra;
2796 /* Local function for parse_bracket_exp used in _LIBC environement.
2797 Seek the collating symbol entry correspondings to NAME.
2798 Return the index of the symbol in the SYMB_TABLE. */
2800 auto inline int32_t
2801 __attribute ((always_inline))
2802 seek_collating_symbol_entry (name, name_len)
2803 const unsigned char *name;
2804 size_t name_len;
2805 {
2806 int32_t hash = elem_hash ((const char *) name, name_len);
2807 int32_t elem = hash % table_size;
2808 if (symb_table[2 * elem] != 0)
2809 {
2810 int32_t second = hash % (table_size - 2) + 1;
2812 do
2813 {
2814 /* First compare the hashing value. */
2815 if (symb_table[2 * elem] == hash
2816 /* Compare the length of the name. */
2817 && name_len == extra[symb_table[2 * elem + 1]]
2818 /* Compare the name. */
2819 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2820 name_len) == 0)
2821 {
2822 /* Yep, this is the entry. */
2823 break;
2824 }
2826 /* Next entry. */
2827 elem += second;
2828 }
2829 while (symb_table[2 * elem] != 0);
2830 }
2831 return elem;
2832 }
2834 /* Local function for parse_bracket_exp used in _LIBC environment.
2835 Look up the collation sequence value of BR_ELEM.
2836 Return the value if succeeded, UINT_MAX otherwise. */
2838 auto inline unsigned int
2839 __attribute ((always_inline))
2840 lookup_collation_sequence_value (br_elem)
2841 bracket_elem_t *br_elem;
2842 {
2843 if (br_elem->type == SB_CHAR)
2844 {
2845 /*
2846 if (MB_CUR_MAX == 1)
2847 */
2848 if (nrules == 0)
2849 return collseqmb[br_elem->opr.ch];
2850 else
2851 {
2852 wint_t wc = __btowc (br_elem->opr.ch);
2853 return __collseq_table_lookup (collseqwc, wc);
2854 }
2855 }
2856 else if (br_elem->type == MB_CHAR)
2857 {
2858 if (nrules != 0)
2859 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2860 }
2861 else if (br_elem->type == COLL_SYM)
2862 {
2863 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2864 if (nrules != 0)
2865 {
2866 int32_t elem, idx;
2867 elem = seek_collating_symbol_entry (br_elem->opr.name,
2868 sym_name_len);
2869 if (symb_table[2 * elem] != 0)
2870 {
2871 /* We found the entry. */
2872 idx = symb_table[2 * elem + 1];
2873 /* Skip the name of collating element name. */
2874 idx += 1 + extra[idx];
2875 /* Skip the byte sequence of the collating element. */
2876 idx += 1 + extra[idx];
2877 /* Adjust for the alignment. */
2878 idx = (idx + 3) & ~3;
2879 /* Skip the multibyte collation sequence value. */
2880 idx += sizeof (unsigned int);
2881 /* Skip the wide char sequence of the collating element. */
2882 idx += sizeof (unsigned int) *
2883 (1 + *(unsigned int *) (extra + idx));
2884 /* Return the collation sequence value. */
2885 return *(unsigned int *) (extra + idx);
2886 }
2887 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2888 {
2889 /* No valid character. Match it as a single byte
2890 character. */
2891 return collseqmb[br_elem->opr.name[0]];
2892 }
2893 }
2894 else if (sym_name_len == 1)
2895 return collseqmb[br_elem->opr.name[0]];
2896 }
2897 return UINT_MAX;
2898 }
2900 /* Local function for parse_bracket_exp used in _LIBC environement.
2901 Build the range expression which starts from START_ELEM, and ends
2902 at END_ELEM. The result are written to MBCSET and SBCSET.
2903 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2904 mbcset->range_ends, is a pointer argument sinse we may
2905 update it. */
2907 auto inline reg_errcode_t
2908 __attribute ((always_inline))
2909 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2910 re_charset_t *mbcset;
2911 int *range_alloc;
2912 bitset_t sbcset;
2913 bracket_elem_t *start_elem, *end_elem;
2914 {
2915 unsigned int ch;
2916 uint32_t start_collseq;
2917 uint32_t end_collseq;
2919 /* Equivalence Classes and Character Classes can't be a range
2920 start/end. */
2921 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2922 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2923 0))
2924 return REG_ERANGE;
2926 start_collseq = lookup_collation_sequence_value (start_elem);
2927 end_collseq = lookup_collation_sequence_value (end_elem);
2928 /* Check start/end collation sequence values. */
2929 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2930 return REG_ECOLLATE;
2931 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2932 return REG_ERANGE;
2934 /* Got valid collation sequence values, add them as a new entry.
2935 However, if we have no collation elements, and the character set
2936 is single byte, the single byte character set that we
2937 build below suffices. */
2938 if (nrules > 0 || dfa->mb_cur_max > 1)
2939 {
2940 /* Check the space of the arrays. */
2941 if (BE (*range_alloc == mbcset->nranges, 0))
2942 {
2943 /* There is not enough space, need realloc. */
2944 uint32_t *new_array_start;
2945 uint32_t *new_array_end;
2946 int new_nranges;
2948 /* +1 in case of mbcset->nranges is 0. */
2949 new_nranges = 2 * mbcset->nranges + 1;
2950 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2951 new_nranges);
2952 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2953 new_nranges);
2955 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2956 return REG_ESPACE;
2958 mbcset->range_starts = new_array_start;
2959 mbcset->range_ends = new_array_end;
2960 *range_alloc = new_nranges;
2961 }
2963 mbcset->range_starts[mbcset->nranges] = start_collseq;
2964 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2965 }
2967 /* Build the table for single byte characters. */
2968 for (ch = 0; ch < SBC_MAX; ch++)
2969 {
2970 uint32_t ch_collseq;
2971 /*
2972 if (MB_CUR_MAX == 1)
2973 */
2974 if (nrules == 0)
2975 ch_collseq = collseqmb[ch];
2976 else
2977 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2978 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2979 bitset_set (sbcset, ch);
2980 }
2981 return REG_NOERROR;
2982 }
2984 /* Local function for parse_bracket_exp used in _LIBC environement.
2985 Build the collating element which is represented by NAME.
2986 The result are written to MBCSET and SBCSET.
2987 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2988 pointer argument sinse we may update it. */
2990 auto inline reg_errcode_t
2991 __attribute ((always_inline))
2992 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2993 re_charset_t *mbcset;
2994 int *coll_sym_alloc;
2995 bitset_t sbcset;
2996 const unsigned char *name;
2997 {
2998 int32_t elem, idx;
2999 size_t name_len = strlen ((const char *) name);
3000 if (nrules != 0)
3001 {
3002 elem = seek_collating_symbol_entry (name, name_len);
3003 if (symb_table[2 * elem] != 0)
3004 {
3005 /* We found the entry. */
3006 idx = symb_table[2 * elem + 1];
3007 /* Skip the name of collating element name. */
3008 idx += 1 + extra[idx];
3009 }
3010 else if (symb_table[2 * elem] == 0 && name_len == 1)
3011 {
3012 /* No valid character, treat it as a normal
3013 character. */
3014 bitset_set (sbcset, name[0]);
3015 return REG_NOERROR;
3016 }
3017 else
3018 return REG_ECOLLATE;
3020 /* Got valid collation sequence, add it as a new entry. */
3021 /* Check the space of the arrays. */
3022 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3023 {
3024 /* Not enough, realloc it. */
3025 /* +1 in case of mbcset->ncoll_syms is 0. */
3026 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3027 /* Use realloc since mbcset->coll_syms is NULL
3028 if *alloc == 0. */
3029 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3030 new_coll_sym_alloc);
3031 if (BE (new_coll_syms == NULL, 0))
3032 return REG_ESPACE;
3033 mbcset->coll_syms = new_coll_syms;
3034 *coll_sym_alloc = new_coll_sym_alloc;
3035 }
3036 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3037 return REG_NOERROR;
3038 }
3039 else
3040 {
3041 if (BE (name_len != 1, 0))
3042 return REG_ECOLLATE;
3043 else
3044 {
3045 bitset_set (sbcset, name[0]);
3046 return REG_NOERROR;
3047 }
3048 }
3049 }
3050 #endif
3052 re_token_t br_token;
3053 re_bitset_ptr_t sbcset;
3054 #ifdef RE_ENABLE_I18N
3055 re_charset_t *mbcset;
3056 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3057 int equiv_class_alloc = 0, char_class_alloc = 0;
3058 #endif /* not RE_ENABLE_I18N */
3059 int non_match = 0;
3060 bin_tree_t *work_tree;
3061 int token_len;
3062 int first_round = 1;
3063 #ifdef _LIBC
3064 collseqmb = (const unsigned char *)
3065 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3066 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3067 if (nrules)
3068 {
3069 /*
3070 if (MB_CUR_MAX > 1)
3071 */
3072 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3073 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3074 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3075 _NL_COLLATE_SYMB_TABLEMB);
3076 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3077 _NL_COLLATE_SYMB_EXTRAMB);
3078 }
3079 #endif
3080 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3081 #ifdef RE_ENABLE_I18N
3082 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3083 #endif /* RE_ENABLE_I18N */
3084 #ifdef RE_ENABLE_I18N
3085 if (BE (sbcset == NULL || mbcset == NULL, 0))
3086 #else
3087 if (BE (sbcset == NULL, 0))
3088 #endif /* RE_ENABLE_I18N */
3089 {
3090 *err = REG_ESPACE;
3091 return NULL;
3092 }
3094 token_len = peek_token_bracket (token, regexp, syntax);
3095 if (BE (token->type == END_OF_RE, 0))
3096 {
3097 *err = REG_BADPAT;
3098 goto parse_bracket_exp_free_return;
3099 }
3100 if (token->type == OP_NON_MATCH_LIST)
3101 {
3102 #ifdef RE_ENABLE_I18N
3103 mbcset->non_match = 1;
3104 #endif /* not RE_ENABLE_I18N */
3105 non_match = 1;
3106 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3107 bitset_set (sbcset, '\n');
3108 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3109 token_len = peek_token_bracket (token, regexp, syntax);
3110 if (BE (token->type == END_OF_RE, 0))
3111 {
3112 *err = REG_BADPAT;
3113 goto parse_bracket_exp_free_return;
3114 }
3115 }
3117 /* We treat the first ']' as a normal character. */
3118 if (token->type == OP_CLOSE_BRACKET)
3119 token->type = CHARACTER;
3121 while (1)
3122 {
3123 bracket_elem_t start_elem, end_elem;
3124 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3125 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3126 reg_errcode_t ret;
3127 int token_len2 = 0, is_range_exp = 0;
3128 re_token_t token2;
3130 start_elem.opr.name = start_name_buf;
3131 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3132 syntax, first_round);
3133 if (BE (ret != REG_NOERROR, 0))
3134 {
3135 *err = ret;
3136 goto parse_bracket_exp_free_return;
3137 }
3138 first_round = 0;
3140 /* Get information about the next token. We need it in any case. */
3141 token_len = peek_token_bracket (token, regexp, syntax);
3143 /* Do not check for ranges if we know they are not allowed. */
3144 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3145 {
3146 if (BE (token->type == END_OF_RE, 0))
3147 {
3148 *err = REG_EBRACK;
3149 goto parse_bracket_exp_free_return;
3150 }
3151 if (token->type == OP_CHARSET_RANGE)
3152 {
3153 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3154 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3155 if (BE (token2.type == END_OF_RE, 0))
3156 {
3157 *err = REG_EBRACK;
3158 goto parse_bracket_exp_free_return;
3159 }
3160 if (token2.type == OP_CLOSE_BRACKET)
3161 {
3162 /* We treat the last '-' as a normal character. */
3163 re_string_skip_bytes (regexp, -token_len);
3164 token->type = CHARACTER;
3165 }
3166 else
3167 is_range_exp = 1;
3168 }
3169 }
3171 if (is_range_exp == 1)
3172 {
3173 end_elem.opr.name = end_name_buf;
3174 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3175 dfa, syntax, 1);
3176 if (BE (ret != REG_NOERROR, 0))
3177 {
3178 *err = ret;
3179 goto parse_bracket_exp_free_return;
3180 }
3182 token_len = peek_token_bracket (token, regexp, syntax);
3184 #ifdef _LIBC
3185 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3186 &start_elem, &end_elem);
3187 #else
3188 # ifdef RE_ENABLE_I18N
3189 *err = build_range_exp (sbcset,
3190 dfa->mb_cur_max > 1 ? mbcset : NULL,
3191 &range_alloc, &start_elem, &end_elem);
3192 # else
3193 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3194 # endif
3195 #endif /* RE_ENABLE_I18N */
3196 if (BE (*err != REG_NOERROR, 0))
3197 goto parse_bracket_exp_free_return;
3198 }
3199 else
3200 {
3201 switch (start_elem.type)
3202 {
3203 case SB_CHAR:
3204 bitset_set (sbcset, start_elem.opr.ch);
3205 break;
3206 #ifdef RE_ENABLE_I18N
3207 case MB_CHAR:
3208 /* Check whether the array has enough space. */
3209 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3210 {
3211 wchar_t *new_mbchars;
3212 /* Not enough, realloc it. */
3213 /* +1 in case of mbcset->nmbchars is 0. */
3214 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3215 /* Use realloc since array is NULL if *alloc == 0. */
3216 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3217 mbchar_alloc);
3218 if (BE (new_mbchars == NULL, 0))
3219 goto parse_bracket_exp_espace;
3220 mbcset->mbchars = new_mbchars;
3221 }
3222 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3223 break;
3224 #endif /* RE_ENABLE_I18N */
3225 case EQUIV_CLASS:
3226 *err = build_equiv_class (sbcset,
3227 #ifdef RE_ENABLE_I18N
3228 mbcset, &equiv_class_alloc,
3229 #endif /* RE_ENABLE_I18N */
3230 start_elem.opr.name);
3231 if (BE (*err != REG_NOERROR, 0))
3232 goto parse_bracket_exp_free_return;
3233 break;
3234 case COLL_SYM:
3235 *err = build_collating_symbol (sbcset,
3236 #ifdef RE_ENABLE_I18N
3237 mbcset, &coll_sym_alloc,
3238 #endif /* RE_ENABLE_I18N */
3239 start_elem.opr.name);
3240 if (BE (*err != REG_NOERROR, 0))
3241 goto parse_bracket_exp_free_return;
3242 break;
3243 case CHAR_CLASS:
3244 *err = build_charclass (regexp->trans, sbcset,
3245 #ifdef RE_ENABLE_I18N
3246 mbcset, &char_class_alloc,
3247 #endif /* RE_ENABLE_I18N */
3248 (const char *) start_elem.opr.name, syntax);
3249 if (BE (*err != REG_NOERROR, 0))
3250 goto parse_bracket_exp_free_return;
3251 break;
3252 default:
3253 assert (0);
3254 break;
3255 }
3256 }
3257 if (BE (token->type == END_OF_RE, 0))
3258 {
3259 *err = REG_EBRACK;
3260 goto parse_bracket_exp_free_return;
3261 }
3262 if (token->type == OP_CLOSE_BRACKET)
3263 break;
3264 }
3266 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3268 /* If it is non-matching list. */
3269 if (non_match)
3270 bitset_not (sbcset);
3272 #ifdef RE_ENABLE_I18N
3273 /* Ensure only single byte characters are set. */
3274 if (dfa->mb_cur_max > 1)
3275 bitset_mask (sbcset, dfa->sb_char);
3277 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3278 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3279 || mbcset->non_match)))
3280 {
3281 bin_tree_t *mbc_tree;
3282 int sbc_idx;
3283 /* Build a tree for complex bracket. */
3284 dfa->has_mb_node = 1;
3285 br_token.type = COMPLEX_BRACKET;
3286 br_token.opr.mbcset = mbcset;
3287 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3288 if (BE (mbc_tree == NULL, 0))
3289 goto parse_bracket_exp_espace;
3290 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3291 if (sbcset[sbc_idx])
3292 break;
3293 /* If there are no bits set in sbcset, there is no point
3294 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3295 if (sbc_idx < BITSET_WORDS)
3296 {
3297 /* Build a tree for simple bracket. */
3298 br_token.type = SIMPLE_BRACKET;
3299 br_token.opr.sbcset = sbcset;
3300 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3301 if (BE (work_tree == NULL, 0))
3302 goto parse_bracket_exp_espace;
3304 /* Then join them by ALT node. */
3305 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3306 if (BE (work_tree == NULL, 0))
3307 goto parse_bracket_exp_espace;
3308 }
3309 else
3310 {
3311 re_free (sbcset);
3312 work_tree = mbc_tree;
3313 }
3314 }
3315 else
3316 #endif /* not RE_ENABLE_I18N */
3317 {
3318 #ifdef RE_ENABLE_I18N
3319 free_charset (mbcset);
3320 #endif
3321 /* Build a tree for simple bracket. */
3322 br_token.type = SIMPLE_BRACKET;
3323 br_token.opr.sbcset = sbcset;
3324 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3325 if (BE (work_tree == NULL, 0))
3326 goto parse_bracket_exp_espace;
3327 }
3328 return work_tree;
3330 parse_bracket_exp_espace:
3331 *err = REG_ESPACE;
3332 parse_bracket_exp_free_return:
3333 re_free (sbcset);
3334 #ifdef RE_ENABLE_I18N
3335 free_charset (mbcset);
3336 #endif /* RE_ENABLE_I18N */
3337 return NULL;
3338 }
3340 /* Parse an element in the bracket expression. */
3342 static reg_errcode_t
3343 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3344 re_token_t *token, int token_len, re_dfa_t *dfa,
3345 reg_syntax_t syntax, int accept_hyphen)
3346 {
3347 #ifdef RE_ENABLE_I18N
3348 int cur_char_size;
3349 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3350 if (cur_char_size > 1)
3351 {
3352 elem->type = MB_CHAR;
3353 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3354 re_string_skip_bytes (regexp, cur_char_size);
3355 return REG_NOERROR;
3356 }
3357 #endif /* RE_ENABLE_I18N */
3358 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3359 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3360 || token->type == OP_OPEN_EQUIV_CLASS)
3361 return parse_bracket_symbol (elem, regexp, token);
3362 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3363 {
3364 /* A '-' must only appear as anything but a range indicator before
3365 the closing bracket. Everything else is an error. */
3366 re_token_t token2;
3367 (void) peek_token_bracket (&token2, regexp, syntax);
3368 if (token2.type != OP_CLOSE_BRACKET)
3369 /* The actual error value is not standardized since this whole
3370 case is undefined. But ERANGE makes good sense. */
3371 return REG_ERANGE;
3372 }
3373 elem->type = SB_CHAR;
3374 elem->opr.ch = token->opr.c;
3375 return REG_NOERROR;
3376 }
3378 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3379 such as [:<character_class>:], [.<collating_element>.], and
3380 [=<equivalent_class>=]. */
3382 static reg_errcode_t
3383 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3384 re_token_t *token)
3385 {
3386 unsigned char ch, delim = token->opr.c;
3387 int i = 0;
3388 if (re_string_eoi(regexp))
3389 return REG_EBRACK;
3390 for (;; ++i)
3391 {
3392 if (i >= BRACKET_NAME_BUF_SIZE)
3393 return REG_EBRACK;
3394 if (token->type == OP_OPEN_CHAR_CLASS)
3395 ch = re_string_fetch_byte_case (regexp);
3396 else
3397 ch = re_string_fetch_byte (regexp);
3398 if (re_string_eoi(regexp))
3399 return REG_EBRACK;
3400 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3401 break;
3402 elem->opr.name[i] = ch;
3403 }
3404 re_string_skip_bytes (regexp, 1);
3405 elem->opr.name[i] = '\0';
3406 switch (token->type)
3407 {
3408 case OP_OPEN_COLL_ELEM:
3409 elem->type = COLL_SYM;
3410 break;
3411 case OP_OPEN_EQUIV_CLASS:
3412 elem->type = EQUIV_CLASS;
3413 break;
3414 case OP_OPEN_CHAR_CLASS:
3415 elem->type = CHAR_CLASS;
3416 break;
3417 default:
3418 break;
3419 }
3420 return REG_NOERROR;
3421 }
3423 /* Helper function for parse_bracket_exp.
3424 Build the equivalence class which is represented by NAME.
3425 The result are written to MBCSET and SBCSET.
3426 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3427 is a pointer argument sinse we may update it. */
3429 static reg_errcode_t
3430 #ifdef RE_ENABLE_I18N
3431 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3432 int *equiv_class_alloc, const unsigned char *name)
3433 #else /* not RE_ENABLE_I18N */
3434 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3435 #endif /* not RE_ENABLE_I18N */
3436 {
3437 #ifdef _LIBC
3438 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3439 if (nrules != 0)
3440 {
3441 const int32_t *table, *indirect;
3442 const unsigned char *weights, *extra, *cp;
3443 unsigned char char_buf[2];
3444 int32_t idx1, idx2;
3445 unsigned int ch;
3446 size_t len;
3447 /* This #include defines a local function! */
3448 # include <locale/weight.h>
3449 /* Calculate the index for equivalence class. */
3450 cp = name;
3451 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3452 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3453 _NL_COLLATE_WEIGHTMB);
3454 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3455 _NL_COLLATE_EXTRAMB);
3456 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3457 _NL_COLLATE_INDIRECTMB);
3458 idx1 = findidx (&cp);
3459 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3460 /* This isn't a valid character. */
3461 return REG_ECOLLATE;
3463 /* Build single byte matcing table for this equivalence class. */
3464 char_buf[1] = (unsigned char) '\0';
3465 len = weights[idx1 & 0xffffff];
3466 for (ch = 0; ch < SBC_MAX; ++ch)
3467 {
3468 char_buf[0] = ch;
3469 cp = char_buf;
3470 idx2 = findidx (&cp);
3471 /*
3472 idx2 = table[ch];
3473 */
3474 if (idx2 == 0)
3475 /* This isn't a valid character. */
3476 continue;
3477 /* Compare only if the length matches and the collation rule
3478 index is the same. */
3479 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3480 {
3481 int cnt = 0;
3483 while (cnt <= len &&
3484 weights[(idx1 & 0xffffff) + 1 + cnt]
3485 == weights[(idx2 & 0xffffff) + 1 + cnt])
3486 ++cnt;
3488 if (cnt > len)
3489 bitset_set (sbcset, ch);
3490 }
3491 }
3492 /* Check whether the array has enough space. */
3493 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3494 {
3495 /* Not enough, realloc it. */
3496 /* +1 in case of mbcset->nequiv_classes is 0. */
3497 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3498 /* Use realloc since the array is NULL if *alloc == 0. */
3499 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3500 int32_t,
3501 new_equiv_class_alloc);
3502 if (BE (new_equiv_classes == NULL, 0))
3503 return REG_ESPACE;
3504 mbcset->equiv_classes = new_equiv_classes;
3505 *equiv_class_alloc = new_equiv_class_alloc;
3506 }
3507 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3508 }
3509 else
3510 #endif /* _LIBC */
3511 {
3512 if (BE (strlen ((const char *) name) != 1, 0))
3513 return REG_ECOLLATE;
3514 bitset_set (sbcset, *name);
3515 }
3516 return REG_NOERROR;
3517 }
3519 /* Helper function for parse_bracket_exp.
3520 Build the character class which is represented by NAME.
3521 The result are written to MBCSET and SBCSET.
3522 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3523 is a pointer argument sinse we may update it. */
3525 static reg_errcode_t
3526 #ifdef RE_ENABLE_I18N
3527 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3528 re_charset_t *mbcset, int *char_class_alloc,
3529 const char *class_name, reg_syntax_t syntax)
3530 #else /* not RE_ENABLE_I18N */
3531 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3532 const char *class_name, reg_syntax_t syntax)
3533 #endif /* not RE_ENABLE_I18N */
3534 {
3535 int i;
3537 /* In case of REG_ICASE "upper" and "lower" match the both of
3538 upper and lower cases. */
3539 if ((syntax & RE_ICASE)
3540 && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3541 class_name = "alpha";
3543 #ifdef RE_ENABLE_I18N
3544 /* Check the space of the arrays. */
3545 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3546 {
3547 /* Not enough, realloc it. */
3548 /* +1 in case of mbcset->nchar_classes is 0. */
3549 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3550 /* Use realloc since array is NULL if *alloc == 0. */
3551 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3552 new_char_class_alloc);
3553 if (BE (new_char_classes == NULL, 0))
3554 return REG_ESPACE;
3555 mbcset->char_classes = new_char_classes;
3556 *char_class_alloc = new_char_class_alloc;
3557 }
3558 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3559 #endif /* RE_ENABLE_I18N */
3561 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3562 do { \
3563 if (BE (trans != NULL, 0)) \
3564 { \
3565 for (i = 0; i < SBC_MAX; ++i) \
3566 if (ctype_func (i)) \
3567 bitset_set (sbcset, trans[i]); \
3568 } \
3569 else \
3570 { \
3571 for (i = 0; i < SBC_MAX; ++i) \
3572 if (ctype_func (i)) \
3573 bitset_set (sbcset, i); \
3574 } \
3575 } while (0)
3577 if (strcmp (class_name, "alnum") == 0)
3578 BUILD_CHARCLASS_LOOP (isalnum);
3579 else if (strcmp (class_name, "cntrl") == 0)
3580 BUILD_CHARCLASS_LOOP (iscntrl);
3581 else if (strcmp (class_name, "lower") == 0)
3582 BUILD_CHARCLASS_LOOP (islower);
3583 else if (strcmp (class_name, "space") == 0)
3584 BUILD_CHARCLASS_LOOP (isspace);
3585 else if (strcmp (class_name, "alpha") == 0)
3586 BUILD_CHARCLASS_LOOP (isalpha);
3587 else if (strcmp (class_name, "digit") == 0)
3588 BUILD_CHARCLASS_LOOP (isdigit);
3589 else if (strcmp (class_name, "print") == 0)
3590 BUILD_CHARCLASS_LOOP (isprint);
3591 else if (strcmp (class_name, "upper") == 0)
3592 BUILD_CHARCLASS_LOOP (isupper);
3593 else if (strcmp (class_name, "blank") == 0)
3594 #ifndef GAWK
3595 BUILD_CHARCLASS_LOOP (isblank);
3596 #else
3597 /* see comments above */
3598 BUILD_CHARCLASS_LOOP (is_blank);
3599 #endif
3600 else if (strcmp (class_name, "graph") == 0)
3601 BUILD_CHARCLASS_LOOP (isgraph);
3602 else if (strcmp (class_name, "punct") == 0)
3603 BUILD_CHARCLASS_LOOP (ispunct);
3604 else if (strcmp (class_name, "xdigit") == 0)
3605 BUILD_CHARCLASS_LOOP (isxdigit);
3606 else
3607 return REG_ECTYPE;
3609 return REG_NOERROR;
3610 }
3612 static bin_tree_t *
3613 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3614 const char *class_name,
3615 const char *extra, int non_match,
3616 reg_errcode_t *err)
3617 {
3618 re_bitset_ptr_t sbcset;
3619 #ifdef RE_ENABLE_I18N
3620 re_charset_t *mbcset;
3621 int alloc = 0;
3622 #endif /* not RE_ENABLE_I18N */
3623 reg_errcode_t ret;
3624 re_token_t br_token;
3625 bin_tree_t *tree;
3627 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3628 #ifdef RE_ENABLE_I18N
3629 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3630 #endif /* RE_ENABLE_I18N */
3632 #ifdef RE_ENABLE_I18N
3633 if (BE (sbcset == NULL || mbcset == NULL, 0))
3634 #else /* not RE_ENABLE_I18N */
3635 if (BE (sbcset == NULL, 0))
3636 #endif /* not RE_ENABLE_I18N */
3637 {
3638 *err = REG_ESPACE;
3639 return NULL;
3640 }
3642 if (non_match)
3643 {
3644 #ifdef RE_ENABLE_I18N
3645 mbcset->non_match = 1;
3646 #endif /* not RE_ENABLE_I18N */
3647 }
3649 /* We don't care the syntax in this case. */
3650 ret = build_charclass (trans, sbcset,
3651 #ifdef RE_ENABLE_I18N
3652 mbcset, &alloc,
3653 #endif /* RE_ENABLE_I18N */
3654 class_name, 0);
3656 if (BE (ret != REG_NOERROR, 0))
3657 {
3658 re_free (sbcset);
3659 #ifdef RE_ENABLE_I18N
3660 free_charset (mbcset);
3661 #endif /* RE_ENABLE_I18N */
3662 *err = ret;
3663 return NULL;
3664 }
3665 /* \w match '_' also. */
3666 for (; *extra; extra++)
3667 bitset_set (sbcset, *extra);
3669 /* If it is non-matching list. */
3670 if (non_match)
3671 bitset_not (sbcset);
3673 #ifdef RE_ENABLE_I18N
3674 /* Ensure only single byte characters are set. */
3675 if (dfa->mb_cur_max > 1)
3676 bitset_mask (sbcset, dfa->sb_char);
3677 #endif
3679 /* Build a tree for simple bracket. */
3680 br_token.type = SIMPLE_BRACKET;
3681 br_token.opr.sbcset = sbcset;
3682 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3683 if (BE (tree == NULL, 0))
3684 goto build_word_op_espace;
3686 #ifdef RE_ENABLE_I18N
3687 if (dfa->mb_cur_max > 1)
3688 {
3689 bin_tree_t *mbc_tree;
3690 /* Build a tree for complex bracket. */
3691 br_token.type = COMPLEX_BRACKET;
3692 br_token.opr.mbcset = mbcset;
3693 dfa->has_mb_node = 1;
3694 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3695 if (BE (mbc_tree == NULL, 0))
3696 goto build_word_op_espace;
3697 /* Then join them by ALT node. */
3698 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3699 if (BE (mbc_tree != NULL, 1))
3700 return tree;
3701 }
3702 else
3703 {
3704 free_charset (mbcset);
3705 return tree;
3706 }
3707 #else /* not RE_ENABLE_I18N */
3708 return tree;
3709 #endif /* not RE_ENABLE_I18N */
3711 build_word_op_espace:
3712 re_free (sbcset);
3713 #ifdef RE_ENABLE_I18N
3714 free_charset (mbcset);
3715 #endif /* RE_ENABLE_I18N */
3716 *err = REG_ESPACE;
3717 return NULL;
3718 }
3720 /* This is intended for the expressions like "a{1,3}".
3721 Fetch a number from `input', and return the number.
3722 Return -1, if the number field is empty like "{,1}".
3723 Return -2, If an error is occured. */
3725 static int
3726 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3727 {
3728 int num = -1;
3729 unsigned char c;
3730 while (1)
3731 {
3732 fetch_token (token, input, syntax);
3733 c = token->opr.c;
3734 if (BE (token->type == END_OF_RE, 0))
3735 return -2;
3736 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3737 break;
3738 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3739 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3740 num = (num > RE_DUP_MAX) ? -2 : num;
3741 }
3742 return num;
3743 }
3744 \f
3745 #ifdef RE_ENABLE_I18N
3746 static void
3747 free_charset (re_charset_t *cset)
3748 {
3749 re_free (cset->mbchars);
3750 # ifdef _LIBC
3751 re_free (cset->coll_syms);
3752 re_free (cset->equiv_classes);
3753 re_free (cset->range_starts);
3754 re_free (cset->range_ends);
3755 # endif
3756 re_free (cset->char_classes);
3757 re_free (cset);
3758 }
3759 #endif /* RE_ENABLE_I18N */
3760 \f
3761 /* Functions for binary tree operation. */
3763 /* Create a tree node. */
3765 static bin_tree_t *
3766 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3767 re_token_type_t type)
3768 {
3769 re_token_t t;
3770 t.type = type;
3771 return create_token_tree (dfa, left, right, &t);
3772 }
3774 static bin_tree_t *
3775 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3776 const re_token_t *token)
3777 {
3778 bin_tree_t *tree;
3779 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3780 {
3781 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3783 if (storage == NULL)
3784 return NULL;
3785 storage->next = dfa->str_tree_storage;
3786 dfa->str_tree_storage = storage;
3787 dfa->str_tree_storage_idx = 0;
3788 }
3789 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3791 tree->parent = NULL;
3792 tree->left = left;
3793 tree->right = right;
3794 tree->token = *token;
3795 tree->token.duplicated = 0;
3796 tree->token.opt_subexp = 0;
3797 tree->first = NULL;
3798 tree->next = NULL;
3799 tree->node_idx = -1;
3801 if (left != NULL)
3802 left->parent = tree;
3803 if (right != NULL)
3804 right->parent = tree;
3805 return tree;
3806 }
3808 /* Mark the tree SRC as an optional subexpression.
3809 To be called from preorder or postorder. */
3811 static reg_errcode_t
3812 mark_opt_subexp (void *extra, bin_tree_t *node)
3813 {
3814 int idx = (int) (long) extra;
3815 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3816 node->token.opt_subexp = 1;
3818 return REG_NOERROR;
3819 }
3821 /* Free the allocated memory inside NODE. */
3823 static void
3824 free_token (re_token_t *node)
3825 {
3826 #ifdef RE_ENABLE_I18N
3827 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3828 free_charset (node->opr.mbcset);
3829 else
3830 #endif /* RE_ENABLE_I18N */
3831 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3832 re_free (node->opr.sbcset);
3833 }
3835 /* Worker function for tree walking. Free the allocated memory inside NODE
3836 and its children. */
3838 static reg_errcode_t
3839 free_tree (void *extra, bin_tree_t *node)
3840 {
3841 free_token (&node->token);
3842 return REG_NOERROR;
3843 }
3846 /* Duplicate the node SRC, and return new node. This is a preorder
3847 visit similar to the one implemented by the generic visitor, but
3848 we need more infrastructure to maintain two parallel trees --- so,
3849 it's easier to duplicate. */
3851 static bin_tree_t *
3852 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3853 {
3854 const bin_tree_t *node;
3855 bin_tree_t *dup_root;
3856 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3858 for (node = root; ; )
3859 {
3860 /* Create a new tree and link it back to the current parent. */
3861 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3862 if (*p_new == NULL)
3863 return NULL;
3864 (*p_new)->parent = dup_node;
3865 (*p_new)->token.duplicated = 1;
3866 dup_node = *p_new;
3868 /* Go to the left node, or up and to the right. */
3869 if (node->left)
3870 {
3871 node = node->left;
3872 p_new = &dup_node->left;
3873 }
3874 else
3875 {
3876 const bin_tree_t *prev = NULL;
3877 while (node->right == prev || node->right == NULL)
3878 {
3879 prev = node;
3880 node = node->parent;
3881 dup_node = dup_node->parent;
3882 if (!node)
3883 return dup_root;
3884 }
3885 node = node->right;
3886 p_new = &dup_node->right;
3887 }
3888 }
3889 }