1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002,2003,2004,2005,2006,2007 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 size_t length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
24 char *fastmap);
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
26 #ifdef RE_ENABLE_I18N
27 static void free_charset (re_charset_t *cset);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t *preg);
30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
31 #ifdef RE_ENABLE_I18N
32 static void optimize_utf8 (re_dfa_t *dfa);
33 #endif
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
37 void *extra);
38 static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
40 void *extra);
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44 bin_tree_t *node);
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50 unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53 Idx node, bool root);
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static Idx fetch_number (re_string_t *input, re_token_t *token,
56 reg_syntax_t syntax);
57 static int peek_token (re_token_t *token, re_string_t *input,
58 reg_syntax_t syntax) internal_function;
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 Idx nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 Idx nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 Idx nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 Idx nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
78 reg_errcode_t *err);
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80 re_string_t *regexp,
81 re_token_t *token, int token_len,
82 re_dfa_t *dfa,
83 reg_syntax_t syntax,
84 bool accept_hyphen);
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86 re_string_t *regexp,
87 re_token_t *token);
88 #ifdef RE_ENABLE_I18N
89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
90 re_charset_t *mbcset,
91 Idx *equiv_class_alloc,
92 const unsigned char *name);
93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
94 bitset_t sbcset,
95 re_charset_t *mbcset,
96 Idx *char_class_alloc,
97 const unsigned char *class_name,
98 reg_syntax_t syntax);
99 #else /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
101 const unsigned char *name);
102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
103 bitset_t sbcset,
104 const unsigned char *class_name,
105 reg_syntax_t syntax);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 RE_TRANSLATE_TYPE trans,
109 const unsigned char *class_name,
110 const unsigned char *extra,
111 bool non_match, reg_errcode_t *err);
112 static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
114 re_token_type_t type);
115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119 static void free_token (re_token_t *node);
120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
122 \f
123 /* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
128 static const char __re_error_msgid[] =
129 {
130 #define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
132 "\0"
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
135 "\0"
136 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138 "\0"
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141 "\0"
142 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144 "\0"
145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147 "\0"
148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150 "\0"
151 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
153 "\0"
154 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156 "\0"
157 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159 "\0"
160 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162 "\0"
163 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
165 "\0"
166 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
168 "\0"
169 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171 "\0"
172 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
174 "\0"
175 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
177 "\0"
178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
180 };
182 static const size_t __re_error_msgid_idx[] =
183 {
184 REG_NOERROR_IDX,
185 REG_NOMATCH_IDX,
186 REG_BADPAT_IDX,
187 REG_ECOLLATE_IDX,
188 REG_ECTYPE_IDX,
189 REG_EESCAPE_IDX,
190 REG_ESUBREG_IDX,
191 REG_EBRACK_IDX,
192 REG_EPAREN_IDX,
193 REG_EBRACE_IDX,
194 REG_BADBR_IDX,
195 REG_ERANGE_IDX,
196 REG_ESPACE_IDX,
197 REG_BADRPT_IDX,
198 REG_EEND_IDX,
199 REG_ESIZE_IDX,
200 REG_ERPAREN_IDX
201 };
202 \f
203 /* Entry points for GNU code. */
205 /* re_compile_pattern is the GNU regular expression compiler: it
206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207 Returns 0 if the pattern was valid, otherwise an error string.
209 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
210 are set in BUFP on entry. */
212 #ifdef _LIBC
213 const char *
214 re_compile_pattern (pattern, length, bufp)
215 const char *pattern;
216 size_t length;
217 struct re_pattern_buffer *bufp;
218 #else /* size_t might promote */
219 const char *
220 re_compile_pattern (const char *pattern, size_t length,
221 struct re_pattern_buffer *bufp)
222 #endif
223 {
224 reg_errcode_t ret;
226 /* And GNU code determines whether or not to get register information
227 by passing null for the REGS argument to re_match, etc., not by
228 setting no_sub, unless RE_NO_SUB is set. */
229 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
231 /* Match anchors at newline. */
232 bufp->newline_anchor = 1;
234 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
236 if (!ret)
237 return NULL;
238 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
239 }
240 #ifdef _LIBC
241 weak_alias (__re_compile_pattern, re_compile_pattern)
242 #endif
244 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
245 also be assigned to arbitrarily: each pattern buffer stores its own
246 syntax, so it can be changed between regex compilations. */
247 /* This has no initializer because initialized variables in Emacs
248 become read-only after dumping. */
249 reg_syntax_t re_syntax_options;
252 /* Specify the precise syntax of regexps for compilation. This provides
253 for compatibility for various utilities which historically have
254 different, incompatible syntaxes.
256 The argument SYNTAX is a bit mask comprised of the various bits
257 defined in regex.h. We return the old syntax. */
259 reg_syntax_t
260 re_set_syntax (syntax)
261 reg_syntax_t syntax;
262 {
263 reg_syntax_t ret = re_syntax_options;
265 re_syntax_options = syntax;
266 return ret;
267 }
268 #ifdef _LIBC
269 weak_alias (__re_set_syntax, re_set_syntax)
270 #endif
272 int
273 re_compile_fastmap (bufp)
274 struct re_pattern_buffer *bufp;
275 {
276 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
277 char *fastmap = bufp->fastmap;
279 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
280 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
281 if (dfa->init_state != dfa->init_state_word)
282 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
283 if (dfa->init_state != dfa->init_state_nl)
284 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
285 if (dfa->init_state != dfa->init_state_begbuf)
286 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
287 bufp->fastmap_accurate = 1;
288 return 0;
289 }
290 #ifdef _LIBC
291 weak_alias (__re_compile_fastmap, re_compile_fastmap)
292 #endif
294 static inline void
295 __attribute ((always_inline))
296 re_set_fastmap (char *fastmap, bool icase, int ch)
297 {
298 fastmap[ch] = 1;
299 if (icase)
300 fastmap[tolower (ch)] = 1;
301 }
303 /* Helper function for re_compile_fastmap.
304 Compile fastmap for the initial_state INIT_STATE. */
306 static void
307 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
308 char *fastmap)
309 {
310 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
311 Idx node_cnt;
312 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
313 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
314 {
315 Idx node = init_state->nodes.elems[node_cnt];
316 re_token_type_t type = dfa->nodes[node].type;
318 if (type == CHARACTER)
319 {
320 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
321 #ifdef RE_ENABLE_I18N
322 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
323 {
324 unsigned char buf[MB_LEN_MAX];
325 unsigned char *p;
326 wchar_t wc;
327 mbstate_t state;
329 p = buf;
330 *p++ = dfa->nodes[node].opr.c;
331 while (++node < dfa->nodes_len
332 && dfa->nodes[node].type == CHARACTER
333 && dfa->nodes[node].mb_partial)
334 *p++ = dfa->nodes[node].opr.c;
335 memset (&state, '\0', sizeof (state));
336 if (mbrtowc (&wc, (const char *) buf, p - buf,
337 &state) == p - buf
338 && (__wcrtomb ((char *) buf, towlower (wc), &state)
339 != (size_t) -1))
340 re_set_fastmap (fastmap, false, buf[0]);
341 }
342 #endif
343 }
344 else if (type == SIMPLE_BRACKET)
345 {
346 int i, ch;
347 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
348 {
349 int j;
350 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
351 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
352 if (w & ((bitset_word_t) 1 << j))
353 re_set_fastmap (fastmap, icase, ch);
354 }
355 }
356 #ifdef RE_ENABLE_I18N
357 else if (type == COMPLEX_BRACKET)
358 {
359 Idx i;
360 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
361 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
362 || cset->nranges || cset->nchar_classes)
363 {
364 # ifdef _LIBC
365 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
366 {
367 /* In this case we want to catch the bytes which are
368 the first byte of any collation elements.
369 e.g. In da_DK, we want to catch 'a' since "aa"
370 is a valid collation element, and don't catch
371 'b' since 'b' is the only collation element
372 which starts from 'b'. */
373 const int32_t *table = (const int32_t *)
374 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
375 for (i = 0; i < SBC_MAX; ++i)
376 if (table[i] < 0)
377 re_set_fastmap (fastmap, icase, i);
378 }
379 # else
380 if (dfa->mb_cur_max > 1)
381 for (i = 0; i < SBC_MAX; ++i)
382 if (__btowc (i) == WEOF)
383 re_set_fastmap (fastmap, icase, i);
384 # endif /* not _LIBC */
385 }
386 for (i = 0; i < cset->nmbchars; ++i)
387 {
388 char buf[256];
389 mbstate_t state;
390 memset (&state, '\0', sizeof (state));
391 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
392 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
393 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
394 {
395 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
396 != (size_t) -1)
397 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
398 }
399 }
400 }
401 #endif /* RE_ENABLE_I18N */
402 else if (type == OP_PERIOD
403 #ifdef RE_ENABLE_I18N
404 || type == OP_UTF8_PERIOD
405 #endif /* RE_ENABLE_I18N */
406 || type == END_OF_RE)
407 {
408 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
409 if (type == END_OF_RE)
410 bufp->can_be_null = 1;
411 return;
412 }
413 }
414 }
415 \f
416 /* Entry point for POSIX code. */
417 /* regcomp takes a regular expression as a string and compiles it.
419 PREG is a regex_t *. We do not expect any fields to be initialized,
420 since POSIX says we shouldn't. Thus, we set
422 `buffer' to the compiled pattern;
423 `used' to the length of the compiled pattern;
424 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
425 REG_EXTENDED bit in CFLAGS is set; otherwise, to
426 RE_SYNTAX_POSIX_BASIC;
427 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
428 `fastmap' to an allocated space for the fastmap;
429 `fastmap_accurate' to zero;
430 `re_nsub' to the number of subexpressions in PATTERN.
432 PATTERN is the address of the pattern string.
434 CFLAGS is a series of bits which affect compilation.
436 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
437 use POSIX basic syntax.
439 If REG_NEWLINE is set, then . and [^...] don't match newline.
440 Also, regexec will try a match beginning after every newline.
442 If REG_ICASE is set, then we considers upper- and lowercase
443 versions of letters to be equivalent when matching.
445 If REG_NOSUB is set, then when PREG is passed to regexec, that
446 routine will report only success or failure, and nothing about the
447 registers.
449 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
450 the return codes and their meanings.) */
452 int
453 regcomp (preg, pattern, cflags)
454 regex_t *_Restrict_ preg;
455 const char *_Restrict_ pattern;
456 int cflags;
457 {
458 reg_errcode_t ret;
459 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
460 : RE_SYNTAX_POSIX_BASIC);
462 preg->buffer = NULL;
463 preg->allocated = 0;
464 preg->used = 0;
466 /* Try to allocate space for the fastmap. */
467 preg->fastmap = re_malloc (char, SBC_MAX);
468 if (BE (preg->fastmap == NULL, 0))
469 return REG_ESPACE;
471 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
473 /* If REG_NEWLINE is set, newlines are treated differently. */
474 if (cflags & REG_NEWLINE)
475 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
476 syntax &= ~RE_DOT_NEWLINE;
477 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
478 /* It also changes the matching behavior. */
479 preg->newline_anchor = 1;
480 }
481 else
482 preg->newline_anchor = 0;
483 preg->no_sub = !!(cflags & REG_NOSUB);
484 preg->translate = NULL;
486 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
488 /* POSIX doesn't distinguish between an unmatched open-group and an
489 unmatched close-group: both are REG_EPAREN. */
490 if (ret == REG_ERPAREN)
491 ret = REG_EPAREN;
493 /* We have already checked preg->fastmap != NULL. */
494 if (BE (ret == REG_NOERROR, 1))
495 /* Compute the fastmap now, since regexec cannot modify the pattern
496 buffer. This function never fails in this implementation. */
497 (void) re_compile_fastmap (preg);
498 else
499 {
500 /* Some error occurred while compiling the expression. */
501 re_free (preg->fastmap);
502 preg->fastmap = NULL;
503 }
505 return (int) ret;
506 }
507 #ifdef _LIBC
508 weak_alias (__regcomp, regcomp)
509 #endif
511 /* Returns a message corresponding to an error code, ERRCODE, returned
512 from either regcomp or regexec. We don't use PREG here. */
514 #ifdef _LIBC
515 size_t
516 regerror (errcode, preg, errbuf, errbuf_size)
517 int errcode;
518 const regex_t *_Restrict_ preg;
519 char *_Restrict_ errbuf;
520 size_t errbuf_size;
521 #else /* size_t might promote */
522 size_t
523 regerror (int errcode, const regex_t *_Restrict_ preg,
524 char *_Restrict_ errbuf, size_t errbuf_size)
525 #endif
526 {
527 const char *msg;
528 size_t msg_size;
530 if (BE (errcode < 0
531 || errcode >= (int) (sizeof (__re_error_msgid_idx)
532 / sizeof (__re_error_msgid_idx[0])), 0))
533 /* Only error codes returned by the rest of the code should be passed
534 to this routine. If we are given anything else, or if other regex
535 code generates an invalid error code, then the program has a bug.
536 Dump core so we can fix it. */
537 abort ();
539 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
541 msg_size = strlen (msg) + 1; /* Includes the null. */
543 if (BE (errbuf_size != 0, 1))
544 {
545 size_t cpy_size = msg_size;
546 if (BE (msg_size > errbuf_size, 0))
547 {
548 cpy_size = errbuf_size - 1;
549 errbuf[cpy_size] = '\0';
550 }
551 memcpy (errbuf, msg, cpy_size);
552 }
554 return msg_size;
555 }
556 #ifdef _LIBC
557 weak_alias (__regerror, regerror)
558 #endif
561 #ifdef RE_ENABLE_I18N
562 /* This static array is used for the map to single-byte characters when
563 UTF-8 is used. Otherwise we would allocate memory just to initialize
564 it the same all the time. UTF-8 is the preferred encoding so this is
565 a worthwhile optimization. */
566 static const bitset_t utf8_sb_map =
567 {
568 /* Set the first 128 bits. */
569 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
570 # error "bitset_word_t is narrower than 32 bits"
571 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
572 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
573 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
574 BITSET_WORD_MAX, BITSET_WORD_MAX,
575 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
576 BITSET_WORD_MAX,
577 # endif
578 (BITSET_WORD_MAX
579 >> (SBC_MAX % BITSET_WORD_BITS == 0
580 ? 0
581 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
582 };
583 #endif
586 static void
587 free_dfa_content (re_dfa_t *dfa)
588 {
589 Idx i, j;
591 if (dfa->nodes)
592 for (i = 0; i < dfa->nodes_len; ++i)
593 free_token (dfa->nodes + i);
594 re_free (dfa->nexts);
595 for (i = 0; i < dfa->nodes_len; ++i)
596 {
597 if (dfa->eclosures != NULL)
598 re_node_set_free (dfa->eclosures + i);
599 if (dfa->inveclosures != NULL)
600 re_node_set_free (dfa->inveclosures + i);
601 if (dfa->edests != NULL)
602 re_node_set_free (dfa->edests + i);
603 }
604 re_free (dfa->edests);
605 re_free (dfa->eclosures);
606 re_free (dfa->inveclosures);
607 re_free (dfa->nodes);
609 if (dfa->state_table)
610 for (i = 0; i <= dfa->state_hash_mask; ++i)
611 {
612 struct re_state_table_entry *entry = dfa->state_table + i;
613 for (j = 0; j < entry->num; ++j)
614 {
615 re_dfastate_t *state = entry->array[j];
616 free_state (state);
617 }
618 re_free (entry->array);
619 }
620 re_free (dfa->state_table);
621 #ifdef RE_ENABLE_I18N
622 if (dfa->sb_char != utf8_sb_map)
623 re_free (dfa->sb_char);
624 #endif
625 re_free (dfa->subexp_map);
626 #ifdef DEBUG
627 re_free (dfa->re_str);
628 #endif
630 re_free (dfa);
631 }
634 /* Free dynamically allocated space used by PREG. */
636 void
637 regfree (preg)
638 regex_t *preg;
639 {
640 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
641 if (BE (dfa != NULL, 1))
642 free_dfa_content (dfa);
643 preg->buffer = NULL;
644 preg->allocated = 0;
646 re_free (preg->fastmap);
647 preg->fastmap = NULL;
649 re_free (preg->translate);
650 preg->translate = NULL;
651 }
652 #ifdef _LIBC
653 weak_alias (__regfree, regfree)
654 #endif
655 \f
656 /* Entry points compatible with 4.2 BSD regex library. We don't define
657 them unless specifically requested. */
659 #if defined _REGEX_RE_COMP || defined _LIBC
661 /* BSD has one and only one pattern buffer. */
662 static struct re_pattern_buffer re_comp_buf;
664 char *
665 # ifdef _LIBC
666 /* Make these definitions weak in libc, so POSIX programs can redefine
667 these names if they don't use our functions, and still use
668 regcomp/regexec above without link errors. */
669 weak_function
670 # endif
671 re_comp (s)
672 const char *s;
673 {
674 reg_errcode_t ret;
675 char *fastmap;
677 if (!s)
678 {
679 if (!re_comp_buf.buffer)
680 return gettext ("No previous regular expression");
681 return 0;
682 }
684 if (re_comp_buf.buffer)
685 {
686 fastmap = re_comp_buf.fastmap;
687 re_comp_buf.fastmap = NULL;
688 __regfree (&re_comp_buf);
689 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
690 re_comp_buf.fastmap = fastmap;
691 }
693 if (re_comp_buf.fastmap == NULL)
694 {
695 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
696 if (re_comp_buf.fastmap == NULL)
697 return (char *) gettext (__re_error_msgid
698 + __re_error_msgid_idx[(int) REG_ESPACE]);
699 }
701 /* Since `re_exec' always passes NULL for the `regs' argument, we
702 don't need to initialize the pattern buffer fields which affect it. */
704 /* Match anchors at newlines. */
705 re_comp_buf.newline_anchor = 1;
707 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
709 if (!ret)
710 return NULL;
712 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
713 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
714 }
716 #ifdef _LIBC
717 libc_freeres_fn (free_mem)
718 {
719 __regfree (&re_comp_buf);
720 }
721 #endif
723 #endif /* _REGEX_RE_COMP */
724 \f
725 /* Internal entry point.
726 Compile the regular expression PATTERN, whose length is LENGTH.
727 SYNTAX indicate regular expression's syntax. */
729 static reg_errcode_t
730 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
731 reg_syntax_t syntax)
732 {
733 reg_errcode_t err = REG_NOERROR;
734 re_dfa_t *dfa;
735 re_string_t regexp;
737 /* Initialize the pattern buffer. */
738 preg->fastmap_accurate = 0;
739 preg->syntax = syntax;
740 preg->not_bol = preg->not_eol = 0;
741 preg->used = 0;
742 preg->re_nsub = 0;
743 preg->can_be_null = 0;
744 preg->regs_allocated = REGS_UNALLOCATED;
746 /* Initialize the dfa. */
747 dfa = (re_dfa_t *) preg->buffer;
748 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
749 {
750 /* If zero allocated, but buffer is non-null, try to realloc
751 enough space. This loses if buffer's address is bogus, but
752 that is the user's responsibility. If ->buffer is NULL this
753 is a simple allocation. */
754 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
755 if (dfa == NULL)
756 return REG_ESPACE;
757 preg->allocated = sizeof (re_dfa_t);
758 preg->buffer = (unsigned char *) dfa;
759 }
760 preg->used = sizeof (re_dfa_t);
762 err = init_dfa (dfa, length);
763 if (BE (err != REG_NOERROR, 0))
764 {
765 free_dfa_content (dfa);
766 preg->buffer = NULL;
767 preg->allocated = 0;
768 return err;
769 }
770 #ifdef DEBUG
771 /* Note: length+1 will not overflow since it is checked in init_dfa. */
772 dfa->re_str = re_malloc (char, length + 1);
773 strncpy (dfa->re_str, pattern, length + 1);
774 #endif
776 __libc_lock_init (dfa->lock);
778 err = re_string_construct (®exp, pattern, length, preg->translate,
779 syntax & RE_ICASE, dfa);
780 if (BE (err != REG_NOERROR, 0))
781 {
782 re_compile_internal_free_return:
783 free_workarea_compile (preg);
784 re_string_destruct (®exp);
785 free_dfa_content (dfa);
786 preg->buffer = NULL;
787 preg->allocated = 0;
788 return err;
789 }
791 /* Parse the regular expression, and build a structure tree. */
792 preg->re_nsub = 0;
793 dfa->str_tree = parse (®exp, preg, syntax, &err);
794 if (BE (dfa->str_tree == NULL, 0))
795 goto re_compile_internal_free_return;
797 /* Analyze the tree and create the nfa. */
798 err = analyze (preg);
799 if (BE (err != REG_NOERROR, 0))
800 goto re_compile_internal_free_return;
802 #ifdef RE_ENABLE_I18N
803 /* If possible, do searching in single byte encoding to speed things up. */
804 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
805 optimize_utf8 (dfa);
806 #endif
808 /* Then create the initial state of the dfa. */
809 err = create_initial_state (dfa);
811 /* Release work areas. */
812 free_workarea_compile (preg);
813 re_string_destruct (®exp);
815 if (BE (err != REG_NOERROR, 0))
816 {
817 free_dfa_content (dfa);
818 preg->buffer = NULL;
819 preg->allocated = 0;
820 }
822 return err;
823 }
825 /* Initialize DFA. We use the length of the regular expression PAT_LEN
826 as the initial length of some arrays. */
828 static reg_errcode_t
829 init_dfa (re_dfa_t *dfa, size_t pat_len)
830 {
831 __re_size_t table_size;
832 #ifdef RE_ENABLE_I18N
833 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
834 #else
835 size_t max_i18n_object_size = 0;
836 #endif
837 size_t max_object_size =
838 MAX (sizeof (struct re_state_table_entry),
839 MAX (sizeof (re_token_t),
840 MAX (sizeof (re_node_set),
841 MAX (sizeof (regmatch_t),
842 max_i18n_object_size))));
844 memset (dfa, '\0', sizeof (re_dfa_t));
846 /* Force allocation of str_tree_storage the first time. */
847 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
849 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
850 calculation below, and for similar doubling calculations
851 elsewhere. And it's <= rather than <, because some of the
852 doubling calculations add 1 afterwards. */
853 if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
854 return REG_ESPACE;
856 dfa->nodes_alloc = pat_len + 1;
857 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
859 /* table_size = 2 ^ ceil(log pat_len) */
860 for (table_size = 1; ; table_size <<= 1)
861 if (table_size > pat_len)
862 break;
864 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
865 dfa->state_hash_mask = table_size - 1;
867 dfa->mb_cur_max = MB_CUR_MAX;
868 #ifdef _LIBC
869 if (dfa->mb_cur_max == 6
870 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
871 dfa->is_utf8 = 1;
872 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
873 != 0);
874 #else
875 if (strcmp (locale_charset (), "UTF-8") == 0)
876 dfa->is_utf8 = 1;
878 /* We check exhaustively in the loop below if this charset is a
879 superset of ASCII. */
880 dfa->map_notascii = 0;
881 #endif
883 #ifdef RE_ENABLE_I18N
884 if (dfa->mb_cur_max > 1)
885 {
886 if (dfa->is_utf8)
887 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
888 else
889 {
890 int i, j, ch;
892 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
893 if (BE (dfa->sb_char == NULL, 0))
894 return REG_ESPACE;
896 /* Set the bits corresponding to single byte chars. */
897 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
898 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
899 {
900 wint_t wch = __btowc (ch);
901 if (wch != WEOF)
902 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
903 # ifndef _LIBC
904 if (isascii (ch) && wch != ch)
905 dfa->map_notascii = 1;
906 # endif
907 }
908 }
909 }
910 #endif
912 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
913 return REG_ESPACE;
914 return REG_NOERROR;
915 }
917 /* Initialize WORD_CHAR table, which indicate which character is
918 "word". In this case "word" means that it is the word construction
919 character used by some operators like "\<", "\>", etc. */
921 static void
922 internal_function
923 init_word_char (re_dfa_t *dfa)
924 {
925 int i, j, ch;
926 dfa->word_ops_used = 1;
927 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
928 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
929 if (isalnum (ch) || ch == '_')
930 dfa->word_char[i] |= (bitset_word_t) 1 << j;
931 }
933 /* Free the work area which are only used while compiling. */
935 static void
936 free_workarea_compile (regex_t *preg)
937 {
938 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
939 bin_tree_storage_t *storage, *next;
940 for (storage = dfa->str_tree_storage; storage; storage = next)
941 {
942 next = storage->next;
943 re_free (storage);
944 }
945 dfa->str_tree_storage = NULL;
946 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
947 dfa->str_tree = NULL;
948 re_free (dfa->org_indices);
949 dfa->org_indices = NULL;
950 }
952 /* Create initial states for all contexts. */
954 static reg_errcode_t
955 create_initial_state (re_dfa_t *dfa)
956 {
957 Idx first, i;
958 reg_errcode_t err;
959 re_node_set init_nodes;
961 /* Initial states have the epsilon closure of the node which is
962 the first node of the regular expression. */
963 first = dfa->str_tree->first->node_idx;
964 dfa->init_node = first;
965 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
966 if (BE (err != REG_NOERROR, 0))
967 return err;
969 /* The back-references which are in initial states can epsilon transit,
970 since in this case all of the subexpressions can be null.
971 Then we add epsilon closures of the nodes which are the next nodes of
972 the back-references. */
973 if (dfa->nbackref > 0)
974 for (i = 0; i < init_nodes.nelem; ++i)
975 {
976 Idx node_idx = init_nodes.elems[i];
977 re_token_type_t type = dfa->nodes[node_idx].type;
979 Idx clexp_idx;
980 if (type != OP_BACK_REF)
981 continue;
982 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
983 {
984 re_token_t *clexp_node;
985 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
986 if (clexp_node->type == OP_CLOSE_SUBEXP
987 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
988 break;
989 }
990 if (clexp_idx == init_nodes.nelem)
991 continue;
993 if (type == OP_BACK_REF)
994 {
995 Idx dest_idx = dfa->edests[node_idx].elems[0];
996 if (!re_node_set_contains (&init_nodes, dest_idx))
997 {
998 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
999 i = 0;
1000 }
1001 }
1002 }
1004 /* It must be the first time to invoke acquire_state. */
1005 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1006 /* We don't check ERR here, since the initial state must not be NULL. */
1007 if (BE (dfa->init_state == NULL, 0))
1008 return err;
1009 if (dfa->init_state->has_constraint)
1010 {
1011 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1012 CONTEXT_WORD);
1013 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1014 CONTEXT_NEWLINE);
1015 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1016 &init_nodes,
1017 CONTEXT_NEWLINE
1018 | CONTEXT_BEGBUF);
1019 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1020 || dfa->init_state_begbuf == NULL, 0))
1021 return err;
1022 }
1023 else
1024 dfa->init_state_word = dfa->init_state_nl
1025 = dfa->init_state_begbuf = dfa->init_state;
1027 re_node_set_free (&init_nodes);
1028 return REG_NOERROR;
1029 }
1030 \f
1031 #ifdef RE_ENABLE_I18N
1032 /* If it is possible to do searching in single byte encoding instead of UTF-8
1033 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1034 DFA nodes where needed. */
1036 static void
1037 optimize_utf8 (re_dfa_t *dfa)
1038 {
1039 Idx node;
1040 int i;
1041 bool mb_chars = false;
1042 bool has_period = false;
1044 for (node = 0; node < dfa->nodes_len; ++node)
1045 switch (dfa->nodes[node].type)
1046 {
1047 case CHARACTER:
1048 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1049 mb_chars = true;
1050 break;
1051 case ANCHOR:
1052 switch (dfa->nodes[node].opr.ctx_type)
1053 {
1054 case LINE_FIRST:
1055 case LINE_LAST:
1056 case BUF_FIRST:
1057 case BUF_LAST:
1058 break;
1059 default:
1060 /* Word anchors etc. cannot be handled. */
1061 return;
1062 }
1063 break;
1064 case OP_PERIOD:
1065 has_period = true;
1066 break;
1067 case OP_BACK_REF:
1068 case OP_ALT:
1069 case END_OF_RE:
1070 case OP_DUP_ASTERISK:
1071 case OP_OPEN_SUBEXP:
1072 case OP_CLOSE_SUBEXP:
1073 break;
1074 case COMPLEX_BRACKET:
1075 return;
1076 case SIMPLE_BRACKET:
1077 /* Just double check. */
1078 {
1079 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1080 ? 0
1081 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1082 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1083 {
1084 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1085 return;
1086 rshift = 0;
1087 }
1088 }
1089 break;
1090 default:
1091 abort ();
1092 }
1094 if (mb_chars || has_period)
1095 for (node = 0; node < dfa->nodes_len; ++node)
1096 {
1097 if (dfa->nodes[node].type == CHARACTER
1098 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1099 dfa->nodes[node].mb_partial = 0;
1100 else if (dfa->nodes[node].type == OP_PERIOD)
1101 dfa->nodes[node].type = OP_UTF8_PERIOD;
1102 }
1104 /* The search can be in single byte locale. */
1105 dfa->mb_cur_max = 1;
1106 dfa->is_utf8 = 0;
1107 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1108 }
1109 #endif
1110 \f
1111 /* Analyze the structure tree, and calculate "first", "next", "edest",
1112 "eclosure", and "inveclosure". */
1114 static reg_errcode_t
1115 analyze (regex_t *preg)
1116 {
1117 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1118 reg_errcode_t ret;
1120 /* Allocate arrays. */
1121 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1122 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1123 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1124 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1125 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1126 || dfa->eclosures == NULL, 0))
1127 return REG_ESPACE;
1129 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1130 if (dfa->subexp_map != NULL)
1131 {
1132 Idx i;
1133 for (i = 0; i < preg->re_nsub; i++)
1134 dfa->subexp_map[i] = i;
1135 preorder (dfa->str_tree, optimize_subexps, dfa);
1136 for (i = 0; i < preg->re_nsub; i++)
1137 if (dfa->subexp_map[i] != i)
1138 break;
1139 if (i == preg->re_nsub)
1140 {
1141 free (dfa->subexp_map);
1142 dfa->subexp_map = NULL;
1143 }
1144 }
1146 ret = postorder (dfa->str_tree, lower_subexps, preg);
1147 if (BE (ret != REG_NOERROR, 0))
1148 return ret;
1149 ret = postorder (dfa->str_tree, calc_first, dfa);
1150 if (BE (ret != REG_NOERROR, 0))
1151 return ret;
1152 preorder (dfa->str_tree, calc_next, dfa);
1153 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1154 if (BE (ret != REG_NOERROR, 0))
1155 return ret;
1156 ret = calc_eclosure (dfa);
1157 if (BE (ret != REG_NOERROR, 0))
1158 return ret;
1160 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1161 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1162 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1163 || dfa->nbackref)
1164 {
1165 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1166 if (BE (dfa->inveclosures == NULL, 0))
1167 return REG_ESPACE;
1168 ret = calc_inveclosure (dfa);
1169 }
1171 return ret;
1172 }
1174 /* Our parse trees are very unbalanced, so we cannot use a stack to
1175 implement parse tree visits. Instead, we use parent pointers and
1176 some hairy code in these two functions. */
1177 static reg_errcode_t
1178 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1179 void *extra)
1180 {
1181 bin_tree_t *node, *prev;
1183 for (node = root; ; )
1184 {
1185 /* Descend down the tree, preferably to the left (or to the right
1186 if that's the only child). */
1187 while (node->left || node->right)
1188 if (node->left)
1189 node = node->left;
1190 else
1191 node = node->right;
1193 do
1194 {
1195 reg_errcode_t err = fn (extra, node);
1196 if (BE (err != REG_NOERROR, 0))
1197 return err;
1198 if (node->parent == NULL)
1199 return REG_NOERROR;
1200 prev = node;
1201 node = node->parent;
1202 }
1203 /* Go up while we have a node that is reached from the right. */
1204 while (node->right == prev || node->right == NULL);
1205 node = node->right;
1206 }
1207 }
1209 static reg_errcode_t
1210 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1211 void *extra)
1212 {
1213 bin_tree_t *node;
1215 for (node = root; ; )
1216 {
1217 reg_errcode_t err = fn (extra, node);
1218 if (BE (err != REG_NOERROR, 0))
1219 return err;
1221 /* Go to the left node, or up and to the right. */
1222 if (node->left)
1223 node = node->left;
1224 else
1225 {
1226 bin_tree_t *prev = NULL;
1227 while (node->right == prev || node->right == NULL)
1228 {
1229 prev = node;
1230 node = node->parent;
1231 if (!node)
1232 return REG_NOERROR;
1233 }
1234 node = node->right;
1235 }
1236 }
1237 }
1239 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1240 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1241 backreferences as well. Requires a preorder visit. */
1242 static reg_errcode_t
1243 optimize_subexps (void *extra, bin_tree_t *node)
1244 {
1245 re_dfa_t *dfa = (re_dfa_t *) extra;
1247 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1248 {
1249 int idx = node->token.opr.idx;
1250 node->token.opr.idx = dfa->subexp_map[idx];
1251 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1252 }
1254 else if (node->token.type == SUBEXP
1255 && node->left && node->left->token.type == SUBEXP)
1256 {
1257 Idx other_idx = node->left->token.opr.idx;
1259 node->left = node->left->left;
1260 if (node->left)
1261 node->left->parent = node;
1263 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1264 if (other_idx < BITSET_WORD_BITS)
1265 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1266 }
1268 return REG_NOERROR;
1269 }
1271 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1272 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1273 static reg_errcode_t
1274 lower_subexps (void *extra, bin_tree_t *node)
1275 {
1276 regex_t *preg = (regex_t *) extra;
1277 reg_errcode_t err = REG_NOERROR;
1279 if (node->left && node->left->token.type == SUBEXP)
1280 {
1281 node->left = lower_subexp (&err, preg, node->left);
1282 if (node->left)
1283 node->left->parent = node;
1284 }
1285 if (node->right && node->right->token.type == SUBEXP)
1286 {
1287 node->right = lower_subexp (&err, preg, node->right);
1288 if (node->right)
1289 node->right->parent = node;
1290 }
1292 return err;
1293 }
1295 static bin_tree_t *
1296 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1297 {
1298 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1299 bin_tree_t *body = node->left;
1300 bin_tree_t *op, *cls, *tree1, *tree;
1302 if (preg->no_sub
1303 /* We do not optimize empty subexpressions, because otherwise we may
1304 have bad CONCAT nodes with NULL children. This is obviously not
1305 very common, so we do not lose much. An example that triggers
1306 this case is the sed "script" /\(\)/x. */
1307 && node->left != NULL
1308 && (node->token.opr.idx >= BITSET_WORD_BITS
1309 || !(dfa->used_bkref_map
1310 & ((bitset_word_t) 1 << node->token.opr.idx))))
1311 return node->left;
1313 /* Convert the SUBEXP node to the concatenation of an
1314 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1315 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1316 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1317 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1318 tree = create_tree (dfa, op, tree1, CONCAT);
1319 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1320 {
1321 *err = REG_ESPACE;
1322 return NULL;
1323 }
1325 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1326 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1327 return tree;
1328 }
1330 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1331 nodes. Requires a postorder visit. */
1332 static reg_errcode_t
1333 calc_first (void *extra, bin_tree_t *node)
1334 {
1335 re_dfa_t *dfa = (re_dfa_t *) extra;
1336 if (node->token.type == CONCAT)
1337 {
1338 node->first = node->left->first;
1339 node->node_idx = node->left->node_idx;
1340 }
1341 else
1342 {
1343 node->first = node;
1344 node->node_idx = re_dfa_add_node (dfa, node->token);
1345 if (BE (node->node_idx == REG_MISSING, 0))
1346 return REG_ESPACE;
1347 }
1348 return REG_NOERROR;
1349 }
1351 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1352 static reg_errcode_t
1353 calc_next (void *extra, bin_tree_t *node)
1354 {
1355 switch (node->token.type)
1356 {
1357 case OP_DUP_ASTERISK:
1358 node->left->next = node;
1359 break;
1360 case CONCAT:
1361 node->left->next = node->right->first;
1362 node->right->next = node->next;
1363 break;
1364 default:
1365 if (node->left)
1366 node->left->next = node->next;
1367 if (node->right)
1368 node->right->next = node->next;
1369 break;
1370 }
1371 return REG_NOERROR;
1372 }
1374 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1375 static reg_errcode_t
1376 link_nfa_nodes (void *extra, bin_tree_t *node)
1377 {
1378 re_dfa_t *dfa = (re_dfa_t *) extra;
1379 Idx idx = node->node_idx;
1380 reg_errcode_t err = REG_NOERROR;
1382 switch (node->token.type)
1383 {
1384 case CONCAT:
1385 break;
1387 case END_OF_RE:
1388 assert (node->next == NULL);
1389 break;
1391 case OP_DUP_ASTERISK:
1392 case OP_ALT:
1393 {
1394 Idx left, right;
1395 dfa->has_plural_match = 1;
1396 if (node->left != NULL)
1397 left = node->left->first->node_idx;
1398 else
1399 left = node->next->node_idx;
1400 if (node->right != NULL)
1401 right = node->right->first->node_idx;
1402 else
1403 right = node->next->node_idx;
1404 assert (REG_VALID_INDEX (left));
1405 assert (REG_VALID_INDEX (right));
1406 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1407 }
1408 break;
1410 case ANCHOR:
1411 case OP_OPEN_SUBEXP:
1412 case OP_CLOSE_SUBEXP:
1413 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1414 break;
1416 case OP_BACK_REF:
1417 dfa->nexts[idx] = node->next->node_idx;
1418 if (node->token.type == OP_BACK_REF)
1419 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1420 break;
1422 default:
1423 assert (!IS_EPSILON_NODE (node->token.type));
1424 dfa->nexts[idx] = node->next->node_idx;
1425 break;
1426 }
1428 return err;
1429 }
1431 /* Duplicate the epsilon closure of the node ROOT_NODE.
1432 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1433 to their own constraint. */
1435 static reg_errcode_t
1436 internal_function
1437 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1438 Idx root_node, unsigned int init_constraint)
1439 {
1440 Idx org_node, clone_node;
1441 bool ok;
1442 unsigned int constraint = init_constraint;
1443 for (org_node = top_org_node, clone_node = top_clone_node;;)
1444 {
1445 Idx org_dest, clone_dest;
1446 if (dfa->nodes[org_node].type == OP_BACK_REF)
1447 {
1448 /* If the back reference epsilon-transit, its destination must
1449 also have the constraint. Then duplicate the epsilon closure
1450 of the destination of the back reference, and store it in
1451 edests of the back reference. */
1452 org_dest = dfa->nexts[org_node];
1453 re_node_set_empty (dfa->edests + clone_node);
1454 clone_dest = duplicate_node (dfa, org_dest, constraint);
1455 if (BE (clone_dest == REG_MISSING, 0))
1456 return REG_ESPACE;
1457 dfa->nexts[clone_node] = dfa->nexts[org_node];
1458 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1459 if (BE (! ok, 0))
1460 return REG_ESPACE;
1461 }
1462 else if (dfa->edests[org_node].nelem == 0)
1463 {
1464 /* In case of the node can't epsilon-transit, don't duplicate the
1465 destination and store the original destination as the
1466 destination of the node. */
1467 dfa->nexts[clone_node] = dfa->nexts[org_node];
1468 break;
1469 }
1470 else if (dfa->edests[org_node].nelem == 1)
1471 {
1472 /* In case of the node can epsilon-transit, and it has only one
1473 destination. */
1474 org_dest = dfa->edests[org_node].elems[0];
1475 re_node_set_empty (dfa->edests + clone_node);
1476 if (dfa->nodes[org_node].type == ANCHOR)
1477 {
1478 /* In case of the node has another constraint, append it. */
1479 if (org_node == root_node && clone_node != org_node)
1480 {
1481 /* ...but if the node is root_node itself, it means the
1482 epsilon closure have a loop, then tie it to the
1483 destination of the root_node. */
1484 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1485 if (BE (! ok, 0))
1486 return REG_ESPACE;
1487 break;
1488 }
1489 constraint |= dfa->nodes[org_node].opr.ctx_type;
1490 }
1491 clone_dest = duplicate_node (dfa, org_dest, constraint);
1492 if (BE (clone_dest == REG_MISSING, 0))
1493 return REG_ESPACE;
1494 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1495 if (BE (! ok, 0))
1496 return REG_ESPACE;
1497 }
1498 else /* dfa->edests[org_node].nelem == 2 */
1499 {
1500 /* In case of the node can epsilon-transit, and it has two
1501 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1502 org_dest = dfa->edests[org_node].elems[0];
1503 re_node_set_empty (dfa->edests + clone_node);
1504 /* Search for a duplicated node which satisfies the constraint. */
1505 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1506 if (clone_dest == REG_MISSING)
1507 {
1508 /* There are no such a duplicated node, create a new one. */
1509 reg_errcode_t err;
1510 clone_dest = duplicate_node (dfa, org_dest, constraint);
1511 if (BE (clone_dest == REG_MISSING, 0))
1512 return REG_ESPACE;
1513 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1514 if (BE (! ok, 0))
1515 return REG_ESPACE;
1516 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1517 root_node, constraint);
1518 if (BE (err != REG_NOERROR, 0))
1519 return err;
1520 }
1521 else
1522 {
1523 /* There are a duplicated node which satisfy the constraint,
1524 use it to avoid infinite loop. */
1525 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1526 if (BE (! ok, 0))
1527 return REG_ESPACE;
1528 }
1530 org_dest = dfa->edests[org_node].elems[1];
1531 clone_dest = duplicate_node (dfa, org_dest, constraint);
1532 if (BE (clone_dest == REG_MISSING, 0))
1533 return REG_ESPACE;
1534 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1535 if (BE (! ok, 0))
1536 return REG_ESPACE;
1537 }
1538 org_node = org_dest;
1539 clone_node = clone_dest;
1540 }
1541 return REG_NOERROR;
1542 }
1544 /* Search for a node which is duplicated from the node ORG_NODE, and
1545 satisfies the constraint CONSTRAINT. */
1547 static Idx
1548 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1549 unsigned int constraint)
1550 {
1551 Idx idx;
1552 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1553 {
1554 if (org_node == dfa->org_indices[idx]
1555 && constraint == dfa->nodes[idx].constraint)
1556 return idx; /* Found. */
1557 }
1558 return REG_MISSING; /* Not found. */
1559 }
1561 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1562 Return the index of the new node, or REG_MISSING if insufficient storage is
1563 available. */
1565 static Idx
1566 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1567 {
1568 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1569 if (BE (dup_idx != REG_MISSING, 1))
1570 {
1571 dfa->nodes[dup_idx].constraint = constraint;
1572 if (dfa->nodes[org_idx].type == ANCHOR)
1573 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1574 dfa->nodes[dup_idx].duplicated = 1;
1576 /* Store the index of the original node. */
1577 dfa->org_indices[dup_idx] = org_idx;
1578 }
1579 return dup_idx;
1580 }
1582 static reg_errcode_t
1583 calc_inveclosure (re_dfa_t *dfa)
1584 {
1585 Idx src, idx;
1586 bool ok;
1587 for (idx = 0; idx < dfa->nodes_len; ++idx)
1588 re_node_set_init_empty (dfa->inveclosures + idx);
1590 for (src = 0; src < dfa->nodes_len; ++src)
1591 {
1592 Idx *elems = dfa->eclosures[src].elems;
1593 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1594 {
1595 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1596 if (BE (! ok, 0))
1597 return REG_ESPACE;
1598 }
1599 }
1601 return REG_NOERROR;
1602 }
1604 /* Calculate "eclosure" for all the node in DFA. */
1606 static reg_errcode_t
1607 calc_eclosure (re_dfa_t *dfa)
1608 {
1609 Idx node_idx;
1610 bool incomplete;
1611 #ifdef DEBUG
1612 assert (dfa->nodes_len > 0);
1613 #endif
1614 incomplete = false;
1615 /* For each nodes, calculate epsilon closure. */
1616 for (node_idx = 0; ; ++node_idx)
1617 {
1618 reg_errcode_t err;
1619 re_node_set eclosure_elem;
1620 if (node_idx == dfa->nodes_len)
1621 {
1622 if (!incomplete)
1623 break;
1624 incomplete = false;
1625 node_idx = 0;
1626 }
1628 #ifdef DEBUG
1629 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1630 #endif
1632 /* If we have already calculated, skip it. */
1633 if (dfa->eclosures[node_idx].nelem != 0)
1634 continue;
1635 /* Calculate epsilon closure of `node_idx'. */
1636 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1637 if (BE (err != REG_NOERROR, 0))
1638 return err;
1640 if (dfa->eclosures[node_idx].nelem == 0)
1641 {
1642 incomplete = true;
1643 re_node_set_free (&eclosure_elem);
1644 }
1645 }
1646 return REG_NOERROR;
1647 }
1649 /* Calculate epsilon closure of NODE. */
1651 static reg_errcode_t
1652 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1653 {
1654 reg_errcode_t err;
1655 unsigned int constraint;
1656 Idx i;
1657 bool incomplete;
1658 bool ok;
1659 re_node_set eclosure;
1660 incomplete = false;
1661 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1662 if (BE (err != REG_NOERROR, 0))
1663 return err;
1665 /* This indicates that we are calculating this node now.
1666 We reference this value to avoid infinite loop. */
1667 dfa->eclosures[node].nelem = REG_MISSING;
1669 constraint = ((dfa->nodes[node].type == ANCHOR)
1670 ? dfa->nodes[node].opr.ctx_type : 0);
1671 /* If the current node has constraints, duplicate all nodes.
1672 Since they must inherit the constraints. */
1673 if (constraint
1674 && dfa->edests[node].nelem
1675 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1676 {
1677 err = duplicate_node_closure (dfa, node, node, node, constraint);
1678 if (BE (err != REG_NOERROR, 0))
1679 return err;
1680 }
1682 /* Expand each epsilon destination nodes. */
1683 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1684 for (i = 0; i < dfa->edests[node].nelem; ++i)
1685 {
1686 re_node_set eclosure_elem;
1687 Idx edest = dfa->edests[node].elems[i];
1688 /* If calculating the epsilon closure of `edest' is in progress,
1689 return intermediate result. */
1690 if (dfa->eclosures[edest].nelem == REG_MISSING)
1691 {
1692 incomplete = true;
1693 continue;
1694 }
1695 /* If we haven't calculated the epsilon closure of `edest' yet,
1696 calculate now. Otherwise use calculated epsilon closure. */
1697 if (dfa->eclosures[edest].nelem == 0)
1698 {
1699 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1700 if (BE (err != REG_NOERROR, 0))
1701 return err;
1702 }
1703 else
1704 eclosure_elem = dfa->eclosures[edest];
1705 /* Merge the epsilon closure of `edest'. */
1706 re_node_set_merge (&eclosure, &eclosure_elem);
1707 /* If the epsilon closure of `edest' is incomplete,
1708 the epsilon closure of this node is also incomplete. */
1709 if (dfa->eclosures[edest].nelem == 0)
1710 {
1711 incomplete = true;
1712 re_node_set_free (&eclosure_elem);
1713 }
1714 }
1716 /* Epsilon closures include itself. */
1717 ok = re_node_set_insert (&eclosure, node);
1718 if (BE (! ok, 0))
1719 return REG_ESPACE;
1720 if (incomplete && !root)
1721 dfa->eclosures[node].nelem = 0;
1722 else
1723 dfa->eclosures[node] = eclosure;
1724 *new_set = eclosure;
1725 return REG_NOERROR;
1726 }
1727 \f
1728 /* Functions for token which are used in the parser. */
1730 /* Fetch a token from INPUT.
1731 We must not use this function inside bracket expressions. */
1733 static void
1734 internal_function
1735 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1736 {
1737 re_string_skip_bytes (input, peek_token (result, input, syntax));
1738 }
1740 /* Peek a token from INPUT, and return the length of the token.
1741 We must not use this function inside bracket expressions. */
1743 static int
1744 internal_function
1745 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1746 {
1747 unsigned char c;
1749 if (re_string_eoi (input))
1750 {
1751 token->type = END_OF_RE;
1752 return 0;
1753 }
1755 c = re_string_peek_byte (input, 0);
1756 token->opr.c = c;
1758 token->word_char = 0;
1759 #ifdef RE_ENABLE_I18N
1760 token->mb_partial = 0;
1761 if (input->mb_cur_max > 1 &&
1762 !re_string_first_byte (input, re_string_cur_idx (input)))
1763 {
1764 token->type = CHARACTER;
1765 token->mb_partial = 1;
1766 return 1;
1767 }
1768 #endif
1769 if (c == '\\')
1770 {
1771 unsigned char c2;
1772 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1773 {
1774 token->type = BACK_SLASH;
1775 return 1;
1776 }
1778 c2 = re_string_peek_byte_case (input, 1);
1779 token->opr.c = c2;
1780 token->type = CHARACTER;
1781 #ifdef RE_ENABLE_I18N
1782 if (input->mb_cur_max > 1)
1783 {
1784 wint_t wc = re_string_wchar_at (input,
1785 re_string_cur_idx (input) + 1);
1786 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1787 }
1788 else
1789 #endif
1790 token->word_char = IS_WORD_CHAR (c2) != 0;
1792 switch (c2)
1793 {
1794 case '|':
1795 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1796 token->type = OP_ALT;
1797 break;
1798 case '1': case '2': case '3': case '4': case '5':
1799 case '6': case '7': case '8': case '9':
1800 if (!(syntax & RE_NO_BK_REFS))
1801 {
1802 token->type = OP_BACK_REF;
1803 token->opr.idx = c2 - '1';
1804 }
1805 break;
1806 case '<':
1807 if (!(syntax & RE_NO_GNU_OPS))
1808 {
1809 token->type = ANCHOR;
1810 token->opr.ctx_type = WORD_FIRST;
1811 }
1812 break;
1813 case '>':
1814 if (!(syntax & RE_NO_GNU_OPS))
1815 {
1816 token->type = ANCHOR;
1817 token->opr.ctx_type = WORD_LAST;
1818 }
1819 break;
1820 case 'b':
1821 if (!(syntax & RE_NO_GNU_OPS))
1822 {
1823 token->type = ANCHOR;
1824 token->opr.ctx_type = WORD_DELIM;
1825 }
1826 break;
1827 case 'B':
1828 if (!(syntax & RE_NO_GNU_OPS))
1829 {
1830 token->type = ANCHOR;
1831 token->opr.ctx_type = NOT_WORD_DELIM;
1832 }
1833 break;
1834 case 'w':
1835 if (!(syntax & RE_NO_GNU_OPS))
1836 token->type = OP_WORD;
1837 break;
1838 case 'W':
1839 if (!(syntax & RE_NO_GNU_OPS))
1840 token->type = OP_NOTWORD;
1841 break;
1842 case 's':
1843 if (!(syntax & RE_NO_GNU_OPS))
1844 token->type = OP_SPACE;
1845 break;
1846 case 'S':
1847 if (!(syntax & RE_NO_GNU_OPS))
1848 token->type = OP_NOTSPACE;
1849 break;
1850 case '`':
1851 if (!(syntax & RE_NO_GNU_OPS))
1852 {
1853 token->type = ANCHOR;
1854 token->opr.ctx_type = BUF_FIRST;
1855 }
1856 break;
1857 case '\'':
1858 if (!(syntax & RE_NO_GNU_OPS))
1859 {
1860 token->type = ANCHOR;
1861 token->opr.ctx_type = BUF_LAST;
1862 }
1863 break;
1864 case '(':
1865 if (!(syntax & RE_NO_BK_PARENS))
1866 token->type = OP_OPEN_SUBEXP;
1867 break;
1868 case ')':
1869 if (!(syntax & RE_NO_BK_PARENS))
1870 token->type = OP_CLOSE_SUBEXP;
1871 break;
1872 case '+':
1873 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1874 token->type = OP_DUP_PLUS;
1875 break;
1876 case '?':
1877 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1878 token->type = OP_DUP_QUESTION;
1879 break;
1880 case '{':
1881 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1882 token->type = OP_OPEN_DUP_NUM;
1883 break;
1884 case '}':
1885 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1886 token->type = OP_CLOSE_DUP_NUM;
1887 break;
1888 default:
1889 break;
1890 }
1891 return 2;
1892 }
1894 token->type = CHARACTER;
1895 #ifdef RE_ENABLE_I18N
1896 if (input->mb_cur_max > 1)
1897 {
1898 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1899 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1900 }
1901 else
1902 #endif
1903 token->word_char = IS_WORD_CHAR (token->opr.c);
1905 switch (c)
1906 {
1907 case '\n':
1908 if (syntax & RE_NEWLINE_ALT)
1909 token->type = OP_ALT;
1910 break;
1911 case '|':
1912 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1913 token->type = OP_ALT;
1914 break;
1915 case '*':
1916 token->type = OP_DUP_ASTERISK;
1917 break;
1918 case '+':
1919 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1920 token->type = OP_DUP_PLUS;
1921 break;
1922 case '?':
1923 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1924 token->type = OP_DUP_QUESTION;
1925 break;
1926 case '{':
1927 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1928 token->type = OP_OPEN_DUP_NUM;
1929 break;
1930 case '}':
1931 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1932 token->type = OP_CLOSE_DUP_NUM;
1933 break;
1934 case '(':
1935 if (syntax & RE_NO_BK_PARENS)
1936 token->type = OP_OPEN_SUBEXP;
1937 break;
1938 case ')':
1939 if (syntax & RE_NO_BK_PARENS)
1940 token->type = OP_CLOSE_SUBEXP;
1941 break;
1942 case '[':
1943 token->type = OP_OPEN_BRACKET;
1944 break;
1945 case '.':
1946 token->type = OP_PERIOD;
1947 break;
1948 case '^':
1949 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1950 re_string_cur_idx (input) != 0)
1951 {
1952 char prev = re_string_peek_byte (input, -1);
1953 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1954 break;
1955 }
1956 token->type = ANCHOR;
1957 token->opr.ctx_type = LINE_FIRST;
1958 break;
1959 case '$':
1960 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1961 re_string_cur_idx (input) + 1 != re_string_length (input))
1962 {
1963 re_token_t next;
1964 re_string_skip_bytes (input, 1);
1965 peek_token (&next, input, syntax);
1966 re_string_skip_bytes (input, -1);
1967 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1968 break;
1969 }
1970 token->type = ANCHOR;
1971 token->opr.ctx_type = LINE_LAST;
1972 break;
1973 default:
1974 break;
1975 }
1976 return 1;
1977 }
1979 /* Peek a token from INPUT, and return the length of the token.
1980 We must not use this function out of bracket expressions. */
1982 static int
1983 internal_function
1984 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1985 {
1986 unsigned char c;
1987 if (re_string_eoi (input))
1988 {
1989 token->type = END_OF_RE;
1990 return 0;
1991 }
1992 c = re_string_peek_byte (input, 0);
1993 token->opr.c = c;
1995 #ifdef RE_ENABLE_I18N
1996 if (input->mb_cur_max > 1 &&
1997 !re_string_first_byte (input, re_string_cur_idx (input)))
1998 {
1999 token->type = CHARACTER;
2000 return 1;
2001 }
2002 #endif /* RE_ENABLE_I18N */
2004 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2005 && re_string_cur_idx (input) + 1 < re_string_length (input))
2006 {
2007 /* In this case, '\' escape a character. */
2008 unsigned char c2;
2009 re_string_skip_bytes (input, 1);
2010 c2 = re_string_peek_byte (input, 0);
2011 token->opr.c = c2;
2012 token->type = CHARACTER;
2013 return 1;
2014 }
2015 if (c == '[') /* '[' is a special char in a bracket exps. */
2016 {
2017 unsigned char c2;
2018 int token_len;
2019 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2020 c2 = re_string_peek_byte (input, 1);
2021 else
2022 c2 = 0;
2023 token->opr.c = c2;
2024 token_len = 2;
2025 switch (c2)
2026 {
2027 case '.':
2028 token->type = OP_OPEN_COLL_ELEM;
2029 break;
2030 case '=':
2031 token->type = OP_OPEN_EQUIV_CLASS;
2032 break;
2033 case ':':
2034 if (syntax & RE_CHAR_CLASSES)
2035 {
2036 token->type = OP_OPEN_CHAR_CLASS;
2037 break;
2038 }
2039 /* else fall through. */
2040 default:
2041 token->type = CHARACTER;
2042 token->opr.c = c;
2043 token_len = 1;
2044 break;
2045 }
2046 return token_len;
2047 }
2048 switch (c)
2049 {
2050 case '-':
2051 token->type = OP_CHARSET_RANGE;
2052 break;
2053 case ']':
2054 token->type = OP_CLOSE_BRACKET;
2055 break;
2056 case '^':
2057 token->type = OP_NON_MATCH_LIST;
2058 break;
2059 default:
2060 token->type = CHARACTER;
2061 }
2062 return 1;
2063 }
2064 \f
2065 /* Functions for parser. */
2067 /* Entry point of the parser.
2068 Parse the regular expression REGEXP and return the structure tree.
2069 If an error is occured, ERR is set by error code, and return NULL.
2070 This function build the following tree, from regular expression <reg_exp>:
2071 CAT
2072 / \
2073 / \
2074 <reg_exp> EOR
2076 CAT means concatenation.
2077 EOR means end of regular expression. */
2079 static bin_tree_t *
2080 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2081 reg_errcode_t *err)
2082 {
2083 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2084 bin_tree_t *tree, *eor, *root;
2085 re_token_t current_token;
2086 dfa->syntax = syntax;
2087 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2088 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2089 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2090 return NULL;
2091 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2092 if (tree != NULL)
2093 root = create_tree (dfa, tree, eor, CONCAT);
2094 else
2095 root = eor;
2096 if (BE (eor == NULL || root == NULL, 0))
2097 {
2098 *err = REG_ESPACE;
2099 return NULL;
2100 }
2101 return root;
2102 }
2104 /* This function build the following tree, from regular expression
2105 <branch1>|<branch2>:
2106 ALT
2107 / \
2108 / \
2109 <branch1> <branch2>
2111 ALT means alternative, which represents the operator `|'. */
2113 static bin_tree_t *
2114 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2115 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2116 {
2117 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2118 bin_tree_t *tree, *branch = NULL;
2119 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2120 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2121 return NULL;
2123 while (token->type == OP_ALT)
2124 {
2125 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2126 if (token->type != OP_ALT && token->type != END_OF_RE
2127 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2128 {
2129 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2130 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2131 return NULL;
2132 }
2133 else
2134 branch = NULL;
2135 tree = create_tree (dfa, tree, branch, OP_ALT);
2136 if (BE (tree == NULL, 0))
2137 {
2138 *err = REG_ESPACE;
2139 return NULL;
2140 }
2141 }
2142 return tree;
2143 }
2145 /* This function build the following tree, from regular expression
2146 <exp1><exp2>:
2147 CAT
2148 / \
2149 / \
2150 <exp1> <exp2>
2152 CAT means concatenation. */
2154 static bin_tree_t *
2155 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2156 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2157 {
2158 bin_tree_t *tree, *expr;
2159 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2160 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2161 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2162 return NULL;
2164 while (token->type != OP_ALT && token->type != END_OF_RE
2165 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2166 {
2167 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2168 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2169 {
2170 return NULL;
2171 }
2172 if (tree != NULL && expr != NULL)
2173 {
2174 tree = create_tree (dfa, tree, expr, CONCAT);
2175 if (tree == NULL)
2176 {
2177 *err = REG_ESPACE;
2178 return NULL;
2179 }
2180 }
2181 else if (tree == NULL)
2182 tree = expr;
2183 /* Otherwise expr == NULL, we don't need to create new tree. */
2184 }
2185 return tree;
2186 }
2188 /* This function build the following tree, from regular expression a*:
2189 *
2190 |
2191 a
2192 */
2194 static bin_tree_t *
2195 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2196 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2197 {
2198 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2199 bin_tree_t *tree;
2200 switch (token->type)
2201 {
2202 case CHARACTER:
2203 tree = create_token_tree (dfa, NULL, NULL, token);
2204 if (BE (tree == NULL, 0))
2205 {
2206 *err = REG_ESPACE;
2207 return NULL;
2208 }
2209 #ifdef RE_ENABLE_I18N
2210 if (dfa->mb_cur_max > 1)
2211 {
2212 while (!re_string_eoi (regexp)
2213 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2214 {
2215 bin_tree_t *mbc_remain;
2216 fetch_token (token, regexp, syntax);
2217 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2218 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2219 if (BE (mbc_remain == NULL || tree == NULL, 0))
2220 {
2221 *err = REG_ESPACE;
2222 return NULL;
2223 }
2224 }
2225 }
2226 #endif
2227 break;
2228 case OP_OPEN_SUBEXP:
2229 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2230 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2231 return NULL;
2232 break;
2233 case OP_OPEN_BRACKET:
2234 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2235 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2236 return NULL;
2237 break;
2238 case OP_BACK_REF:
2239 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2240 {
2241 *err = REG_ESUBREG;
2242 return NULL;
2243 }
2244 dfa->used_bkref_map |= 1 << token->opr.idx;
2245 tree = create_token_tree (dfa, NULL, NULL, token);
2246 if (BE (tree == NULL, 0))
2247 {
2248 *err = REG_ESPACE;
2249 return NULL;
2250 }
2251 ++dfa->nbackref;
2252 dfa->has_mb_node = 1;
2253 break;
2254 case OP_OPEN_DUP_NUM:
2255 if (syntax & RE_CONTEXT_INVALID_DUP)
2256 {
2257 *err = REG_BADRPT;
2258 return NULL;
2259 }
2260 /* FALLTHROUGH */
2261 case OP_DUP_ASTERISK:
2262 case OP_DUP_PLUS:
2263 case OP_DUP_QUESTION:
2264 if (syntax & RE_CONTEXT_INVALID_OPS)
2265 {
2266 *err = REG_BADRPT;
2267 return NULL;
2268 }
2269 else if (syntax & RE_CONTEXT_INDEP_OPS)
2270 {
2271 fetch_token (token, regexp, syntax);
2272 return parse_expression (regexp, preg, token, syntax, nest, err);
2273 }
2274 /* else fall through */
2275 case OP_CLOSE_SUBEXP:
2276 if ((token->type == OP_CLOSE_SUBEXP) &&
2277 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2278 {
2279 *err = REG_ERPAREN;
2280 return NULL;
2281 }
2282 /* else fall through */
2283 case OP_CLOSE_DUP_NUM:
2284 /* We treat it as a normal character. */
2286 /* Then we can these characters as normal characters. */
2287 token->type = CHARACTER;
2288 /* mb_partial and word_char bits should be initialized already
2289 by peek_token. */
2290 tree = create_token_tree (dfa, NULL, NULL, token);
2291 if (BE (tree == NULL, 0))
2292 {
2293 *err = REG_ESPACE;
2294 return NULL;
2295 }
2296 break;
2297 case ANCHOR:
2298 if ((token->opr.ctx_type
2299 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2300 && dfa->word_ops_used == 0)
2301 init_word_char (dfa);
2302 if (token->opr.ctx_type == WORD_DELIM
2303 || token->opr.ctx_type == NOT_WORD_DELIM)
2304 {
2305 bin_tree_t *tree_first, *tree_last;
2306 if (token->opr.ctx_type == WORD_DELIM)
2307 {
2308 token->opr.ctx_type = WORD_FIRST;
2309 tree_first = create_token_tree (dfa, NULL, NULL, token);
2310 token->opr.ctx_type = WORD_LAST;
2311 }
2312 else
2313 {
2314 token->opr.ctx_type = INSIDE_WORD;
2315 tree_first = create_token_tree (dfa, NULL, NULL, token);
2316 token->opr.ctx_type = INSIDE_NOTWORD;
2317 }
2318 tree_last = create_token_tree (dfa, NULL, NULL, token);
2319 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2320 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2321 {
2322 *err = REG_ESPACE;
2323 return NULL;
2324 }
2325 }
2326 else
2327 {
2328 tree = create_token_tree (dfa, NULL, NULL, token);
2329 if (BE (tree == NULL, 0))
2330 {
2331 *err = REG_ESPACE;
2332 return NULL;
2333 }
2334 }
2335 /* We must return here, since ANCHORs can't be followed
2336 by repetition operators.
2337 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2338 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2339 fetch_token (token, regexp, syntax);
2340 return tree;
2341 case OP_PERIOD:
2342 tree = create_token_tree (dfa, NULL, NULL, token);
2343 if (BE (tree == NULL, 0))
2344 {
2345 *err = REG_ESPACE;
2346 return NULL;
2347 }
2348 if (dfa->mb_cur_max > 1)
2349 dfa->has_mb_node = 1;
2350 break;
2351 case OP_WORD:
2352 case OP_NOTWORD:
2353 tree = build_charclass_op (dfa, regexp->trans,
2354 (const unsigned char *) "alnum",
2355 (const unsigned char *) "_",
2356 token->type == OP_NOTWORD, err);
2357 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2358 return NULL;
2359 break;
2360 case OP_SPACE:
2361 case OP_NOTSPACE:
2362 tree = build_charclass_op (dfa, regexp->trans,
2363 (const unsigned char *) "space",
2364 (const unsigned char *) "",
2365 token->type == OP_NOTSPACE, err);
2366 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2367 return NULL;
2368 break;
2369 case OP_ALT:
2370 case END_OF_RE:
2371 return NULL;
2372 case BACK_SLASH:
2373 *err = REG_EESCAPE;
2374 return NULL;
2375 default:
2376 /* Must not happen? */
2377 #ifdef DEBUG
2378 assert (0);
2379 #endif
2380 return NULL;
2381 }
2382 fetch_token (token, regexp, syntax);
2384 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2385 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2386 {
2387 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2388 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2389 return NULL;
2390 /* In BRE consecutive duplications are not allowed. */
2391 if ((syntax & RE_CONTEXT_INVALID_DUP)
2392 && (token->type == OP_DUP_ASTERISK
2393 || token->type == OP_OPEN_DUP_NUM))
2394 {
2395 *err = REG_BADRPT;
2396 return NULL;
2397 }
2398 }
2400 return tree;
2401 }
2403 /* This function build the following tree, from regular expression
2404 (<reg_exp>):
2405 SUBEXP
2406 |
2407 <reg_exp>
2408 */
2410 static bin_tree_t *
2411 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2412 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2413 {
2414 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2415 bin_tree_t *tree;
2416 size_t cur_nsub;
2417 cur_nsub = preg->re_nsub++;
2419 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2421 /* The subexpression may be a null string. */
2422 if (token->type == OP_CLOSE_SUBEXP)
2423 tree = NULL;
2424 else
2425 {
2426 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2427 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2428 *err = REG_EPAREN;
2429 if (BE (*err != REG_NOERROR, 0))
2430 return NULL;
2431 }
2433 if (cur_nsub <= '9' - '1')
2434 dfa->completed_bkref_map |= 1 << cur_nsub;
2436 tree = create_tree (dfa, tree, NULL, SUBEXP);
2437 if (BE (tree == NULL, 0))
2438 {
2439 *err = REG_ESPACE;
2440 return NULL;
2441 }
2442 tree->token.opr.idx = cur_nsub;
2443 return tree;
2444 }
2446 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2448 static bin_tree_t *
2449 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2450 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2451 {
2452 bin_tree_t *tree = NULL, *old_tree = NULL;
2453 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2454 re_token_t start_token = *token;
2456 if (token->type == OP_OPEN_DUP_NUM)
2457 {
2458 end = 0;
2459 start = fetch_number (regexp, token, syntax);
2460 if (start == REG_MISSING)
2461 {
2462 if (token->type == CHARACTER && token->opr.c == ',')
2463 start = 0; /* We treat "{,m}" as "{0,m}". */
2464 else
2465 {
2466 *err = REG_BADBR; /* <re>{} is invalid. */
2467 return NULL;
2468 }
2469 }
2470 if (BE (start != REG_ERROR, 1))
2471 {
2472 /* We treat "{n}" as "{n,n}". */
2473 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2474 : ((token->type == CHARACTER && token->opr.c == ',')
2475 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2476 }
2477 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2478 {
2479 /* Invalid sequence. */
2480 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2481 {
2482 if (token->type == END_OF_RE)
2483 *err = REG_EBRACE;
2484 else
2485 *err = REG_BADBR;
2487 return NULL;
2488 }
2490 /* If the syntax bit is set, rollback. */
2491 re_string_set_index (regexp, start_idx);
2492 *token = start_token;
2493 token->type = CHARACTER;
2494 /* mb_partial and word_char bits should be already initialized by
2495 peek_token. */
2496 return elem;
2497 }
2499 if (BE (end != REG_MISSING && start > end, 0))
2500 {
2501 /* First number greater than second. */
2502 *err = REG_BADBR;
2503 return NULL;
2504 }
2505 }
2506 else
2507 {
2508 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2509 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2510 }
2512 fetch_token (token, regexp, syntax);
2514 if (BE (elem == NULL, 0))
2515 return NULL;
2516 if (BE (start == 0 && end == 0, 0))
2517 {
2518 postorder (elem, free_tree, NULL);
2519 return NULL;
2520 }
2522 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2523 if (BE (start > 0, 0))
2524 {
2525 tree = elem;
2526 for (i = 2; i <= start; ++i)
2527 {
2528 elem = duplicate_tree (elem, dfa);
2529 tree = create_tree (dfa, tree, elem, CONCAT);
2530 if (BE (elem == NULL || tree == NULL, 0))
2531 goto parse_dup_op_espace;
2532 }
2534 if (start == end)
2535 return tree;
2537 /* Duplicate ELEM before it is marked optional. */
2538 elem = duplicate_tree (elem, dfa);
2539 old_tree = tree;
2540 }
2541 else
2542 old_tree = NULL;
2544 if (elem->token.type == SUBEXP)
2545 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2547 tree = create_tree (dfa, elem, NULL,
2548 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2549 if (BE (tree == NULL, 0))
2550 goto parse_dup_op_espace;
2552 /* This loop is actually executed only when end != REG_MISSING,
2553 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2554 already created the start+1-th copy. */
2555 if ((Idx) -1 < 0 || end != REG_MISSING)
2556 for (i = start + 2; i <= end; ++i)
2557 {
2558 elem = duplicate_tree (elem, dfa);
2559 tree = create_tree (dfa, tree, elem, CONCAT);
2560 if (BE (elem == NULL || tree == NULL, 0))
2561 goto parse_dup_op_espace;
2563 tree = create_tree (dfa, tree, NULL, OP_ALT);
2564 if (BE (tree == NULL, 0))
2565 goto parse_dup_op_espace;
2566 }
2568 if (old_tree)
2569 tree = create_tree (dfa, old_tree, tree, CONCAT);
2571 return tree;
2573 parse_dup_op_espace:
2574 *err = REG_ESPACE;
2575 return NULL;
2576 }
2578 /* Size of the names for collating symbol/equivalence_class/character_class.
2579 I'm not sure, but maybe enough. */
2580 #define BRACKET_NAME_BUF_SIZE 32
2582 #ifndef _LIBC
2583 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2584 Build the range expression which starts from START_ELEM, and ends
2585 at END_ELEM. The result are written to MBCSET and SBCSET.
2586 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2587 mbcset->range_ends, is a pointer argument sinse we may
2588 update it. */
2590 static reg_errcode_t
2591 internal_function
2592 # ifdef RE_ENABLE_I18N
2593 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2594 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2595 # else /* not RE_ENABLE_I18N */
2596 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2597 bracket_elem_t *end_elem)
2598 # endif /* not RE_ENABLE_I18N */
2599 {
2600 unsigned int start_ch, end_ch;
2601 /* Equivalence Classes and Character Classes can't be a range start/end. */
2602 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2603 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2604 0))
2605 return REG_ERANGE;
2607 /* We can handle no multi character collating elements without libc
2608 support. */
2609 if (BE ((start_elem->type == COLL_SYM
2610 && strlen ((char *) start_elem->opr.name) > 1)
2611 || (end_elem->type == COLL_SYM
2612 && strlen ((char *) end_elem->opr.name) > 1), 0))
2613 return REG_ECOLLATE;
2615 # ifdef RE_ENABLE_I18N
2616 {
2617 wchar_t wc;
2618 wint_t start_wc;
2619 wint_t end_wc;
2620 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2622 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2623 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2624 : 0));
2625 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2626 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2627 : 0));
2628 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2629 ? __btowc (start_ch) : start_elem->opr.wch);
2630 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2631 ? __btowc (end_ch) : end_elem->opr.wch);
2632 if (start_wc == WEOF || end_wc == WEOF)
2633 return REG_ECOLLATE;
2634 cmp_buf[0] = start_wc;
2635 cmp_buf[4] = end_wc;
2636 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2637 return REG_ERANGE;
2639 /* Got valid collation sequence values, add them as a new entry.
2640 However, for !_LIBC we have no collation elements: if the
2641 character set is single byte, the single byte character set
2642 that we build below suffices. parse_bracket_exp passes
2643 no MBCSET if dfa->mb_cur_max == 1. */
2644 if (mbcset)
2645 {
2646 /* Check the space of the arrays. */
2647 if (BE (*range_alloc == mbcset->nranges, 0))
2648 {
2649 /* There is not enough space, need realloc. */
2650 wchar_t *new_array_start, *new_array_end;
2651 Idx new_nranges;
2653 /* +1 in case of mbcset->nranges is 0. */
2654 new_nranges = 2 * mbcset->nranges + 1;
2655 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2656 are NULL if *range_alloc == 0. */
2657 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2658 new_nranges);
2659 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2660 new_nranges);
2662 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2663 return REG_ESPACE;
2665 mbcset->range_starts = new_array_start;
2666 mbcset->range_ends = new_array_end;
2667 *range_alloc = new_nranges;
2668 }
2670 mbcset->range_starts[mbcset->nranges] = start_wc;
2671 mbcset->range_ends[mbcset->nranges++] = end_wc;
2672 }
2674 /* Build the table for single byte characters. */
2675 for (wc = 0; wc < SBC_MAX; ++wc)
2676 {
2677 cmp_buf[2] = wc;
2678 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2679 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2680 bitset_set (sbcset, wc);
2681 }
2682 }
2683 # else /* not RE_ENABLE_I18N */
2684 {
2685 unsigned int ch;
2686 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2687 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2688 : 0));
2689 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2690 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2691 : 0));
2692 if (start_ch > end_ch)
2693 return REG_ERANGE;
2694 /* Build the table for single byte characters. */
2695 for (ch = 0; ch < SBC_MAX; ++ch)
2696 if (start_ch <= ch && ch <= end_ch)
2697 bitset_set (sbcset, ch);
2698 }
2699 # endif /* not RE_ENABLE_I18N */
2700 return REG_NOERROR;
2701 }
2702 #endif /* not _LIBC */
2704 #ifndef _LIBC
2705 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2706 Build the collating element which is represented by NAME.
2707 The result are written to MBCSET and SBCSET.
2708 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2709 pointer argument since we may update it. */
2711 static reg_errcode_t
2712 internal_function
2713 build_collating_symbol (bitset_t sbcset,
2714 # ifdef RE_ENABLE_I18N
2715 re_charset_t *mbcset, Idx *coll_sym_alloc,
2716 # endif
2717 const unsigned char *name)
2718 {
2719 size_t name_len = strlen ((const char *) name);
2720 if (BE (name_len != 1, 0))
2721 return REG_ECOLLATE;
2722 else
2723 {
2724 bitset_set (sbcset, name[0]);
2725 return REG_NOERROR;
2726 }
2727 }
2728 #endif /* not _LIBC */
2730 /* This function parse bracket expression like "[abc]", "[a-c]",
2731 "[[.a-a.]]" etc. */
2733 static bin_tree_t *
2734 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2735 reg_syntax_t syntax, reg_errcode_t *err)
2736 {
2737 #ifdef _LIBC
2738 const unsigned char *collseqmb;
2739 const char *collseqwc;
2740 uint32_t nrules;
2741 int32_t table_size;
2742 const int32_t *symb_table;
2743 const unsigned char *extra;
2745 /* Local function for parse_bracket_exp used in _LIBC environement.
2746 Seek the collating symbol entry correspondings to NAME.
2747 Return the index of the symbol in the SYMB_TABLE. */
2749 auto inline int32_t
2750 __attribute ((always_inline))
2751 seek_collating_symbol_entry (name, name_len)
2752 const unsigned char *name;
2753 size_t name_len;
2754 {
2755 int32_t hash = elem_hash ((const char *) name, name_len);
2756 int32_t elem = hash % table_size;
2757 if (symb_table[2 * elem] != 0)
2758 {
2759 int32_t second = hash % (table_size - 2) + 1;
2761 do
2762 {
2763 /* First compare the hashing value. */
2764 if (symb_table[2 * elem] == hash
2765 /* Compare the length of the name. */
2766 && name_len == extra[symb_table[2 * elem + 1]]
2767 /* Compare the name. */
2768 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2769 name_len) == 0)
2770 {
2771 /* Yep, this is the entry. */
2772 break;
2773 }
2775 /* Next entry. */
2776 elem += second;
2777 }
2778 while (symb_table[2 * elem] != 0);
2779 }
2780 return elem;
2781 }
2783 /* Local function for parse_bracket_exp used in _LIBC environement.
2784 Look up the collation sequence value of BR_ELEM.
2785 Return the value if succeeded, UINT_MAX otherwise. */
2787 auto inline unsigned int
2788 __attribute ((always_inline))
2789 lookup_collation_sequence_value (br_elem)
2790 bracket_elem_t *br_elem;
2791 {
2792 if (br_elem->type == SB_CHAR)
2793 {
2794 /*
2795 if (MB_CUR_MAX == 1)
2796 */
2797 if (nrules == 0)
2798 return collseqmb[br_elem->opr.ch];
2799 else
2800 {
2801 wint_t wc = __btowc (br_elem->opr.ch);
2802 return __collseq_table_lookup (collseqwc, wc);
2803 }
2804 }
2805 else if (br_elem->type == MB_CHAR)
2806 {
2807 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2808 }
2809 else if (br_elem->type == COLL_SYM)
2810 {
2811 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2812 if (nrules != 0)
2813 {
2814 int32_t elem, idx;
2815 elem = seek_collating_symbol_entry (br_elem->opr.name,
2816 sym_name_len);
2817 if (symb_table[2 * elem] != 0)
2818 {
2819 /* We found the entry. */
2820 idx = symb_table[2 * elem + 1];
2821 /* Skip the name of collating element name. */
2822 idx += 1 + extra[idx];
2823 /* Skip the byte sequence of the collating element. */
2824 idx += 1 + extra[idx];
2825 /* Adjust for the alignment. */
2826 idx = (idx + 3) & ~3;
2827 /* Skip the multibyte collation sequence value. */
2828 idx += sizeof (unsigned int);
2829 /* Skip the wide char sequence of the collating element. */
2830 idx += sizeof (unsigned int) *
2831 (1 + *(unsigned int *) (extra + idx));
2832 /* Return the collation sequence value. */
2833 return *(unsigned int *) (extra + idx);
2834 }
2835 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2836 {
2837 /* No valid character. Match it as a single byte
2838 character. */
2839 return collseqmb[br_elem->opr.name[0]];
2840 }
2841 }
2842 else if (sym_name_len == 1)
2843 return collseqmb[br_elem->opr.name[0]];
2844 }
2845 return UINT_MAX;
2846 }
2848 /* Local function for parse_bracket_exp used in _LIBC environement.
2849 Build the range expression which starts from START_ELEM, and ends
2850 at END_ELEM. The result are written to MBCSET and SBCSET.
2851 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2852 mbcset->range_ends, is a pointer argument sinse we may
2853 update it. */
2855 auto inline reg_errcode_t
2856 __attribute ((always_inline))
2857 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2858 re_charset_t *mbcset;
2859 Idx *range_alloc;
2860 bitset_t sbcset;
2861 bracket_elem_t *start_elem, *end_elem;
2862 {
2863 unsigned int ch;
2864 uint32_t start_collseq;
2865 uint32_t end_collseq;
2867 /* Equivalence Classes and Character Classes can't be a range
2868 start/end. */
2869 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2870 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2871 0))
2872 return REG_ERANGE;
2874 start_collseq = lookup_collation_sequence_value (start_elem);
2875 end_collseq = lookup_collation_sequence_value (end_elem);
2876 /* Check start/end collation sequence values. */
2877 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2878 return REG_ECOLLATE;
2879 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2880 return REG_ERANGE;
2882 /* Got valid collation sequence values, add them as a new entry.
2883 However, if we have no collation elements, and the character set
2884 is single byte, the single byte character set that we
2885 build below suffices. */
2886 if (nrules > 0 || dfa->mb_cur_max > 1)
2887 {
2888 /* Check the space of the arrays. */
2889 if (BE (*range_alloc == mbcset->nranges, 0))
2890 {
2891 /* There is not enough space, need realloc. */
2892 uint32_t *new_array_start;
2893 uint32_t *new_array_end;
2894 Idx new_nranges;
2896 /* +1 in case of mbcset->nranges is 0. */
2897 new_nranges = 2 * mbcset->nranges + 1;
2898 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2899 new_nranges);
2900 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2901 new_nranges);
2903 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2904 return REG_ESPACE;
2906 mbcset->range_starts = new_array_start;
2907 mbcset->range_ends = new_array_end;
2908 *range_alloc = new_nranges;
2909 }
2911 mbcset->range_starts[mbcset->nranges] = start_collseq;
2912 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2913 }
2915 /* Build the table for single byte characters. */
2916 for (ch = 0; ch < SBC_MAX; ch++)
2917 {
2918 uint32_t ch_collseq;
2919 /*
2920 if (MB_CUR_MAX == 1)
2921 */
2922 if (nrules == 0)
2923 ch_collseq = collseqmb[ch];
2924 else
2925 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2926 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2927 bitset_set (sbcset, ch);
2928 }
2929 return REG_NOERROR;
2930 }
2932 /* Local function for parse_bracket_exp used in _LIBC environement.
2933 Build the collating element which is represented by NAME.
2934 The result are written to MBCSET and SBCSET.
2935 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2936 pointer argument sinse we may update it. */
2938 auto inline reg_errcode_t
2939 __attribute ((always_inline))
2940 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2941 re_charset_t *mbcset;
2942 Idx *coll_sym_alloc;
2943 bitset_t sbcset;
2944 const unsigned char *name;
2945 {
2946 int32_t elem, idx;
2947 size_t name_len = strlen ((const char *) name);
2948 if (nrules != 0)
2949 {
2950 elem = seek_collating_symbol_entry (name, name_len);
2951 if (symb_table[2 * elem] != 0)
2952 {
2953 /* We found the entry. */
2954 idx = symb_table[2 * elem + 1];
2955 /* Skip the name of collating element name. */
2956 idx += 1 + extra[idx];
2957 }
2958 else if (symb_table[2 * elem] == 0 && name_len == 1)
2959 {
2960 /* No valid character, treat it as a normal
2961 character. */
2962 bitset_set (sbcset, name[0]);
2963 return REG_NOERROR;
2964 }
2965 else
2966 return REG_ECOLLATE;
2968 /* Got valid collation sequence, add it as a new entry. */
2969 /* Check the space of the arrays. */
2970 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2971 {
2972 /* Not enough, realloc it. */
2973 /* +1 in case of mbcset->ncoll_syms is 0. */
2974 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2975 /* Use realloc since mbcset->coll_syms is NULL
2976 if *alloc == 0. */
2977 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2978 new_coll_sym_alloc);
2979 if (BE (new_coll_syms == NULL, 0))
2980 return REG_ESPACE;
2981 mbcset->coll_syms = new_coll_syms;
2982 *coll_sym_alloc = new_coll_sym_alloc;
2983 }
2984 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2985 return REG_NOERROR;
2986 }
2987 else
2988 {
2989 if (BE (name_len != 1, 0))
2990 return REG_ECOLLATE;
2991 else
2992 {
2993 bitset_set (sbcset, name[0]);
2994 return REG_NOERROR;
2995 }
2996 }
2997 }
2998 #endif
3000 re_token_t br_token;
3001 re_bitset_ptr_t sbcset;
3002 #ifdef RE_ENABLE_I18N
3003 re_charset_t *mbcset;
3004 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3005 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3006 #endif /* not RE_ENABLE_I18N */
3007 bool non_match = false;
3008 bin_tree_t *work_tree;
3009 int token_len;
3010 bool first_round = true;
3011 #ifdef _LIBC
3012 collseqmb = (const unsigned char *)
3013 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3014 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3015 if (nrules)
3016 {
3017 /*
3018 if (MB_CUR_MAX > 1)
3019 */
3020 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3021 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3022 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3023 _NL_COLLATE_SYMB_TABLEMB);
3024 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3025 _NL_COLLATE_SYMB_EXTRAMB);
3026 }
3027 #endif
3028 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3029 #ifdef RE_ENABLE_I18N
3030 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3031 #endif /* RE_ENABLE_I18N */
3032 #ifdef RE_ENABLE_I18N
3033 if (BE (sbcset == NULL || mbcset == NULL, 0))
3034 #else
3035 if (BE (sbcset == NULL, 0))
3036 #endif /* RE_ENABLE_I18N */
3037 {
3038 *err = REG_ESPACE;
3039 return NULL;
3040 }
3042 token_len = peek_token_bracket (token, regexp, syntax);
3043 if (BE (token->type == END_OF_RE, 0))
3044 {
3045 *err = REG_BADPAT;
3046 goto parse_bracket_exp_free_return;
3047 }
3048 if (token->type == OP_NON_MATCH_LIST)
3049 {
3050 #ifdef RE_ENABLE_I18N
3051 mbcset->non_match = 1;
3052 #endif /* not RE_ENABLE_I18N */
3053 non_match = true;
3054 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3055 bitset_set (sbcset, '\n');
3056 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3057 token_len = peek_token_bracket (token, regexp, syntax);
3058 if (BE (token->type == END_OF_RE, 0))
3059 {
3060 *err = REG_BADPAT;
3061 goto parse_bracket_exp_free_return;
3062 }
3063 }
3065 /* We treat the first ']' as a normal character. */
3066 if (token->type == OP_CLOSE_BRACKET)
3067 token->type = CHARACTER;
3069 while (1)
3070 {
3071 bracket_elem_t start_elem, end_elem;
3072 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3073 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3074 reg_errcode_t ret;
3075 int token_len2 = 0;
3076 bool is_range_exp = false;
3077 re_token_t token2;
3079 start_elem.opr.name = start_name_buf;
3080 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3081 syntax, first_round);
3082 if (BE (ret != REG_NOERROR, 0))
3083 {
3084 *err = ret;
3085 goto parse_bracket_exp_free_return;
3086 }
3087 first_round = false;
3089 /* Get information about the next token. We need it in any case. */
3090 token_len = peek_token_bracket (token, regexp, syntax);
3092 /* Do not check for ranges if we know they are not allowed. */
3093 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3094 {
3095 if (BE (token->type == END_OF_RE, 0))
3096 {
3097 *err = REG_EBRACK;
3098 goto parse_bracket_exp_free_return;
3099 }
3100 if (token->type == OP_CHARSET_RANGE)
3101 {
3102 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3103 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3104 if (BE (token2.type == END_OF_RE, 0))
3105 {
3106 *err = REG_EBRACK;
3107 goto parse_bracket_exp_free_return;
3108 }
3109 if (token2.type == OP_CLOSE_BRACKET)
3110 {
3111 /* We treat the last '-' as a normal character. */
3112 re_string_skip_bytes (regexp, -token_len);
3113 token->type = CHARACTER;
3114 }
3115 else
3116 is_range_exp = true;
3117 }
3118 }
3120 if (is_range_exp == true)
3121 {
3122 end_elem.opr.name = end_name_buf;
3123 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3124 dfa, syntax, true);
3125 if (BE (ret != REG_NOERROR, 0))
3126 {
3127 *err = ret;
3128 goto parse_bracket_exp_free_return;
3129 }
3131 token_len = peek_token_bracket (token, regexp, syntax);
3133 #ifdef _LIBC
3134 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3135 &start_elem, &end_elem);
3136 #else
3137 # ifdef RE_ENABLE_I18N
3138 *err = build_range_exp (sbcset,
3139 dfa->mb_cur_max > 1 ? mbcset : NULL,
3140 &range_alloc, &start_elem, &end_elem);
3141 # else
3142 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3143 # endif
3144 #endif /* RE_ENABLE_I18N */
3145 if (BE (*err != REG_NOERROR, 0))
3146 goto parse_bracket_exp_free_return;
3147 }
3148 else
3149 {
3150 switch (start_elem.type)
3151 {
3152 case SB_CHAR:
3153 bitset_set (sbcset, start_elem.opr.ch);
3154 break;
3155 #ifdef RE_ENABLE_I18N
3156 case MB_CHAR:
3157 /* Check whether the array has enough space. */
3158 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3159 {
3160 wchar_t *new_mbchars;
3161 /* Not enough, realloc it. */
3162 /* +1 in case of mbcset->nmbchars is 0. */
3163 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3164 /* Use realloc since array is NULL if *alloc == 0. */
3165 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3166 mbchar_alloc);
3167 if (BE (new_mbchars == NULL, 0))
3168 goto parse_bracket_exp_espace;
3169 mbcset->mbchars = new_mbchars;
3170 }
3171 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3172 break;
3173 #endif /* RE_ENABLE_I18N */
3174 case EQUIV_CLASS:
3175 *err = build_equiv_class (sbcset,
3176 #ifdef RE_ENABLE_I18N
3177 mbcset, &equiv_class_alloc,
3178 #endif /* RE_ENABLE_I18N */
3179 start_elem.opr.name);
3180 if (BE (*err != REG_NOERROR, 0))
3181 goto parse_bracket_exp_free_return;
3182 break;
3183 case COLL_SYM:
3184 *err = build_collating_symbol (sbcset,
3185 #ifdef RE_ENABLE_I18N
3186 mbcset, &coll_sym_alloc,
3187 #endif /* RE_ENABLE_I18N */
3188 start_elem.opr.name);
3189 if (BE (*err != REG_NOERROR, 0))
3190 goto parse_bracket_exp_free_return;
3191 break;
3192 case CHAR_CLASS:
3193 *err = build_charclass (regexp->trans, sbcset,
3194 #ifdef RE_ENABLE_I18N
3195 mbcset, &char_class_alloc,
3196 #endif /* RE_ENABLE_I18N */
3197 start_elem.opr.name, syntax);
3198 if (BE (*err != REG_NOERROR, 0))
3199 goto parse_bracket_exp_free_return;
3200 break;
3201 default:
3202 assert (0);
3203 break;
3204 }
3205 }
3206 if (BE (token->type == END_OF_RE, 0))
3207 {
3208 *err = REG_EBRACK;
3209 goto parse_bracket_exp_free_return;
3210 }
3211 if (token->type == OP_CLOSE_BRACKET)
3212 break;
3213 }
3215 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3217 /* If it is non-matching list. */
3218 if (non_match)
3219 bitset_not (sbcset);
3221 #ifdef RE_ENABLE_I18N
3222 /* Ensure only single byte characters are set. */
3223 if (dfa->mb_cur_max > 1)
3224 bitset_mask (sbcset, dfa->sb_char);
3226 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3227 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3228 || mbcset->non_match)))
3229 {
3230 bin_tree_t *mbc_tree;
3231 int sbc_idx;
3232 /* Build a tree for complex bracket. */
3233 dfa->has_mb_node = 1;
3234 br_token.type = COMPLEX_BRACKET;
3235 br_token.opr.mbcset = mbcset;
3236 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3237 if (BE (mbc_tree == NULL, 0))
3238 goto parse_bracket_exp_espace;
3239 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3240 if (sbcset[sbc_idx])
3241 break;
3242 /* If there are no bits set in sbcset, there is no point
3243 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3244 if (sbc_idx < BITSET_WORDS)
3245 {
3246 /* Build a tree for simple bracket. */
3247 br_token.type = SIMPLE_BRACKET;
3248 br_token.opr.sbcset = sbcset;
3249 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3250 if (BE (work_tree == NULL, 0))
3251 goto parse_bracket_exp_espace;
3253 /* Then join them by ALT node. */
3254 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3255 if (BE (work_tree == NULL, 0))
3256 goto parse_bracket_exp_espace;
3257 }
3258 else
3259 {
3260 re_free (sbcset);
3261 work_tree = mbc_tree;
3262 }
3263 }
3264 else
3265 #endif /* not RE_ENABLE_I18N */
3266 {
3267 #ifdef RE_ENABLE_I18N
3268 free_charset (mbcset);
3269 #endif
3270 /* Build a tree for simple bracket. */
3271 br_token.type = SIMPLE_BRACKET;
3272 br_token.opr.sbcset = sbcset;
3273 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3274 if (BE (work_tree == NULL, 0))
3275 goto parse_bracket_exp_espace;
3276 }
3277 return work_tree;
3279 parse_bracket_exp_espace:
3280 *err = REG_ESPACE;
3281 parse_bracket_exp_free_return:
3282 re_free (sbcset);
3283 #ifdef RE_ENABLE_I18N
3284 free_charset (mbcset);
3285 #endif /* RE_ENABLE_I18N */
3286 return NULL;
3287 }
3289 /* Parse an element in the bracket expression. */
3291 static reg_errcode_t
3292 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3293 re_token_t *token, int token_len, re_dfa_t *dfa,
3294 reg_syntax_t syntax, bool accept_hyphen)
3295 {
3296 #ifdef RE_ENABLE_I18N
3297 int cur_char_size;
3298 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3299 if (cur_char_size > 1)
3300 {
3301 elem->type = MB_CHAR;
3302 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3303 re_string_skip_bytes (regexp, cur_char_size);
3304 return REG_NOERROR;
3305 }
3306 #endif /* RE_ENABLE_I18N */
3307 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3308 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3309 || token->type == OP_OPEN_EQUIV_CLASS)
3310 return parse_bracket_symbol (elem, regexp, token);
3311 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3312 {
3313 /* A '-' must only appear as anything but a range indicator before
3314 the closing bracket. Everything else is an error. */
3315 re_token_t token2;
3316 (void) peek_token_bracket (&token2, regexp, syntax);
3317 if (token2.type != OP_CLOSE_BRACKET)
3318 /* The actual error value is not standardized since this whole
3319 case is undefined. But ERANGE makes good sense. */
3320 return REG_ERANGE;
3321 }
3322 elem->type = SB_CHAR;
3323 elem->opr.ch = token->opr.c;
3324 return REG_NOERROR;
3325 }
3327 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3328 such as [:<character_class>:], [.<collating_element>.], and
3329 [=<equivalent_class>=]. */
3331 static reg_errcode_t
3332 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3333 re_token_t *token)
3334 {
3335 unsigned char ch, delim = token->opr.c;
3336 int i = 0;
3337 if (re_string_eoi(regexp))
3338 return REG_EBRACK;
3339 for (;; ++i)
3340 {
3341 if (i >= BRACKET_NAME_BUF_SIZE)
3342 return REG_EBRACK;
3343 if (token->type == OP_OPEN_CHAR_CLASS)
3344 ch = re_string_fetch_byte_case (regexp);
3345 else
3346 ch = re_string_fetch_byte (regexp);
3347 if (re_string_eoi(regexp))
3348 return REG_EBRACK;
3349 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3350 break;
3351 elem->opr.name[i] = ch;
3352 }
3353 re_string_skip_bytes (regexp, 1);
3354 elem->opr.name[i] = '\0';
3355 switch (token->type)
3356 {
3357 case OP_OPEN_COLL_ELEM:
3358 elem->type = COLL_SYM;
3359 break;
3360 case OP_OPEN_EQUIV_CLASS:
3361 elem->type = EQUIV_CLASS;
3362 break;
3363 case OP_OPEN_CHAR_CLASS:
3364 elem->type = CHAR_CLASS;
3365 break;
3366 default:
3367 break;
3368 }
3369 return REG_NOERROR;
3370 }
3372 /* Helper function for parse_bracket_exp.
3373 Build the equivalence class which is represented by NAME.
3374 The result are written to MBCSET and SBCSET.
3375 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3376 is a pointer argument sinse we may update it. */
3378 static reg_errcode_t
3379 #ifdef RE_ENABLE_I18N
3380 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3381 Idx *equiv_class_alloc, const unsigned char *name)
3382 #else /* not RE_ENABLE_I18N */
3383 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3384 #endif /* not RE_ENABLE_I18N */
3385 {
3386 #ifdef _LIBC
3387 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3388 if (nrules != 0)
3389 {
3390 const int32_t *table, *indirect;
3391 const unsigned char *weights, *extra, *cp;
3392 unsigned char char_buf[2];
3393 int32_t idx1, idx2;
3394 unsigned int ch;
3395 size_t len;
3396 /* This #include defines a local function! */
3397 # include <locale/weight.h>
3398 /* Calculate the index for equivalence class. */
3399 cp = name;
3400 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3401 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3402 _NL_COLLATE_WEIGHTMB);
3403 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3404 _NL_COLLATE_EXTRAMB);
3405 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3406 _NL_COLLATE_INDIRECTMB);
3407 idx1 = findidx (&cp);
3408 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3409 /* This isn't a valid character. */
3410 return REG_ECOLLATE;
3412 /* Build single byte matcing table for this equivalence class. */
3413 char_buf[1] = (unsigned char) '\0';
3414 len = weights[idx1];
3415 for (ch = 0; ch < SBC_MAX; ++ch)
3416 {
3417 char_buf[0] = ch;
3418 cp = char_buf;
3419 idx2 = findidx (&cp);
3420 /*
3421 idx2 = table[ch];
3422 */
3423 if (idx2 == 0)
3424 /* This isn't a valid character. */
3425 continue;
3426 if (len == weights[idx2])
3427 {
3428 int cnt = 0;
3429 while (cnt <= len &&
3430 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3431 ++cnt;
3433 if (cnt > len)
3434 bitset_set (sbcset, ch);
3435 }
3436 }
3437 /* Check whether the array has enough space. */
3438 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3439 {
3440 /* Not enough, realloc it. */
3441 /* +1 in case of mbcset->nequiv_classes is 0. */
3442 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3443 /* Use realloc since the array is NULL if *alloc == 0. */
3444 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3445 int32_t,
3446 new_equiv_class_alloc);
3447 if (BE (new_equiv_classes == NULL, 0))
3448 return REG_ESPACE;
3449 mbcset->equiv_classes = new_equiv_classes;
3450 *equiv_class_alloc = new_equiv_class_alloc;
3451 }
3452 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3453 }
3454 else
3455 #endif /* _LIBC */
3456 {
3457 if (BE (strlen ((const char *) name) != 1, 0))
3458 return REG_ECOLLATE;
3459 bitset_set (sbcset, *name);
3460 }
3461 return REG_NOERROR;
3462 }
3464 /* Helper function for parse_bracket_exp.
3465 Build the character class which is represented by NAME.
3466 The result are written to MBCSET and SBCSET.
3467 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3468 is a pointer argument sinse we may update it. */
3470 static reg_errcode_t
3471 #ifdef RE_ENABLE_I18N
3472 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3473 re_charset_t *mbcset, Idx *char_class_alloc,
3474 const unsigned char *class_name, reg_syntax_t syntax)
3475 #else /* not RE_ENABLE_I18N */
3476 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3477 const unsigned char *class_name, reg_syntax_t syntax)
3478 #endif /* not RE_ENABLE_I18N */
3479 {
3480 int i;
3481 const char *name = (const char *) class_name;
3483 /* In case of REG_ICASE "upper" and "lower" match the both of
3484 upper and lower cases. */
3485 if ((syntax & RE_ICASE)
3486 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3487 name = "alpha";
3489 #ifdef RE_ENABLE_I18N
3490 /* Check the space of the arrays. */
3491 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3492 {
3493 /* Not enough, realloc it. */
3494 /* +1 in case of mbcset->nchar_classes is 0. */
3495 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3496 /* Use realloc since array is NULL if *alloc == 0. */
3497 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3498 new_char_class_alloc);
3499 if (BE (new_char_classes == NULL, 0))
3500 return REG_ESPACE;
3501 mbcset->char_classes = new_char_classes;
3502 *char_class_alloc = new_char_class_alloc;
3503 }
3504 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3505 #endif /* RE_ENABLE_I18N */
3507 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3508 do { \
3509 if (BE (trans != NULL, 0)) \
3510 { \
3511 for (i = 0; i < SBC_MAX; ++i) \
3512 if (ctype_func (i)) \
3513 bitset_set (sbcset, trans[i]); \
3514 } \
3515 else \
3516 { \
3517 for (i = 0; i < SBC_MAX; ++i) \
3518 if (ctype_func (i)) \
3519 bitset_set (sbcset, i); \
3520 } \
3521 } while (0)
3523 if (strcmp (name, "alnum") == 0)
3524 BUILD_CHARCLASS_LOOP (isalnum);
3525 else if (strcmp (name, "cntrl") == 0)
3526 BUILD_CHARCLASS_LOOP (iscntrl);
3527 else if (strcmp (name, "lower") == 0)
3528 BUILD_CHARCLASS_LOOP (islower);
3529 else if (strcmp (name, "space") == 0)
3530 BUILD_CHARCLASS_LOOP (isspace);
3531 else if (strcmp (name, "alpha") == 0)
3532 BUILD_CHARCLASS_LOOP (isalpha);
3533 else if (strcmp (name, "digit") == 0)
3534 BUILD_CHARCLASS_LOOP (isdigit);
3535 else if (strcmp (name, "print") == 0)
3536 BUILD_CHARCLASS_LOOP (isprint);
3537 else if (strcmp (name, "upper") == 0)
3538 BUILD_CHARCLASS_LOOP (isupper);
3539 else if (strcmp (name, "blank") == 0)
3540 BUILD_CHARCLASS_LOOP (isblank);
3541 else if (strcmp (name, "graph") == 0)
3542 BUILD_CHARCLASS_LOOP (isgraph);
3543 else if (strcmp (name, "punct") == 0)
3544 BUILD_CHARCLASS_LOOP (ispunct);
3545 else if (strcmp (name, "xdigit") == 0)
3546 BUILD_CHARCLASS_LOOP (isxdigit);
3547 else
3548 return REG_ECTYPE;
3550 return REG_NOERROR;
3551 }
3553 static bin_tree_t *
3554 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3555 const unsigned char *class_name,
3556 const unsigned char *extra, bool non_match,
3557 reg_errcode_t *err)
3558 {
3559 re_bitset_ptr_t sbcset;
3560 #ifdef RE_ENABLE_I18N
3561 re_charset_t *mbcset;
3562 Idx alloc = 0;
3563 #endif /* not RE_ENABLE_I18N */
3564 reg_errcode_t ret;
3565 re_token_t br_token;
3566 bin_tree_t *tree;
3568 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3569 #ifdef RE_ENABLE_I18N
3570 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3571 #endif /* RE_ENABLE_I18N */
3573 #ifdef RE_ENABLE_I18N
3574 if (BE (sbcset == NULL || mbcset == NULL, 0))
3575 #else /* not RE_ENABLE_I18N */
3576 if (BE (sbcset == NULL, 0))
3577 #endif /* not RE_ENABLE_I18N */
3578 {
3579 *err = REG_ESPACE;
3580 return NULL;
3581 }
3583 if (non_match)
3584 {
3585 #ifdef RE_ENABLE_I18N
3586 mbcset->non_match = 1;
3587 #endif /* not RE_ENABLE_I18N */
3588 }
3590 /* We don't care the syntax in this case. */
3591 ret = build_charclass (trans, sbcset,
3592 #ifdef RE_ENABLE_I18N
3593 mbcset, &alloc,
3594 #endif /* RE_ENABLE_I18N */
3595 class_name, 0);
3597 if (BE (ret != REG_NOERROR, 0))
3598 {
3599 re_free (sbcset);
3600 #ifdef RE_ENABLE_I18N
3601 free_charset (mbcset);
3602 #endif /* RE_ENABLE_I18N */
3603 *err = ret;
3604 return NULL;
3605 }
3606 /* \w match '_' also. */
3607 for (; *extra; extra++)
3608 bitset_set (sbcset, *extra);
3610 /* If it is non-matching list. */
3611 if (non_match)
3612 bitset_not (sbcset);
3614 #ifdef RE_ENABLE_I18N
3615 /* Ensure only single byte characters are set. */
3616 if (dfa->mb_cur_max > 1)
3617 bitset_mask (sbcset, dfa->sb_char);
3618 #endif
3620 /* Build a tree for simple bracket. */
3621 br_token.type = SIMPLE_BRACKET;
3622 br_token.opr.sbcset = sbcset;
3623 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3624 if (BE (tree == NULL, 0))
3625 goto build_word_op_espace;
3627 #ifdef RE_ENABLE_I18N
3628 if (dfa->mb_cur_max > 1)
3629 {
3630 bin_tree_t *mbc_tree;
3631 /* Build a tree for complex bracket. */
3632 br_token.type = COMPLEX_BRACKET;
3633 br_token.opr.mbcset = mbcset;
3634 dfa->has_mb_node = 1;
3635 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3636 if (BE (mbc_tree == NULL, 0))
3637 goto build_word_op_espace;
3638 /* Then join them by ALT node. */
3639 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3640 if (BE (mbc_tree != NULL, 1))
3641 return tree;
3642 }
3643 else
3644 {
3645 free_charset (mbcset);
3646 return tree;
3647 }
3648 #else /* not RE_ENABLE_I18N */
3649 return tree;
3650 #endif /* not RE_ENABLE_I18N */
3652 build_word_op_espace:
3653 re_free (sbcset);
3654 #ifdef RE_ENABLE_I18N
3655 free_charset (mbcset);
3656 #endif /* RE_ENABLE_I18N */
3657 *err = REG_ESPACE;
3658 return NULL;
3659 }
3661 /* This is intended for the expressions like "a{1,3}".
3662 Fetch a number from `input', and return the number.
3663 Return REG_MISSING if the number field is empty like "{,1}".
3664 Return REG_ERROR if an error occurred. */
3666 static Idx
3667 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3668 {
3669 Idx num = REG_MISSING;
3670 unsigned char c;
3671 while (1)
3672 {
3673 fetch_token (token, input, syntax);
3674 c = token->opr.c;
3675 if (BE (token->type == END_OF_RE, 0))
3676 return REG_ERROR;
3677 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3678 break;
3679 num = ((token->type != CHARACTER || c < '0' || '9' < c
3680 || num == REG_ERROR)
3681 ? REG_ERROR
3682 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3683 num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3684 }
3685 return num;
3686 }
3687 \f
3688 #ifdef RE_ENABLE_I18N
3689 static void
3690 free_charset (re_charset_t *cset)
3691 {
3692 re_free (cset->mbchars);
3693 # ifdef _LIBC
3694 re_free (cset->coll_syms);
3695 re_free (cset->equiv_classes);
3696 re_free (cset->range_starts);
3697 re_free (cset->range_ends);
3698 # endif
3699 re_free (cset->char_classes);
3700 re_free (cset);
3701 }
3702 #endif /* RE_ENABLE_I18N */
3703 \f
3704 /* Functions for binary tree operation. */
3706 /* Create a tree node. */
3708 static bin_tree_t *
3709 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3710 re_token_type_t type)
3711 {
3712 re_token_t t;
3713 t.type = type;
3714 return create_token_tree (dfa, left, right, &t);
3715 }
3717 static bin_tree_t *
3718 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3719 const re_token_t *token)
3720 {
3721 bin_tree_t *tree;
3722 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3723 {
3724 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3726 if (storage == NULL)
3727 return NULL;
3728 storage->next = dfa->str_tree_storage;
3729 dfa->str_tree_storage = storage;
3730 dfa->str_tree_storage_idx = 0;
3731 }
3732 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3734 tree->parent = NULL;
3735 tree->left = left;
3736 tree->right = right;
3737 tree->token = *token;
3738 tree->token.duplicated = 0;
3739 tree->token.opt_subexp = 0;
3740 tree->first = NULL;
3741 tree->next = NULL;
3742 tree->node_idx = REG_MISSING;
3744 if (left != NULL)
3745 left->parent = tree;
3746 if (right != NULL)
3747 right->parent = tree;
3748 return tree;
3749 }
3751 /* Mark the tree SRC as an optional subexpression.
3752 To be called from preorder or postorder. */
3754 static reg_errcode_t
3755 mark_opt_subexp (void *extra, bin_tree_t *node)
3756 {
3757 Idx idx = (Idx) (long) extra;
3758 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3759 node->token.opt_subexp = 1;
3761 return REG_NOERROR;
3762 }
3764 /* Free the allocated memory inside NODE. */
3766 static void
3767 free_token (re_token_t *node)
3768 {
3769 #ifdef RE_ENABLE_I18N
3770 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3771 free_charset (node->opr.mbcset);
3772 else
3773 #endif /* RE_ENABLE_I18N */
3774 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3775 re_free (node->opr.sbcset);
3776 }
3778 /* Worker function for tree walking. Free the allocated memory inside NODE
3779 and its children. */
3781 static reg_errcode_t
3782 free_tree (void *extra, bin_tree_t *node)
3783 {
3784 free_token (&node->token);
3785 return REG_NOERROR;
3786 }
3789 /* Duplicate the node SRC, and return new node. This is a preorder
3790 visit similar to the one implemented by the generic visitor, but
3791 we need more infrastructure to maintain two parallel trees --- so,
3792 it's easier to duplicate. */
3794 static bin_tree_t *
3795 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3796 {
3797 const bin_tree_t *node;
3798 bin_tree_t *dup_root;
3799 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3801 for (node = root; ; )
3802 {
3803 /* Create a new tree and link it back to the current parent. */
3804 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3805 if (*p_new == NULL)
3806 return NULL;
3807 (*p_new)->parent = dup_node;
3808 (*p_new)->token.duplicated = 1;
3809 dup_node = *p_new;
3811 /* Go to the left node, or up and to the right. */
3812 if (node->left)
3813 {
3814 node = node->left;
3815 p_new = &dup_node->left;
3816 }
3817 else
3818 {
3819 const bin_tree_t *prev = NULL;
3820 while (node->right == prev || node->right == NULL)
3821 {
3822 prev = node;
3823 node = node->parent;
3824 dup_node = dup_node->parent;
3825 if (!node)
3826 return dup_root;
3827 }
3828 node = node->right;
3829 p_new = &dup_node->right;
3830 }
3831 }
3832 }