summaryrefslogtreecommitdiffstats
path: root/gl/regcomp.c
diff options
context:
space:
mode:
Diffstat (limited to 'gl/regcomp.c')
-rw-r--r--gl/regcomp.c3858
1 files changed, 3858 insertions, 0 deletions
diff --git a/gl/regcomp.c b/gl/regcomp.c
new file mode 100644
index 0000000..8df6bb8
--- /dev/null
+++ b/gl/regcomp.c
@@ -0,0 +1,3858 @@
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002,2003,2004,2005,2006 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19
20static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 size_t length, reg_syntax_t syntax);
22static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
24 char *fastmap);
25static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
26#ifdef RE_ENABLE_I18N
27static void free_charset (re_charset_t *cset);
28#endif /* RE_ENABLE_I18N */
29static void free_workarea_compile (regex_t *preg);
30static reg_errcode_t create_initial_state (re_dfa_t *dfa);
31#ifdef RE_ENABLE_I18N
32static void optimize_utf8 (re_dfa_t *dfa);
33#endif
34static reg_errcode_t analyze (regex_t *preg);
35static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
37 void *extra);
38static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
40 void *extra);
41static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44 bin_tree_t *node);
45static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50 unsigned int constraint);
51static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53 Idx node, bool root);
54static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55static Idx fetch_number (re_string_t *input, re_token_t *token,
56 reg_syntax_t syntax);
57static int peek_token (re_token_t *token, re_string_t *input,
58 reg_syntax_t syntax) internal_function;
59static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 Idx nest, reg_errcode_t *err);
64static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 Idx nest, reg_errcode_t *err);
67static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 Idx nest, reg_errcode_t *err);
70static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 Idx nest, reg_errcode_t *err);
73static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
78 reg_errcode_t *err);
79static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80 re_string_t *regexp,
81 re_token_t *token, int token_len,
82 re_dfa_t *dfa,
83 reg_syntax_t syntax,
84 bool accept_hyphen);
85static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86 re_string_t *regexp,
87 re_token_t *token);
88#ifdef RE_ENABLE_I18N
89static reg_errcode_t build_equiv_class (bitset_t sbcset,
90 re_charset_t *mbcset,
91 Idx *equiv_class_alloc,
92 const unsigned char *name);
93static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
94 bitset_t sbcset,
95 re_charset_t *mbcset,
96 Idx *char_class_alloc,
97 const unsigned char *class_name,
98 reg_syntax_t syntax);
99#else /* not RE_ENABLE_I18N */
100static reg_errcode_t build_equiv_class (bitset_t sbcset,
101 const unsigned char *name);
102static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
103 bitset_t sbcset,
104 const unsigned char *class_name,
105 reg_syntax_t syntax);
106#endif /* not RE_ENABLE_I18N */
107static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 RE_TRANSLATE_TYPE trans,
109 const unsigned char *class_name,
110 const unsigned char *extra,
111 bool non_match, reg_errcode_t *err);
112static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
114 re_token_type_t type);
115static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
118static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119static void free_token (re_token_t *node);
120static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
122
123/* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
127
128static const char __re_error_msgid[] =
129 {
130#define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
132 "\0"
133#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
135 "\0"
136#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138 "\0"
139#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141 "\0"
142#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144 "\0"
145#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147 "\0"
148#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150 "\0"
151#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
153 "\0"
154#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156 "\0"
157#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159 "\0"
160#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162 "\0"
163#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
165 "\0"
166#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
168 "\0"
169#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171 "\0"
172#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
174 "\0"
175#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
177 "\0"
178#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
180 };
181
182static const size_t __re_error_msgid_idx[] =
183 {
184 REG_NOERROR_IDX,
185 REG_NOMATCH_IDX,
186 REG_BADPAT_IDX,
187 REG_ECOLLATE_IDX,
188 REG_ECTYPE_IDX,
189 REG_EESCAPE_IDX,
190 REG_ESUBREG_IDX,
191 REG_EBRACK_IDX,
192 REG_EPAREN_IDX,
193 REG_EBRACE_IDX,
194 REG_BADBR_IDX,
195 REG_ERANGE_IDX,
196 REG_ESPACE_IDX,
197 REG_BADRPT_IDX,
198 REG_EEND_IDX,
199 REG_ESIZE_IDX,
200 REG_ERPAREN_IDX
201 };
202
203/* Entry points for GNU code. */
204
205/* re_compile_pattern is the GNU regular expression compiler: it
206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207 Returns 0 if the pattern was valid, otherwise an error string.
208
209 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
210 are set in BUFP on entry. */
211
212#ifdef _LIBC
213const char *
214re_compile_pattern (pattern, length, bufp)
215 const char *pattern;
216 size_t length;
217 struct re_pattern_buffer *bufp;
218#else /* size_t might promote */
219const char *
220re_compile_pattern (const char *pattern, size_t length,
221 struct re_pattern_buffer *bufp)
222#endif
223{
224 reg_errcode_t ret;
225
226 /* And GNU code determines whether or not to get register information
227 by passing null for the REGS argument to re_match, etc., not by
228 setting no_sub, unless RE_NO_SUB is set. */
229 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
230
231 /* Match anchors at newline. */
232 bufp->newline_anchor = 1;
233
234 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
235
236 if (!ret)
237 return NULL;
238 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
239}
240#ifdef _LIBC
241weak_alias (__re_compile_pattern, re_compile_pattern)
242#endif
243
244/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
245 also be assigned to arbitrarily: each pattern buffer stores its own
246 syntax, so it can be changed between regex compilations. */
247/* This has no initializer because initialized variables in Emacs
248 become read-only after dumping. */
249reg_syntax_t re_syntax_options;
250
251
252/* Specify the precise syntax of regexps for compilation. This provides
253 for compatibility for various utilities which historically have
254 different, incompatible syntaxes.
255
256 The argument SYNTAX is a bit mask comprised of the various bits
257 defined in regex.h. We return the old syntax. */
258
259reg_syntax_t
260re_set_syntax (syntax)
261 reg_syntax_t syntax;
262{
263 reg_syntax_t ret = re_syntax_options;
264
265 re_syntax_options = syntax;
266 return ret;
267}
268#ifdef _LIBC
269weak_alias (__re_set_syntax, re_set_syntax)
270#endif
271
272int
273re_compile_fastmap (bufp)
274 struct re_pattern_buffer *bufp;
275{
276 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
277 char *fastmap = bufp->fastmap;
278
279 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
280 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
281 if (dfa->init_state != dfa->init_state_word)
282 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
283 if (dfa->init_state != dfa->init_state_nl)
284 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
285 if (dfa->init_state != dfa->init_state_begbuf)
286 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
287 bufp->fastmap_accurate = 1;
288 return 0;
289}
290#ifdef _LIBC
291weak_alias (__re_compile_fastmap, re_compile_fastmap)
292#endif
293
294static inline void
295__attribute ((always_inline))
296re_set_fastmap (char *fastmap, bool icase, int ch)
297{
298 fastmap[ch] = 1;
299 if (icase)
300 fastmap[tolower (ch)] = 1;
301}
302
303/* Helper function for re_compile_fastmap.
304 Compile fastmap for the initial_state INIT_STATE. */
305
306static void
307re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
308 char *fastmap)
309{
310 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
311 Idx node_cnt;
312 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
313 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
314 {
315 Idx node = init_state->nodes.elems[node_cnt];
316 re_token_type_t type = dfa->nodes[node].type;
317
318 if (type == CHARACTER)
319 {
320 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
321#ifdef RE_ENABLE_I18N
322 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
323 {
324 unsigned char buf[MB_LEN_MAX];
325 unsigned char *p;
326 wchar_t wc;
327 mbstate_t state;
328
329 p = buf;
330 *p++ = dfa->nodes[node].opr.c;
331 while (++node < dfa->nodes_len
332 && dfa->nodes[node].type == CHARACTER
333 && dfa->nodes[node].mb_partial)
334 *p++ = dfa->nodes[node].opr.c;
335 memset (&state, '\0', sizeof (state));
336 if (mbrtowc (&wc, (const char *) buf, p - buf,
337 &state) == p - buf
338 && (__wcrtomb ((char *) buf, towlower (wc), &state)
339 != (size_t) -1))
340 re_set_fastmap (fastmap, false, buf[0]);
341 }
342#endif
343 }
344 else if (type == SIMPLE_BRACKET)
345 {
346 int i, ch;
347 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
348 {
349 int j;
350 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
351 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
352 if (w & ((bitset_word_t) 1 << j))
353 re_set_fastmap (fastmap, icase, ch);
354 }
355 }
356#ifdef RE_ENABLE_I18N
357 else if (type == COMPLEX_BRACKET)
358 {
359 Idx i;
360 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
361 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
362 || cset->nranges || cset->nchar_classes)
363 {
364# ifdef _LIBC
365 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
366 {
367 /* In this case we want to catch the bytes which are
368 the first byte of any collation elements.
369 e.g. In da_DK, we want to catch 'a' since "aa"
370 is a valid collation element, and don't catch
371 'b' since 'b' is the only collation element
372 which starts from 'b'. */
373 const int32_t *table = (const int32_t *)
374 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
375 for (i = 0; i < SBC_MAX; ++i)
376 if (table[i] < 0)
377 re_set_fastmap (fastmap, icase, i);
378 }
379# else
380 if (dfa->mb_cur_max > 1)
381 for (i = 0; i < SBC_MAX; ++i)
382 if (__btowc (i) == WEOF)
383 re_set_fastmap (fastmap, icase, i);
384# endif /* not _LIBC */
385 }
386 for (i = 0; i < cset->nmbchars; ++i)
387 {
388 char buf[256];
389 mbstate_t state;
390 memset (&state, '\0', sizeof (state));
391 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
392 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
393 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
394 {
395 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
396 != (size_t) -1)
397 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
398 }
399 }
400 }
401#endif /* RE_ENABLE_I18N */
402 else if (type == OP_PERIOD
403#ifdef RE_ENABLE_I18N
404 || type == OP_UTF8_PERIOD
405#endif /* RE_ENABLE_I18N */
406 || type == END_OF_RE)
407 {
408 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
409 if (type == END_OF_RE)
410 bufp->can_be_null = 1;
411 return;
412 }
413 }
414}
415
416/* Entry point for POSIX code. */
417/* regcomp takes a regular expression as a string and compiles it.
418
419 PREG is a regex_t *. We do not expect any fields to be initialized,
420 since POSIX says we shouldn't. Thus, we set
421
422 `buffer' to the compiled pattern;
423 `used' to the length of the compiled pattern;
424 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
425 REG_EXTENDED bit in CFLAGS is set; otherwise, to
426 RE_SYNTAX_POSIX_BASIC;
427 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
428 `fastmap' to an allocated space for the fastmap;
429 `fastmap_accurate' to zero;
430 `re_nsub' to the number of subexpressions in PATTERN.
431
432 PATTERN is the address of the pattern string.
433
434 CFLAGS is a series of bits which affect compilation.
435
436 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
437 use POSIX basic syntax.
438
439 If REG_NEWLINE is set, then . and [^...] don't match newline.
440 Also, regexec will try a match beginning after every newline.
441
442 If REG_ICASE is set, then we considers upper- and lowercase
443 versions of letters to be equivalent when matching.
444
445 If REG_NOSUB is set, then when PREG is passed to regexec, that
446 routine will report only success or failure, and nothing about the
447 registers.
448
449 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
450 the return codes and their meanings.) */
451
452int
453regcomp (preg, pattern, cflags)
454 regex_t *__restrict preg;
455 const char *__restrict pattern;
456 int cflags;
457{
458 reg_errcode_t ret;
459 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
460 : RE_SYNTAX_POSIX_BASIC);
461
462 preg->buffer = NULL;
463 preg->allocated = 0;
464 preg->used = 0;
465
466 /* Try to allocate space for the fastmap. */
467 preg->fastmap = re_malloc (char, SBC_MAX);
468 if (BE (preg->fastmap == NULL, 0))
469 return REG_ESPACE;
470
471 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
472
473 /* If REG_NEWLINE is set, newlines are treated differently. */
474 if (cflags & REG_NEWLINE)
475 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
476 syntax &= ~RE_DOT_NEWLINE;
477 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
478 /* It also changes the matching behavior. */
479 preg->newline_anchor = 1;
480 }
481 else
482 preg->newline_anchor = 0;
483 preg->no_sub = !!(cflags & REG_NOSUB);
484 preg->translate = NULL;
485
486 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
487
488 /* POSIX doesn't distinguish between an unmatched open-group and an
489 unmatched close-group: both are REG_EPAREN. */
490 if (ret == REG_ERPAREN)
491 ret = REG_EPAREN;
492
493 /* We have already checked preg->fastmap != NULL. */
494 if (BE (ret == REG_NOERROR, 1))
495 /* Compute the fastmap now, since regexec cannot modify the pattern
496 buffer. This function never fails in this implementation. */
497 (void) re_compile_fastmap (preg);
498 else
499 {
500 /* Some error occurred while compiling the expression. */
501 re_free (preg->fastmap);
502 preg->fastmap = NULL;
503 }
504
505 return (int) ret;
506}
507#ifdef _LIBC
508weak_alias (__regcomp, regcomp)
509#endif
510
511/* Returns a message corresponding to an error code, ERRCODE, returned
512 from either regcomp or regexec. We don't use PREG here. */
513
514#ifdef _LIBC
515size_t
516regerror (errcode, preg, errbuf, errbuf_size)
517 int errcode;
518 const regex_t *__restrict preg;
519 char *__restrict errbuf;
520 size_t errbuf_size;
521#else /* size_t might promote */
522size_t
523regerror (int errcode, const regex_t *__restrict preg,
524 char *__restrict errbuf, size_t errbuf_size)
525#endif
526{
527 const char *msg;
528 size_t msg_size;
529
530 if (BE (errcode < 0
531 || errcode >= (int) (sizeof (__re_error_msgid_idx)
532 / sizeof (__re_error_msgid_idx[0])), 0))
533 /* Only error codes returned by the rest of the code should be passed
534 to this routine. If we are given anything else, or if other regex
535 code generates an invalid error code, then the program has a bug.
536 Dump core so we can fix it. */
537 abort ();
538
539 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
540
541 msg_size = strlen (msg) + 1; /* Includes the null. */
542
543 if (BE (errbuf_size != 0, 1))
544 {
545 if (BE (msg_size > errbuf_size, 0))
546 {
547#if defined HAVE_MEMPCPY || defined _LIBC
548 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
549#else
550 memcpy (errbuf, msg, errbuf_size - 1);
551 errbuf[errbuf_size - 1] = 0;
552#endif
553 }
554 else
555 memcpy (errbuf, msg, msg_size);
556 }
557
558 return msg_size;
559}
560#ifdef _LIBC
561weak_alias (__regerror, regerror)
562#endif
563
564
565#ifdef RE_ENABLE_I18N
566/* This static array is used for the map to single-byte characters when
567 UTF-8 is used. Otherwise we would allocate memory just to initialize
568 it the same all the time. UTF-8 is the preferred encoding so this is
569 a worthwhile optimization. */
570static const bitset_t utf8_sb_map =
571{
572 /* Set the first 128 bits. */
573# if 4 * BITSET_WORD_BITS < ASCII_CHARS
574# error "bitset_word_t is narrower than 32 bits"
575# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
576 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
577# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
578 BITSET_WORD_MAX, BITSET_WORD_MAX,
579# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
580 BITSET_WORD_MAX,
581# endif
582 (BITSET_WORD_MAX
583 >> (SBC_MAX % BITSET_WORD_BITS == 0
584 ? 0
585 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
586};
587#endif
588
589
590static void
591free_dfa_content (re_dfa_t *dfa)
592{
593 Idx i, j;
594
595 if (dfa->nodes)
596 for (i = 0; i < dfa->nodes_len; ++i)
597 free_token (dfa->nodes + i);
598 re_free (dfa->nexts);
599 for (i = 0; i < dfa->nodes_len; ++i)
600 {
601 if (dfa->eclosures != NULL)
602 re_node_set_free (dfa->eclosures + i);
603 if (dfa->inveclosures != NULL)
604 re_node_set_free (dfa->inveclosures + i);
605 if (dfa->edests != NULL)
606 re_node_set_free (dfa->edests + i);
607 }
608 re_free (dfa->edests);
609 re_free (dfa->eclosures);
610 re_free (dfa->inveclosures);
611 re_free (dfa->nodes);
612
613 if (dfa->state_table)
614 for (i = 0; i <= dfa->state_hash_mask; ++i)
615 {
616 struct re_state_table_entry *entry = dfa->state_table + i;
617 for (j = 0; j < entry->num; ++j)
618 {
619 re_dfastate_t *state = entry->array[j];
620 free_state (state);
621 }
622 re_free (entry->array);
623 }
624 re_free (dfa->state_table);
625#ifdef RE_ENABLE_I18N
626 if (dfa->sb_char != utf8_sb_map)
627 re_free (dfa->sb_char);
628#endif
629 re_free (dfa->subexp_map);
630#ifdef DEBUG
631 re_free (dfa->re_str);
632#endif
633
634 re_free (dfa);
635}
636
637
638/* Free dynamically allocated space used by PREG. */
639
640void
641regfree (preg)
642 regex_t *preg;
643{
644 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
645 if (BE (dfa != NULL, 1))
646 free_dfa_content (dfa);
647 preg->buffer = NULL;
648 preg->allocated = 0;
649
650 re_free (preg->fastmap);
651 preg->fastmap = NULL;
652
653 re_free (preg->translate);
654 preg->translate = NULL;
655}
656#ifdef _LIBC
657weak_alias (__regfree, regfree)
658#endif
659
660/* Entry points compatible with 4.2 BSD regex library. We don't define
661 them unless specifically requested. */
662
663#if defined _REGEX_RE_COMP || defined _LIBC
664
665/* BSD has one and only one pattern buffer. */
666static struct re_pattern_buffer re_comp_buf;
667
668char *
669# ifdef _LIBC
670/* Make these definitions weak in libc, so POSIX programs can redefine
671 these names if they don't use our functions, and still use
672 regcomp/regexec above without link errors. */
673weak_function
674# endif
675re_comp (s)
676 const char *s;
677{
678 reg_errcode_t ret;
679 char *fastmap;
680
681 if (!s)
682 {
683 if (!re_comp_buf.buffer)
684 return gettext ("No previous regular expression");
685 return 0;
686 }
687
688 if (re_comp_buf.buffer)
689 {
690 fastmap = re_comp_buf.fastmap;
691 re_comp_buf.fastmap = NULL;
692 __regfree (&re_comp_buf);
693 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
694 re_comp_buf.fastmap = fastmap;
695 }
696
697 if (re_comp_buf.fastmap == NULL)
698 {
699 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
700 if (re_comp_buf.fastmap == NULL)
701 return (char *) gettext (__re_error_msgid
702 + __re_error_msgid_idx[(int) REG_ESPACE]);
703 }
704
705 /* Since `re_exec' always passes NULL for the `regs' argument, we
706 don't need to initialize the pattern buffer fields which affect it. */
707
708 /* Match anchors at newlines. */
709 re_comp_buf.newline_anchor = 1;
710
711 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
712
713 if (!ret)
714 return NULL;
715
716 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
717 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
718}
719
720#ifdef _LIBC
721libc_freeres_fn (free_mem)
722{
723 __regfree (&re_comp_buf);
724}
725#endif
726
727#endif /* _REGEX_RE_COMP */
728
729/* Internal entry point.
730 Compile the regular expression PATTERN, whose length is LENGTH.
731 SYNTAX indicate regular expression's syntax. */
732
733static reg_errcode_t
734re_compile_internal (regex_t *preg, const char * pattern, size_t length,
735 reg_syntax_t syntax)
736{
737 reg_errcode_t err = REG_NOERROR;
738 re_dfa_t *dfa;
739 re_string_t regexp;
740
741 /* Initialize the pattern buffer. */
742 preg->fastmap_accurate = 0;
743 preg->syntax = syntax;
744 preg->not_bol = preg->not_eol = 0;
745 preg->used = 0;
746 preg->re_nsub = 0;
747 preg->can_be_null = 0;
748 preg->regs_allocated = REGS_UNALLOCATED;
749
750 /* Initialize the dfa. */
751 dfa = (re_dfa_t *) preg->buffer;
752 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
753 {
754 /* If zero allocated, but buffer is non-null, try to realloc
755 enough space. This loses if buffer's address is bogus, but
756 that is the user's responsibility. If ->buffer is NULL this
757 is a simple allocation. */
758 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
759 if (dfa == NULL)
760 return REG_ESPACE;
761 preg->allocated = sizeof (re_dfa_t);
762 preg->buffer = (unsigned char *) dfa;
763 }
764 preg->used = sizeof (re_dfa_t);
765
766 err = init_dfa (dfa, length);
767 if (BE (err != REG_NOERROR, 0))
768 {
769 free_dfa_content (dfa);
770 preg->buffer = NULL;
771 preg->allocated = 0;
772 return err;
773 }
774#ifdef DEBUG
775 /* Note: length+1 will not overflow since it is checked in init_dfa. */
776 dfa->re_str = re_malloc (char, length + 1);
777 strncpy (dfa->re_str, pattern, length + 1);
778#endif
779
780 __libc_lock_init (dfa->lock);
781
782 err = re_string_construct (&regexp, pattern, length, preg->translate,
783 syntax & RE_ICASE, dfa);
784 if (BE (err != REG_NOERROR, 0))
785 {
786 re_compile_internal_free_return:
787 free_workarea_compile (preg);
788 re_string_destruct (&regexp);
789 free_dfa_content (dfa);
790 preg->buffer = NULL;
791 preg->allocated = 0;
792 return err;
793 }
794
795 /* Parse the regular expression, and build a structure tree. */
796 preg->re_nsub = 0;
797 dfa->str_tree = parse (&regexp, preg, syntax, &err);
798 if (BE (dfa->str_tree == NULL, 0))
799 goto re_compile_internal_free_return;
800
801 /* Analyze the tree and create the nfa. */
802 err = analyze (preg);
803 if (BE (err != REG_NOERROR, 0))
804 goto re_compile_internal_free_return;
805
806#ifdef RE_ENABLE_I18N
807 /* If possible, do searching in single byte encoding to speed things up. */
808 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
809 optimize_utf8 (dfa);
810#endif
811
812 /* Then create the initial state of the dfa. */
813 err = create_initial_state (dfa);
814
815 /* Release work areas. */
816 free_workarea_compile (preg);
817 re_string_destruct (&regexp);
818
819 if (BE (err != REG_NOERROR, 0))
820 {
821 free_dfa_content (dfa);
822 preg->buffer = NULL;
823 preg->allocated = 0;
824 }
825
826 return err;
827}
828
829/* Initialize DFA. We use the length of the regular expression PAT_LEN
830 as the initial length of some arrays. */
831
832static reg_errcode_t
833init_dfa (re_dfa_t *dfa, size_t pat_len)
834{
835 __re_size_t table_size;
836#ifndef _LIBC
837 char *codeset_name;
838#endif
839#ifdef RE_ENABLE_I18N
840 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
841#else
842 size_t max_i18n_object_size = 0;
843#endif
844 size_t max_object_size =
845 MAX (sizeof (struct re_state_table_entry),
846 MAX (sizeof (re_token_t),
847 MAX (sizeof (re_node_set),
848 MAX (sizeof (regmatch_t),
849 max_i18n_object_size))));
850
851 memset (dfa, '\0', sizeof (re_dfa_t));
852
853 /* Force allocation of str_tree_storage the first time. */
854 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
855
856 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
857 calculation below, and for similar doubling calculations
858 elsewhere. And it's <= rather than <, because some of the
859 doubling calculations add 1 afterwards. */
860 if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
861 return REG_ESPACE;
862
863 dfa->nodes_alloc = pat_len + 1;
864 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
865
866 /* table_size = 2 ^ ceil(log pat_len) */
867 for (table_size = 1; ; table_size <<= 1)
868 if (table_size > pat_len)
869 break;
870
871 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
872 dfa->state_hash_mask = table_size - 1;
873
874 dfa->mb_cur_max = MB_CUR_MAX;
875#ifdef _LIBC
876 if (dfa->mb_cur_max == 6
877 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
878 dfa->is_utf8 = 1;
879 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
880 != 0);
881#else
882# ifdef HAVE_LANGINFO_CODESET
883 codeset_name = nl_langinfo (CODESET);
884# else
885 codeset_name = getenv ("LC_ALL");
886 if (codeset_name == NULL || codeset_name[0] == '\0')
887 codeset_name = getenv ("LC_CTYPE");
888 if (codeset_name == NULL || codeset_name[0] == '\0')
889 codeset_name = getenv ("LANG");
890 if (codeset_name == NULL)
891 codeset_name = "";
892 else if (strchr (codeset_name, '.') != NULL)
893 codeset_name = strchr (codeset_name, '.') + 1;
894# endif
895
896 if (strcasecmp (codeset_name, "UTF-8") == 0
897 || strcasecmp (codeset_name, "UTF8") == 0)
898 dfa->is_utf8 = 1;
899
900 /* We check exhaustively in the loop below if this charset is a
901 superset of ASCII. */
902 dfa->map_notascii = 0;
903#endif
904
905#ifdef RE_ENABLE_I18N
906 if (dfa->mb_cur_max > 1)
907 {
908 if (dfa->is_utf8)
909 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
910 else
911 {
912 int i, j, ch;
913
914 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
915 if (BE (dfa->sb_char == NULL, 0))
916 return REG_ESPACE;
917
918 /* Set the bits corresponding to single byte chars. */
919 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
920 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
921 {
922 wint_t wch = __btowc (ch);
923 if (wch != WEOF)
924 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
925# ifndef _LIBC
926 if (isascii (ch) && wch != ch)
927 dfa->map_notascii = 1;
928# endif
929 }
930 }
931 }
932#endif
933
934 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
935 return REG_ESPACE;
936 return REG_NOERROR;
937}
938
939/* Initialize WORD_CHAR table, which indicate which character is
940 "word". In this case "word" means that it is the word construction
941 character used by some operators like "\<", "\>", etc. */
942
943static void
944internal_function
945init_word_char (re_dfa_t *dfa)
946{
947 int i, j, ch;
948 dfa->word_ops_used = 1;
949 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
950 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
951 if (isalnum (ch) || ch == '_')
952 dfa->word_char[i] |= (bitset_word_t) 1 << j;
953}
954
955/* Free the work area which are only used while compiling. */
956
957static void
958free_workarea_compile (regex_t *preg)
959{
960 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
961 bin_tree_storage_t *storage, *next;
962 for (storage = dfa->str_tree_storage; storage; storage = next)
963 {
964 next = storage->next;
965 re_free (storage);
966 }
967 dfa->str_tree_storage = NULL;
968 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
969 dfa->str_tree = NULL;
970 re_free (dfa->org_indices);
971 dfa->org_indices = NULL;
972}
973
974/* Create initial states for all contexts. */
975
976static reg_errcode_t
977create_initial_state (re_dfa_t *dfa)
978{
979 Idx first, i;
980 reg_errcode_t err;
981 re_node_set init_nodes;
982
983 /* Initial states have the epsilon closure of the node which is
984 the first node of the regular expression. */
985 first = dfa->str_tree->first->node_idx;
986 dfa->init_node = first;
987 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
988 if (BE (err != REG_NOERROR, 0))
989 return err;
990
991 /* The back-references which are in initial states can epsilon transit,
992 since in this case all of the subexpressions can be null.
993 Then we add epsilon closures of the nodes which are the next nodes of
994 the back-references. */
995 if (dfa->nbackref > 0)
996 for (i = 0; i < init_nodes.nelem; ++i)
997 {
998 Idx node_idx = init_nodes.elems[i];
999 re_token_type_t type = dfa->nodes[node_idx].type;
1000
1001 Idx clexp_idx;
1002 if (type != OP_BACK_REF)
1003 continue;
1004 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1005 {
1006 re_token_t *clexp_node;
1007 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1008 if (clexp_node->type == OP_CLOSE_SUBEXP
1009 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1010 break;
1011 }
1012 if (clexp_idx == init_nodes.nelem)
1013 continue;
1014
1015 if (type == OP_BACK_REF)
1016 {
1017 Idx dest_idx = dfa->edests[node_idx].elems[0];
1018 if (!re_node_set_contains (&init_nodes, dest_idx))
1019 {
1020 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1021 i = 0;
1022 }
1023 }
1024 }
1025
1026 /* It must be the first time to invoke acquire_state. */
1027 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1028 /* We don't check ERR here, since the initial state must not be NULL. */
1029 if (BE (dfa->init_state == NULL, 0))
1030 return err;
1031 if (dfa->init_state->has_constraint)
1032 {
1033 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1034 CONTEXT_WORD);
1035 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1036 CONTEXT_NEWLINE);
1037 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1038 &init_nodes,
1039 CONTEXT_NEWLINE
1040 | CONTEXT_BEGBUF);
1041 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1042 || dfa->init_state_begbuf == NULL, 0))
1043 return err;
1044 }
1045 else
1046 dfa->init_state_word = dfa->init_state_nl
1047 = dfa->init_state_begbuf = dfa->init_state;
1048
1049 re_node_set_free (&init_nodes);
1050 return REG_NOERROR;
1051}
1052
1053#ifdef RE_ENABLE_I18N
1054/* If it is possible to do searching in single byte encoding instead of UTF-8
1055 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1056 DFA nodes where needed. */
1057
1058static void
1059optimize_utf8 (re_dfa_t *dfa)
1060{
1061 Idx node;
1062 int i;
1063 bool mb_chars = false;
1064 bool has_period = false;
1065
1066 for (node = 0; node < dfa->nodes_len; ++node)
1067 switch (dfa->nodes[node].type)
1068 {
1069 case CHARACTER:
1070 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1071 mb_chars = true;
1072 break;
1073 case ANCHOR:
1074 switch (dfa->nodes[node].opr.idx)
1075 {
1076 case LINE_FIRST:
1077 case LINE_LAST:
1078 case BUF_FIRST:
1079 case BUF_LAST:
1080 break;
1081 default:
1082 /* Word anchors etc. cannot be handled. */
1083 return;
1084 }
1085 break;
1086 case OP_PERIOD:
1087 has_period = true;
1088 break;
1089 case OP_BACK_REF:
1090 case OP_ALT:
1091 case END_OF_RE:
1092 case OP_DUP_ASTERISK:
1093 case OP_OPEN_SUBEXP:
1094 case OP_CLOSE_SUBEXP:
1095 break;
1096 case COMPLEX_BRACKET:
1097 return;
1098 case SIMPLE_BRACKET:
1099 /* Just double check. */
1100 {
1101 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1102 ? 0
1103 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1104 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1105 {
1106 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1107 return;
1108 rshift = 0;
1109 }
1110 }
1111 break;
1112 default:
1113 abort ();
1114 }
1115
1116 if (mb_chars || has_period)
1117 for (node = 0; node < dfa->nodes_len; ++node)
1118 {
1119 if (dfa->nodes[node].type == CHARACTER
1120 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1121 dfa->nodes[node].mb_partial = 0;
1122 else if (dfa->nodes[node].type == OP_PERIOD)
1123 dfa->nodes[node].type = OP_UTF8_PERIOD;
1124 }
1125
1126 /* The search can be in single byte locale. */
1127 dfa->mb_cur_max = 1;
1128 dfa->is_utf8 = 0;
1129 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1130}
1131#endif
1132
1133/* Analyze the structure tree, and calculate "first", "next", "edest",
1134 "eclosure", and "inveclosure". */
1135
1136static reg_errcode_t
1137analyze (regex_t *preg)
1138{
1139 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1140 reg_errcode_t ret;
1141
1142 /* Allocate arrays. */
1143 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1144 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1145 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1146 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1147 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1148 || dfa->eclosures == NULL, 0))
1149 return REG_ESPACE;
1150
1151 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1152 if (dfa->subexp_map != NULL)
1153 {
1154 Idx i;
1155 for (i = 0; i < preg->re_nsub; i++)
1156 dfa->subexp_map[i] = i;
1157 preorder (dfa->str_tree, optimize_subexps, dfa);
1158 for (i = 0; i < preg->re_nsub; i++)
1159 if (dfa->subexp_map[i] != i)
1160 break;
1161 if (i == preg->re_nsub)
1162 {
1163 free (dfa->subexp_map);
1164 dfa->subexp_map = NULL;
1165 }
1166 }
1167
1168 ret = postorder (dfa->str_tree, lower_subexps, preg);
1169 if (BE (ret != REG_NOERROR, 0))
1170 return ret;
1171 ret = postorder (dfa->str_tree, calc_first, dfa);
1172 if (BE (ret != REG_NOERROR, 0))
1173 return ret;
1174 preorder (dfa->str_tree, calc_next, dfa);
1175 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1176 if (BE (ret != REG_NOERROR, 0))
1177 return ret;
1178 ret = calc_eclosure (dfa);
1179 if (BE (ret != REG_NOERROR, 0))
1180 return ret;
1181
1182 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1183 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1184 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1185 || dfa->nbackref)
1186 {
1187 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1188 if (BE (dfa->inveclosures == NULL, 0))
1189 return REG_ESPACE;
1190 ret = calc_inveclosure (dfa);
1191 }
1192
1193 return ret;
1194}
1195
1196/* Our parse trees are very unbalanced, so we cannot use a stack to
1197 implement parse tree visits. Instead, we use parent pointers and
1198 some hairy code in these two functions. */
1199static reg_errcode_t
1200postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1201 void *extra)
1202{
1203 bin_tree_t *node, *prev;
1204
1205 for (node = root; ; )
1206 {
1207 /* Descend down the tree, preferably to the left (or to the right
1208 if that's the only child). */
1209 while (node->left || node->right)
1210 if (node->left)
1211 node = node->left;
1212 else
1213 node = node->right;
1214
1215 do
1216 {
1217 reg_errcode_t err = fn (extra, node);
1218 if (BE (err != REG_NOERROR, 0))
1219 return err;
1220 if (node->parent == NULL)
1221 return REG_NOERROR;
1222 prev = node;
1223 node = node->parent;
1224 }
1225 /* Go up while we have a node that is reached from the right. */
1226 while (node->right == prev || node->right == NULL);
1227 node = node->right;
1228 }
1229}
1230
1231static reg_errcode_t
1232preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1233 void *extra)
1234{
1235 bin_tree_t *node;
1236
1237 for (node = root; ; )
1238 {
1239 reg_errcode_t err = fn (extra, node);
1240 if (BE (err != REG_NOERROR, 0))
1241 return err;
1242
1243 /* Go to the left node, or up and to the right. */
1244 if (node->left)
1245 node = node->left;
1246 else
1247 {
1248 bin_tree_t *prev = NULL;
1249 while (node->right == prev || node->right == NULL)
1250 {
1251 prev = node;
1252 node = node->parent;
1253 if (!node)
1254 return REG_NOERROR;
1255 }
1256 node = node->right;
1257 }
1258 }
1259}
1260
1261/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1262 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1263 backreferences as well. Requires a preorder visit. */
1264static reg_errcode_t
1265optimize_subexps (void *extra, bin_tree_t *node)
1266{
1267 re_dfa_t *dfa = (re_dfa_t *) extra;
1268
1269 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1270 {
1271 int idx = node->token.opr.idx;
1272 node->token.opr.idx = dfa->subexp_map[idx];
1273 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1274 }
1275
1276 else if (node->token.type == SUBEXP
1277 && node->left && node->left->token.type == SUBEXP)
1278 {
1279 Idx other_idx = node->left->token.opr.idx;
1280
1281 node->left = node->left->left;
1282 if (node->left)
1283 node->left->parent = node;
1284
1285 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1286 if (other_idx < BITSET_WORD_BITS)
1287 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1288 }
1289
1290 return REG_NOERROR;
1291}
1292
1293/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1294 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1295static reg_errcode_t
1296lower_subexps (void *extra, bin_tree_t *node)
1297{
1298 regex_t *preg = (regex_t *) extra;
1299 reg_errcode_t err = REG_NOERROR;
1300
1301 if (node->left && node->left->token.type == SUBEXP)
1302 {
1303 node->left = lower_subexp (&err, preg, node->left);
1304 if (node->left)
1305 node->left->parent = node;
1306 }
1307 if (node->right && node->right->token.type == SUBEXP)
1308 {
1309 node->right = lower_subexp (&err, preg, node->right);
1310 if (node->right)
1311 node->right->parent = node;
1312 }
1313
1314 return err;
1315}
1316
1317static bin_tree_t *
1318lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1319{
1320 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1321 bin_tree_t *body = node->left;
1322 bin_tree_t *op, *cls, *tree1, *tree;
1323
1324 if (preg->no_sub
1325 /* We do not optimize empty subexpressions, because otherwise we may
1326 have bad CONCAT nodes with NULL children. This is obviously not
1327 very common, so we do not lose much. An example that triggers
1328 this case is the sed "script" /\(\)/x. */
1329 && node->left != NULL
1330 && (node->token.opr.idx >= BITSET_WORD_BITS
1331 || !(dfa->used_bkref_map
1332 & ((bitset_word_t) 1 << node->token.opr.idx))))
1333 return node->left;
1334
1335 /* Convert the SUBEXP node to the concatenation of an
1336 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1337 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1338 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1339 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1340 tree = create_tree (dfa, op, tree1, CONCAT);
1341 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1342 {
1343 *err = REG_ESPACE;
1344 return NULL;
1345 }
1346
1347 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1348 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1349 return tree;
1350}
1351
1352/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1353 nodes. Requires a postorder visit. */
1354static reg_errcode_t
1355calc_first (void *extra, bin_tree_t *node)
1356{
1357 re_dfa_t *dfa = (re_dfa_t *) extra;
1358 if (node->token.type == CONCAT)
1359 {
1360 node->first = node->left->first;
1361 node->node_idx = node->left->node_idx;
1362 }
1363 else
1364 {
1365 node->first = node;
1366 node->node_idx = re_dfa_add_node (dfa, node->token);
1367 if (BE (node->node_idx == REG_MISSING, 0))
1368 return REG_ESPACE;
1369 }
1370 return REG_NOERROR;
1371}
1372
1373/* Pass 2: compute NEXT on the tree. Preorder visit. */
1374static reg_errcode_t
1375calc_next (void *extra, bin_tree_t *node)
1376{
1377 switch (node->token.type)
1378 {
1379 case OP_DUP_ASTERISK:
1380 node->left->next = node;
1381 break;
1382 case CONCAT:
1383 node->left->next = node->right->first;
1384 node->right->next = node->next;
1385 break;
1386 default:
1387 if (node->left)
1388 node->left->next = node->next;
1389 if (node->right)
1390 node->right->next = node->next;
1391 break;
1392 }
1393 return REG_NOERROR;
1394}
1395
1396/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1397static reg_errcode_t
1398link_nfa_nodes (void *extra, bin_tree_t *node)
1399{
1400 re_dfa_t *dfa = (re_dfa_t *) extra;
1401 Idx idx = node->node_idx;
1402 reg_errcode_t err = REG_NOERROR;
1403
1404 switch (node->token.type)
1405 {
1406 case CONCAT:
1407 break;
1408
1409 case END_OF_RE:
1410 assert (node->next == NULL);
1411 break;
1412
1413 case OP_DUP_ASTERISK:
1414 case OP_ALT:
1415 {
1416 Idx left, right;
1417 dfa->has_plural_match = 1;
1418 if (node->left != NULL)
1419 left = node->left->first->node_idx;
1420 else
1421 left = node->next->node_idx;
1422 if (node->right != NULL)
1423 right = node->right->first->node_idx;
1424 else
1425 right = node->next->node_idx;
1426 assert (REG_VALID_INDEX (left));
1427 assert (REG_VALID_INDEX (right));
1428 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1429 }
1430 break;
1431
1432 case ANCHOR:
1433 case OP_OPEN_SUBEXP:
1434 case OP_CLOSE_SUBEXP:
1435 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1436 break;
1437
1438 case OP_BACK_REF:
1439 dfa->nexts[idx] = node->next->node_idx;
1440 if (node->token.type == OP_BACK_REF)
1441 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1442 break;
1443
1444 default:
1445 assert (!IS_EPSILON_NODE (node->token.type));
1446 dfa->nexts[idx] = node->next->node_idx;
1447 break;
1448 }
1449
1450 return err;
1451}
1452
1453/* Duplicate the epsilon closure of the node ROOT_NODE.
1454 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1455 to their own constraint. */
1456
1457static reg_errcode_t
1458internal_function
1459duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1460 Idx root_node, unsigned int init_constraint)
1461{
1462 Idx org_node, clone_node;
1463 bool ok;
1464 unsigned int constraint = init_constraint;
1465 for (org_node = top_org_node, clone_node = top_clone_node;;)
1466 {
1467 Idx org_dest, clone_dest;
1468 if (dfa->nodes[org_node].type == OP_BACK_REF)
1469 {
1470 /* If the back reference epsilon-transit, its destination must
1471 also have the constraint. Then duplicate the epsilon closure
1472 of the destination of the back reference, and store it in
1473 edests of the back reference. */
1474 org_dest = dfa->nexts[org_node];
1475 re_node_set_empty (dfa->edests + clone_node);
1476 clone_dest = duplicate_node (dfa, org_dest, constraint);
1477 if (BE (clone_dest == REG_MISSING, 0))
1478 return REG_ESPACE;
1479 dfa->nexts[clone_node] = dfa->nexts[org_node];
1480 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1481 if (BE (! ok, 0))
1482 return REG_ESPACE;
1483 }
1484 else if (dfa->edests[org_node].nelem == 0)
1485 {
1486 /* In case of the node can't epsilon-transit, don't duplicate the
1487 destination and store the original destination as the
1488 destination of the node. */
1489 dfa->nexts[clone_node] = dfa->nexts[org_node];
1490 break;
1491 }
1492 else if (dfa->edests[org_node].nelem == 1)
1493 {
1494 /* In case of the node can epsilon-transit, and it has only one
1495 destination. */
1496 org_dest = dfa->edests[org_node].elems[0];
1497 re_node_set_empty (dfa->edests + clone_node);
1498 if (dfa->nodes[org_node].type == ANCHOR)
1499 {
1500 /* In case of the node has another constraint, append it. */
1501 if (org_node == root_node && clone_node != org_node)
1502 {
1503 /* ...but if the node is root_node itself, it means the
1504 epsilon closure have a loop, then tie it to the
1505 destination of the root_node. */
1506 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1507 if (BE (! ok, 0))
1508 return REG_ESPACE;
1509 break;
1510 }
1511 constraint |= dfa->nodes[org_node].opr.ctx_type;
1512 }
1513 clone_dest = duplicate_node (dfa, org_dest, constraint);
1514 if (BE (clone_dest == REG_MISSING, 0))
1515 return REG_ESPACE;
1516 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1517 if (BE (! ok, 0))
1518 return REG_ESPACE;
1519 }
1520 else /* dfa->edests[org_node].nelem == 2 */
1521 {
1522 /* In case of the node can epsilon-transit, and it has two
1523 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1524 org_dest = dfa->edests[org_node].elems[0];
1525 re_node_set_empty (dfa->edests + clone_node);
1526 /* Search for a duplicated node which satisfies the constraint. */
1527 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1528 if (clone_dest == REG_MISSING)
1529 {
1530 /* There are no such a duplicated node, create a new one. */
1531 reg_errcode_t err;
1532 clone_dest = duplicate_node (dfa, org_dest, constraint);
1533 if (BE (clone_dest == REG_MISSING, 0))
1534 return REG_ESPACE;
1535 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1536 if (BE (! ok, 0))
1537 return REG_ESPACE;
1538 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1539 root_node, constraint);
1540 if (BE (err != REG_NOERROR, 0))
1541 return err;
1542 }
1543 else
1544 {
1545 /* There are a duplicated node which satisfy the constraint,
1546 use it to avoid infinite loop. */
1547 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1548 if (BE (! ok, 0))
1549 return REG_ESPACE;
1550 }
1551
1552 org_dest = dfa->edests[org_node].elems[1];
1553 clone_dest = duplicate_node (dfa, org_dest, constraint);
1554 if (BE (clone_dest == REG_MISSING, 0))
1555 return REG_ESPACE;
1556 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1557 if (BE (! ok, 0))
1558 return REG_ESPACE;
1559 }
1560 org_node = org_dest;
1561 clone_node = clone_dest;
1562 }
1563 return REG_NOERROR;
1564}
1565
1566/* Search for a node which is duplicated from the node ORG_NODE, and
1567 satisfies the constraint CONSTRAINT. */
1568
1569static Idx
1570search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1571 unsigned int constraint)
1572{
1573 Idx idx;
1574 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1575 {
1576 if (org_node == dfa->org_indices[idx]
1577 && constraint == dfa->nodes[idx].constraint)
1578 return idx; /* Found. */
1579 }
1580 return REG_MISSING; /* Not found. */
1581}
1582
1583/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1584 Return the index of the new node, or REG_MISSING if insufficient storage is
1585 available. */
1586
1587static Idx
1588duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1589{
1590 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1591 if (BE (dup_idx != REG_MISSING, 1))
1592 {
1593 dfa->nodes[dup_idx].constraint = constraint;
1594 if (dfa->nodes[org_idx].type == ANCHOR)
1595 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1596 dfa->nodes[dup_idx].duplicated = 1;
1597
1598 /* Store the index of the original node. */
1599 dfa->org_indices[dup_idx] = org_idx;
1600 }
1601 return dup_idx;
1602}
1603
1604static reg_errcode_t
1605calc_inveclosure (re_dfa_t *dfa)
1606{
1607 Idx src, idx;
1608 bool ok;
1609 for (idx = 0; idx < dfa->nodes_len; ++idx)
1610 re_node_set_init_empty (dfa->inveclosures + idx);
1611
1612 for (src = 0; src < dfa->nodes_len; ++src)
1613 {
1614 Idx *elems = dfa->eclosures[src].elems;
1615 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1616 {
1617 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1618 if (BE (! ok, 0))
1619 return REG_ESPACE;
1620 }
1621 }
1622
1623 return REG_NOERROR;
1624}
1625
1626/* Calculate "eclosure" for all the node in DFA. */
1627
1628static reg_errcode_t
1629calc_eclosure (re_dfa_t *dfa)
1630{
1631 Idx node_idx;
1632 bool incomplete;
1633#ifdef DEBUG
1634 assert (dfa->nodes_len > 0);
1635#endif
1636 incomplete = false;
1637 /* For each nodes, calculate epsilon closure. */
1638 for (node_idx = 0; ; ++node_idx)
1639 {
1640 reg_errcode_t err;
1641 re_node_set eclosure_elem;
1642 if (node_idx == dfa->nodes_len)
1643 {
1644 if (!incomplete)
1645 break;
1646 incomplete = false;
1647 node_idx = 0;
1648 }
1649
1650#ifdef DEBUG
1651 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1652#endif
1653
1654 /* If we have already calculated, skip it. */
1655 if (dfa->eclosures[node_idx].nelem != 0)
1656 continue;
1657 /* Calculate epsilon closure of `node_idx'. */
1658 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1659 if (BE (err != REG_NOERROR, 0))
1660 return err;
1661
1662 if (dfa->eclosures[node_idx].nelem == 0)
1663 {
1664 incomplete = true;
1665 re_node_set_free (&eclosure_elem);
1666 }
1667 }
1668 return REG_NOERROR;
1669}
1670
1671/* Calculate epsilon closure of NODE. */
1672
1673static reg_errcode_t
1674calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1675{
1676 reg_errcode_t err;
1677 unsigned int constraint;
1678 Idx i;
1679 bool incomplete;
1680 bool ok;
1681 re_node_set eclosure;
1682 incomplete = false;
1683 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1684 if (BE (err != REG_NOERROR, 0))
1685 return err;
1686
1687 /* This indicates that we are calculating this node now.
1688 We reference this value to avoid infinite loop. */
1689 dfa->eclosures[node].nelem = REG_MISSING;
1690
1691 constraint = ((dfa->nodes[node].type == ANCHOR)
1692 ? dfa->nodes[node].opr.ctx_type : 0);
1693 /* If the current node has constraints, duplicate all nodes.
1694 Since they must inherit the constraints. */
1695 if (constraint
1696 && dfa->edests[node].nelem
1697 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1698 {
1699 err = duplicate_node_closure (dfa, node, node, node, constraint);
1700 if (BE (err != REG_NOERROR, 0))
1701 return err;
1702 }
1703
1704 /* Expand each epsilon destination nodes. */
1705 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1706 for (i = 0; i < dfa->edests[node].nelem; ++i)
1707 {
1708 re_node_set eclosure_elem;
1709 Idx edest = dfa->edests[node].elems[i];
1710 /* If calculating the epsilon closure of `edest' is in progress,
1711 return intermediate result. */
1712 if (dfa->eclosures[edest].nelem == REG_MISSING)
1713 {
1714 incomplete = true;
1715 continue;
1716 }
1717 /* If we haven't calculated the epsilon closure of `edest' yet,
1718 calculate now. Otherwise use calculated epsilon closure. */
1719 if (dfa->eclosures[edest].nelem == 0)
1720 {
1721 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1722 if (BE (err != REG_NOERROR, 0))
1723 return err;
1724 }
1725 else
1726 eclosure_elem = dfa->eclosures[edest];
1727 /* Merge the epsilon closure of `edest'. */
1728 re_node_set_merge (&eclosure, &eclosure_elem);
1729 /* If the epsilon closure of `edest' is incomplete,
1730 the epsilon closure of this node is also incomplete. */
1731 if (dfa->eclosures[edest].nelem == 0)
1732 {
1733 incomplete = true;
1734 re_node_set_free (&eclosure_elem);
1735 }
1736 }
1737
1738 /* Epsilon closures include itself. */
1739 ok = re_node_set_insert (&eclosure, node);
1740 if (BE (! ok, 0))
1741 return REG_ESPACE;
1742 if (incomplete && !root)
1743 dfa->eclosures[node].nelem = 0;
1744 else
1745 dfa->eclosures[node] = eclosure;
1746 *new_set = eclosure;
1747 return REG_NOERROR;
1748}
1749
1750/* Functions for token which are used in the parser. */
1751
1752/* Fetch a token from INPUT.
1753 We must not use this function inside bracket expressions. */
1754
1755static void
1756internal_function
1757fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1758{
1759 re_string_skip_bytes (input, peek_token (result, input, syntax));
1760}
1761
1762/* Peek a token from INPUT, and return the length of the token.
1763 We must not use this function inside bracket expressions. */
1764
1765static int
1766internal_function
1767peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1768{
1769 unsigned char c;
1770
1771 if (re_string_eoi (input))
1772 {
1773 token->type = END_OF_RE;
1774 return 0;
1775 }
1776
1777 c = re_string_peek_byte (input, 0);
1778 token->opr.c = c;
1779
1780 token->word_char = 0;
1781#ifdef RE_ENABLE_I18N
1782 token->mb_partial = 0;
1783 if (input->mb_cur_max > 1 &&
1784 !re_string_first_byte (input, re_string_cur_idx (input)))
1785 {
1786 token->type = CHARACTER;
1787 token->mb_partial = 1;
1788 return 1;
1789 }
1790#endif
1791 if (c == '\\')
1792 {
1793 unsigned char c2;
1794 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1795 {
1796 token->type = BACK_SLASH;
1797 return 1;
1798 }
1799
1800 c2 = re_string_peek_byte_case (input, 1);
1801 token->opr.c = c2;
1802 token->type = CHARACTER;
1803#ifdef RE_ENABLE_I18N
1804 if (input->mb_cur_max > 1)
1805 {
1806 wint_t wc = re_string_wchar_at (input,
1807 re_string_cur_idx (input) + 1);
1808 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1809 }
1810 else
1811#endif
1812 token->word_char = IS_WORD_CHAR (c2) != 0;
1813
1814 switch (c2)
1815 {
1816 case '|':
1817 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1818 token->type = OP_ALT;
1819 break;
1820 case '1': case '2': case '3': case '4': case '5':
1821 case '6': case '7': case '8': case '9':
1822 if (!(syntax & RE_NO_BK_REFS))
1823 {
1824 token->type = OP_BACK_REF;
1825 token->opr.idx = c2 - '1';
1826 }
1827 break;
1828 case '<':
1829 if (!(syntax & RE_NO_GNU_OPS))
1830 {
1831 token->type = ANCHOR;
1832 token->opr.ctx_type = WORD_FIRST;
1833 }
1834 break;
1835 case '>':
1836 if (!(syntax & RE_NO_GNU_OPS))
1837 {
1838 token->type = ANCHOR;
1839 token->opr.ctx_type = WORD_LAST;
1840 }
1841 break;
1842 case 'b':
1843 if (!(syntax & RE_NO_GNU_OPS))
1844 {
1845 token->type = ANCHOR;
1846 token->opr.ctx_type = WORD_DELIM;
1847 }
1848 break;
1849 case 'B':
1850 if (!(syntax & RE_NO_GNU_OPS))
1851 {
1852 token->type = ANCHOR;
1853 token->opr.ctx_type = NOT_WORD_DELIM;
1854 }
1855 break;
1856 case 'w':
1857 if (!(syntax & RE_NO_GNU_OPS))
1858 token->type = OP_WORD;
1859 break;
1860 case 'W':
1861 if (!(syntax & RE_NO_GNU_OPS))
1862 token->type = OP_NOTWORD;
1863 break;
1864 case 's':
1865 if (!(syntax & RE_NO_GNU_OPS))
1866 token->type = OP_SPACE;
1867 break;
1868 case 'S':
1869 if (!(syntax & RE_NO_GNU_OPS))
1870 token->type = OP_NOTSPACE;
1871 break;
1872 case '`':
1873 if (!(syntax & RE_NO_GNU_OPS))
1874 {
1875 token->type = ANCHOR;
1876 token->opr.ctx_type = BUF_FIRST;
1877 }
1878 break;
1879 case '\'':
1880 if (!(syntax & RE_NO_GNU_OPS))
1881 {
1882 token->type = ANCHOR;
1883 token->opr.ctx_type = BUF_LAST;
1884 }
1885 break;
1886 case '(':
1887 if (!(syntax & RE_NO_BK_PARENS))
1888 token->type = OP_OPEN_SUBEXP;
1889 break;
1890 case ')':
1891 if (!(syntax & RE_NO_BK_PARENS))
1892 token->type = OP_CLOSE_SUBEXP;
1893 break;
1894 case '+':
1895 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1896 token->type = OP_DUP_PLUS;
1897 break;
1898 case '?':
1899 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1900 token->type = OP_DUP_QUESTION;
1901 break;
1902 case '{':
1903 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1904 token->type = OP_OPEN_DUP_NUM;
1905 break;
1906 case '}':
1907 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1908 token->type = OP_CLOSE_DUP_NUM;
1909 break;
1910 default:
1911 break;
1912 }
1913 return 2;
1914 }
1915
1916 token->type = CHARACTER;
1917#ifdef RE_ENABLE_I18N
1918 if (input->mb_cur_max > 1)
1919 {
1920 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1921 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1922 }
1923 else
1924#endif
1925 token->word_char = IS_WORD_CHAR (token->opr.c);
1926
1927 switch (c)
1928 {
1929 case '\n':
1930 if (syntax & RE_NEWLINE_ALT)
1931 token->type = OP_ALT;
1932 break;
1933 case '|':
1934 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1935 token->type = OP_ALT;
1936 break;
1937 case '*':
1938 token->type = OP_DUP_ASTERISK;
1939 break;
1940 case '+':
1941 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1942 token->type = OP_DUP_PLUS;
1943 break;
1944 case '?':
1945 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1946 token->type = OP_DUP_QUESTION;
1947 break;
1948 case '{':
1949 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1950 token->type = OP_OPEN_DUP_NUM;
1951 break;
1952 case '}':
1953 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1954 token->type = OP_CLOSE_DUP_NUM;
1955 break;
1956 case '(':
1957 if (syntax & RE_NO_BK_PARENS)
1958 token->type = OP_OPEN_SUBEXP;
1959 break;
1960 case ')':
1961 if (syntax & RE_NO_BK_PARENS)
1962 token->type = OP_CLOSE_SUBEXP;
1963 break;
1964 case '[':
1965 token->type = OP_OPEN_BRACKET;
1966 break;
1967 case '.':
1968 token->type = OP_PERIOD;
1969 break;
1970 case '^':
1971 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1972 re_string_cur_idx (input) != 0)
1973 {
1974 char prev = re_string_peek_byte (input, -1);
1975 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1976 break;
1977 }
1978 token->type = ANCHOR;
1979 token->opr.ctx_type = LINE_FIRST;
1980 break;
1981 case '$':
1982 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1983 re_string_cur_idx (input) + 1 != re_string_length (input))
1984 {
1985 re_token_t next;
1986 re_string_skip_bytes (input, 1);
1987 peek_token (&next, input, syntax);
1988 re_string_skip_bytes (input, -1);
1989 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1990 break;
1991 }
1992 token->type = ANCHOR;
1993 token->opr.ctx_type = LINE_LAST;
1994 break;
1995 default:
1996 break;
1997 }
1998 return 1;
1999}
2000
2001/* Peek a token from INPUT, and return the length of the token.
2002 We must not use this function out of bracket expressions. */
2003
2004static int
2005internal_function
2006peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2007{
2008 unsigned char c;
2009 if (re_string_eoi (input))
2010 {
2011 token->type = END_OF_RE;
2012 return 0;
2013 }
2014 c = re_string_peek_byte (input, 0);
2015 token->opr.c = c;
2016
2017#ifdef RE_ENABLE_I18N
2018 if (input->mb_cur_max > 1 &&
2019 !re_string_first_byte (input, re_string_cur_idx (input)))
2020 {
2021 token->type = CHARACTER;
2022 return 1;
2023 }
2024#endif /* RE_ENABLE_I18N */
2025
2026 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2027 && re_string_cur_idx (input) + 1 < re_string_length (input))
2028 {
2029 /* In this case, '\' escape a character. */
2030 unsigned char c2;
2031 re_string_skip_bytes (input, 1);
2032 c2 = re_string_peek_byte (input, 0);
2033 token->opr.c = c2;
2034 token->type = CHARACTER;
2035 return 1;
2036 }
2037 if (c == '[') /* '[' is a special char in a bracket exps. */
2038 {
2039 unsigned char c2;
2040 int token_len;
2041 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2042 c2 = re_string_peek_byte (input, 1);
2043 else
2044 c2 = 0;
2045 token->opr.c = c2;
2046 token_len = 2;
2047 switch (c2)
2048 {
2049 case '.':
2050 token->type = OP_OPEN_COLL_ELEM;
2051 break;
2052 case '=':
2053 token->type = OP_OPEN_EQUIV_CLASS;
2054 break;
2055 case ':':
2056 if (syntax & RE_CHAR_CLASSES)
2057 {
2058 token->type = OP_OPEN_CHAR_CLASS;
2059 break;
2060 }
2061 /* else fall through. */
2062 default:
2063 token->type = CHARACTER;
2064 token->opr.c = c;
2065 token_len = 1;
2066 break;
2067 }
2068 return token_len;
2069 }
2070 switch (c)
2071 {
2072 case '-':
2073 token->type = OP_CHARSET_RANGE;
2074 break;
2075 case ']':
2076 token->type = OP_CLOSE_BRACKET;
2077 break;
2078 case '^':
2079 token->type = OP_NON_MATCH_LIST;
2080 break;
2081 default:
2082 token->type = CHARACTER;
2083 }
2084 return 1;
2085}
2086
2087/* Functions for parser. */
2088
2089/* Entry point of the parser.
2090 Parse the regular expression REGEXP and return the structure tree.
2091 If an error is occured, ERR is set by error code, and return NULL.
2092 This function build the following tree, from regular expression <reg_exp>:
2093 CAT
2094 / \
2095 / \
2096 <reg_exp> EOR
2097
2098 CAT means concatenation.
2099 EOR means end of regular expression. */
2100
2101static bin_tree_t *
2102parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2103 reg_errcode_t *err)
2104{
2105 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2106 bin_tree_t *tree, *eor, *root;
2107 re_token_t current_token;
2108 dfa->syntax = syntax;
2109 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2110 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2111 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2112 return NULL;
2113 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2114 if (tree != NULL)
2115 root = create_tree (dfa, tree, eor, CONCAT);
2116 else
2117 root = eor;
2118 if (BE (eor == NULL || root == NULL, 0))
2119 {
2120 *err = REG_ESPACE;
2121 return NULL;
2122 }
2123 return root;
2124}
2125
2126/* This function build the following tree, from regular expression
2127 <branch1>|<branch2>:
2128 ALT
2129 / \
2130 / \
2131 <branch1> <branch2>
2132
2133 ALT means alternative, which represents the operator `|'. */
2134
2135static bin_tree_t *
2136parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2137 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2138{
2139 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2140 bin_tree_t *tree, *branch = NULL;
2141 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2142 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2143 return NULL;
2144
2145 while (token->type == OP_ALT)
2146 {
2147 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2148 if (token->type != OP_ALT && token->type != END_OF_RE
2149 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2150 {
2151 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2152 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2153 return NULL;
2154 }
2155 else
2156 branch = NULL;
2157 tree = create_tree (dfa, tree, branch, OP_ALT);
2158 if (BE (tree == NULL, 0))
2159 {
2160 *err = REG_ESPACE;
2161 return NULL;
2162 }
2163 }
2164 return tree;
2165}
2166
2167/* This function build the following tree, from regular expression
2168 <exp1><exp2>:
2169 CAT
2170 / \
2171 / \
2172 <exp1> <exp2>
2173
2174 CAT means concatenation. */
2175
2176static bin_tree_t *
2177parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2178 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2179{
2180 bin_tree_t *tree, *expr;
2181 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2182 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2183 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2184 return NULL;
2185
2186 while (token->type != OP_ALT && token->type != END_OF_RE
2187 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2188 {
2189 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2190 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2191 {
2192 return NULL;
2193 }
2194 if (tree != NULL && expr != NULL)
2195 {
2196 tree = create_tree (dfa, tree, expr, CONCAT);
2197 if (tree == NULL)
2198 {
2199 *err = REG_ESPACE;
2200 return NULL;
2201 }
2202 }
2203 else if (tree == NULL)
2204 tree = expr;
2205 /* Otherwise expr == NULL, we don't need to create new tree. */
2206 }
2207 return tree;
2208}
2209
2210/* This function build the following tree, from regular expression a*:
2211 *
2212 |
2213 a
2214*/
2215
2216static bin_tree_t *
2217parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2218 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2219{
2220 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2221 bin_tree_t *tree;
2222 switch (token->type)
2223 {
2224 case CHARACTER:
2225 tree = create_token_tree (dfa, NULL, NULL, token);
2226 if (BE (tree == NULL, 0))
2227 {
2228 *err = REG_ESPACE;
2229 return NULL;
2230 }
2231#ifdef RE_ENABLE_I18N
2232 if (dfa->mb_cur_max > 1)
2233 {
2234 while (!re_string_eoi (regexp)
2235 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2236 {
2237 bin_tree_t *mbc_remain;
2238 fetch_token (token, regexp, syntax);
2239 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2240 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2241 if (BE (mbc_remain == NULL || tree == NULL, 0))
2242 {
2243 *err = REG_ESPACE;
2244 return NULL;
2245 }
2246 }
2247 }
2248#endif
2249 break;
2250 case OP_OPEN_SUBEXP:
2251 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2252 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2253 return NULL;
2254 break;
2255 case OP_OPEN_BRACKET:
2256 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2257 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2258 return NULL;
2259 break;
2260 case OP_BACK_REF:
2261 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2262 {
2263 *err = REG_ESUBREG;
2264 return NULL;
2265 }
2266 dfa->used_bkref_map |= 1 << token->opr.idx;
2267 tree = create_token_tree (dfa, NULL, NULL, token);
2268 if (BE (tree == NULL, 0))
2269 {
2270 *err = REG_ESPACE;
2271 return NULL;
2272 }
2273 ++dfa->nbackref;
2274 dfa->has_mb_node = 1;
2275 break;
2276 case OP_OPEN_DUP_NUM:
2277 if (syntax & RE_CONTEXT_INVALID_DUP)
2278 {
2279 *err = REG_BADRPT;
2280 return NULL;
2281 }
2282 /* FALLTHROUGH */
2283 case OP_DUP_ASTERISK:
2284 case OP_DUP_PLUS:
2285 case OP_DUP_QUESTION:
2286 if (syntax & RE_CONTEXT_INVALID_OPS)
2287 {
2288 *err = REG_BADRPT;
2289 return NULL;
2290 }
2291 else if (syntax & RE_CONTEXT_INDEP_OPS)
2292 {
2293 fetch_token (token, regexp, syntax);
2294 return parse_expression (regexp, preg, token, syntax, nest, err);
2295 }
2296 /* else fall through */
2297 case OP_CLOSE_SUBEXP:
2298 if ((token->type == OP_CLOSE_SUBEXP) &&
2299 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2300 {
2301 *err = REG_ERPAREN;
2302 return NULL;
2303 }
2304 /* else fall through */
2305 case OP_CLOSE_DUP_NUM:
2306 /* We treat it as a normal character. */
2307
2308 /* Then we can these characters as normal characters. */
2309 token->type = CHARACTER;
2310 /* mb_partial and word_char bits should be initialized already
2311 by peek_token. */
2312 tree = create_token_tree (dfa, NULL, NULL, token);
2313 if (BE (tree == NULL, 0))
2314 {
2315 *err = REG_ESPACE;
2316 return NULL;
2317 }
2318 break;
2319 case ANCHOR:
2320 if ((token->opr.ctx_type
2321 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2322 && dfa->word_ops_used == 0)
2323 init_word_char (dfa);
2324 if (token->opr.ctx_type == WORD_DELIM
2325 || token->opr.ctx_type == NOT_WORD_DELIM)
2326 {
2327 bin_tree_t *tree_first, *tree_last;
2328 if (token->opr.ctx_type == WORD_DELIM)
2329 {
2330 token->opr.ctx_type = WORD_FIRST;
2331 tree_first = create_token_tree (dfa, NULL, NULL, token);
2332 token->opr.ctx_type = WORD_LAST;
2333 }
2334 else
2335 {
2336 token->opr.ctx_type = INSIDE_WORD;
2337 tree_first = create_token_tree (dfa, NULL, NULL, token);
2338 token->opr.ctx_type = INSIDE_NOTWORD;
2339 }
2340 tree_last = create_token_tree (dfa, NULL, NULL, token);
2341 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2342 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2343 {
2344 *err = REG_ESPACE;
2345 return NULL;
2346 }
2347 }
2348 else
2349 {
2350 tree = create_token_tree (dfa, NULL, NULL, token);
2351 if (BE (tree == NULL, 0))
2352 {
2353 *err = REG_ESPACE;
2354 return NULL;
2355 }
2356 }
2357 /* We must return here, since ANCHORs can't be followed
2358 by repetition operators.
2359 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2360 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2361 fetch_token (token, regexp, syntax);
2362 return tree;
2363 case OP_PERIOD:
2364 tree = create_token_tree (dfa, NULL, NULL, token);
2365 if (BE (tree == NULL, 0))
2366 {
2367 *err = REG_ESPACE;
2368 return NULL;
2369 }
2370 if (dfa->mb_cur_max > 1)
2371 dfa->has_mb_node = 1;
2372 break;
2373 case OP_WORD:
2374 case OP_NOTWORD:
2375 tree = build_charclass_op (dfa, regexp->trans,
2376 (const unsigned char *) "alnum",
2377 (const unsigned char *) "_",
2378 token->type == OP_NOTWORD, err);
2379 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2380 return NULL;
2381 break;
2382 case OP_SPACE:
2383 case OP_NOTSPACE:
2384 tree = build_charclass_op (dfa, regexp->trans,
2385 (const unsigned char *) "space",
2386 (const unsigned char *) "",
2387 token->type == OP_NOTSPACE, err);
2388 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2389 return NULL;
2390 break;
2391 case OP_ALT:
2392 case END_OF_RE:
2393 return NULL;
2394 case BACK_SLASH:
2395 *err = REG_EESCAPE;
2396 return NULL;
2397 default:
2398 /* Must not happen? */
2399#ifdef DEBUG
2400 assert (0);
2401#endif
2402 return NULL;
2403 }
2404 fetch_token (token, regexp, syntax);
2405
2406 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2407 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2408 {
2409 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2410 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2411 return NULL;
2412 /* In BRE consecutive duplications are not allowed. */
2413 if ((syntax & RE_CONTEXT_INVALID_DUP)
2414 && (token->type == OP_DUP_ASTERISK
2415 || token->type == OP_OPEN_DUP_NUM))
2416 {
2417 *err = REG_BADRPT;
2418 return NULL;
2419 }
2420 }
2421
2422 return tree;
2423}
2424
2425/* This function build the following tree, from regular expression
2426 (<reg_exp>):
2427 SUBEXP
2428 |
2429 <reg_exp>
2430*/
2431
2432static bin_tree_t *
2433parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2434 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2435{
2436 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2437 bin_tree_t *tree;
2438 size_t cur_nsub;
2439 cur_nsub = preg->re_nsub++;
2440
2441 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2442
2443 /* The subexpression may be a null string. */
2444 if (token->type == OP_CLOSE_SUBEXP)
2445 tree = NULL;
2446 else
2447 {
2448 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2449 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2450 *err = REG_EPAREN;
2451 if (BE (*err != REG_NOERROR, 0))
2452 return NULL;
2453 }
2454
2455 if (cur_nsub <= '9' - '1')
2456 dfa->completed_bkref_map |= 1 << cur_nsub;
2457
2458 tree = create_tree (dfa, tree, NULL, SUBEXP);
2459 if (BE (tree == NULL, 0))
2460 {
2461 *err = REG_ESPACE;
2462 return NULL;
2463 }
2464 tree->token.opr.idx = cur_nsub;
2465 return tree;
2466}
2467
2468/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2469
2470static bin_tree_t *
2471parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2472 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2473{
2474 bin_tree_t *tree = NULL, *old_tree = NULL;
2475 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2476 re_token_t start_token = *token;
2477
2478 if (token->type == OP_OPEN_DUP_NUM)
2479 {
2480 end = 0;
2481 start = fetch_number (regexp, token, syntax);
2482 if (start == REG_MISSING)
2483 {
2484 if (token->type == CHARACTER && token->opr.c == ',')
2485 start = 0; /* We treat "{,m}" as "{0,m}". */
2486 else
2487 {
2488 *err = REG_BADBR; /* <re>{} is invalid. */
2489 return NULL;
2490 }
2491 }
2492 if (BE (start != REG_ERROR, 1))
2493 {
2494 /* We treat "{n}" as "{n,n}". */
2495 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2496 : ((token->type == CHARACTER && token->opr.c == ',')
2497 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2498 }
2499 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2500 {
2501 /* Invalid sequence. */
2502 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2503 {
2504 if (token->type == END_OF_RE)
2505 *err = REG_EBRACE;
2506 else
2507 *err = REG_BADBR;
2508
2509 return NULL;
2510 }
2511
2512 /* If the syntax bit is set, rollback. */
2513 re_string_set_index (regexp, start_idx);
2514 *token = start_token;
2515 token->type = CHARACTER;
2516 /* mb_partial and word_char bits should be already initialized by
2517 peek_token. */
2518 return elem;
2519 }
2520
2521 if (BE (end != REG_MISSING && start > end, 0))
2522 {
2523 /* First number greater than second. */
2524 *err = REG_BADBR;
2525 return NULL;
2526 }
2527 }
2528 else
2529 {
2530 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2531 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2532 }
2533
2534 fetch_token (token, regexp, syntax);
2535
2536 if (BE (elem == NULL, 0))
2537 return NULL;
2538 if (BE (start == 0 && end == 0, 0))
2539 {
2540 postorder (elem, free_tree, NULL);
2541 return NULL;
2542 }
2543
2544 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2545 if (BE (start > 0, 0))
2546 {
2547 tree = elem;
2548 for (i = 2; i <= start; ++i)
2549 {
2550 elem = duplicate_tree (elem, dfa);
2551 tree = create_tree (dfa, tree, elem, CONCAT);
2552 if (BE (elem == NULL || tree == NULL, 0))
2553 goto parse_dup_op_espace;
2554 }
2555
2556 if (start == end)
2557 return tree;
2558
2559 /* Duplicate ELEM before it is marked optional. */
2560 elem = duplicate_tree (elem, dfa);
2561 old_tree = tree;
2562 }
2563 else
2564 old_tree = NULL;
2565
2566 if (elem->token.type == SUBEXP)
2567 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2568
2569 tree = create_tree (dfa, elem, NULL,
2570 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2571 if (BE (tree == NULL, 0))
2572 goto parse_dup_op_espace;
2573
2574 /* This loop is actually executed only when end != REG_MISSING,
2575 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2576 already created the start+1-th copy. */
2577 if ((Idx) -1 < 0 || end != REG_MISSING)
2578 for (i = start + 2; i <= end; ++i)
2579 {
2580 elem = duplicate_tree (elem, dfa);
2581 tree = create_tree (dfa, tree, elem, CONCAT);
2582 if (BE (elem == NULL || tree == NULL, 0))
2583 goto parse_dup_op_espace;
2584
2585 tree = create_tree (dfa, tree, NULL, OP_ALT);
2586 if (BE (tree == NULL, 0))
2587 goto parse_dup_op_espace;
2588 }
2589
2590 if (old_tree)
2591 tree = create_tree (dfa, old_tree, tree, CONCAT);
2592
2593 return tree;
2594
2595 parse_dup_op_espace:
2596 *err = REG_ESPACE;
2597 return NULL;
2598}
2599
2600/* Size of the names for collating symbol/equivalence_class/character_class.
2601 I'm not sure, but maybe enough. */
2602#define BRACKET_NAME_BUF_SIZE 32
2603
2604#ifndef _LIBC
2605 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2606 Build the range expression which starts from START_ELEM, and ends
2607 at END_ELEM. The result are written to MBCSET and SBCSET.
2608 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2609 mbcset->range_ends, is a pointer argument sinse we may
2610 update it. */
2611
2612static reg_errcode_t
2613internal_function
2614# ifdef RE_ENABLE_I18N
2615build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2616 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2617# else /* not RE_ENABLE_I18N */
2618build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2619 bracket_elem_t *end_elem)
2620# endif /* not RE_ENABLE_I18N */
2621{
2622 unsigned int start_ch, end_ch;
2623 /* Equivalence Classes and Character Classes can't be a range start/end. */
2624 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2625 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2626 0))
2627 return REG_ERANGE;
2628
2629 /* We can handle no multi character collating elements without libc
2630 support. */
2631 if (BE ((start_elem->type == COLL_SYM
2632 && strlen ((char *) start_elem->opr.name) > 1)
2633 || (end_elem->type == COLL_SYM
2634 && strlen ((char *) end_elem->opr.name) > 1), 0))
2635 return REG_ECOLLATE;
2636
2637# ifdef RE_ENABLE_I18N
2638 {
2639 wchar_t wc;
2640 wint_t start_wc;
2641 wint_t end_wc;
2642 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2643
2644 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2645 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2646 : 0));
2647 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2648 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2649 : 0));
2650 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2651 ? __btowc (start_ch) : start_elem->opr.wch);
2652 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2653 ? __btowc (end_ch) : end_elem->opr.wch);
2654 if (start_wc == WEOF || end_wc == WEOF)
2655 return REG_ECOLLATE;
2656 cmp_buf[0] = start_wc;
2657 cmp_buf[4] = end_wc;
2658 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2659 return REG_ERANGE;
2660
2661 /* Got valid collation sequence values, add them as a new entry.
2662 However, for !_LIBC we have no collation elements: if the
2663 character set is single byte, the single byte character set
2664 that we build below suffices. parse_bracket_exp passes
2665 no MBCSET if dfa->mb_cur_max == 1. */
2666 if (mbcset)
2667 {
2668 /* Check the space of the arrays. */
2669 if (BE (*range_alloc == mbcset->nranges, 0))
2670 {
2671 /* There is not enough space, need realloc. */
2672 wchar_t *new_array_start, *new_array_end;
2673 Idx new_nranges;
2674
2675 /* +1 in case of mbcset->nranges is 0. */
2676 new_nranges = 2 * mbcset->nranges + 1;
2677 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2678 are NULL if *range_alloc == 0. */
2679 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2680 new_nranges);
2681 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2682 new_nranges);
2683
2684 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2685 return REG_ESPACE;
2686
2687 mbcset->range_starts = new_array_start;
2688 mbcset->range_ends = new_array_end;
2689 *range_alloc = new_nranges;
2690 }
2691
2692 mbcset->range_starts[mbcset->nranges] = start_wc;
2693 mbcset->range_ends[mbcset->nranges++] = end_wc;
2694 }
2695
2696 /* Build the table for single byte characters. */
2697 for (wc = 0; wc < SBC_MAX; ++wc)
2698 {
2699 cmp_buf[2] = wc;
2700 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2701 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2702 bitset_set (sbcset, wc);
2703 }
2704 }
2705# else /* not RE_ENABLE_I18N */
2706 {
2707 unsigned int ch;
2708 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2709 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2710 : 0));
2711 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2712 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2713 : 0));
2714 if (start_ch > end_ch)
2715 return REG_ERANGE;
2716 /* Build the table for single byte characters. */
2717 for (ch = 0; ch < SBC_MAX; ++ch)
2718 if (start_ch <= ch && ch <= end_ch)
2719 bitset_set (sbcset, ch);
2720 }
2721# endif /* not RE_ENABLE_I18N */
2722 return REG_NOERROR;
2723}
2724#endif /* not _LIBC */
2725
2726#ifndef _LIBC
2727/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2728 Build the collating element which is represented by NAME.
2729 The result are written to MBCSET and SBCSET.
2730 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2731 pointer argument since we may update it. */
2732
2733static reg_errcode_t
2734internal_function
2735build_collating_symbol (bitset_t sbcset,
2736# ifdef RE_ENABLE_I18N
2737 re_charset_t *mbcset, Idx *coll_sym_alloc,
2738# endif
2739 const unsigned char *name)
2740{
2741 size_t name_len = strlen ((const char *) name);
2742 if (BE (name_len != 1, 0))
2743 return REG_ECOLLATE;
2744 else
2745 {
2746 bitset_set (sbcset, name[0]);
2747 return REG_NOERROR;
2748 }
2749}
2750#endif /* not _LIBC */
2751
2752/* This function parse bracket expression like "[abc]", "[a-c]",
2753 "[[.a-a.]]" etc. */
2754
2755static bin_tree_t *
2756parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2757 reg_syntax_t syntax, reg_errcode_t *err)
2758{
2759#ifdef _LIBC
2760 const unsigned char *collseqmb;
2761 const char *collseqwc;
2762 uint32_t nrules;
2763 int32_t table_size;
2764 const int32_t *symb_table;
2765 const unsigned char *extra;
2766
2767 /* Local function for parse_bracket_exp used in _LIBC environement.
2768 Seek the collating symbol entry correspondings to NAME.
2769 Return the index of the symbol in the SYMB_TABLE. */
2770
2771 auto inline int32_t
2772 __attribute ((always_inline))
2773 seek_collating_symbol_entry (name, name_len)
2774 const unsigned char *name;
2775 size_t name_len;
2776 {
2777 int32_t hash = elem_hash ((const char *) name, name_len);
2778 int32_t elem = hash % table_size;
2779 if (symb_table[2 * elem] != 0)
2780 {
2781 int32_t second = hash % (table_size - 2) + 1;
2782
2783 do
2784 {
2785 /* First compare the hashing value. */
2786 if (symb_table[2 * elem] == hash
2787 /* Compare the length of the name. */
2788 && name_len == extra[symb_table[2 * elem + 1]]
2789 /* Compare the name. */
2790 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2791 name_len) == 0)
2792 {
2793 /* Yep, this is the entry. */
2794 break;
2795 }
2796
2797 /* Next entry. */
2798 elem += second;
2799 }
2800 while (symb_table[2 * elem] != 0);
2801 }
2802 return elem;
2803 }
2804
2805 /* Local function for parse_bracket_exp used in _LIBC environement.
2806 Look up the collation sequence value of BR_ELEM.
2807 Return the value if succeeded, UINT_MAX otherwise. */
2808
2809 auto inline unsigned int
2810 __attribute ((always_inline))
2811 lookup_collation_sequence_value (br_elem)
2812 bracket_elem_t *br_elem;
2813 {
2814 if (br_elem->type == SB_CHAR)
2815 {
2816 /*
2817 if (MB_CUR_MAX == 1)
2818 */
2819 if (nrules == 0)
2820 return collseqmb[br_elem->opr.ch];
2821 else
2822 {
2823 wint_t wc = __btowc (br_elem->opr.ch);
2824 return __collseq_table_lookup (collseqwc, wc);
2825 }
2826 }
2827 else if (br_elem->type == MB_CHAR)
2828 {
2829 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2830 }
2831 else if (br_elem->type == COLL_SYM)
2832 {
2833 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2834 if (nrules != 0)
2835 {
2836 int32_t elem, idx;
2837 elem = seek_collating_symbol_entry (br_elem->opr.name,
2838 sym_name_len);
2839 if (symb_table[2 * elem] != 0)
2840 {
2841 /* We found the entry. */
2842 idx = symb_table[2 * elem + 1];
2843 /* Skip the name of collating element name. */
2844 idx += 1 + extra[idx];
2845 /* Skip the byte sequence of the collating element. */
2846 idx += 1 + extra[idx];
2847 /* Adjust for the alignment. */
2848 idx = (idx + 3) & ~3;
2849 /* Skip the multibyte collation sequence value. */
2850 idx += sizeof (unsigned int);
2851 /* Skip the wide char sequence of the collating element. */
2852 idx += sizeof (unsigned int) *
2853 (1 + *(unsigned int *) (extra + idx));
2854 /* Return the collation sequence value. */
2855 return *(unsigned int *) (extra + idx);
2856 }
2857 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2858 {
2859 /* No valid character. Match it as a single byte
2860 character. */
2861 return collseqmb[br_elem->opr.name[0]];
2862 }
2863 }
2864 else if (sym_name_len == 1)
2865 return collseqmb[br_elem->opr.name[0]];
2866 }
2867 return UINT_MAX;
2868 }
2869
2870 /* Local function for parse_bracket_exp used in _LIBC environement.
2871 Build the range expression which starts from START_ELEM, and ends
2872 at END_ELEM. The result are written to MBCSET and SBCSET.
2873 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2874 mbcset->range_ends, is a pointer argument sinse we may
2875 update it. */
2876
2877 auto inline reg_errcode_t
2878 __attribute ((always_inline))
2879 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2880 re_charset_t *mbcset;
2881 Idx *range_alloc;
2882 bitset_t sbcset;
2883 bracket_elem_t *start_elem, *end_elem;
2884 {
2885 unsigned int ch;
2886 uint32_t start_collseq;
2887 uint32_t end_collseq;
2888
2889 /* Equivalence Classes and Character Classes can't be a range
2890 start/end. */
2891 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2892 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2893 0))
2894 return REG_ERANGE;
2895
2896 start_collseq = lookup_collation_sequence_value (start_elem);
2897 end_collseq = lookup_collation_sequence_value (end_elem);
2898 /* Check start/end collation sequence values. */
2899 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2900 return REG_ECOLLATE;
2901 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2902 return REG_ERANGE;
2903
2904 /* Got valid collation sequence values, add them as a new entry.
2905 However, if we have no collation elements, and the character set
2906 is single byte, the single byte character set that we
2907 build below suffices. */
2908 if (nrules > 0 || dfa->mb_cur_max > 1)
2909 {
2910 /* Check the space of the arrays. */
2911 if (BE (*range_alloc == mbcset->nranges, 0))
2912 {
2913 /* There is not enough space, need realloc. */
2914 uint32_t *new_array_start;
2915 uint32_t *new_array_end;
2916 Idx new_nranges;
2917
2918 /* +1 in case of mbcset->nranges is 0. */
2919 new_nranges = 2 * mbcset->nranges + 1;
2920 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2921 new_nranges);
2922 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2923 new_nranges);
2924
2925 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2926 return REG_ESPACE;
2927
2928 mbcset->range_starts = new_array_start;
2929 mbcset->range_ends = new_array_end;
2930 *range_alloc = new_nranges;
2931 }
2932
2933 mbcset->range_starts[mbcset->nranges] = start_collseq;
2934 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2935 }
2936
2937 /* Build the table for single byte characters. */
2938 for (ch = 0; ch < SBC_MAX; ch++)
2939 {
2940 uint32_t ch_collseq;
2941 /*
2942 if (MB_CUR_MAX == 1)
2943 */
2944 if (nrules == 0)
2945 ch_collseq = collseqmb[ch];
2946 else
2947 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2948 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2949 bitset_set (sbcset, ch);
2950 }
2951 return REG_NOERROR;
2952 }
2953
2954 /* Local function for parse_bracket_exp used in _LIBC environement.
2955 Build the collating element which is represented by NAME.
2956 The result are written to MBCSET and SBCSET.
2957 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2958 pointer argument sinse we may update it. */
2959
2960 auto inline reg_errcode_t
2961 __attribute ((always_inline))
2962 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2963 re_charset_t *mbcset;
2964 Idx *coll_sym_alloc;
2965 bitset_t sbcset;
2966 const unsigned char *name;
2967 {
2968 int32_t elem, idx;
2969 size_t name_len = strlen ((const char *) name);
2970 if (nrules != 0)
2971 {
2972 elem = seek_collating_symbol_entry (name, name_len);
2973 if (symb_table[2 * elem] != 0)
2974 {
2975 /* We found the entry. */
2976 idx = symb_table[2 * elem + 1];
2977 /* Skip the name of collating element name. */
2978 idx += 1 + extra[idx];
2979 }
2980 else if (symb_table[2 * elem] == 0 && name_len == 1)
2981 {
2982 /* No valid character, treat it as a normal
2983 character. */
2984 bitset_set (sbcset, name[0]);
2985 return REG_NOERROR;
2986 }
2987 else
2988 return REG_ECOLLATE;
2989
2990 /* Got valid collation sequence, add it as a new entry. */
2991 /* Check the space of the arrays. */
2992 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2993 {
2994 /* Not enough, realloc it. */
2995 /* +1 in case of mbcset->ncoll_syms is 0. */
2996 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2997 /* Use realloc since mbcset->coll_syms is NULL
2998 if *alloc == 0. */
2999 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3000 new_coll_sym_alloc);
3001 if (BE (new_coll_syms == NULL, 0))
3002 return REG_ESPACE;
3003 mbcset->coll_syms = new_coll_syms;
3004 *coll_sym_alloc = new_coll_sym_alloc;
3005 }
3006 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3007 return REG_NOERROR;
3008 }
3009 else
3010 {
3011 if (BE (name_len != 1, 0))
3012 return REG_ECOLLATE;
3013 else
3014 {
3015 bitset_set (sbcset, name[0]);
3016 return REG_NOERROR;
3017 }
3018 }
3019 }
3020#endif
3021
3022 re_token_t br_token;
3023 re_bitset_ptr_t sbcset;
3024#ifdef RE_ENABLE_I18N
3025 re_charset_t *mbcset;
3026 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3027 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3028#endif /* not RE_ENABLE_I18N */
3029 bool non_match = false;
3030 bin_tree_t *work_tree;
3031 int token_len;
3032 bool first_round = true;
3033#ifdef _LIBC
3034 collseqmb = (const unsigned char *)
3035 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3036 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3037 if (nrules)
3038 {
3039 /*
3040 if (MB_CUR_MAX > 1)
3041 */
3042 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3043 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3044 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3045 _NL_COLLATE_SYMB_TABLEMB);
3046 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3047 _NL_COLLATE_SYMB_EXTRAMB);
3048 }
3049#endif
3050 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3051#ifdef RE_ENABLE_I18N
3052 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3053#endif /* RE_ENABLE_I18N */
3054#ifdef RE_ENABLE_I18N
3055 if (BE (sbcset == NULL || mbcset == NULL, 0))
3056#else
3057 if (BE (sbcset == NULL, 0))
3058#endif /* RE_ENABLE_I18N */
3059 {
3060 *err = REG_ESPACE;
3061 return NULL;
3062 }
3063
3064 token_len = peek_token_bracket (token, regexp, syntax);
3065 if (BE (token->type == END_OF_RE, 0))
3066 {
3067 *err = REG_BADPAT;
3068 goto parse_bracket_exp_free_return;
3069 }
3070 if (token->type == OP_NON_MATCH_LIST)
3071 {
3072#ifdef RE_ENABLE_I18N
3073 mbcset->non_match = 1;
3074#endif /* not RE_ENABLE_I18N */
3075 non_match = true;
3076 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3077 bitset_set (sbcset, '\0');
3078 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3079 token_len = peek_token_bracket (token, regexp, syntax);
3080 if (BE (token->type == END_OF_RE, 0))
3081 {
3082 *err = REG_BADPAT;
3083 goto parse_bracket_exp_free_return;
3084 }
3085 }
3086
3087 /* We treat the first ']' as a normal character. */
3088 if (token->type == OP_CLOSE_BRACKET)
3089 token->type = CHARACTER;
3090
3091 while (1)
3092 {
3093 bracket_elem_t start_elem, end_elem;
3094 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3095 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3096 reg_errcode_t ret;
3097 int token_len2 = 0;
3098 bool is_range_exp = false;
3099 re_token_t token2;
3100
3101 start_elem.opr.name = start_name_buf;
3102 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3103 syntax, first_round);
3104 if (BE (ret != REG_NOERROR, 0))
3105 {
3106 *err = ret;
3107 goto parse_bracket_exp_free_return;
3108 }
3109 first_round = false;
3110
3111 /* Get information about the next token. We need it in any case. */
3112 token_len = peek_token_bracket (token, regexp, syntax);
3113
3114 /* Do not check for ranges if we know they are not allowed. */
3115 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3116 {
3117 if (BE (token->type == END_OF_RE, 0))
3118 {
3119 *err = REG_EBRACK;
3120 goto parse_bracket_exp_free_return;
3121 }
3122 if (token->type == OP_CHARSET_RANGE)
3123 {
3124 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3125 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3126 if (BE (token2.type == END_OF_RE, 0))
3127 {
3128 *err = REG_EBRACK;
3129 goto parse_bracket_exp_free_return;
3130 }
3131 if (token2.type == OP_CLOSE_BRACKET)
3132 {
3133 /* We treat the last '-' as a normal character. */
3134 re_string_skip_bytes (regexp, -token_len);
3135 token->type = CHARACTER;
3136 }
3137 else
3138 is_range_exp = true;
3139 }
3140 }
3141
3142 if (is_range_exp == true)
3143 {
3144 end_elem.opr.name = end_name_buf;
3145 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3146 dfa, syntax, true);
3147 if (BE (ret != REG_NOERROR, 0))
3148 {
3149 *err = ret;
3150 goto parse_bracket_exp_free_return;
3151 }
3152
3153 token_len = peek_token_bracket (token, regexp, syntax);
3154
3155#ifdef _LIBC
3156 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3157 &start_elem, &end_elem);
3158#else
3159# ifdef RE_ENABLE_I18N
3160 *err = build_range_exp (sbcset,
3161 dfa->mb_cur_max > 1 ? mbcset : NULL,
3162 &range_alloc, &start_elem, &end_elem);
3163# else
3164 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3165# endif
3166#endif /* RE_ENABLE_I18N */
3167 if (BE (*err != REG_NOERROR, 0))
3168 goto parse_bracket_exp_free_return;
3169 }
3170 else
3171 {
3172 switch (start_elem.type)
3173 {
3174 case SB_CHAR:
3175 bitset_set (sbcset, start_elem.opr.ch);
3176 break;
3177#ifdef RE_ENABLE_I18N
3178 case MB_CHAR:
3179 /* Check whether the array has enough space. */
3180 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3181 {
3182 wchar_t *new_mbchars;
3183 /* Not enough, realloc it. */
3184 /* +1 in case of mbcset->nmbchars is 0. */
3185 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3186 /* Use realloc since array is NULL if *alloc == 0. */
3187 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3188 mbchar_alloc);
3189 if (BE (new_mbchars == NULL, 0))
3190 goto parse_bracket_exp_espace;
3191 mbcset->mbchars = new_mbchars;
3192 }
3193 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3194 break;
3195#endif /* RE_ENABLE_I18N */
3196 case EQUIV_CLASS:
3197 *err = build_equiv_class (sbcset,
3198#ifdef RE_ENABLE_I18N
3199 mbcset, &equiv_class_alloc,
3200#endif /* RE_ENABLE_I18N */
3201 start_elem.opr.name);
3202 if (BE (*err != REG_NOERROR, 0))
3203 goto parse_bracket_exp_free_return;
3204 break;
3205 case COLL_SYM:
3206 *err = build_collating_symbol (sbcset,
3207#ifdef RE_ENABLE_I18N
3208 mbcset, &coll_sym_alloc,
3209#endif /* RE_ENABLE_I18N */
3210 start_elem.opr.name);
3211 if (BE (*err != REG_NOERROR, 0))
3212 goto parse_bracket_exp_free_return;
3213 break;
3214 case CHAR_CLASS:
3215 *err = build_charclass (regexp->trans, sbcset,
3216#ifdef RE_ENABLE_I18N
3217 mbcset, &char_class_alloc,
3218#endif /* RE_ENABLE_I18N */
3219 start_elem.opr.name, syntax);
3220 if (BE (*err != REG_NOERROR, 0))
3221 goto parse_bracket_exp_free_return;
3222 break;
3223 default:
3224 assert (0);
3225 break;
3226 }
3227 }
3228 if (BE (token->type == END_OF_RE, 0))
3229 {
3230 *err = REG_EBRACK;
3231 goto parse_bracket_exp_free_return;
3232 }
3233 if (token->type == OP_CLOSE_BRACKET)
3234 break;
3235 }
3236
3237 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3238
3239 /* If it is non-matching list. */
3240 if (non_match)
3241 bitset_not (sbcset);
3242
3243#ifdef RE_ENABLE_I18N
3244 /* Ensure only single byte characters are set. */
3245 if (dfa->mb_cur_max > 1)
3246 bitset_mask (sbcset, dfa->sb_char);
3247
3248 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3249 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3250 || mbcset->non_match)))
3251 {
3252 bin_tree_t *mbc_tree;
3253 int sbc_idx;
3254 /* Build a tree for complex bracket. */
3255 dfa->has_mb_node = 1;
3256 br_token.type = COMPLEX_BRACKET;
3257 br_token.opr.mbcset = mbcset;
3258 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3259 if (BE (mbc_tree == NULL, 0))
3260 goto parse_bracket_exp_espace;
3261 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3262 if (sbcset[sbc_idx])
3263 break;
3264 /* If there are no bits set in sbcset, there is no point
3265 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3266 if (sbc_idx < BITSET_WORDS)
3267 {
3268 /* Build a tree for simple bracket. */
3269 br_token.type = SIMPLE_BRACKET;
3270 br_token.opr.sbcset = sbcset;
3271 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3272 if (BE (work_tree == NULL, 0))
3273 goto parse_bracket_exp_espace;
3274
3275 /* Then join them by ALT node. */
3276 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3277 if (BE (work_tree == NULL, 0))
3278 goto parse_bracket_exp_espace;
3279 }
3280 else
3281 {
3282 re_free (sbcset);
3283 work_tree = mbc_tree;
3284 }
3285 }
3286 else
3287#endif /* not RE_ENABLE_I18N */
3288 {
3289#ifdef RE_ENABLE_I18N
3290 free_charset (mbcset);
3291#endif
3292 /* Build a tree for simple bracket. */
3293 br_token.type = SIMPLE_BRACKET;
3294 br_token.opr.sbcset = sbcset;
3295 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3296 if (BE (work_tree == NULL, 0))
3297 goto parse_bracket_exp_espace;
3298 }
3299 return work_tree;
3300
3301 parse_bracket_exp_espace:
3302 *err = REG_ESPACE;
3303 parse_bracket_exp_free_return:
3304 re_free (sbcset);
3305#ifdef RE_ENABLE_I18N
3306 free_charset (mbcset);
3307#endif /* RE_ENABLE_I18N */
3308 return NULL;
3309}
3310
3311/* Parse an element in the bracket expression. */
3312
3313static reg_errcode_t
3314parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3315 re_token_t *token, int token_len, re_dfa_t *dfa,
3316 reg_syntax_t syntax, bool accept_hyphen)
3317{
3318#ifdef RE_ENABLE_I18N
3319 int cur_char_size;
3320 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3321 if (cur_char_size > 1)
3322 {
3323 elem->type = MB_CHAR;
3324 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3325 re_string_skip_bytes (regexp, cur_char_size);
3326 return REG_NOERROR;
3327 }
3328#endif /* RE_ENABLE_I18N */
3329 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3330 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3331 || token->type == OP_OPEN_EQUIV_CLASS)
3332 return parse_bracket_symbol (elem, regexp, token);
3333 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3334 {
3335 /* A '-' must only appear as anything but a range indicator before
3336 the closing bracket. Everything else is an error. */
3337 re_token_t token2;
3338 (void) peek_token_bracket (&token2, regexp, syntax);
3339 if (token2.type != OP_CLOSE_BRACKET)
3340 /* The actual error value is not standardized since this whole
3341 case is undefined. But ERANGE makes good sense. */
3342 return REG_ERANGE;
3343 }
3344 elem->type = SB_CHAR;
3345 elem->opr.ch = token->opr.c;
3346 return REG_NOERROR;
3347}
3348
3349/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3350 such as [:<character_class>:], [.<collating_element>.], and
3351 [=<equivalent_class>=]. */
3352
3353static reg_errcode_t
3354parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3355 re_token_t *token)
3356{
3357 unsigned char ch, delim = token->opr.c;
3358 int i = 0;
3359 if (re_string_eoi(regexp))
3360 return REG_EBRACK;
3361 for (;; ++i)
3362 {
3363 if (i >= BRACKET_NAME_BUF_SIZE)
3364 return REG_EBRACK;
3365 if (token->type == OP_OPEN_CHAR_CLASS)
3366 ch = re_string_fetch_byte_case (regexp);
3367 else
3368 ch = re_string_fetch_byte (regexp);
3369 if (re_string_eoi(regexp))
3370 return REG_EBRACK;
3371 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3372 break;
3373 elem->opr.name[i] = ch;
3374 }
3375 re_string_skip_bytes (regexp, 1);
3376 elem->opr.name[i] = '\0';
3377 switch (token->type)
3378 {
3379 case OP_OPEN_COLL_ELEM:
3380 elem->type = COLL_SYM;
3381 break;
3382 case OP_OPEN_EQUIV_CLASS:
3383 elem->type = EQUIV_CLASS;
3384 break;
3385 case OP_OPEN_CHAR_CLASS:
3386 elem->type = CHAR_CLASS;
3387 break;
3388 default:
3389 break;
3390 }
3391 return REG_NOERROR;
3392}
3393
3394 /* Helper function for parse_bracket_exp.
3395 Build the equivalence class which is represented by NAME.
3396 The result are written to MBCSET and SBCSET.
3397 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3398 is a pointer argument sinse we may update it. */
3399
3400static reg_errcode_t
3401#ifdef RE_ENABLE_I18N
3402build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3403 Idx *equiv_class_alloc, const unsigned char *name)
3404#else /* not RE_ENABLE_I18N */
3405build_equiv_class (bitset_t sbcset, const unsigned char *name)
3406#endif /* not RE_ENABLE_I18N */
3407{
3408#ifdef _LIBC
3409 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3410 if (nrules != 0)
3411 {
3412 const int32_t *table, *indirect;
3413 const unsigned char *weights, *extra, *cp;
3414 unsigned char char_buf[2];
3415 int32_t idx1, idx2;
3416 unsigned int ch;
3417 size_t len;
3418 /* This #include defines a local function! */
3419# include <locale/weight.h>
3420 /* Calculate the index for equivalence class. */
3421 cp = name;
3422 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3423 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3424 _NL_COLLATE_WEIGHTMB);
3425 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3426 _NL_COLLATE_EXTRAMB);
3427 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3428 _NL_COLLATE_INDIRECTMB);
3429 idx1 = findidx (&cp);
3430 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3431 /* This isn't a valid character. */
3432 return REG_ECOLLATE;
3433
3434 /* Build single byte matcing table for this equivalence class. */
3435 char_buf[1] = (unsigned char) '\0';
3436 len = weights[idx1];
3437 for (ch = 0; ch < SBC_MAX; ++ch)
3438 {
3439 char_buf[0] = ch;
3440 cp = char_buf;
3441 idx2 = findidx (&cp);
3442/*
3443 idx2 = table[ch];
3444*/
3445 if (idx2 == 0)
3446 /* This isn't a valid character. */
3447 continue;
3448 if (len == weights[idx2])
3449 {
3450 int cnt = 0;
3451 while (cnt <= len &&
3452 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3453 ++cnt;
3454
3455 if (cnt > len)
3456 bitset_set (sbcset, ch);
3457 }
3458 }
3459 /* Check whether the array has enough space. */
3460 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3461 {
3462 /* Not enough, realloc it. */
3463 /* +1 in case of mbcset->nequiv_classes is 0. */
3464 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3465 /* Use realloc since the array is NULL if *alloc == 0. */
3466 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3467 int32_t,
3468 new_equiv_class_alloc);
3469 if (BE (new_equiv_classes == NULL, 0))
3470 return REG_ESPACE;
3471 mbcset->equiv_classes = new_equiv_classes;
3472 *equiv_class_alloc = new_equiv_class_alloc;
3473 }
3474 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3475 }
3476 else
3477#endif /* _LIBC */
3478 {
3479 if (BE (strlen ((const char *) name) != 1, 0))
3480 return REG_ECOLLATE;
3481 bitset_set (sbcset, *name);
3482 }
3483 return REG_NOERROR;
3484}
3485
3486 /* Helper function for parse_bracket_exp.
3487 Build the character class which is represented by NAME.
3488 The result are written to MBCSET and SBCSET.
3489 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3490 is a pointer argument sinse we may update it. */
3491
3492static reg_errcode_t
3493#ifdef RE_ENABLE_I18N
3494build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3495 re_charset_t *mbcset, Idx *char_class_alloc,
3496 const unsigned char *class_name, reg_syntax_t syntax)
3497#else /* not RE_ENABLE_I18N */
3498build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3499 const unsigned char *class_name, reg_syntax_t syntax)
3500#endif /* not RE_ENABLE_I18N */
3501{
3502 int i;
3503 const char *name = (const char *) class_name;
3504
3505 /* In case of REG_ICASE "upper" and "lower" match the both of
3506 upper and lower cases. */
3507 if ((syntax & RE_ICASE)
3508 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3509 name = "alpha";
3510
3511#ifdef RE_ENABLE_I18N
3512 /* Check the space of the arrays. */
3513 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3514 {
3515 /* Not enough, realloc it. */
3516 /* +1 in case of mbcset->nchar_classes is 0. */
3517 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3518 /* Use realloc since array is NULL if *alloc == 0. */
3519 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3520 new_char_class_alloc);
3521 if (BE (new_char_classes == NULL, 0))
3522 return REG_ESPACE;
3523 mbcset->char_classes = new_char_classes;
3524 *char_class_alloc = new_char_class_alloc;
3525 }
3526 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3527#endif /* RE_ENABLE_I18N */
3528
3529#define BUILD_CHARCLASS_LOOP(ctype_func) \
3530 do { \
3531 if (BE (trans != NULL, 0)) \
3532 { \
3533 for (i = 0; i < SBC_MAX; ++i) \
3534 if (ctype_func (i)) \
3535 bitset_set (sbcset, trans[i]); \
3536 } \
3537 else \
3538 { \
3539 for (i = 0; i < SBC_MAX; ++i) \
3540 if (ctype_func (i)) \
3541 bitset_set (sbcset, i); \
3542 } \
3543 } while (0)
3544
3545 if (strcmp (name, "alnum") == 0)
3546 BUILD_CHARCLASS_LOOP (isalnum);
3547 else if (strcmp (name, "cntrl") == 0)
3548 BUILD_CHARCLASS_LOOP (iscntrl);
3549 else if (strcmp (name, "lower") == 0)
3550 BUILD_CHARCLASS_LOOP (islower);
3551 else if (strcmp (name, "space") == 0)
3552 BUILD_CHARCLASS_LOOP (isspace);
3553 else if (strcmp (name, "alpha") == 0)
3554 BUILD_CHARCLASS_LOOP (isalpha);
3555 else if (strcmp (name, "digit") == 0)
3556 BUILD_CHARCLASS_LOOP (isdigit);
3557 else if (strcmp (name, "print") == 0)
3558 BUILD_CHARCLASS_LOOP (isprint);
3559 else if (strcmp (name, "upper") == 0)
3560 BUILD_CHARCLASS_LOOP (isupper);
3561 else if (strcmp (name, "blank") == 0)
3562 BUILD_CHARCLASS_LOOP (isblank);
3563 else if (strcmp (name, "graph") == 0)
3564 BUILD_CHARCLASS_LOOP (isgraph);
3565 else if (strcmp (name, "punct") == 0)
3566 BUILD_CHARCLASS_LOOP (ispunct);
3567 else if (strcmp (name, "xdigit") == 0)
3568 BUILD_CHARCLASS_LOOP (isxdigit);
3569 else
3570 return REG_ECTYPE;
3571
3572 return REG_NOERROR;
3573}
3574
3575static bin_tree_t *
3576build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3577 const unsigned char *class_name,
3578 const unsigned char *extra, bool non_match,
3579 reg_errcode_t *err)
3580{
3581 re_bitset_ptr_t sbcset;
3582#ifdef RE_ENABLE_I18N
3583 re_charset_t *mbcset;
3584 Idx alloc = 0;
3585#endif /* not RE_ENABLE_I18N */
3586 reg_errcode_t ret;
3587 re_token_t br_token;
3588 bin_tree_t *tree;
3589
3590 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3591#ifdef RE_ENABLE_I18N
3592 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3593#endif /* RE_ENABLE_I18N */
3594
3595#ifdef RE_ENABLE_I18N
3596 if (BE (sbcset == NULL || mbcset == NULL, 0))
3597#else /* not RE_ENABLE_I18N */
3598 if (BE (sbcset == NULL, 0))
3599#endif /* not RE_ENABLE_I18N */
3600 {
3601 *err = REG_ESPACE;
3602 return NULL;
3603 }
3604
3605 if (non_match)
3606 {
3607#ifdef RE_ENABLE_I18N
3608 /*
3609 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3610 bitset_set(cset->sbcset, '\0');
3611 */
3612 mbcset->non_match = 1;
3613#endif /* not RE_ENABLE_I18N */
3614 }
3615
3616 /* We don't care the syntax in this case. */
3617 ret = build_charclass (trans, sbcset,
3618#ifdef RE_ENABLE_I18N
3619 mbcset, &alloc,
3620#endif /* RE_ENABLE_I18N */
3621 class_name, 0);
3622
3623 if (BE (ret != REG_NOERROR, 0))
3624 {
3625 re_free (sbcset);
3626#ifdef RE_ENABLE_I18N
3627 free_charset (mbcset);
3628#endif /* RE_ENABLE_I18N */
3629 *err = ret;
3630 return NULL;
3631 }
3632 /* \w match '_' also. */
3633 for (; *extra; extra++)
3634 bitset_set (sbcset, *extra);
3635
3636 /* If it is non-matching list. */
3637 if (non_match)
3638 bitset_not (sbcset);
3639
3640#ifdef RE_ENABLE_I18N
3641 /* Ensure only single byte characters are set. */
3642 if (dfa->mb_cur_max > 1)
3643 bitset_mask (sbcset, dfa->sb_char);
3644#endif
3645
3646 /* Build a tree for simple bracket. */
3647 br_token.type = SIMPLE_BRACKET;
3648 br_token.opr.sbcset = sbcset;
3649 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3650 if (BE (tree == NULL, 0))
3651 goto build_word_op_espace;
3652
3653#ifdef RE_ENABLE_I18N
3654 if (dfa->mb_cur_max > 1)
3655 {
3656 bin_tree_t *mbc_tree;
3657 /* Build a tree for complex bracket. */
3658 br_token.type = COMPLEX_BRACKET;
3659 br_token.opr.mbcset = mbcset;
3660 dfa->has_mb_node = 1;
3661 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3662 if (BE (mbc_tree == NULL, 0))
3663 goto build_word_op_espace;
3664 /* Then join them by ALT node. */
3665 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3666 if (BE (mbc_tree != NULL, 1))
3667 return tree;
3668 }
3669 else
3670 {
3671 free_charset (mbcset);
3672 return tree;
3673 }
3674#else /* not RE_ENABLE_I18N */
3675 return tree;
3676#endif /* not RE_ENABLE_I18N */
3677
3678 build_word_op_espace:
3679 re_free (sbcset);
3680#ifdef RE_ENABLE_I18N
3681 free_charset (mbcset);
3682#endif /* RE_ENABLE_I18N */
3683 *err = REG_ESPACE;
3684 return NULL;
3685}
3686
3687/* This is intended for the expressions like "a{1,3}".
3688 Fetch a number from `input', and return the number.
3689 Return REG_MISSING if the number field is empty like "{,1}".
3690 Return REG_ERROR if an error occurred. */
3691
3692static Idx
3693fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3694{
3695 Idx num = REG_MISSING;
3696 unsigned char c;
3697 while (1)
3698 {
3699 fetch_token (token, input, syntax);
3700 c = token->opr.c;
3701 if (BE (token->type == END_OF_RE, 0))
3702 return REG_ERROR;
3703 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3704 break;
3705 num = ((token->type != CHARACTER || c < '0' || '9' < c
3706 || num == REG_ERROR)
3707 ? REG_ERROR
3708 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3709 num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3710 }
3711 return num;
3712}
3713
3714#ifdef RE_ENABLE_I18N
3715static void
3716free_charset (re_charset_t *cset)
3717{
3718 re_free (cset->mbchars);
3719# ifdef _LIBC
3720 re_free (cset->coll_syms);
3721 re_free (cset->equiv_classes);
3722 re_free (cset->range_starts);
3723 re_free (cset->range_ends);
3724# endif
3725 re_free (cset->char_classes);
3726 re_free (cset);
3727}
3728#endif /* RE_ENABLE_I18N */
3729
3730/* Functions for binary tree operation. */
3731
3732/* Create a tree node. */
3733
3734static bin_tree_t *
3735create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3736 re_token_type_t type)
3737{
3738 re_token_t t;
3739 t.type = type;
3740 return create_token_tree (dfa, left, right, &t);
3741}
3742
3743static bin_tree_t *
3744create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3745 const re_token_t *token)
3746{
3747 bin_tree_t *tree;
3748 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3749 {
3750 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3751
3752 if (storage == NULL)
3753 return NULL;
3754 storage->next = dfa->str_tree_storage;
3755 dfa->str_tree_storage = storage;
3756 dfa->str_tree_storage_idx = 0;
3757 }
3758 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3759
3760 tree->parent = NULL;
3761 tree->left = left;
3762 tree->right = right;
3763 tree->token = *token;
3764 tree->token.duplicated = 0;
3765 tree->token.opt_subexp = 0;
3766 tree->first = NULL;
3767 tree->next = NULL;
3768 tree->node_idx = REG_MISSING;
3769
3770 if (left != NULL)
3771 left->parent = tree;
3772 if (right != NULL)
3773 right->parent = tree;
3774 return tree;
3775}
3776
3777/* Mark the tree SRC as an optional subexpression.
3778 To be called from preorder or postorder. */
3779
3780static reg_errcode_t
3781mark_opt_subexp (void *extra, bin_tree_t *node)
3782{
3783 Idx idx = (Idx) (long) extra;
3784 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3785 node->token.opt_subexp = 1;
3786
3787 return REG_NOERROR;
3788}
3789
3790/* Free the allocated memory inside NODE. */
3791
3792static void
3793free_token (re_token_t *node)
3794{
3795#ifdef RE_ENABLE_I18N
3796 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3797 free_charset (node->opr.mbcset);
3798 else
3799#endif /* RE_ENABLE_I18N */
3800 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3801 re_free (node->opr.sbcset);
3802}
3803
3804/* Worker function for tree walking. Free the allocated memory inside NODE
3805 and its children. */
3806
3807static reg_errcode_t
3808free_tree (void *extra, bin_tree_t *node)
3809{
3810 free_token (&node->token);
3811 return REG_NOERROR;
3812}
3813
3814
3815/* Duplicate the node SRC, and return new node. This is a preorder
3816 visit similar to the one implemented by the generic visitor, but
3817 we need more infrastructure to maintain two parallel trees --- so,
3818 it's easier to duplicate. */
3819
3820static bin_tree_t *
3821duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3822{
3823 const bin_tree_t *node;
3824 bin_tree_t *dup_root;
3825 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3826
3827 for (node = root; ; )
3828 {
3829 /* Create a new tree and link it back to the current parent. */
3830 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3831 if (*p_new == NULL)
3832 return NULL;
3833 (*p_new)->parent = dup_node;
3834 (*p_new)->token.duplicated = 1;
3835 dup_node = *p_new;
3836
3837 /* Go to the left node, or up and to the right. */
3838 if (node->left)
3839 {
3840 node = node->left;
3841 p_new = &dup_node->left;
3842 }
3843 else
3844 {
3845 const bin_tree_t *prev = NULL;
3846 while (node->right == prev || node->right == NULL)
3847 {
3848 prev = node;
3849 node = node->parent;
3850 dup_node = dup_node->parent;
3851 if (!node)
3852 return dup_root;
3853 }
3854 node = node->right;
3855 p_new = &dup_node->right;
3856 }
3857 }
3858}