summaryrefslogtreecommitdiffstats
path: root/gl/regcomp.c
diff options
context:
space:
mode:
Diffstat (limited to 'gl/regcomp.c')
-rw-r--r--gl/regcomp.c359
1 files changed, 202 insertions, 157 deletions
diff --git a/gl/regcomp.c b/gl/regcomp.c
index 86ca02b..f0b2e52 100644
--- a/gl/regcomp.c
+++ b/gl/regcomp.c
@@ -1,22 +1,21 @@
1/* Extended regular expression matching and search library. 1/* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free 2 Copyright (C) 2002-2013 Free Software Foundation, Inc.
3 Software Foundation, Inc.
4 This file is part of the GNU C Library. 3 This file is part of the GNU C Library.
5 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 5
7 This program is free software; you can redistribute it and/or modify 6 The GNU C Library is free software; you can redistribute it and/or
8 it under the terms of the GNU General Public License as published by 7 modify it under the terms of the GNU General Public
9 the Free Software Foundation; either version 3, or (at your option) 8 License as published by the Free Software Foundation; either
10 any later version. 9 version 3 of the License, or (at your option) any later version.
11 10
12 This program is distributed in the hope that it will be useful, 11 The GNU C Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 GNU General Public License for more details. 14 General Public License for more details.
16 15
17 You should have received a copy of the GNU General Public License along 16 You should have received a copy of the GNU General Public
18 with this program; if not, write to the Free Software Foundation, 17 License along with the GNU C Library; if not, see
19 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 18 <http://www.gnu.org/licenses/>. */
20 19
21static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, 20static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 size_t length, reg_syntax_t syntax); 21 size_t length, reg_syntax_t syntax);
@@ -95,20 +94,20 @@ static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
95 bitset_t sbcset, 94 bitset_t sbcset,
96 re_charset_t *mbcset, 95 re_charset_t *mbcset,
97 Idx *char_class_alloc, 96 Idx *char_class_alloc,
98 const unsigned char *class_name, 97 const char *class_name,
99 reg_syntax_t syntax); 98 reg_syntax_t syntax);
100#else /* not RE_ENABLE_I18N */ 99#else /* not RE_ENABLE_I18N */
101static reg_errcode_t build_equiv_class (bitset_t sbcset, 100static reg_errcode_t build_equiv_class (bitset_t sbcset,
102 const unsigned char *name); 101 const unsigned char *name);
103static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, 102static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 bitset_t sbcset, 103 bitset_t sbcset,
105 const unsigned char *class_name, 104 const char *class_name,
106 reg_syntax_t syntax); 105 reg_syntax_t syntax);
107#endif /* not RE_ENABLE_I18N */ 106#endif /* not RE_ENABLE_I18N */
108static bin_tree_t *build_charclass_op (re_dfa_t *dfa, 107static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109 RE_TRANSLATE_TYPE trans, 108 RE_TRANSLATE_TYPE trans,
110 const unsigned char *class_name, 109 const char *class_name,
111 const unsigned char *extra, 110 const char *extra,
112 bool non_match, reg_errcode_t *err); 111 bool non_match, reg_errcode_t *err);
113static bin_tree_t *create_tree (re_dfa_t *dfa, 112static bin_tree_t *create_tree (re_dfa_t *dfa,
114 bin_tree_t *left, bin_tree_t *right, 113 bin_tree_t *left, bin_tree_t *right,
@@ -207,7 +206,7 @@ static const size_t __re_error_msgid_idx[] =
207 compiles PATTERN (of length LENGTH) and puts the result in BUFP. 206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
208 Returns 0 if the pattern was valid, otherwise an error string. 207 Returns 0 if the pattern was valid, otherwise an error string.
209 208
210 Assumes the `allocated' (and perhaps `buffer') and `translate' fields 209 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
211 are set in BUFP on entry. */ 210 are set in BUFP on entry. */
212 211
213#ifdef _LIBC 212#ifdef _LIBC
@@ -242,7 +241,7 @@ re_compile_pattern (const char *pattern, size_t length,
242weak_alias (__re_compile_pattern, re_compile_pattern) 241weak_alias (__re_compile_pattern, re_compile_pattern)
243#endif 242#endif
244 243
245/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can 244/* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
246 also be assigned to arbitrarily: each pattern buffer stores its own 245 also be assigned to arbitrarily: each pattern buffer stores its own
247 syntax, so it can be changed between regex compilations. */ 246 syntax, so it can be changed between regex compilations. */
248/* This has no initializer because initialized variables in Emacs 247/* This has no initializer because initialized variables in Emacs
@@ -274,7 +273,7 @@ int
274re_compile_fastmap (bufp) 273re_compile_fastmap (bufp)
275 struct re_pattern_buffer *bufp; 274 struct re_pattern_buffer *bufp;
276{ 275{
277 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 276 re_dfa_t *dfa = bufp->buffer;
278 char *fastmap = bufp->fastmap; 277 char *fastmap = bufp->fastmap;
279 278
280 memset (fastmap, '\0', sizeof (char) * SBC_MAX); 279 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
@@ -293,7 +292,7 @@ weak_alias (__re_compile_fastmap, re_compile_fastmap)
293#endif 292#endif
294 293
295static inline void 294static inline void
296__attribute ((always_inline)) 295__attribute__ ((always_inline))
297re_set_fastmap (char *fastmap, bool icase, int ch) 296re_set_fastmap (char *fastmap, bool icase, int ch)
298{ 297{
299 fastmap[ch] = 1; 298 fastmap[ch] = 1;
@@ -308,7 +307,7 @@ static void
308re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, 307re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
309 char *fastmap) 308 char *fastmap)
310{ 309{
311 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 310 re_dfa_t *dfa = bufp->buffer;
312 Idx node_cnt; 311 Idx node_cnt;
313 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE)); 312 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
314 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) 313 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
@@ -440,15 +439,15 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
440 PREG is a regex_t *. We do not expect any fields to be initialized, 439 PREG is a regex_t *. We do not expect any fields to be initialized,
441 since POSIX says we shouldn't. Thus, we set 440 since POSIX says we shouldn't. Thus, we set
442 441
443 `buffer' to the compiled pattern; 442 'buffer' to the compiled pattern;
444 `used' to the length of the compiled pattern; 443 'used' to the length of the compiled pattern;
445 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the 444 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
446 REG_EXTENDED bit in CFLAGS is set; otherwise, to 445 REG_EXTENDED bit in CFLAGS is set; otherwise, to
447 RE_SYNTAX_POSIX_BASIC; 446 RE_SYNTAX_POSIX_BASIC;
448 `newline_anchor' to REG_NEWLINE being set in CFLAGS; 447 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
449 `fastmap' to an allocated space for the fastmap; 448 'fastmap' to an allocated space for the fastmap;
450 `fastmap_accurate' to zero; 449 'fastmap_accurate' to zero;
451 `re_nsub' to the number of subexpressions in PATTERN. 450 're_nsub' to the number of subexpressions in PATTERN.
452 451
453 PATTERN is the address of the pattern string. 452 PATTERN is the address of the pattern string.
454 453
@@ -587,19 +586,23 @@ weak_alias (__regerror, regerror)
587static const bitset_t utf8_sb_map = 586static const bitset_t utf8_sb_map =
588{ 587{
589 /* Set the first 128 bits. */ 588 /* Set the first 128 bits. */
590# if 4 * BITSET_WORD_BITS < ASCII_CHARS 589# if defined __GNUC__ && !defined __STRICT_ANSI__
591# error "bitset_word_t is narrower than 32 bits" 590 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
592# elif 3 * BITSET_WORD_BITS < ASCII_CHARS 591# else
592# if 4 * BITSET_WORD_BITS < ASCII_CHARS
593# error "bitset_word_t is narrower than 32 bits"
594# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
593 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX, 595 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
594# elif 2 * BITSET_WORD_BITS < ASCII_CHARS 596# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
595 BITSET_WORD_MAX, BITSET_WORD_MAX, 597 BITSET_WORD_MAX, BITSET_WORD_MAX,
596# elif 1 * BITSET_WORD_BITS < ASCII_CHARS 598# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
597 BITSET_WORD_MAX, 599 BITSET_WORD_MAX,
598# endif 600# endif
599 (BITSET_WORD_MAX 601 (BITSET_WORD_MAX
600 >> (SBC_MAX % BITSET_WORD_BITS == 0 602 >> (SBC_MAX % BITSET_WORD_BITS == 0
601 ? 0 603 ? 0
602 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS)) 604 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
605# endif
603}; 606};
604#endif 607#endif
605 608
@@ -658,9 +661,12 @@ void
658regfree (preg) 661regfree (preg)
659 regex_t *preg; 662 regex_t *preg;
660{ 663{
661 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 664 re_dfa_t *dfa = preg->buffer;
662 if (BE (dfa != NULL, 1)) 665 if (BE (dfa != NULL, 1))
663 free_dfa_content (dfa); 666 {
667 lock_fini (dfa->lock);
668 free_dfa_content (dfa);
669 }
664 preg->buffer = NULL; 670 preg->buffer = NULL;
665 preg->allocated = 0; 671 preg->allocated = 0;
666 672
@@ -719,7 +725,7 @@ re_comp (s)
719 + __re_error_msgid_idx[(int) REG_ESPACE]); 725 + __re_error_msgid_idx[(int) REG_ESPACE]);
720 } 726 }
721 727
722 /* Since `re_exec' always passes NULL for the `regs' argument, we 728 /* Since 're_exec' always passes NULL for the 'regs' argument, we
723 don't need to initialize the pattern buffer fields which affect it. */ 729 don't need to initialize the pattern buffer fields which affect it. */
724 730
725 /* Match anchors at newlines. */ 731 /* Match anchors at newlines. */
@@ -730,7 +736,7 @@ re_comp (s)
730 if (!ret) 736 if (!ret)
731 return NULL; 737 return NULL;
732 738
733 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ 739 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
734 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 740 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
735} 741}
736 742
@@ -765,7 +771,7 @@ re_compile_internal (regex_t *preg, const char * pattern, size_t length,
765 preg->regs_allocated = REGS_UNALLOCATED; 771 preg->regs_allocated = REGS_UNALLOCATED;
766 772
767 /* Initialize the dfa. */ 773 /* Initialize the dfa. */
768 dfa = (re_dfa_t *) preg->buffer; 774 dfa = preg->buffer;
769 if (BE (preg->allocated < sizeof (re_dfa_t), 0)) 775 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
770 { 776 {
771 /* If zero allocated, but buffer is non-null, try to realloc 777 /* If zero allocated, but buffer is non-null, try to realloc
@@ -776,11 +782,13 @@ re_compile_internal (regex_t *preg, const char * pattern, size_t length,
776 if (dfa == NULL) 782 if (dfa == NULL)
777 return REG_ESPACE; 783 return REG_ESPACE;
778 preg->allocated = sizeof (re_dfa_t); 784 preg->allocated = sizeof (re_dfa_t);
779 preg->buffer = (unsigned char *) dfa; 785 preg->buffer = dfa;
780 } 786 }
781 preg->used = sizeof (re_dfa_t); 787 preg->used = sizeof (re_dfa_t);
782 788
783 err = init_dfa (dfa, length); 789 err = init_dfa (dfa, length);
790 if (BE (err == REG_NOERROR && lock_init (dfa->lock) != 0, 0))
791 err = REG_ESPACE;
784 if (BE (err != REG_NOERROR, 0)) 792 if (BE (err != REG_NOERROR, 0))
785 { 793 {
786 free_dfa_content (dfa); 794 free_dfa_content (dfa);
@@ -794,8 +802,6 @@ re_compile_internal (regex_t *preg, const char * pattern, size_t length,
794 strncpy (dfa->re_str, pattern, length + 1); 802 strncpy (dfa->re_str, pattern, length + 1);
795#endif 803#endif
796 804
797 __libc_lock_init (dfa->lock);
798
799 err = re_string_construct (&regexp, pattern, length, preg->translate, 805 err = re_string_construct (&regexp, pattern, length, preg->translate,
800 (syntax & RE_ICASE) != 0, dfa); 806 (syntax & RE_ICASE) != 0, dfa);
801 if (BE (err != REG_NOERROR, 0)) 807 if (BE (err != REG_NOERROR, 0))
@@ -803,6 +809,7 @@ re_compile_internal (regex_t *preg, const char * pattern, size_t length,
803 re_compile_internal_free_return: 809 re_compile_internal_free_return:
804 free_workarea_compile (preg); 810 free_workarea_compile (preg);
805 re_string_destruct (&regexp); 811 re_string_destruct (&regexp);
812 lock_fini (dfa->lock);
806 free_dfa_content (dfa); 813 free_dfa_content (dfa);
807 preg->buffer = NULL; 814 preg->buffer = NULL;
808 preg->allocated = 0; 815 preg->allocated = 0;
@@ -835,6 +842,7 @@ re_compile_internal (regex_t *preg, const char * pattern, size_t length,
835 842
836 if (BE (err != REG_NOERROR, 0)) 843 if (BE (err != REG_NOERROR, 0))
837 { 844 {
845 lock_fini (dfa->lock);
838 free_dfa_content (dfa); 846 free_dfa_content (dfa);
839 preg->buffer = NULL; 847 preg->buffer = NULL;
840 preg->allocated = 0; 848 preg->allocated = 0;
@@ -851,7 +859,7 @@ init_dfa (re_dfa_t *dfa, size_t pat_len)
851{ 859{
852 __re_size_t table_size; 860 __re_size_t table_size;
853#ifndef _LIBC 861#ifndef _LIBC
854 char *codeset_name; 862 const char *codeset_name;
855#endif 863#endif
856#ifdef RE_ENABLE_I18N 864#ifdef RE_ENABLE_I18N
857 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t)); 865 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
@@ -874,7 +882,7 @@ init_dfa (re_dfa_t *dfa, size_t pat_len)
874 calculation below, and for similar doubling calculations 882 calculation below, and for similar doubling calculations
875 elsewhere. And it's <= rather than <, because some of the 883 elsewhere. And it's <= rather than <, because some of the
876 doubling calculations add 1 afterwards. */ 884 doubling calculations add 1 afterwards. */
877 if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0)) 885 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
878 return REG_ESPACE; 886 return REG_ESPACE;
879 887
880 dfa->nodes_alloc = pat_len + 1; 888 dfa->nodes_alloc = pat_len + 1;
@@ -897,8 +905,10 @@ init_dfa (re_dfa_t *dfa, size_t pat_len)
897 != 0); 905 != 0);
898#else 906#else
899 codeset_name = nl_langinfo (CODESET); 907 codeset_name = nl_langinfo (CODESET);
900 if (strcasecmp (codeset_name, "UTF-8") == 0 908 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
901 || strcasecmp (codeset_name, "UTF8") == 0) 909 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
910 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
911 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
902 dfa->is_utf8 = 1; 912 dfa->is_utf8 = 1;
903 913
904 /* We check exhaustively in the loop below if this charset is a 914 /* We check exhaustively in the loop below if this charset is a
@@ -948,9 +958,43 @@ static void
948internal_function 958internal_function
949init_word_char (re_dfa_t *dfa) 959init_word_char (re_dfa_t *dfa)
950{ 960{
951 int i, j, ch; 961 int i = 0;
962 int j;
963 int ch = 0;
952 dfa->word_ops_used = 1; 964 dfa->word_ops_used = 1;
953 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 965 if (BE (dfa->map_notascii == 0, 1))
966 {
967 bitset_word_t bits0 = 0x00000000;
968 bitset_word_t bits1 = 0x03ff0000;
969 bitset_word_t bits2 = 0x87fffffe;
970 bitset_word_t bits3 = 0x07fffffe;
971 if (BITSET_WORD_BITS == 64)
972 {
973 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
974 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
975 i = 2;
976 }
977 else if (BITSET_WORD_BITS == 32)
978 {
979 dfa->word_char[0] = bits0;
980 dfa->word_char[1] = bits1;
981 dfa->word_char[2] = bits2;
982 dfa->word_char[3] = bits3;
983 i = 4;
984 }
985 else
986 goto general_case;
987 ch = 128;
988
989 if (BE (dfa->is_utf8, 1))
990 {
991 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
992 return;
993 }
994 }
995
996 general_case:
997 for (; i < BITSET_WORDS; ++i)
954 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 998 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
955 if (isalnum (ch) || ch == '_') 999 if (isalnum (ch) || ch == '_')
956 dfa->word_char[i] |= (bitset_word_t) 1 << j; 1000 dfa->word_char[i] |= (bitset_word_t) 1 << j;
@@ -961,7 +1005,7 @@ init_word_char (re_dfa_t *dfa)
961static void 1005static void
962free_workarea_compile (regex_t *preg) 1006free_workarea_compile (regex_t *preg)
963{ 1007{
964 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1008 re_dfa_t *dfa = preg->buffer;
965 bin_tree_storage_t *storage, *next; 1009 bin_tree_storage_t *storage, *next;
966 for (storage = dfa->str_tree_storage; storage; storage = next) 1010 for (storage = dfa->str_tree_storage; storage; storage = next)
967 { 1011 {
@@ -1145,7 +1189,7 @@ optimize_utf8 (re_dfa_t *dfa)
1145static reg_errcode_t 1189static reg_errcode_t
1146analyze (regex_t *preg) 1190analyze (regex_t *preg)
1147{ 1191{
1148 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1192 re_dfa_t *dfa = preg->buffer;
1149 reg_errcode_t ret; 1193 reg_errcode_t ret;
1150 1194
1151 /* Allocate arrays. */ 1195 /* Allocate arrays. */
@@ -1326,7 +1370,7 @@ lower_subexps (void *extra, bin_tree_t *node)
1326static bin_tree_t * 1370static bin_tree_t *
1327lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node) 1371lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1328{ 1372{
1329 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1373 re_dfa_t *dfa = preg->buffer;
1330 bin_tree_t *body = node->left; 1374 bin_tree_t *body = node->left;
1331 bin_tree_t *op, *cls, *tree1, *tree; 1375 bin_tree_t *op, *cls, *tree1, *tree;
1332 1376
@@ -1660,7 +1704,7 @@ calc_eclosure (re_dfa_t *dfa)
1660 /* If we have already calculated, skip it. */ 1704 /* If we have already calculated, skip it. */
1661 if (dfa->eclosures[node_idx].nelem != 0) 1705 if (dfa->eclosures[node_idx].nelem != 0)
1662 continue; 1706 continue;
1663 /* Calculate epsilon closure of `node_idx'. */ 1707 /* Calculate epsilon closure of 'node_idx'. */
1664 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true); 1708 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1665 if (BE (err != REG_NOERROR, 0)) 1709 if (BE (err != REG_NOERROR, 0))
1666 return err; 1710 return err;
@@ -1710,14 +1754,14 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1710 { 1754 {
1711 re_node_set eclosure_elem; 1755 re_node_set eclosure_elem;
1712 Idx edest = dfa->edests[node].elems[i]; 1756 Idx edest = dfa->edests[node].elems[i];
1713 /* If calculating the epsilon closure of `edest' is in progress, 1757 /* If calculating the epsilon closure of 'edest' is in progress,
1714 return intermediate result. */ 1758 return intermediate result. */
1715 if (dfa->eclosures[edest].nelem == REG_MISSING) 1759 if (dfa->eclosures[edest].nelem == REG_MISSING)
1716 { 1760 {
1717 incomplete = true; 1761 incomplete = true;
1718 continue; 1762 continue;
1719 } 1763 }
1720 /* If we haven't calculated the epsilon closure of `edest' yet, 1764 /* If we haven't calculated the epsilon closure of 'edest' yet,
1721 calculate now. Otherwise use calculated epsilon closure. */ 1765 calculate now. Otherwise use calculated epsilon closure. */
1722 if (dfa->eclosures[edest].nelem == 0) 1766 if (dfa->eclosures[edest].nelem == 0)
1723 { 1767 {
@@ -1727,11 +1771,11 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1727 } 1771 }
1728 else 1772 else
1729 eclosure_elem = dfa->eclosures[edest]; 1773 eclosure_elem = dfa->eclosures[edest];
1730 /* Merge the epsilon closure of `edest'. */ 1774 /* Merge the epsilon closure of 'edest'. */
1731 err = re_node_set_merge (&eclosure, &eclosure_elem); 1775 err = re_node_set_merge (&eclosure, &eclosure_elem);
1732 if (BE (err != REG_NOERROR, 0)) 1776 if (BE (err != REG_NOERROR, 0))
1733 return err; 1777 return err;
1734 /* If the epsilon closure of `edest' is incomplete, 1778 /* If the epsilon closure of 'edest' is incomplete,
1735 the epsilon closure of this node is also incomplete. */ 1779 the epsilon closure of this node is also incomplete. */
1736 if (dfa->eclosures[edest].nelem == 0) 1780 if (dfa->eclosures[edest].nelem == 0)
1737 { 1781 {
@@ -2093,7 +2137,7 @@ peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2093 2137
2094/* Entry point of the parser. 2138/* Entry point of the parser.
2095 Parse the regular expression REGEXP and return the structure tree. 2139 Parse the regular expression REGEXP and return the structure tree.
2096 If an error is occured, ERR is set by error code, and return NULL. 2140 If an error occurs, ERR is set by error code, and return NULL.
2097 This function build the following tree, from regular expression <reg_exp>: 2141 This function build the following tree, from regular expression <reg_exp>:
2098 CAT 2142 CAT
2099 / \ 2143 / \
@@ -2107,7 +2151,7 @@ static bin_tree_t *
2107parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, 2151parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2108 reg_errcode_t *err) 2152 reg_errcode_t *err)
2109{ 2153{
2110 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2154 re_dfa_t *dfa = preg->buffer;
2111 bin_tree_t *tree, *eor, *root; 2155 bin_tree_t *tree, *eor, *root;
2112 re_token_t current_token; 2156 re_token_t current_token;
2113 dfa->syntax = syntax; 2157 dfa->syntax = syntax;
@@ -2135,13 +2179,13 @@ parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2135 / \ 2179 / \
2136 <branch1> <branch2> 2180 <branch1> <branch2>
2137 2181
2138 ALT means alternative, which represents the operator `|'. */ 2182 ALT means alternative, which represents the operator '|'. */
2139 2183
2140static bin_tree_t * 2184static bin_tree_t *
2141parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2185parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2142 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2186 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2143{ 2187{
2144 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2188 re_dfa_t *dfa = preg->buffer;
2145 bin_tree_t *tree, *branch = NULL; 2189 bin_tree_t *tree, *branch = NULL;
2146 tree = parse_branch (regexp, preg, token, syntax, nest, err); 2190 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2147 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2191 if (BE (*err != REG_NOERROR && tree == NULL, 0))
@@ -2183,7 +2227,7 @@ parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2183 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2227 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2184{ 2228{
2185 bin_tree_t *tree, *expr; 2229 bin_tree_t *tree, *expr;
2186 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2230 re_dfa_t *dfa = preg->buffer;
2187 tree = parse_expression (regexp, preg, token, syntax, nest, err); 2231 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2188 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2232 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2189 return NULL; 2233 return NULL;
@@ -2194,16 +2238,21 @@ parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2194 expr = parse_expression (regexp, preg, token, syntax, nest, err); 2238 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2195 if (BE (*err != REG_NOERROR && expr == NULL, 0)) 2239 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2196 { 2240 {
2241 if (tree != NULL)
2242 postorder (tree, free_tree, NULL);
2197 return NULL; 2243 return NULL;
2198 } 2244 }
2199 if (tree != NULL && expr != NULL) 2245 if (tree != NULL && expr != NULL)
2200 { 2246 {
2201 tree = create_tree (dfa, tree, expr, CONCAT); 2247 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2202 if (tree == NULL) 2248 if (newtree == NULL)
2203 { 2249 {
2250 postorder (expr, free_tree, NULL);
2251 postorder (tree, free_tree, NULL);
2204 *err = REG_ESPACE; 2252 *err = REG_ESPACE;
2205 return NULL; 2253 return NULL;
2206 } 2254 }
2255 tree = newtree;
2207 } 2256 }
2208 else if (tree == NULL) 2257 else if (tree == NULL)
2209 tree = expr; 2258 tree = expr;
@@ -2222,7 +2271,7 @@ static bin_tree_t *
2222parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, 2271parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2223 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2272 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2224{ 2273{
2225 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2274 re_dfa_t *dfa = preg->buffer;
2226 bin_tree_t *tree; 2275 bin_tree_t *tree;
2227 switch (token->type) 2276 switch (token->type)
2228 { 2277 {
@@ -2378,8 +2427,8 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2378 case OP_WORD: 2427 case OP_WORD:
2379 case OP_NOTWORD: 2428 case OP_NOTWORD:
2380 tree = build_charclass_op (dfa, regexp->trans, 2429 tree = build_charclass_op (dfa, regexp->trans,
2381 (const unsigned char *) "alnum", 2430 "alnum",
2382 (const unsigned char *) "_", 2431 "_",
2383 token->type == OP_NOTWORD, err); 2432 token->type == OP_NOTWORD, err);
2384 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2433 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2385 return NULL; 2434 return NULL;
@@ -2387,8 +2436,8 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2387 case OP_SPACE: 2436 case OP_SPACE:
2388 case OP_NOTSPACE: 2437 case OP_NOTSPACE:
2389 tree = build_charclass_op (dfa, regexp->trans, 2438 tree = build_charclass_op (dfa, regexp->trans,
2390 (const unsigned char *) "space", 2439 "space",
2391 (const unsigned char *) "", 2440 "",
2392 token->type == OP_NOTSPACE, err); 2441 token->type == OP_NOTSPACE, err);
2393 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2442 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2394 return NULL; 2443 return NULL;
@@ -2438,7 +2487,7 @@ static bin_tree_t *
2438parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2487parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2439 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2488 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2440{ 2489{
2441 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2490 re_dfa_t *dfa = preg->buffer;
2442 bin_tree_t *tree; 2491 bin_tree_t *tree;
2443 size_t cur_nsub; 2492 size_t cur_nsub;
2444 cur_nsub = preg->re_nsub++; 2493 cur_nsub = preg->re_nsub++;
@@ -2452,7 +2501,11 @@ parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2452 { 2501 {
2453 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); 2502 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2454 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0)) 2503 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2455 *err = REG_EPAREN; 2504 {
2505 if (tree != NULL)
2506 postorder (tree, free_tree, NULL);
2507 *err = REG_EPAREN;
2508 }
2456 if (BE (*err != REG_NOERROR, 0)) 2509 if (BE (*err != REG_NOERROR, 0))
2457 return NULL; 2510 return NULL;
2458 } 2511 }
@@ -2530,6 +2583,12 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2530 *err = REG_BADBR; 2583 *err = REG_BADBR;
2531 return NULL; 2584 return NULL;
2532 } 2585 }
2586
2587 if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
2588 {
2589 *err = REG_ESIZE;
2590 return NULL;
2591 }
2533 } 2592 }
2534 else 2593 else
2535 { 2594 {
@@ -2570,7 +2629,10 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2570 old_tree = NULL; 2629 old_tree = NULL;
2571 2630
2572 if (elem->token.type == SUBEXP) 2631 if (elem->token.type == SUBEXP)
2573 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx); 2632 {
2633 uintptr_t subidx = elem->token.opr.idx;
2634 postorder (elem, mark_opt_subexp, (void *) subidx);
2635 }
2574 2636
2575 tree = create_tree (dfa, elem, NULL, 2637 tree = create_tree (dfa, elem, NULL,
2576 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT)); 2638 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
@@ -2616,7 +2678,7 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2616 Build the range expression which starts from START_ELEM, and ends 2678 Build the range expression which starts from START_ELEM, and ends
2617 at END_ELEM. The result are written to MBCSET and SBCSET. 2679 at END_ELEM. The result are written to MBCSET and SBCSET.
2618 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2680 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2619 mbcset->range_ends, is a pointer argument sinse we may 2681 mbcset->range_ends, is a pointer argument since we may
2620 update it. */ 2682 update it. */
2621 2683
2622static reg_errcode_t 2684static reg_errcode_t
@@ -2655,7 +2717,6 @@ build_range_exp (const reg_syntax_t syntax,
2655 wchar_t wc; 2717 wchar_t wc;
2656 wint_t start_wc; 2718 wint_t start_wc;
2657 wint_t end_wc; 2719 wint_t end_wc;
2658 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2659 2720
2660 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch 2721 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2661 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2722 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
@@ -2669,11 +2730,7 @@ build_range_exp (const reg_syntax_t syntax,
2669 ? __btowc (end_ch) : end_elem->opr.wch); 2730 ? __btowc (end_ch) : end_elem->opr.wch);
2670 if (start_wc == WEOF || end_wc == WEOF) 2731 if (start_wc == WEOF || end_wc == WEOF)
2671 return REG_ECOLLATE; 2732 return REG_ECOLLATE;
2672 cmp_buf[0] = start_wc; 2733 else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
2673 cmp_buf[4] = end_wc;
2674
2675 if (BE ((syntax & RE_NO_EMPTY_RANGES)
2676 && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2677 return REG_ERANGE; 2734 return REG_ERANGE;
2678 2735
2679 /* Got valid collation sequence values, add them as a new entry. 2736 /* Got valid collation sequence values, add them as a new entry.
@@ -2714,9 +2771,7 @@ build_range_exp (const reg_syntax_t syntax,
2714 /* Build the table for single byte characters. */ 2771 /* Build the table for single byte characters. */
2715 for (wc = 0; wc < SBC_MAX; ++wc) 2772 for (wc = 0; wc < SBC_MAX; ++wc)
2716 { 2773 {
2717 cmp_buf[2] = wc; 2774 if (start_wc <= wc && wc <= end_wc)
2718 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2719 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2720 bitset_set (sbcset, wc); 2775 bitset_set (sbcset, wc);
2721 } 2776 }
2722 } 2777 }
@@ -2750,11 +2805,12 @@ build_range_exp (const reg_syntax_t syntax,
2750 2805
2751static reg_errcode_t 2806static reg_errcode_t
2752internal_function 2807internal_function
2753build_collating_symbol (bitset_t sbcset,
2754# ifdef RE_ENABLE_I18N 2808# ifdef RE_ENABLE_I18N
2755 re_charset_t *mbcset, Idx *coll_sym_alloc, 2809build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2756# endif 2810 Idx *coll_sym_alloc, const unsigned char *name)
2757 const unsigned char *name) 2811# else /* not RE_ENABLE_I18N */
2812build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2813# endif /* not RE_ENABLE_I18N */
2758{ 2814{
2759 size_t name_len = strlen ((const char *) name); 2815 size_t name_len = strlen ((const char *) name);
2760 if (BE (name_len != 1, 0)) 2816 if (BE (name_len != 1, 0))
@@ -2782,42 +2838,31 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2782 const int32_t *symb_table; 2838 const int32_t *symb_table;
2783 const unsigned char *extra; 2839 const unsigned char *extra;
2784 2840
2785 /* Local function for parse_bracket_exp used in _LIBC environement. 2841 /* Local function for parse_bracket_exp used in _LIBC environment.
2786 Seek the collating symbol entry correspondings to NAME. 2842 Seek the collating symbol entry corresponding to NAME.
2787 Return the index of the symbol in the SYMB_TABLE. */ 2843 Return the index of the symbol in the SYMB_TABLE,
2844 or -1 if not found. */
2788 2845
2789 auto inline int32_t 2846 auto inline int32_t
2790 __attribute ((always_inline)) 2847 __attribute__ ((always_inline))
2791 seek_collating_symbol_entry (name, name_len) 2848 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2792 const unsigned char *name;
2793 size_t name_len;
2794 { 2849 {
2795 int32_t hash = elem_hash ((const char *) name, name_len); 2850 int32_t elem;
2796 int32_t elem = hash % table_size;
2797 if (symb_table[2 * elem] != 0)
2798 {
2799 int32_t second = hash % (table_size - 2) + 1;
2800 2851
2801 do 2852 for (elem = 0; elem < table_size; elem++)
2802 { 2853 if (symb_table[2 * elem] != 0)
2803 /* First compare the hashing value. */ 2854 {
2804 if (symb_table[2 * elem] == hash 2855 int32_t idx = symb_table[2 * elem + 1];
2805 /* Compare the length of the name. */ 2856 /* Skip the name of collating element name. */
2806 && name_len == extra[symb_table[2 * elem + 1]] 2857 idx += 1 + extra[idx];
2807 /* Compare the name. */ 2858 if (/* Compare the length of the name. */
2808 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1], 2859 name_len == extra[idx]
2809 name_len) == 0) 2860 /* Compare the name. */
2810 { 2861 && memcmp (name, &extra[idx + 1], name_len) == 0)
2811 /* Yep, this is the entry. */ 2862 /* Yep, this is the entry. */
2812 break; 2863 return elem;
2813 } 2864 }
2814 2865 return -1;
2815 /* Next entry. */
2816 elem += second;
2817 }
2818 while (symb_table[2 * elem] != 0);
2819 }
2820 return elem;
2821 } 2866 }
2822 2867
2823 /* Local function for parse_bracket_exp used in _LIBC environment. 2868 /* Local function for parse_bracket_exp used in _LIBC environment.
@@ -2825,9 +2870,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2825 Return the value if succeeded, UINT_MAX otherwise. */ 2870 Return the value if succeeded, UINT_MAX otherwise. */
2826 2871
2827 auto inline unsigned int 2872 auto inline unsigned int
2828 __attribute ((always_inline)) 2873 __attribute__ ((always_inline))
2829 lookup_collation_sequence_value (br_elem) 2874 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2830 bracket_elem_t *br_elem;
2831 { 2875 {
2832 if (br_elem->type == SB_CHAR) 2876 if (br_elem->type == SB_CHAR)
2833 { 2877 {
@@ -2855,7 +2899,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2855 int32_t elem, idx; 2899 int32_t elem, idx;
2856 elem = seek_collating_symbol_entry (br_elem->opr.name, 2900 elem = seek_collating_symbol_entry (br_elem->opr.name,
2857 sym_name_len); 2901 sym_name_len);
2858 if (symb_table[2 * elem] != 0) 2902 if (elem != -1)
2859 { 2903 {
2860 /* We found the entry. */ 2904 /* We found the entry. */
2861 idx = symb_table[2 * elem + 1]; 2905 idx = symb_table[2 * elem + 1];
@@ -2873,7 +2917,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2873 /* Return the collation sequence value. */ 2917 /* Return the collation sequence value. */
2874 return *(unsigned int *) (extra + idx); 2918 return *(unsigned int *) (extra + idx);
2875 } 2919 }
2876 else if (symb_table[2 * elem] == 0 && sym_name_len == 1) 2920 else if (sym_name_len == 1)
2877 { 2921 {
2878 /* No valid character. Match it as a single byte 2922 /* No valid character. Match it as a single byte
2879 character. */ 2923 character. */
@@ -2886,20 +2930,17 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2886 return UINT_MAX; 2930 return UINT_MAX;
2887 } 2931 }
2888 2932
2889 /* Local function for parse_bracket_exp used in _LIBC environement. 2933 /* Local function for parse_bracket_exp used in _LIBC environment.
2890 Build the range expression which starts from START_ELEM, and ends 2934 Build the range expression which starts from START_ELEM, and ends
2891 at END_ELEM. The result are written to MBCSET and SBCSET. 2935 at END_ELEM. The result are written to MBCSET and SBCSET.
2892 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2936 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2893 mbcset->range_ends, is a pointer argument sinse we may 2937 mbcset->range_ends, is a pointer argument since we may
2894 update it. */ 2938 update it. */
2895 2939
2896 auto inline reg_errcode_t 2940 auto inline reg_errcode_t
2897 __attribute ((always_inline)) 2941 __attribute__ ((always_inline))
2898 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem) 2942 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2899 re_charset_t *mbcset; 2943 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2900 Idx *range_alloc;
2901 bitset_t sbcset;
2902 bracket_elem_t *start_elem, *end_elem;
2903 { 2944 {
2904 unsigned int ch; 2945 unsigned int ch;
2905 uint32_t start_collseq; 2946 uint32_t start_collseq;
@@ -2912,6 +2953,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2912 0)) 2953 0))
2913 return REG_ERANGE; 2954 return REG_ERANGE;
2914 2955
2956 /* FIXME: Implement rational ranges here, too. */
2915 start_collseq = lookup_collation_sequence_value (start_elem); 2957 start_collseq = lookup_collation_sequence_value (start_elem);
2916 end_collseq = lookup_collation_sequence_value (end_elem); 2958 end_collseq = lookup_collation_sequence_value (end_elem);
2917 /* Check start/end collation sequence values. */ 2959 /* Check start/end collation sequence values. */
@@ -2970,33 +3012,30 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2970 return REG_NOERROR; 3012 return REG_NOERROR;
2971 } 3013 }
2972 3014
2973 /* Local function for parse_bracket_exp used in _LIBC environement. 3015 /* Local function for parse_bracket_exp used in _LIBC environment.
2974 Build the collating element which is represented by NAME. 3016 Build the collating element which is represented by NAME.
2975 The result are written to MBCSET and SBCSET. 3017 The result are written to MBCSET and SBCSET.
2976 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 3018 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2977 pointer argument sinse we may update it. */ 3019 pointer argument since we may update it. */
2978 3020
2979 auto inline reg_errcode_t 3021 auto inline reg_errcode_t
2980 __attribute ((always_inline)) 3022 __attribute__ ((always_inline))
2981 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name) 3023 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2982 re_charset_t *mbcset; 3024 Idx *coll_sym_alloc, const unsigned char *name)
2983 Idx *coll_sym_alloc;
2984 bitset_t sbcset;
2985 const unsigned char *name;
2986 { 3025 {
2987 int32_t elem, idx; 3026 int32_t elem, idx;
2988 size_t name_len = strlen ((const char *) name); 3027 size_t name_len = strlen ((const char *) name);
2989 if (nrules != 0) 3028 if (nrules != 0)
2990 { 3029 {
2991 elem = seek_collating_symbol_entry (name, name_len); 3030 elem = seek_collating_symbol_entry (name, name_len);
2992 if (symb_table[2 * elem] != 0) 3031 if (elem != -1)
2993 { 3032 {
2994 /* We found the entry. */ 3033 /* We found the entry. */
2995 idx = symb_table[2 * elem + 1]; 3034 idx = symb_table[2 * elem + 1];
2996 /* Skip the name of collating element name. */ 3035 /* Skip the name of collating element name. */
2997 idx += 1 + extra[idx]; 3036 idx += 1 + extra[idx];
2998 } 3037 }
2999 else if (symb_table[2 * elem] == 0 && name_len == 1) 3038 else if (name_len == 1)
3000 { 3039 {
3001 /* No valid character, treat it as a normal 3040 /* No valid character, treat it as a normal
3002 character. */ 3041 character. */
@@ -3076,6 +3115,10 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
3076 if (BE (sbcset == NULL, 0)) 3115 if (BE (sbcset == NULL, 0))
3077#endif /* RE_ENABLE_I18N */ 3116#endif /* RE_ENABLE_I18N */
3078 { 3117 {
3118 re_free (sbcset);
3119#ifdef RE_ENABLE_I18N
3120 re_free (mbcset);
3121#endif
3079 *err = REG_ESPACE; 3122 *err = REG_ESPACE;
3080 return NULL; 3123 return NULL;
3081 } 3124 }
@@ -3235,7 +3278,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
3235#ifdef RE_ENABLE_I18N 3278#ifdef RE_ENABLE_I18N
3236 mbcset, &char_class_alloc, 3279 mbcset, &char_class_alloc,
3237#endif /* RE_ENABLE_I18N */ 3280#endif /* RE_ENABLE_I18N */
3238 start_elem.opr.name, syntax); 3281 (const char *) start_elem.opr.name,
3282 syntax);
3239 if (BE (*err != REG_NOERROR, 0)) 3283 if (BE (*err != REG_NOERROR, 0))
3240 goto parse_bracket_exp_free_return; 3284 goto parse_bracket_exp_free_return;
3241 break; 3285 break;
@@ -3414,7 +3458,7 @@ parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3414 Build the equivalence class which is represented by NAME. 3458 Build the equivalence class which is represented by NAME.
3415 The result are written to MBCSET and SBCSET. 3459 The result are written to MBCSET and SBCSET.
3416 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes, 3460 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3417 is a pointer argument sinse we may update it. */ 3461 is a pointer argument since we may update it. */
3418 3462
3419static reg_errcode_t 3463static reg_errcode_t
3420#ifdef RE_ENABLE_I18N 3464#ifdef RE_ENABLE_I18N
@@ -3445,19 +3489,18 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name)
3445 _NL_COLLATE_EXTRAMB); 3489 _NL_COLLATE_EXTRAMB);
3446 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3490 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3447 _NL_COLLATE_INDIRECTMB); 3491 _NL_COLLATE_INDIRECTMB);
3448 idx1 = findidx (&cp); 3492 idx1 = findidx (&cp, -1);
3449 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0)) 3493 if (BE (idx1 == 0 || *cp != '\0', 0))
3450 /* This isn't a valid character. */ 3494 /* This isn't a valid character. */
3451 return REG_ECOLLATE; 3495 return REG_ECOLLATE;
3452 3496
3453 /* Build single byte matcing table for this equivalence class. */ 3497 /* Build single byte matching table for this equivalence class. */
3454 char_buf[1] = (unsigned char) '\0';
3455 len = weights[idx1 & 0xffffff]; 3498 len = weights[idx1 & 0xffffff];
3456 for (ch = 0; ch < SBC_MAX; ++ch) 3499 for (ch = 0; ch < SBC_MAX; ++ch)
3457 { 3500 {
3458 char_buf[0] = ch; 3501 char_buf[0] = ch;
3459 cp = char_buf; 3502 cp = char_buf;
3460 idx2 = findidx (&cp); 3503 idx2 = findidx (&cp, 1);
3461/* 3504/*
3462 idx2 = table[ch]; 3505 idx2 = table[ch];
3463*/ 3506*/
@@ -3510,20 +3553,20 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name)
3510 Build the character class which is represented by NAME. 3553 Build the character class which is represented by NAME.
3511 The result are written to MBCSET and SBCSET. 3554 The result are written to MBCSET and SBCSET.
3512 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes, 3555 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3513 is a pointer argument sinse we may update it. */ 3556 is a pointer argument since we may update it. */
3514 3557
3515static reg_errcode_t 3558static reg_errcode_t
3516#ifdef RE_ENABLE_I18N 3559#ifdef RE_ENABLE_I18N
3517build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, 3560build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3518 re_charset_t *mbcset, Idx *char_class_alloc, 3561 re_charset_t *mbcset, Idx *char_class_alloc,
3519 const unsigned char *class_name, reg_syntax_t syntax) 3562 const char *class_name, reg_syntax_t syntax)
3520#else /* not RE_ENABLE_I18N */ 3563#else /* not RE_ENABLE_I18N */
3521build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, 3564build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3522 const unsigned char *class_name, reg_syntax_t syntax) 3565 const char *class_name, reg_syntax_t syntax)
3523#endif /* not RE_ENABLE_I18N */ 3566#endif /* not RE_ENABLE_I18N */
3524{ 3567{
3525 int i; 3568 int i;
3526 const char *name = (const char *) class_name; 3569 const char *name = class_name;
3527 3570
3528 /* In case of REG_ICASE "upper" and "lower" match the both of 3571 /* In case of REG_ICASE "upper" and "lower" match the both of
3529 upper and lower cases. */ 3572 upper and lower cases. */
@@ -3597,8 +3640,8 @@ build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3597 3640
3598static bin_tree_t * 3641static bin_tree_t *
3599build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, 3642build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3600 const unsigned char *class_name, 3643 const char *class_name,
3601 const unsigned char *extra, bool non_match, 3644 const char *extra, bool non_match,
3602 reg_errcode_t *err) 3645 reg_errcode_t *err)
3603{ 3646{
3604 re_bitset_ptr_t sbcset; 3647 re_bitset_ptr_t sbcset;
@@ -3704,8 +3747,9 @@ build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3704} 3747}
3705 3748
3706/* This is intended for the expressions like "a{1,3}". 3749/* This is intended for the expressions like "a{1,3}".
3707 Fetch a number from `input', and return the number. 3750 Fetch a number from 'input', and return the number.
3708 Return REG_MISSING if the number field is empty like "{,1}". 3751 Return REG_MISSING if the number field is empty like "{,1}".
3752 Return RE_DUP_MAX + 1 if the number field is too large.
3709 Return REG_ERROR if an error occurred. */ 3753 Return REG_ERROR if an error occurred. */
3710 3754
3711static Idx 3755static Idx
@@ -3724,8 +3768,9 @@ fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3724 num = ((token->type != CHARACTER || c < '0' || '9' < c 3768 num = ((token->type != CHARACTER || c < '0' || '9' < c
3725 || num == REG_ERROR) 3769 || num == REG_ERROR)
3726 ? REG_ERROR 3770 ? REG_ERROR
3727 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0')); 3771 : num == REG_MISSING
3728 num = (num > RE_DUP_MAX) ? REG_ERROR : num; 3772 ? c - '0'
3773 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3729 } 3774 }
3730 return num; 3775 return num;
3731} 3776}
@@ -3799,7 +3844,7 @@ create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3799static reg_errcode_t 3844static reg_errcode_t
3800mark_opt_subexp (void *extra, bin_tree_t *node) 3845mark_opt_subexp (void *extra, bin_tree_t *node)
3801{ 3846{
3802 Idx idx = (Idx) (long) extra; 3847 Idx idx = (uintptr_t) extra;
3803 if (node->token.type == SUBEXP && node->token.opr.idx == idx) 3848 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3804 node->token.opt_subexp = 1; 3849 node->token.opt_subexp = 1;
3805 3850