summaryrefslogtreecommitdiffstats
path: root/gl/regex_internal.c
diff options
context:
space:
mode:
Diffstat (limited to 'gl/regex_internal.c')
-rw-r--r--gl/regex_internal.c1742
1 files changed, 1742 insertions, 0 deletions
diff --git a/gl/regex_internal.c b/gl/regex_internal.c
new file mode 100644
index 0000000..78e16f3
--- /dev/null
+++ b/gl/regex_internal.c
@@ -0,0 +1,1742 @@
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19
20static void re_string_construct_common (const char *str, Idx len,
21 re_string_t *pstr,
22 RE_TRANSLATE_TYPE trans, bool icase,
23 const re_dfa_t *dfa) internal_function;
24static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 const re_node_set *nodes,
26 re_hashval_t hash) internal_function;
27static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 const re_node_set *nodes,
29 unsigned int context,
30 re_hashval_t hash) internal_function;
31
32/* Functions for string operation. */
33
34/* This function allocate the buffers. It is necessary to call
35 re_string_reconstruct before using the object. */
36
37static reg_errcode_t
38internal_function
39re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
40 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
41{
42 reg_errcode_t ret;
43 Idx init_buf_len;
44
45 /* Ensure at least one character fits into the buffers. */
46 if (init_len < dfa->mb_cur_max)
47 init_len = dfa->mb_cur_max;
48 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
49 re_string_construct_common (str, len, pstr, trans, icase, dfa);
50
51 ret = re_string_realloc_buffers (pstr, init_buf_len);
52 if (BE (ret != REG_NOERROR, 0))
53 return ret;
54
55 pstr->word_char = dfa->word_char;
56 pstr->word_ops_used = dfa->word_ops_used;
57 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
58 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
59 pstr->valid_raw_len = pstr->valid_len;
60 return REG_NOERROR;
61}
62
63/* This function allocate the buffers, and initialize them. */
64
65static reg_errcode_t
66internal_function
67re_string_construct (re_string_t *pstr, const char *str, Idx len,
68 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
69{
70 reg_errcode_t ret;
71 memset (pstr, '\0', sizeof (re_string_t));
72 re_string_construct_common (str, len, pstr, trans, icase, dfa);
73
74 if (len > 0)
75 {
76 ret = re_string_realloc_buffers (pstr, len + 1);
77 if (BE (ret != REG_NOERROR, 0))
78 return ret;
79 }
80 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
81
82 if (icase)
83 {
84#ifdef RE_ENABLE_I18N
85 if (dfa->mb_cur_max > 1)
86 {
87 while (1)
88 {
89 ret = build_wcs_upper_buffer (pstr);
90 if (BE (ret != REG_NOERROR, 0))
91 return ret;
92 if (pstr->valid_raw_len >= len)
93 break;
94 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
95 break;
96 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
97 if (BE (ret != REG_NOERROR, 0))
98 return ret;
99 }
100 }
101 else
102#endif /* RE_ENABLE_I18N */
103 build_upper_buffer (pstr);
104 }
105 else
106 {
107#ifdef RE_ENABLE_I18N
108 if (dfa->mb_cur_max > 1)
109 build_wcs_buffer (pstr);
110 else
111#endif /* RE_ENABLE_I18N */
112 {
113 if (trans != NULL)
114 re_string_translate_buffer (pstr);
115 else
116 {
117 pstr->valid_len = pstr->bufs_len;
118 pstr->valid_raw_len = pstr->bufs_len;
119 }
120 }
121 }
122
123 return REG_NOERROR;
124}
125
126/* Helper functions for re_string_allocate, and re_string_construct. */
127
128static reg_errcode_t
129internal_function
130re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
131{
132#ifdef RE_ENABLE_I18N
133 if (pstr->mb_cur_max > 1)
134 {
135 wint_t *new_wcs;
136
137 /* Avoid overflow. */
138 size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
139 if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
140 return REG_ESPACE;
141
142 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
143 if (BE (new_wcs == NULL, 0))
144 return REG_ESPACE;
145 pstr->wcs = new_wcs;
146 if (pstr->offsets != NULL)
147 {
148 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
149 if (BE (new_offsets == NULL, 0))
150 return REG_ESPACE;
151 pstr->offsets = new_offsets;
152 }
153 }
154#endif /* RE_ENABLE_I18N */
155 if (pstr->mbs_allocated)
156 {
157 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
158 new_buf_len);
159 if (BE (new_mbs == NULL, 0))
160 return REG_ESPACE;
161 pstr->mbs = new_mbs;
162 }
163 pstr->bufs_len = new_buf_len;
164 return REG_NOERROR;
165}
166
167
168static void
169internal_function
170re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
171 RE_TRANSLATE_TYPE trans, bool icase,
172 const re_dfa_t *dfa)
173{
174 pstr->raw_mbs = (const unsigned char *) str;
175 pstr->len = len;
176 pstr->raw_len = len;
177 pstr->trans = trans;
178 pstr->icase = icase;
179 pstr->mbs_allocated = (trans != NULL || icase);
180 pstr->mb_cur_max = dfa->mb_cur_max;
181 pstr->is_utf8 = dfa->is_utf8;
182 pstr->map_notascii = dfa->map_notascii;
183 pstr->stop = pstr->len;
184 pstr->raw_stop = pstr->stop;
185}
186
187#ifdef RE_ENABLE_I18N
188
189/* Build wide character buffer PSTR->WCS.
190 If the byte sequence of the string are:
191 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
192 Then wide character buffer will be:
193 <wc1> , WEOF , <wc2> , WEOF , <wc3>
194 We use WEOF for padding, they indicate that the position isn't
195 a first byte of a multibyte character.
196
197 Note that this function assumes PSTR->VALID_LEN elements are already
198 built and starts from PSTR->VALID_LEN. */
199
200static void
201internal_function
202build_wcs_buffer (re_string_t *pstr)
203{
204#ifdef _LIBC
205 unsigned char buf[MB_LEN_MAX];
206 assert (MB_LEN_MAX >= pstr->mb_cur_max);
207#else
208 unsigned char buf[64];
209#endif
210 mbstate_t prev_st;
211 Idx byte_idx, end_idx, remain_len;
212 size_t mbclen;
213
214 /* Build the buffers from pstr->valid_len to either pstr->len or
215 pstr->bufs_len. */
216 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
217 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
218 {
219 wchar_t wc;
220 const char *p;
221
222 remain_len = end_idx - byte_idx;
223 prev_st = pstr->cur_state;
224 /* Apply the translation if we need. */
225 if (BE (pstr->trans != NULL, 0))
226 {
227 int i, ch;
228
229 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
230 {
231 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
232 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
233 }
234 p = (const char *) buf;
235 }
236 else
237 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
238 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
239 if (BE (mbclen == (size_t) -2, 0))
240 {
241 /* The buffer doesn't have enough space, finish to build. */
242 pstr->cur_state = prev_st;
243 break;
244 }
245 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
246 {
247 /* We treat these cases as a singlebyte character. */
248 mbclen = 1;
249 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
250 if (BE (pstr->trans != NULL, 0))
251 wc = pstr->trans[wc];
252 pstr->cur_state = prev_st;
253 }
254
255 /* Write wide character and padding. */
256 pstr->wcs[byte_idx++] = wc;
257 /* Write paddings. */
258 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
259 pstr->wcs[byte_idx++] = WEOF;
260 }
261 pstr->valid_len = byte_idx;
262 pstr->valid_raw_len = byte_idx;
263}
264
265/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
266 but for REG_ICASE. */
267
268static reg_errcode_t
269internal_function
270build_wcs_upper_buffer (re_string_t *pstr)
271{
272 mbstate_t prev_st;
273 Idx src_idx, byte_idx, end_idx, remain_len;
274 size_t mbclen;
275#ifdef _LIBC
276 char buf[MB_LEN_MAX];
277 assert (MB_LEN_MAX >= pstr->mb_cur_max);
278#else
279 char buf[64];
280#endif
281
282 byte_idx = pstr->valid_len;
283 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
284
285 /* The following optimization assumes that ASCII characters can be
286 mapped to wide characters with a simple cast. */
287 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
288 {
289 while (byte_idx < end_idx)
290 {
291 wchar_t wc;
292
293 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
294 && mbsinit (&pstr->cur_state))
295 {
296 /* In case of a singlebyte character. */
297 pstr->mbs[byte_idx]
298 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
299 /* The next step uses the assumption that wchar_t is encoded
300 ASCII-safe: all ASCII values can be converted like this. */
301 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
302 ++byte_idx;
303 continue;
304 }
305
306 remain_len = end_idx - byte_idx;
307 prev_st = pstr->cur_state;
308 mbclen = mbrtowc (&wc,
309 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
310 + byte_idx), remain_len, &pstr->cur_state);
311 if (BE (mbclen < (size_t) -2, 1))
312 {
313 wchar_t wcu = wc;
314 if (iswlower (wc))
315 {
316 size_t mbcdlen;
317
318 wcu = towupper (wc);
319 mbcdlen = wcrtomb (buf, wcu, &prev_st);
320 if (BE (mbclen == mbcdlen, 1))
321 memcpy (pstr->mbs + byte_idx, buf, mbclen);
322 else
323 {
324 src_idx = byte_idx;
325 goto offsets_needed;
326 }
327 }
328 else
329 memcpy (pstr->mbs + byte_idx,
330 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
331 pstr->wcs[byte_idx++] = wcu;
332 /* Write paddings. */
333 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
334 pstr->wcs[byte_idx++] = WEOF;
335 }
336 else if (mbclen == (size_t) -1 || mbclen == 0)
337 {
338 /* It is an invalid character or '\0'. Just use the byte. */
339 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
340 pstr->mbs[byte_idx] = ch;
341 /* And also cast it to wide char. */
342 pstr->wcs[byte_idx++] = (wchar_t) ch;
343 if (BE (mbclen == (size_t) -1, 0))
344 pstr->cur_state = prev_st;
345 }
346 else
347 {
348 /* The buffer doesn't have enough space, finish to build. */
349 pstr->cur_state = prev_st;
350 break;
351 }
352 }
353 pstr->valid_len = byte_idx;
354 pstr->valid_raw_len = byte_idx;
355 return REG_NOERROR;
356 }
357 else
358 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
359 {
360 wchar_t wc;
361 const char *p;
362 offsets_needed:
363 remain_len = end_idx - byte_idx;
364 prev_st = pstr->cur_state;
365 if (BE (pstr->trans != NULL, 0))
366 {
367 int i, ch;
368
369 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
370 {
371 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
372 buf[i] = pstr->trans[ch];
373 }
374 p = (const char *) buf;
375 }
376 else
377 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
378 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
379 if (BE (mbclen < (size_t) -2, 1))
380 {
381 wchar_t wcu = wc;
382 if (iswlower (wc))
383 {
384 size_t mbcdlen;
385
386 wcu = towupper (wc);
387 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
388 if (BE (mbclen == mbcdlen, 1))
389 memcpy (pstr->mbs + byte_idx, buf, mbclen);
390 else if (mbcdlen != (size_t) -1)
391 {
392 size_t i;
393
394 if (byte_idx + mbcdlen > pstr->bufs_len)
395 {
396 pstr->cur_state = prev_st;
397 break;
398 }
399
400 if (pstr->offsets == NULL)
401 {
402 pstr->offsets = re_malloc (Idx, pstr->bufs_len);
403
404 if (pstr->offsets == NULL)
405 return REG_ESPACE;
406 }
407 if (!pstr->offsets_needed)
408 {
409 for (i = 0; i < (size_t) byte_idx; ++i)
410 pstr->offsets[i] = i;
411 pstr->offsets_needed = 1;
412 }
413
414 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
415 pstr->wcs[byte_idx] = wcu;
416 pstr->offsets[byte_idx] = src_idx;
417 for (i = 1; i < mbcdlen; ++i)
418 {
419 pstr->offsets[byte_idx + i]
420 = src_idx + (i < mbclen ? i : mbclen - 1);
421 pstr->wcs[byte_idx + i] = WEOF;
422 }
423 pstr->len += mbcdlen - mbclen;
424 if (pstr->raw_stop > src_idx)
425 pstr->stop += mbcdlen - mbclen;
426 end_idx = (pstr->bufs_len > pstr->len)
427 ? pstr->len : pstr->bufs_len;
428 byte_idx += mbcdlen;
429 src_idx += mbclen;
430 continue;
431 }
432 else
433 memcpy (pstr->mbs + byte_idx, p, mbclen);
434 }
435 else
436 memcpy (pstr->mbs + byte_idx, p, mbclen);
437
438 if (BE (pstr->offsets_needed != 0, 0))
439 {
440 size_t i;
441 for (i = 0; i < mbclen; ++i)
442 pstr->offsets[byte_idx + i] = src_idx + i;
443 }
444 src_idx += mbclen;
445
446 pstr->wcs[byte_idx++] = wcu;
447 /* Write paddings. */
448 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
449 pstr->wcs[byte_idx++] = WEOF;
450 }
451 else if (mbclen == (size_t) -1 || mbclen == 0)
452 {
453 /* It is an invalid character or '\0'. Just use the byte. */
454 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
455
456 if (BE (pstr->trans != NULL, 0))
457 ch = pstr->trans [ch];
458 pstr->mbs[byte_idx] = ch;
459
460 if (BE (pstr->offsets_needed != 0, 0))
461 pstr->offsets[byte_idx] = src_idx;
462 ++src_idx;
463
464 /* And also cast it to wide char. */
465 pstr->wcs[byte_idx++] = (wchar_t) ch;
466 if (BE (mbclen == (size_t) -1, 0))
467 pstr->cur_state = prev_st;
468 }
469 else
470 {
471 /* The buffer doesn't have enough space, finish to build. */
472 pstr->cur_state = prev_st;
473 break;
474 }
475 }
476 pstr->valid_len = byte_idx;
477 pstr->valid_raw_len = src_idx;
478 return REG_NOERROR;
479}
480
481/* Skip characters until the index becomes greater than NEW_RAW_IDX.
482 Return the index. */
483
484static Idx
485internal_function
486re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
487{
488 mbstate_t prev_st;
489 Idx rawbuf_idx;
490 size_t mbclen;
491 wint_t wc = WEOF;
492
493 /* Skip the characters which are not necessary to check. */
494 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
495 rawbuf_idx < new_raw_idx;)
496 {
497 wchar_t wc2;
498 Idx remain_len;
499 remain_len = pstr->len - rawbuf_idx;
500 prev_st = pstr->cur_state;
501 mbclen = mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
502 remain_len, &pstr->cur_state);
503 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
504 {
505 /* We treat these cases as a single byte character. */
506 if (mbclen == 0 || remain_len == 0)
507 wc = L'\0';
508 else
509 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
510 mbclen = 1;
511 pstr->cur_state = prev_st;
512 }
513 else
514 wc = wc2;
515 /* Then proceed the next character. */
516 rawbuf_idx += mbclen;
517 }
518 *last_wc = wc;
519 return rawbuf_idx;
520}
521#endif /* RE_ENABLE_I18N */
522
523/* Build the buffer PSTR->MBS, and apply the translation if we need.
524 This function is used in case of REG_ICASE. */
525
526static void
527internal_function
528build_upper_buffer (re_string_t *pstr)
529{
530 Idx char_idx, end_idx;
531 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
532
533 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
534 {
535 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
536 if (BE (pstr->trans != NULL, 0))
537 ch = pstr->trans[ch];
538 if (islower (ch))
539 pstr->mbs[char_idx] = toupper (ch);
540 else
541 pstr->mbs[char_idx] = ch;
542 }
543 pstr->valid_len = char_idx;
544 pstr->valid_raw_len = char_idx;
545}
546
547/* Apply TRANS to the buffer in PSTR. */
548
549static void
550internal_function
551re_string_translate_buffer (re_string_t *pstr)
552{
553 Idx buf_idx, end_idx;
554 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
555
556 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
557 {
558 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
559 pstr->mbs[buf_idx] = pstr->trans[ch];
560 }
561
562 pstr->valid_len = buf_idx;
563 pstr->valid_raw_len = buf_idx;
564}
565
566/* This function re-construct the buffers.
567 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
568 convert to upper case in case of REG_ICASE, apply translation. */
569
570static reg_errcode_t
571internal_function
572re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
573{
574 Idx offset;
575
576 if (BE (pstr->raw_mbs_idx <= idx, 0))
577 offset = idx - pstr->raw_mbs_idx;
578 else
579 {
580 /* Reset buffer. */
581#ifdef RE_ENABLE_I18N
582 if (pstr->mb_cur_max > 1)
583 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
584#endif /* RE_ENABLE_I18N */
585 pstr->len = pstr->raw_len;
586 pstr->stop = pstr->raw_stop;
587 pstr->valid_len = 0;
588 pstr->raw_mbs_idx = 0;
589 pstr->valid_raw_len = 0;
590 pstr->offsets_needed = 0;
591 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
592 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
593 if (!pstr->mbs_allocated)
594 pstr->mbs = (unsigned char *) pstr->raw_mbs;
595 offset = idx;
596 }
597
598 if (BE (offset != 0, 1))
599 {
600 /* Should the already checked characters be kept? */
601 if (BE (offset < pstr->valid_raw_len, 1))
602 {
603 /* Yes, move them to the front of the buffer. */
604#ifdef RE_ENABLE_I18N
605 if (BE (pstr->offsets_needed, 0))
606 {
607 Idx low = 0, high = pstr->valid_len, mid;
608 do
609 {
610 mid = (high + low) / 2;
611 if (pstr->offsets[mid] > offset)
612 high = mid;
613 else if (pstr->offsets[mid] < offset)
614 low = mid + 1;
615 else
616 break;
617 }
618 while (low < high);
619 if (pstr->offsets[mid] < offset)
620 ++mid;
621 pstr->tip_context = re_string_context_at (pstr, mid - 1,
622 eflags);
623 /* This can be quite complicated, so handle specially
624 only the common and easy case where the character with
625 different length representation of lower and upper
626 case is present at or after offset. */
627 if (pstr->valid_len > offset
628 && mid == offset && pstr->offsets[mid] == offset)
629 {
630 memmove (pstr->wcs, pstr->wcs + offset,
631 (pstr->valid_len - offset) * sizeof (wint_t));
632 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
633 pstr->valid_len -= offset;
634 pstr->valid_raw_len -= offset;
635 for (low = 0; low < pstr->valid_len; low++)
636 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
637 }
638 else
639 {
640 /* Otherwise, just find out how long the partial multibyte
641 character at offset is and fill it with WEOF/255. */
642 pstr->len = pstr->raw_len - idx + offset;
643 pstr->stop = pstr->raw_stop - idx + offset;
644 pstr->offsets_needed = 0;
645 while (mid > 0 && pstr->offsets[mid - 1] == offset)
646 --mid;
647 while (mid < pstr->valid_len)
648 if (pstr->wcs[mid] != WEOF)
649 break;
650 else
651 ++mid;
652 if (mid == pstr->valid_len)
653 pstr->valid_len = 0;
654 else
655 {
656 pstr->valid_len = pstr->offsets[mid] - offset;
657 if (pstr->valid_len)
658 {
659 for (low = 0; low < pstr->valid_len; ++low)
660 pstr->wcs[low] = WEOF;
661 memset (pstr->mbs, 255, pstr->valid_len);
662 }
663 }
664 pstr->valid_raw_len = pstr->valid_len;
665 }
666 }
667 else
668#endif
669 {
670 pstr->tip_context = re_string_context_at (pstr, offset - 1,
671 eflags);
672#ifdef RE_ENABLE_I18N
673 if (pstr->mb_cur_max > 1)
674 memmove (pstr->wcs, pstr->wcs + offset,
675 (pstr->valid_len - offset) * sizeof (wint_t));
676#endif /* RE_ENABLE_I18N */
677 if (BE (pstr->mbs_allocated, 0))
678 memmove (pstr->mbs, pstr->mbs + offset,
679 pstr->valid_len - offset);
680 pstr->valid_len -= offset;
681 pstr->valid_raw_len -= offset;
682#if DEBUG
683 assert (pstr->valid_len > 0);
684#endif
685 }
686 }
687 else
688 {
689 /* No, skip all characters until IDX. */
690 Idx prev_valid_len = pstr->valid_len;
691
692#ifdef RE_ENABLE_I18N
693 if (BE (pstr->offsets_needed, 0))
694 {
695 pstr->len = pstr->raw_len - idx + offset;
696 pstr->stop = pstr->raw_stop - idx + offset;
697 pstr->offsets_needed = 0;
698 }
699#endif
700 pstr->valid_len = 0;
701#ifdef RE_ENABLE_I18N
702 if (pstr->mb_cur_max > 1)
703 {
704 Idx wcs_idx;
705 wint_t wc = WEOF;
706
707 if (pstr->is_utf8)
708 {
709 const unsigned char *raw, *p, *q, *end;
710
711 /* Special case UTF-8. Multi-byte chars start with any
712 byte other than 0x80 - 0xbf. */
713 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
714 end = raw + (offset - pstr->mb_cur_max);
715 if (end < pstr->raw_mbs)
716 end = pstr->raw_mbs;
717 p = raw + offset - 1;
718#ifdef _LIBC
719 /* We know the wchar_t encoding is UCS4, so for the simple
720 case, ASCII characters, skip the conversion step. */
721 if (isascii (*p) && BE (pstr->trans == NULL, 1))
722 {
723 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
724 /* pstr->valid_len = 0; */
725 wc = (wchar_t) *p;
726 }
727 else
728#endif
729 for (; p >= end; --p)
730 if ((*p & 0xc0) != 0x80)
731 {
732 mbstate_t cur_state;
733 wchar_t wc2;
734 Idx mlen = raw + pstr->len - p;
735 unsigned char buf[6];
736 size_t mbclen;
737
738 q = p;
739 if (BE (pstr->trans != NULL, 0))
740 {
741 int i = mlen < 6 ? mlen : 6;
742 while (--i >= 0)
743 buf[i] = pstr->trans[p[i]];
744 q = buf;
745 }
746 /* XXX Don't use mbrtowc, we know which conversion
747 to use (UTF-8 -> UCS4). */
748 memset (&cur_state, 0, sizeof (cur_state));
749 mbclen = mbrtowc (&wc2, (const char *) p, mlen,
750 &cur_state);
751 if (raw + offset - p <= mbclen
752 && mbclen < (size_t) -2)
753 {
754 memset (&pstr->cur_state, '\0',
755 sizeof (mbstate_t));
756 pstr->valid_len = mbclen - (raw + offset - p);
757 wc = wc2;
758 }
759 break;
760 }
761 }
762
763 if (wc == WEOF)
764 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
765 if (wc == WEOF)
766 pstr->tip_context
767 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
768 else
769 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
770 && IS_WIDE_WORD_CHAR (wc))
771 ? CONTEXT_WORD
772 : ((IS_WIDE_NEWLINE (wc)
773 && pstr->newline_anchor)
774 ? CONTEXT_NEWLINE : 0));
775 if (BE (pstr->valid_len, 0))
776 {
777 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
778 pstr->wcs[wcs_idx] = WEOF;
779 if (pstr->mbs_allocated)
780 memset (pstr->mbs, 255, pstr->valid_len);
781 }
782 pstr->valid_raw_len = pstr->valid_len;
783 }
784 else
785#endif /* RE_ENABLE_I18N */
786 {
787 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
788 pstr->valid_raw_len = 0;
789 if (pstr->trans)
790 c = pstr->trans[c];
791 pstr->tip_context = (bitset_contain (pstr->word_char, c)
792 ? CONTEXT_WORD
793 : ((IS_NEWLINE (c) && pstr->newline_anchor)
794 ? CONTEXT_NEWLINE : 0));
795 }
796 }
797 if (!BE (pstr->mbs_allocated, 0))
798 pstr->mbs += offset;
799 }
800 pstr->raw_mbs_idx = idx;
801 pstr->len -= offset;
802 pstr->stop -= offset;
803
804 /* Then build the buffers. */
805#ifdef RE_ENABLE_I18N
806 if (pstr->mb_cur_max > 1)
807 {
808 if (pstr->icase)
809 {
810 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
811 if (BE (ret != REG_NOERROR, 0))
812 return ret;
813 }
814 else
815 build_wcs_buffer (pstr);
816 }
817 else
818#endif /* RE_ENABLE_I18N */
819 if (BE (pstr->mbs_allocated, 0))
820 {
821 if (pstr->icase)
822 build_upper_buffer (pstr);
823 else if (pstr->trans != NULL)
824 re_string_translate_buffer (pstr);
825 }
826 else
827 pstr->valid_len = pstr->len;
828
829 pstr->cur_idx = 0;
830 return REG_NOERROR;
831}
832
833static unsigned char
834internal_function __attribute ((pure))
835re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
836{
837 int ch;
838 Idx off;
839
840 /* Handle the common (easiest) cases first. */
841 if (BE (!pstr->mbs_allocated, 1))
842 return re_string_peek_byte (pstr, idx);
843
844#ifdef RE_ENABLE_I18N
845 if (pstr->mb_cur_max > 1
846 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
847 return re_string_peek_byte (pstr, idx);
848#endif
849
850 off = pstr->cur_idx + idx;
851#ifdef RE_ENABLE_I18N
852 if (pstr->offsets_needed)
853 off = pstr->offsets[off];
854#endif
855
856 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
857
858#ifdef RE_ENABLE_I18N
859 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
860 this function returns CAPITAL LETTER I instead of first byte of
861 DOTLESS SMALL LETTER I. The latter would confuse the parser,
862 since peek_byte_case doesn't advance cur_idx in any way. */
863 if (pstr->offsets_needed && !isascii (ch))
864 return re_string_peek_byte (pstr, idx);
865#endif
866
867 return ch;
868}
869
870static unsigned char
871internal_function __attribute ((pure))
872re_string_fetch_byte_case (re_string_t *pstr)
873{
874 if (BE (!pstr->mbs_allocated, 1))
875 return re_string_fetch_byte (pstr);
876
877#ifdef RE_ENABLE_I18N
878 if (pstr->offsets_needed)
879 {
880 Idx off;
881 int ch;
882
883 /* For tr_TR.UTF-8 [[:islower:]] there is
884 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
885 in that case the whole multi-byte character and return
886 the original letter. On the other side, with
887 [[: DOTLESS SMALL LETTER I return [[:I, as doing
888 anything else would complicate things too much. */
889
890 if (!re_string_first_byte (pstr, pstr->cur_idx))
891 return re_string_fetch_byte (pstr);
892
893 off = pstr->offsets[pstr->cur_idx];
894 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
895
896 if (! isascii (ch))
897 return re_string_fetch_byte (pstr);
898
899 re_string_skip_bytes (pstr,
900 re_string_char_size_at (pstr, pstr->cur_idx));
901 return ch;
902 }
903#endif
904
905 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
906}
907
908static void
909internal_function
910re_string_destruct (re_string_t *pstr)
911{
912#ifdef RE_ENABLE_I18N
913 re_free (pstr->wcs);
914 re_free (pstr->offsets);
915#endif /* RE_ENABLE_I18N */
916 if (pstr->mbs_allocated)
917 re_free (pstr->mbs);
918}
919
920/* Return the context at IDX in INPUT. */
921
922static unsigned int
923internal_function
924re_string_context_at (const re_string_t *input, Idx idx, int eflags)
925{
926 int c;
927 if (BE (! REG_VALID_INDEX (idx), 0))
928 /* In this case, we use the value stored in input->tip_context,
929 since we can't know the character in input->mbs[-1] here. */
930 return input->tip_context;
931 if (BE (idx == input->len, 0))
932 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
933 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
934#ifdef RE_ENABLE_I18N
935 if (input->mb_cur_max > 1)
936 {
937 wint_t wc;
938 Idx wc_idx = idx;
939 while(input->wcs[wc_idx] == WEOF)
940 {
941#ifdef DEBUG
942 /* It must not happen. */
943 assert (REG_VALID_INDEX (wc_idx));
944#endif
945 --wc_idx;
946 if (! REG_VALID_INDEX (wc_idx))
947 return input->tip_context;
948 }
949 wc = input->wcs[wc_idx];
950 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
951 return CONTEXT_WORD;
952 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
953 ? CONTEXT_NEWLINE : 0);
954 }
955 else
956#endif
957 {
958 c = re_string_byte_at (input, idx);
959 if (bitset_contain (input->word_char, c))
960 return CONTEXT_WORD;
961 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
962 }
963}
964
965/* Functions for set operation. */
966
967static reg_errcode_t
968internal_function
969re_node_set_alloc (re_node_set *set, Idx size)
970{
971 set->alloc = size;
972 set->nelem = 0;
973 set->elems = re_malloc (Idx, size);
974 if (BE (set->elems == NULL, 0))
975 return REG_ESPACE;
976 return REG_NOERROR;
977}
978
979static reg_errcode_t
980internal_function
981re_node_set_init_1 (re_node_set *set, Idx elem)
982{
983 set->alloc = 1;
984 set->nelem = 1;
985 set->elems = re_malloc (Idx, 1);
986 if (BE (set->elems == NULL, 0))
987 {
988 set->alloc = set->nelem = 0;
989 return REG_ESPACE;
990 }
991 set->elems[0] = elem;
992 return REG_NOERROR;
993}
994
995static reg_errcode_t
996internal_function
997re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
998{
999 set->alloc = 2;
1000 set->elems = re_malloc (Idx, 2);
1001 if (BE (set->elems == NULL, 0))
1002 return REG_ESPACE;
1003 if (elem1 == elem2)
1004 {
1005 set->nelem = 1;
1006 set->elems[0] = elem1;
1007 }
1008 else
1009 {
1010 set->nelem = 2;
1011 if (elem1 < elem2)
1012 {
1013 set->elems[0] = elem1;
1014 set->elems[1] = elem2;
1015 }
1016 else
1017 {
1018 set->elems[0] = elem2;
1019 set->elems[1] = elem1;
1020 }
1021 }
1022 return REG_NOERROR;
1023}
1024
1025static reg_errcode_t
1026internal_function
1027re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1028{
1029 dest->nelem = src->nelem;
1030 if (src->nelem > 0)
1031 {
1032 dest->alloc = dest->nelem;
1033 dest->elems = re_malloc (Idx, dest->alloc);
1034 if (BE (dest->elems == NULL, 0))
1035 {
1036 dest->alloc = dest->nelem = 0;
1037 return REG_ESPACE;
1038 }
1039 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1040 }
1041 else
1042 re_node_set_init_empty (dest);
1043 return REG_NOERROR;
1044}
1045
1046/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1047 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1048 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1049
1050static reg_errcode_t
1051internal_function
1052re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1053 const re_node_set *src2)
1054{
1055 Idx i1, i2, is, id, delta, sbase;
1056 if (src1->nelem == 0 || src2->nelem == 0)
1057 return REG_NOERROR;
1058
1059 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1060 conservative estimate. */
1061 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1062 {
1063 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1064 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1065 if (BE (new_elems == NULL, 0))
1066 return REG_ESPACE;
1067 dest->elems = new_elems;
1068 dest->alloc = new_alloc;
1069 }
1070
1071 /* Find the items in the intersection of SRC1 and SRC2, and copy
1072 into the top of DEST those that are not already in DEST itself. */
1073 sbase = dest->nelem + src1->nelem + src2->nelem;
1074 i1 = src1->nelem - 1;
1075 i2 = src2->nelem - 1;
1076 id = dest->nelem - 1;
1077 for (;;)
1078 {
1079 if (src1->elems[i1] == src2->elems[i2])
1080 {
1081 /* Try to find the item in DEST. Maybe we could binary search? */
1082 while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1083 --id;
1084
1085 if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1086 dest->elems[--sbase] = src1->elems[i1];
1087
1088 if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1089 break;
1090 }
1091
1092 /* Lower the highest of the two items. */
1093 else if (src1->elems[i1] < src2->elems[i2])
1094 {
1095 if (! REG_VALID_INDEX (--i2))
1096 break;
1097 }
1098 else
1099 {
1100 if (! REG_VALID_INDEX (--i1))
1101 break;
1102 }
1103 }
1104
1105 id = dest->nelem - 1;
1106 is = dest->nelem + src1->nelem + src2->nelem - 1;
1107 delta = is - sbase + 1;
1108
1109 /* Now copy. When DELTA becomes zero, the remaining
1110 DEST elements are already in place; this is more or
1111 less the same loop that is in re_node_set_merge. */
1112 dest->nelem += delta;
1113 if (delta > 0 && REG_VALID_INDEX (id))
1114 for (;;)
1115 {
1116 if (dest->elems[is] > dest->elems[id])
1117 {
1118 /* Copy from the top. */
1119 dest->elems[id + delta--] = dest->elems[is--];
1120 if (delta == 0)
1121 break;
1122 }
1123 else
1124 {
1125 /* Slide from the bottom. */
1126 dest->elems[id + delta] = dest->elems[id];
1127 if (! REG_VALID_INDEX (--id))
1128 break;
1129 }
1130 }
1131
1132 /* Copy remaining SRC elements. */
1133 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1134
1135 return REG_NOERROR;
1136}
1137
1138/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1139 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1140
1141static reg_errcode_t
1142internal_function
1143re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1144 const re_node_set *src2)
1145{
1146 Idx i1, i2, id;
1147 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1148 {
1149 dest->alloc = src1->nelem + src2->nelem;
1150 dest->elems = re_malloc (Idx, dest->alloc);
1151 if (BE (dest->elems == NULL, 0))
1152 return REG_ESPACE;
1153 }
1154 else
1155 {
1156 if (src1 != NULL && src1->nelem > 0)
1157 return re_node_set_init_copy (dest, src1);
1158 else if (src2 != NULL && src2->nelem > 0)
1159 return re_node_set_init_copy (dest, src2);
1160 else
1161 re_node_set_init_empty (dest);
1162 return REG_NOERROR;
1163 }
1164 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1165 {
1166 if (src1->elems[i1] > src2->elems[i2])
1167 {
1168 dest->elems[id++] = src2->elems[i2++];
1169 continue;
1170 }
1171 if (src1->elems[i1] == src2->elems[i2])
1172 ++i2;
1173 dest->elems[id++] = src1->elems[i1++];
1174 }
1175 if (i1 < src1->nelem)
1176 {
1177 memcpy (dest->elems + id, src1->elems + i1,
1178 (src1->nelem - i1) * sizeof (Idx));
1179 id += src1->nelem - i1;
1180 }
1181 else if (i2 < src2->nelem)
1182 {
1183 memcpy (dest->elems + id, src2->elems + i2,
1184 (src2->nelem - i2) * sizeof (Idx));
1185 id += src2->nelem - i2;
1186 }
1187 dest->nelem = id;
1188 return REG_NOERROR;
1189}
1190
1191/* Calculate the union set of the sets DEST and SRC. And store it to
1192 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1193
1194static reg_errcode_t
1195internal_function
1196re_node_set_merge (re_node_set *dest, const re_node_set *src)
1197{
1198 Idx is, id, sbase, delta;
1199 if (src == NULL || src->nelem == 0)
1200 return REG_NOERROR;
1201 if (dest->alloc < 2 * src->nelem + dest->nelem)
1202 {
1203 Idx new_alloc = 2 * (src->nelem + dest->alloc);
1204 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1205 if (BE (new_buffer == NULL, 0))
1206 return REG_ESPACE;
1207 dest->elems = new_buffer;
1208 dest->alloc = new_alloc;
1209 }
1210
1211 if (BE (dest->nelem == 0, 0))
1212 {
1213 dest->nelem = src->nelem;
1214 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1215 return REG_NOERROR;
1216 }
1217
1218 /* Copy into the top of DEST the items of SRC that are not
1219 found in DEST. Maybe we could binary search in DEST? */
1220 for (sbase = dest->nelem + 2 * src->nelem,
1221 is = src->nelem - 1, id = dest->nelem - 1;
1222 REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1223 {
1224 if (dest->elems[id] == src->elems[is])
1225 is--, id--;
1226 else if (dest->elems[id] < src->elems[is])
1227 dest->elems[--sbase] = src->elems[is--];
1228 else /* if (dest->elems[id] > src->elems[is]) */
1229 --id;
1230 }
1231
1232 if (REG_VALID_INDEX (is))
1233 {
1234 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1235 sbase -= is + 1;
1236 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1237 }
1238
1239 id = dest->nelem - 1;
1240 is = dest->nelem + 2 * src->nelem - 1;
1241 delta = is - sbase + 1;
1242 if (delta == 0)
1243 return REG_NOERROR;
1244
1245 /* Now copy. When DELTA becomes zero, the remaining
1246 DEST elements are already in place. */
1247 dest->nelem += delta;
1248 for (;;)
1249 {
1250 if (dest->elems[is] > dest->elems[id])
1251 {
1252 /* Copy from the top. */
1253 dest->elems[id + delta--] = dest->elems[is--];
1254 if (delta == 0)
1255 break;
1256 }
1257 else
1258 {
1259 /* Slide from the bottom. */
1260 dest->elems[id + delta] = dest->elems[id];
1261 if (! REG_VALID_INDEX (--id))
1262 {
1263 /* Copy remaining SRC elements. */
1264 memcpy (dest->elems, dest->elems + sbase,
1265 delta * sizeof (Idx));
1266 break;
1267 }
1268 }
1269 }
1270
1271 return REG_NOERROR;
1272}
1273
1274/* Insert the new element ELEM to the re_node_set* SET.
1275 SET should not already have ELEM.
1276 Return true if successful. */
1277
1278static bool
1279internal_function
1280re_node_set_insert (re_node_set *set, Idx elem)
1281{
1282 Idx idx;
1283 /* In case the set is empty. */
1284 if (set->alloc == 0)
1285 return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1286
1287 if (BE (set->nelem, 0) == 0)
1288 {
1289 /* We already guaranteed above that set->alloc != 0. */
1290 set->elems[0] = elem;
1291 ++set->nelem;
1292 return true;
1293 }
1294
1295 /* Realloc if we need. */
1296 if (set->alloc == set->nelem)
1297 {
1298 Idx *new_elems;
1299 set->alloc = set->alloc * 2;
1300 new_elems = re_realloc (set->elems, Idx, set->alloc);
1301 if (BE (new_elems == NULL, 0))
1302 return false;
1303 set->elems = new_elems;
1304 }
1305
1306 /* Move the elements which follows the new element. Test the
1307 first element separately to skip a check in the inner loop. */
1308 if (elem < set->elems[0])
1309 {
1310 idx = 0;
1311 for (idx = set->nelem; idx > 0; idx--)
1312 set->elems[idx] = set->elems[idx - 1];
1313 }
1314 else
1315 {
1316 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1317 set->elems[idx] = set->elems[idx - 1];
1318 }
1319
1320 /* Insert the new element. */
1321 set->elems[idx] = elem;
1322 ++set->nelem;
1323 return true;
1324}
1325
1326/* Insert the new element ELEM to the re_node_set* SET.
1327 SET should not already have any element greater than or equal to ELEM.
1328 Return true if successful. */
1329
1330static bool
1331internal_function
1332re_node_set_insert_last (re_node_set *set, Idx elem)
1333{
1334 /* Realloc if we need. */
1335 if (set->alloc == set->nelem)
1336 {
1337 Idx *new_elems;
1338 set->alloc = (set->alloc + 1) * 2;
1339 new_elems = re_realloc (set->elems, Idx, set->alloc);
1340 if (BE (new_elems == NULL, 0))
1341 return false;
1342 set->elems = new_elems;
1343 }
1344
1345 /* Insert the new element. */
1346 set->elems[set->nelem++] = elem;
1347 return true;
1348}
1349
1350/* Compare two node sets SET1 and SET2.
1351 Return true if SET1 and SET2 are equivalent. */
1352
1353static bool
1354internal_function __attribute ((pure))
1355re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1356{
1357 Idx i;
1358 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1359 return false;
1360 for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1361 if (set1->elems[i] != set2->elems[i])
1362 return false;
1363 return true;
1364}
1365
1366/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1367
1368static Idx
1369internal_function __attribute ((pure))
1370re_node_set_contains (const re_node_set *set, Idx elem)
1371{
1372 __re_size_t idx, right, mid;
1373 if (! REG_VALID_NONZERO_INDEX (set->nelem))
1374 return 0;
1375
1376 /* Binary search the element. */
1377 idx = 0;
1378 right = set->nelem - 1;
1379 while (idx < right)
1380 {
1381 mid = (idx + right) / 2;
1382 if (set->elems[mid] < elem)
1383 idx = mid + 1;
1384 else
1385 right = mid;
1386 }
1387 return set->elems[idx] == elem ? idx + 1 : 0;
1388}
1389
1390static void
1391internal_function
1392re_node_set_remove_at (re_node_set *set, Idx idx)
1393{
1394 if (idx < 0 || idx >= set->nelem)
1395 return;
1396 --set->nelem;
1397 for (; idx < set->nelem; idx++)
1398 set->elems[idx] = set->elems[idx + 1];
1399}
1400
1401
1402/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1403 Or return REG_MISSING if an error occurred. */
1404
1405static Idx
1406internal_function
1407re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1408{
1409 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1410 {
1411 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1412 Idx *new_nexts, *new_indices;
1413 re_node_set *new_edests, *new_eclosures;
1414 re_token_t *new_nodes;
1415 size_t max_object_size =
1416 MAX (sizeof (re_token_t),
1417 MAX (sizeof (re_node_set),
1418 sizeof (Idx)));
1419
1420 /* Avoid overflows. */
1421 if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0))
1422 return REG_MISSING;
1423
1424 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1425 if (BE (new_nodes == NULL, 0))
1426 return REG_MISSING;
1427 dfa->nodes = new_nodes;
1428 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1429 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1430 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1431 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1432 if (BE (new_nexts == NULL || new_indices == NULL
1433 || new_edests == NULL || new_eclosures == NULL, 0))
1434 return REG_MISSING;
1435 dfa->nexts = new_nexts;
1436 dfa->org_indices = new_indices;
1437 dfa->edests = new_edests;
1438 dfa->eclosures = new_eclosures;
1439 dfa->nodes_alloc = new_nodes_alloc;
1440 }
1441 dfa->nodes[dfa->nodes_len] = token;
1442 dfa->nodes[dfa->nodes_len].constraint = 0;
1443#ifdef RE_ENABLE_I18N
1444 {
1445 int type = token.type;
1446 dfa->nodes[dfa->nodes_len].accept_mb =
1447 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1448 }
1449#endif
1450 dfa->nexts[dfa->nodes_len] = REG_MISSING;
1451 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1452 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1453 return dfa->nodes_len++;
1454}
1455
1456static inline re_hashval_t
1457internal_function
1458calc_state_hash (const re_node_set *nodes, unsigned int context)
1459{
1460 re_hashval_t hash = nodes->nelem + context;
1461 Idx i;
1462 for (i = 0 ; i < nodes->nelem ; i++)
1463 hash += nodes->elems[i];
1464 return hash;
1465}
1466
1467/* Search for the state whose node_set is equivalent to NODES.
1468 Return the pointer to the state, if we found it in the DFA.
1469 Otherwise create the new one and return it. In case of an error
1470 return NULL and set the error code in ERR.
1471 Note: - We assume NULL as the invalid state, then it is possible that
1472 return value is NULL and ERR is REG_NOERROR.
1473 - We never return non-NULL value in case of any errors, it is for
1474 optimization. */
1475
1476static re_dfastate_t *
1477internal_function
1478re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1479 const re_node_set *nodes)
1480{
1481 re_hashval_t hash;
1482 re_dfastate_t *new_state;
1483 struct re_state_table_entry *spot;
1484 Idx i;
1485#ifdef lint
1486 /* Suppress bogus uninitialized-variable warnings. */
1487 *err = REG_NOERROR;
1488#endif
1489 if (BE (nodes->nelem == 0, 0))
1490 {
1491 *err = REG_NOERROR;
1492 return NULL;
1493 }
1494 hash = calc_state_hash (nodes, 0);
1495 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1496
1497 for (i = 0 ; i < spot->num ; i++)
1498 {
1499 re_dfastate_t *state = spot->array[i];
1500 if (hash != state->hash)
1501 continue;
1502 if (re_node_set_compare (&state->nodes, nodes))
1503 return state;
1504 }
1505
1506 /* There are no appropriate state in the dfa, create the new one. */
1507 new_state = create_ci_newstate (dfa, nodes, hash);
1508 if (BE (new_state == NULL, 0))
1509 *err = REG_ESPACE;
1510
1511 return new_state;
1512}
1513
1514/* Search for the state whose node_set is equivalent to NODES and
1515 whose context is equivalent to CONTEXT.
1516 Return the pointer to the state, if we found it in the DFA.
1517 Otherwise create the new one and return it. In case of an error
1518 return NULL and set the error code in ERR.
1519 Note: - We assume NULL as the invalid state, then it is possible that
1520 return value is NULL and ERR is REG_NOERROR.
1521 - We never return non-NULL value in case of any errors, it is for
1522 optimization. */
1523
1524static re_dfastate_t *
1525internal_function
1526re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1527 const re_node_set *nodes, unsigned int context)
1528{
1529 re_hashval_t hash;
1530 re_dfastate_t *new_state;
1531 struct re_state_table_entry *spot;
1532 Idx i;
1533#ifdef lint
1534 /* Suppress bogus uninitialized-variable warnings. */
1535 *err = REG_NOERROR;
1536#endif
1537 if (nodes->nelem == 0)
1538 {
1539 *err = REG_NOERROR;
1540 return NULL;
1541 }
1542 hash = calc_state_hash (nodes, context);
1543 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1544
1545 for (i = 0 ; i < spot->num ; i++)
1546 {
1547 re_dfastate_t *state = spot->array[i];
1548 if (state->hash == hash
1549 && state->context == context
1550 && re_node_set_compare (state->entrance_nodes, nodes))
1551 return state;
1552 }
1553 /* There are no appropriate state in `dfa', create the new one. */
1554 new_state = create_cd_newstate (dfa, nodes, context, hash);
1555 if (BE (new_state == NULL, 0))
1556 *err = REG_ESPACE;
1557
1558 return new_state;
1559}
1560
1561/* Finish initialization of the new state NEWSTATE, and using its hash value
1562 HASH put in the appropriate bucket of DFA's state table. Return value
1563 indicates the error code if failed. */
1564
1565static reg_errcode_t
1566register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1567 re_hashval_t hash)
1568{
1569 struct re_state_table_entry *spot;
1570 reg_errcode_t err;
1571 Idx i;
1572
1573 newstate->hash = hash;
1574 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1575 if (BE (err != REG_NOERROR, 0))
1576 return REG_ESPACE;
1577 for (i = 0; i < newstate->nodes.nelem; i++)
1578 {
1579 Idx elem = newstate->nodes.elems[i];
1580 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1581 if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0))
1582 return REG_ESPACE;
1583 }
1584
1585 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1586 if (BE (spot->alloc <= spot->num, 0))
1587 {
1588 Idx new_alloc = 2 * spot->num + 2;
1589 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1590 new_alloc);
1591 if (BE (new_array == NULL, 0))
1592 return REG_ESPACE;
1593 spot->array = new_array;
1594 spot->alloc = new_alloc;
1595 }
1596 spot->array[spot->num++] = newstate;
1597 return REG_NOERROR;
1598}
1599
1600static void
1601free_state (re_dfastate_t *state)
1602{
1603 re_node_set_free (&state->non_eps_nodes);
1604 re_node_set_free (&state->inveclosure);
1605 if (state->entrance_nodes != &state->nodes)
1606 {
1607 re_node_set_free (state->entrance_nodes);
1608 re_free (state->entrance_nodes);
1609 }
1610 re_node_set_free (&state->nodes);
1611 re_free (state->word_trtable);
1612 re_free (state->trtable);
1613 re_free (state);
1614}
1615
1616/* Create the new state which is independ of contexts.
1617 Return the new state if succeeded, otherwise return NULL. */
1618
1619static re_dfastate_t *
1620internal_function
1621create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1622 re_hashval_t hash)
1623{
1624 Idx i;
1625 reg_errcode_t err;
1626 re_dfastate_t *newstate;
1627
1628 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1629 if (BE (newstate == NULL, 0))
1630 return NULL;
1631 err = re_node_set_init_copy (&newstate->nodes, nodes);
1632 if (BE (err != REG_NOERROR, 0))
1633 {
1634 re_free (newstate);
1635 return NULL;
1636 }
1637
1638 newstate->entrance_nodes = &newstate->nodes;
1639 for (i = 0 ; i < nodes->nelem ; i++)
1640 {
1641 re_token_t *node = dfa->nodes + nodes->elems[i];
1642 re_token_type_t type = node->type;
1643 if (type == CHARACTER && !node->constraint)
1644 continue;
1645#ifdef RE_ENABLE_I18N
1646 newstate->accept_mb |= node->accept_mb;
1647#endif /* RE_ENABLE_I18N */
1648
1649 /* If the state has the halt node, the state is a halt state. */
1650 if (type == END_OF_RE)
1651 newstate->halt = 1;
1652 else if (type == OP_BACK_REF)
1653 newstate->has_backref = 1;
1654 else if (type == ANCHOR || node->constraint)
1655 newstate->has_constraint = 1;
1656 }
1657 err = register_state (dfa, newstate, hash);
1658 if (BE (err != REG_NOERROR, 0))
1659 {
1660 free_state (newstate);
1661 newstate = NULL;
1662 }
1663 return newstate;
1664}
1665
1666/* Create the new state which is depend on the context CONTEXT.
1667 Return the new state if succeeded, otherwise return NULL. */
1668
1669static re_dfastate_t *
1670internal_function
1671create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1672 unsigned int context, re_hashval_t hash)
1673{
1674 Idx i, nctx_nodes = 0;
1675 reg_errcode_t err;
1676 re_dfastate_t *newstate;
1677
1678 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1679 if (BE (newstate == NULL, 0))
1680 return NULL;
1681 err = re_node_set_init_copy (&newstate->nodes, nodes);
1682 if (BE (err != REG_NOERROR, 0))
1683 {
1684 re_free (newstate);
1685 return NULL;
1686 }
1687
1688 newstate->context = context;
1689 newstate->entrance_nodes = &newstate->nodes;
1690
1691 for (i = 0 ; i < nodes->nelem ; i++)
1692 {
1693 unsigned int constraint = 0;
1694 re_token_t *node = dfa->nodes + nodes->elems[i];
1695 re_token_type_t type = node->type;
1696 if (node->constraint)
1697 constraint = node->constraint;
1698
1699 if (type == CHARACTER && !constraint)
1700 continue;
1701#ifdef RE_ENABLE_I18N
1702 newstate->accept_mb |= node->accept_mb;
1703#endif /* RE_ENABLE_I18N */
1704
1705 /* If the state has the halt node, the state is a halt state. */
1706 if (type == END_OF_RE)
1707 newstate->halt = 1;
1708 else if (type == OP_BACK_REF)
1709 newstate->has_backref = 1;
1710 else if (type == ANCHOR)
1711 constraint = node->opr.ctx_type;
1712
1713 if (constraint)
1714 {
1715 if (newstate->entrance_nodes == &newstate->nodes)
1716 {
1717 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1718 if (BE (newstate->entrance_nodes == NULL, 0))
1719 {
1720 free_state (newstate);
1721 return NULL;
1722 }
1723 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1724 nctx_nodes = 0;
1725 newstate->has_constraint = 1;
1726 }
1727
1728 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1729 {
1730 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1731 ++nctx_nodes;
1732 }
1733 }
1734 }
1735 err = register_state (dfa, newstate, hash);
1736 if (BE (err != REG_NOERROR, 0))
1737 {
1738 free_state (newstate);
1739 newstate = NULL;
1740 }
1741 return newstate;
1742}