summaryrefslogtreecommitdiffstats
path: root/gl/str-two-way.h
diff options
context:
space:
mode:
Diffstat (limited to 'gl/str-two-way.h')
-rw-r--r--gl/str-two-way.h65
1 files changed, 44 insertions, 21 deletions
diff --git a/gl/str-two-way.h b/gl/str-two-way.h
index c08f60e..707145d 100644
--- a/gl/str-two-way.h
+++ b/gl/str-two-way.h
@@ -1,5 +1,5 @@
1/* Byte-wise substring search, using the Two-Way algorithm. 1/* Byte-wise substring search, using the Two-Way algorithm.
2 Copyright (C) 2008, 2009, 2010 Free Software Foundation, Inc. 2 Copyright (C) 2008-2013 Free Software Foundation, Inc.
3 This file is part of the GNU C Library. 3 This file is part of the GNU C Library.
4 Written by Eric Blake <ebb9@byu.net>, 2008. 4 Written by Eric Blake <ebb9@byu.net>, 2008.
5 5
@@ -14,8 +14,7 @@
14 GNU General Public License for more details. 14 GNU General Public License for more details.
15 15
16 You should have received a copy of the GNU General Public License along 16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation, 17 with this program; if not, see <http://www.gnu.org/licenses/>. */
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19 18
20/* Before including this file, you need to include <config.h> and 19/* Before including this file, you need to include <config.h> and
21 <string.h>, and define: 20 <string.h>, and define:
@@ -44,14 +43,15 @@
44#include <limits.h> 43#include <limits.h>
45#include <stdint.h> 44#include <stdint.h>
46 45
47/* We use the Two-Way string matching algorithm, which guarantees 46/* We use the Two-Way string matching algorithm (also known as
48 linear complexity with constant space. Additionally, for long 47 Chrochemore-Perrin), which guarantees linear complexity with
49 needles, we also use a bad character shift table similar to the 48 constant space. Additionally, for long needles, we also use a bad
50 Boyer-Moore algorithm to achieve improved (potentially sub-linear) 49 character shift table similar to the Boyer-Moore algorithm to
51 performance. 50 achieve improved (potentially sub-linear) performance.
52 51
53 See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 52 See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260,
54 and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm 53 http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm,
54 http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.34.6641&rep=rep1&type=pdf
55*/ 55*/
56 56
57/* Point at which computing a bad-byte shift table is likely to be 57/* Point at which computing a bad-byte shift table is likely to be
@@ -95,11 +95,14 @@
95 A critical factorization has the property that the local period 95 A critical factorization has the property that the local period
96 equals the global period. All strings have at least one critical 96 equals the global period. All strings have at least one critical
97 factorization with the left half smaller than the global period. 97 factorization with the left half smaller than the global period.
98 And while some strings have more than one critical factorization,
99 it is provable that with an ordered alphabet, at least one of the
100 critical factorizations corresponds to a maximal suffix.
98 101
99 Given an ordered alphabet, a critical factorization can be computed 102 Given an ordered alphabet, a critical factorization can be computed
100 in linear time, with 2 * NEEDLE_LEN comparisons, by computing the 103 in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
101 larger of two ordered maximal suffixes. The ordered maximal 104 shorter of two ordered maximal suffixes. The ordered maximal
102 suffixes are determined by lexicographic comparison of 105 suffixes are determined by lexicographic comparison while tracking
103 periodicity. */ 106 periodicity. */
104static size_t 107static size_t
105critical_factorization (const unsigned char *needle, size_t needle_len, 108critical_factorization (const unsigned char *needle, size_t needle_len,
@@ -112,6 +115,14 @@ critical_factorization (const unsigned char *needle, size_t needle_len,
112 size_t p; /* Intermediate period. */ 115 size_t p; /* Intermediate period. */
113 unsigned char a, b; /* Current comparison bytes. */ 116 unsigned char a, b; /* Current comparison bytes. */
114 117
118 /* Special case NEEDLE_LEN of 1 or 2 (all callers already filtered
119 out 0-length needles. */
120 if (needle_len < 3)
121 {
122 *period = 1;
123 return needle_len - 1;
124 }
125
115 /* Invariants: 126 /* Invariants:
116 0 <= j < NEEDLE_LEN - 1 127 0 <= j < NEEDLE_LEN - 1
117 -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed) 128 -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
@@ -190,8 +201,20 @@ critical_factorization (const unsigned char *needle, size_t needle_len,
190 } 201 }
191 } 202 }
192 203
193 /* Choose the longer suffix. Return the first byte of the right 204 /* Choose the shorter suffix. Return the index of the first byte of
194 half, rather than the last byte of the left half. */ 205 the right half, rather than the last byte of the left half.
206
207 For some examples, 'banana' has two critical factorizations, both
208 exposed by the two lexicographic extreme suffixes of 'anana' and
209 'nana', where both suffixes have a period of 2. On the other
210 hand, with 'aab' and 'bba', both strings have a single critical
211 factorization of the last byte, with the suffix having a period
212 of 1. While the maximal lexicographic suffix of 'aab' is 'b',
213 the maximal lexicographic suffix of 'bba' is 'ba', which is not a
214 critical factorization. Conversely, the maximal reverse
215 lexicographic suffix of 'a' works for 'bba', but not 'ab' for
216 'aab'. The shorter suffix of the two will always be a critical
217 factorization. */
195 if (max_suffix_rev + 1 < max_suffix + 1) 218 if (max_suffix_rev + 1 < max_suffix + 1)
196 return max_suffix + 1; 219 return max_suffix + 1;
197 *period = p; 220 *period = p;
@@ -226,9 +249,9 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
226 first. */ 249 first. */
227 if (CMP_FUNC (needle, needle + period, suffix) == 0) 250 if (CMP_FUNC (needle, needle + period, suffix) == 0)
228 { 251 {
229 /* Entire needle is periodic; a mismatch can only advance by the 252 /* Entire needle is periodic; a mismatch in the left half can
230 period, so use memory to avoid rescanning known occurrences 253 only advance by the period, so use memory to avoid rescanning
231 of the period. */ 254 known occurrences of the period in the right half. */
232 size_t memory = 0; 255 size_t memory = 0;
233 j = 0; 256 j = 0;
234 while (AVAILABLE (haystack, haystack_len, j, needle_len)) 257 while (AVAILABLE (haystack, haystack_len, j, needle_len))
@@ -330,9 +353,9 @@ two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
330 first. */ 353 first. */
331 if (CMP_FUNC (needle, needle + period, suffix) == 0) 354 if (CMP_FUNC (needle, needle + period, suffix) == 0)
332 { 355 {
333 /* Entire needle is periodic; a mismatch can only advance by the 356 /* Entire needle is periodic; a mismatch in the left half can
334 period, so use memory to avoid rescanning known occurrences 357 only advance by the period, so use memory to avoid rescanning
335 of the period. */ 358 known occurrences of the period in the right half. */
336 size_t memory = 0; 359 size_t memory = 0;
337 size_t shift; 360 size_t shift;
338 j = 0; 361 j = 0;
@@ -349,8 +372,8 @@ two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
349 a byte out of place, there can be no match until 372 a byte out of place, there can be no match until
350 after the mismatch. */ 373 after the mismatch. */
351 shift = needle_len - period; 374 shift = needle_len - period;
352 memory = 0;
353 } 375 }
376 memory = 0;
354 j += shift; 377 j += shift;
355 continue; 378 continue;
356 } 379 }