summaryrefslogtreecommitdiffstats
path: root/gl/str-two-way.h
diff options
context:
space:
mode:
Diffstat (limited to 'gl/str-two-way.h')
-rw-r--r--gl/str-two-way.h38
1 files changed, 20 insertions, 18 deletions
diff --git a/gl/str-two-way.h b/gl/str-two-way.h
index 4d555f9..707145d 100644
--- a/gl/str-two-way.h
+++ b/gl/str-two-way.h
@@ -1,5 +1,5 @@
1/* Byte-wise substring search, using the Two-Way algorithm. 1/* Byte-wise substring search, using the Two-Way algorithm.
2 Copyright (C) 2008, 2009, 2010 Free Software Foundation, Inc. 2 Copyright (C) 2008-2013 Free Software Foundation, Inc.
3 This file is part of the GNU C Library. 3 This file is part of the GNU C Library.
4 Written by Eric Blake <ebb9@byu.net>, 2008. 4 Written by Eric Blake <ebb9@byu.net>, 2008.
5 5
@@ -14,8 +14,7 @@
14 GNU General Public License for more details. 14 GNU General Public License for more details.
15 15
16 You should have received a copy of the GNU General Public License along 16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation, 17 with this program; if not, see <http://www.gnu.org/licenses/>. */
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19 18
20/* Before including this file, you need to include <config.h> and 19/* Before including this file, you need to include <config.h> and
21 <string.h>, and define: 20 <string.h>, and define:
@@ -44,14 +43,15 @@
44#include <limits.h> 43#include <limits.h>
45#include <stdint.h> 44#include <stdint.h>
46 45
47/* We use the Two-Way string matching algorithm, which guarantees 46/* We use the Two-Way string matching algorithm (also known as
48 linear complexity with constant space. Additionally, for long 47 Chrochemore-Perrin), which guarantees linear complexity with
49 needles, we also use a bad character shift table similar to the 48 constant space. Additionally, for long needles, we also use a bad
50 Boyer-Moore algorithm to achieve improved (potentially sub-linear) 49 character shift table similar to the Boyer-Moore algorithm to
51 performance. 50 achieve improved (potentially sub-linear) performance.
52 51
53 See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 52 See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260,
54 and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm 53 http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm,
54 http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.34.6641&rep=rep1&type=pdf
55*/ 55*/
56 56
57/* Point at which computing a bad-byte shift table is likely to be 57/* Point at which computing a bad-byte shift table is likely to be
@@ -108,7 +108,7 @@ static size_t
108critical_factorization (const unsigned char *needle, size_t needle_len, 108critical_factorization (const unsigned char *needle, size_t needle_len,
109 size_t *period) 109 size_t *period)
110{ 110{
111 /* Index of last byte of left half. */ 111 /* Index of last byte of left half, or SIZE_MAX. */
112 size_t max_suffix, max_suffix_rev; 112 size_t max_suffix, max_suffix_rev;
113 size_t j; /* Index into NEEDLE for current candidate suffix. */ 113 size_t j; /* Index into NEEDLE for current candidate suffix. */
114 size_t k; /* Offset into current period. */ 114 size_t k; /* Offset into current period. */
@@ -124,8 +124,8 @@ critical_factorization (const unsigned char *needle, size_t needle_len,
124 } 124 }
125 125
126 /* Invariants: 126 /* Invariants:
127 1 <= j < NEEDLE_LEN - 1 127 0 <= j < NEEDLE_LEN - 1
128 0 <= max_suffix{,_rev} < j 128 -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
129 min(max_suffix, max_suffix_rev) < global period of NEEDLE 129 min(max_suffix, max_suffix_rev) < global period of NEEDLE
130 1 <= p <= global period of NEEDLE 130 1 <= p <= global period of NEEDLE
131 p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j] 131 p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
@@ -133,8 +133,9 @@ critical_factorization (const unsigned char *needle, size_t needle_len,
133 */ 133 */
134 134
135 /* Perform lexicographic search. */ 135 /* Perform lexicographic search. */
136 max_suffix = 0; 136 max_suffix = SIZE_MAX;
137 j = k = p = 1; 137 j = 0;
138 k = p = 1;
138 while (j + k < needle_len) 139 while (j + k < needle_len)
139 { 140 {
140 a = CANON_ELEMENT (needle[j + k]); 141 a = CANON_ELEMENT (needle[j + k]);
@@ -167,8 +168,9 @@ critical_factorization (const unsigned char *needle, size_t needle_len,
167 *period = p; 168 *period = p;
168 169
169 /* Perform reverse lexicographic search. */ 170 /* Perform reverse lexicographic search. */
170 max_suffix_rev = 0; 171 max_suffix_rev = SIZE_MAX;
171 j = k = p = 1; 172 j = 0;
173 k = p = 1;
172 while (j + k < needle_len) 174 while (j + k < needle_len)
173 { 175 {
174 a = CANON_ELEMENT (needle[j + k]); 176 a = CANON_ELEMENT (needle[j + k]);
@@ -370,8 +372,8 @@ two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
370 a byte out of place, there can be no match until 372 a byte out of place, there can be no match until
371 after the mismatch. */ 373 after the mismatch. */
372 shift = needle_len - period; 374 shift = needle_len - period;
373 memory = 0;
374 } 375 }
376 memory = 0;
375 j += shift; 377 j += shift;
376 continue; 378 continue;
377 } 379 }