diff options
Diffstat (limited to 'gl/str-two-way.h')
| -rw-r--r-- | gl/str-two-way.h | 38 |
1 files changed, 20 insertions, 18 deletions
diff --git a/gl/str-two-way.h b/gl/str-two-way.h index 4d555f92..707145db 100644 --- a/gl/str-two-way.h +++ b/gl/str-two-way.h | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Byte-wise substring search, using the Two-Way algorithm. | 1 | /* Byte-wise substring search, using the Two-Way algorithm. |
| 2 | Copyright (C) 2008, 2009, 2010 Free Software Foundation, Inc. | 2 | Copyright (C) 2008-2013 Free Software Foundation, Inc. |
| 3 | This file is part of the GNU C Library. | 3 | This file is part of the GNU C Library. |
| 4 | Written by Eric Blake <ebb9@byu.net>, 2008. | 4 | Written by Eric Blake <ebb9@byu.net>, 2008. |
| 5 | 5 | ||
| @@ -14,8 +14,7 @@ | |||
| 14 | GNU General Public License for more details. | 14 | GNU General Public License for more details. |
| 15 | 15 | ||
| 16 | You should have received a copy of the GNU General Public License along | 16 | You should have received a copy of the GNU General Public License along |
| 17 | with this program; if not, write to the Free Software Foundation, | 17 | with this program; if not, see <http://www.gnu.org/licenses/>. */ |
| 18 | Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ | ||
| 19 | 18 | ||
| 20 | /* Before including this file, you need to include <config.h> and | 19 | /* Before including this file, you need to include <config.h> and |
| 21 | <string.h>, and define: | 20 | <string.h>, and define: |
| @@ -44,14 +43,15 @@ | |||
| 44 | #include <limits.h> | 43 | #include <limits.h> |
| 45 | #include <stdint.h> | 44 | #include <stdint.h> |
| 46 | 45 | ||
| 47 | /* We use the Two-Way string matching algorithm, which guarantees | 46 | /* We use the Two-Way string matching algorithm (also known as |
| 48 | linear complexity with constant space. Additionally, for long | 47 | Chrochemore-Perrin), which guarantees linear complexity with |
| 49 | needles, we also use a bad character shift table similar to the | 48 | constant space. Additionally, for long needles, we also use a bad |
| 50 | Boyer-Moore algorithm to achieve improved (potentially sub-linear) | 49 | character shift table similar to the Boyer-Moore algorithm to |
| 51 | performance. | 50 | achieve improved (potentially sub-linear) performance. |
| 52 | 51 | ||
| 53 | See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 | 52 | See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260, |
| 54 | and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm | 53 | http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm, |
| 54 | http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.34.6641&rep=rep1&type=pdf | ||
| 55 | */ | 55 | */ |
| 56 | 56 | ||
| 57 | /* Point at which computing a bad-byte shift table is likely to be | 57 | /* Point at which computing a bad-byte shift table is likely to be |
| @@ -108,7 +108,7 @@ static size_t | |||
| 108 | critical_factorization (const unsigned char *needle, size_t needle_len, | 108 | critical_factorization (const unsigned char *needle, size_t needle_len, |
| 109 | size_t *period) | 109 | size_t *period) |
| 110 | { | 110 | { |
| 111 | /* Index of last byte of left half. */ | 111 | /* Index of last byte of left half, or SIZE_MAX. */ |
| 112 | size_t max_suffix, max_suffix_rev; | 112 | size_t max_suffix, max_suffix_rev; |
| 113 | size_t j; /* Index into NEEDLE for current candidate suffix. */ | 113 | size_t j; /* Index into NEEDLE for current candidate suffix. */ |
| 114 | size_t k; /* Offset into current period. */ | 114 | size_t k; /* Offset into current period. */ |
| @@ -124,8 +124,8 @@ critical_factorization (const unsigned char *needle, size_t needle_len, | |||
| 124 | } | 124 | } |
| 125 | 125 | ||
| 126 | /* Invariants: | 126 | /* Invariants: |
| 127 | 1 <= j < NEEDLE_LEN - 1 | 127 | 0 <= j < NEEDLE_LEN - 1 |
| 128 | 0 <= max_suffix{,_rev} < j | 128 | -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed) |
| 129 | min(max_suffix, max_suffix_rev) < global period of NEEDLE | 129 | min(max_suffix, max_suffix_rev) < global period of NEEDLE |
| 130 | 1 <= p <= global period of NEEDLE | 130 | 1 <= p <= global period of NEEDLE |
| 131 | p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j] | 131 | p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j] |
| @@ -133,8 +133,9 @@ critical_factorization (const unsigned char *needle, size_t needle_len, | |||
| 133 | */ | 133 | */ |
| 134 | 134 | ||
| 135 | /* Perform lexicographic search. */ | 135 | /* Perform lexicographic search. */ |
| 136 | max_suffix = 0; | 136 | max_suffix = SIZE_MAX; |
| 137 | j = k = p = 1; | 137 | j = 0; |
| 138 | k = p = 1; | ||
| 138 | while (j + k < needle_len) | 139 | while (j + k < needle_len) |
| 139 | { | 140 | { |
| 140 | a = CANON_ELEMENT (needle[j + k]); | 141 | a = CANON_ELEMENT (needle[j + k]); |
| @@ -167,8 +168,9 @@ critical_factorization (const unsigned char *needle, size_t needle_len, | |||
| 167 | *period = p; | 168 | *period = p; |
| 168 | 169 | ||
| 169 | /* Perform reverse lexicographic search. */ | 170 | /* Perform reverse lexicographic search. */ |
| 170 | max_suffix_rev = 0; | 171 | max_suffix_rev = SIZE_MAX; |
| 171 | j = k = p = 1; | 172 | j = 0; |
| 173 | k = p = 1; | ||
| 172 | while (j + k < needle_len) | 174 | while (j + k < needle_len) |
| 173 | { | 175 | { |
| 174 | a = CANON_ELEMENT (needle[j + k]); | 176 | a = CANON_ELEMENT (needle[j + k]); |
| @@ -370,8 +372,8 @@ two_way_long_needle (const unsigned char *haystack, size_t haystack_len, | |||
| 370 | a byte out of place, there can be no match until | 372 | a byte out of place, there can be no match until |
| 371 | after the mismatch. */ | 373 | after the mismatch. */ |
| 372 | shift = needle_len - period; | 374 | shift = needle_len - period; |
| 373 | memory = 0; | ||
| 374 | } | 375 | } |
| 376 | memory = 0; | ||
| 375 | j += shift; | 377 | j += shift; |
| 376 | continue; | 378 | continue; |
| 377 | } | 379 | } |
