diff options
Diffstat (limited to 'gl/str-two-way.h')
| -rw-r--r-- | gl/str-two-way.h | 65 |
1 files changed, 44 insertions, 21 deletions
diff --git a/gl/str-two-way.h b/gl/str-two-way.h index c08f60ef..707145db 100644 --- a/gl/str-two-way.h +++ b/gl/str-two-way.h | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Byte-wise substring search, using the Two-Way algorithm. | 1 | /* Byte-wise substring search, using the Two-Way algorithm. |
| 2 | Copyright (C) 2008, 2009, 2010 Free Software Foundation, Inc. | 2 | Copyright (C) 2008-2013 Free Software Foundation, Inc. |
| 3 | This file is part of the GNU C Library. | 3 | This file is part of the GNU C Library. |
| 4 | Written by Eric Blake <ebb9@byu.net>, 2008. | 4 | Written by Eric Blake <ebb9@byu.net>, 2008. |
| 5 | 5 | ||
| @@ -14,8 +14,7 @@ | |||
| 14 | GNU General Public License for more details. | 14 | GNU General Public License for more details. |
| 15 | 15 | ||
| 16 | You should have received a copy of the GNU General Public License along | 16 | You should have received a copy of the GNU General Public License along |
| 17 | with this program; if not, write to the Free Software Foundation, | 17 | with this program; if not, see <http://www.gnu.org/licenses/>. */ |
| 18 | Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ | ||
| 19 | 18 | ||
| 20 | /* Before including this file, you need to include <config.h> and | 19 | /* Before including this file, you need to include <config.h> and |
| 21 | <string.h>, and define: | 20 | <string.h>, and define: |
| @@ -44,14 +43,15 @@ | |||
| 44 | #include <limits.h> | 43 | #include <limits.h> |
| 45 | #include <stdint.h> | 44 | #include <stdint.h> |
| 46 | 45 | ||
| 47 | /* We use the Two-Way string matching algorithm, which guarantees | 46 | /* We use the Two-Way string matching algorithm (also known as |
| 48 | linear complexity with constant space. Additionally, for long | 47 | Chrochemore-Perrin), which guarantees linear complexity with |
| 49 | needles, we also use a bad character shift table similar to the | 48 | constant space. Additionally, for long needles, we also use a bad |
| 50 | Boyer-Moore algorithm to achieve improved (potentially sub-linear) | 49 | character shift table similar to the Boyer-Moore algorithm to |
| 51 | performance. | 50 | achieve improved (potentially sub-linear) performance. |
| 52 | 51 | ||
| 53 | See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 | 52 | See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260, |
| 54 | and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm | 53 | http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm, |
| 54 | http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.34.6641&rep=rep1&type=pdf | ||
| 55 | */ | 55 | */ |
| 56 | 56 | ||
| 57 | /* Point at which computing a bad-byte shift table is likely to be | 57 | /* Point at which computing a bad-byte shift table is likely to be |
| @@ -95,11 +95,14 @@ | |||
| 95 | A critical factorization has the property that the local period | 95 | A critical factorization has the property that the local period |
| 96 | equals the global period. All strings have at least one critical | 96 | equals the global period. All strings have at least one critical |
| 97 | factorization with the left half smaller than the global period. | 97 | factorization with the left half smaller than the global period. |
| 98 | And while some strings have more than one critical factorization, | ||
| 99 | it is provable that with an ordered alphabet, at least one of the | ||
| 100 | critical factorizations corresponds to a maximal suffix. | ||
| 98 | 101 | ||
| 99 | Given an ordered alphabet, a critical factorization can be computed | 102 | Given an ordered alphabet, a critical factorization can be computed |
| 100 | in linear time, with 2 * NEEDLE_LEN comparisons, by computing the | 103 | in linear time, with 2 * NEEDLE_LEN comparisons, by computing the |
| 101 | larger of two ordered maximal suffixes. The ordered maximal | 104 | shorter of two ordered maximal suffixes. The ordered maximal |
| 102 | suffixes are determined by lexicographic comparison of | 105 | suffixes are determined by lexicographic comparison while tracking |
| 103 | periodicity. */ | 106 | periodicity. */ |
| 104 | static size_t | 107 | static size_t |
| 105 | critical_factorization (const unsigned char *needle, size_t needle_len, | 108 | critical_factorization (const unsigned char *needle, size_t needle_len, |
| @@ -112,6 +115,14 @@ critical_factorization (const unsigned char *needle, size_t needle_len, | |||
| 112 | size_t p; /* Intermediate period. */ | 115 | size_t p; /* Intermediate period. */ |
| 113 | unsigned char a, b; /* Current comparison bytes. */ | 116 | unsigned char a, b; /* Current comparison bytes. */ |
| 114 | 117 | ||
| 118 | /* Special case NEEDLE_LEN of 1 or 2 (all callers already filtered | ||
| 119 | out 0-length needles. */ | ||
| 120 | if (needle_len < 3) | ||
| 121 | { | ||
| 122 | *period = 1; | ||
| 123 | return needle_len - 1; | ||
| 124 | } | ||
| 125 | |||
| 115 | /* Invariants: | 126 | /* Invariants: |
| 116 | 0 <= j < NEEDLE_LEN - 1 | 127 | 0 <= j < NEEDLE_LEN - 1 |
| 117 | -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed) | 128 | -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed) |
| @@ -190,8 +201,20 @@ critical_factorization (const unsigned char *needle, size_t needle_len, | |||
| 190 | } | 201 | } |
| 191 | } | 202 | } |
| 192 | 203 | ||
| 193 | /* Choose the longer suffix. Return the first byte of the right | 204 | /* Choose the shorter suffix. Return the index of the first byte of |
| 194 | half, rather than the last byte of the left half. */ | 205 | the right half, rather than the last byte of the left half. |
| 206 | |||
| 207 | For some examples, 'banana' has two critical factorizations, both | ||
| 208 | exposed by the two lexicographic extreme suffixes of 'anana' and | ||
| 209 | 'nana', where both suffixes have a period of 2. On the other | ||
| 210 | hand, with 'aab' and 'bba', both strings have a single critical | ||
| 211 | factorization of the last byte, with the suffix having a period | ||
| 212 | of 1. While the maximal lexicographic suffix of 'aab' is 'b', | ||
| 213 | the maximal lexicographic suffix of 'bba' is 'ba', which is not a | ||
| 214 | critical factorization. Conversely, the maximal reverse | ||
| 215 | lexicographic suffix of 'a' works for 'bba', but not 'ab' for | ||
| 216 | 'aab'. The shorter suffix of the two will always be a critical | ||
| 217 | factorization. */ | ||
| 195 | if (max_suffix_rev + 1 < max_suffix + 1) | 218 | if (max_suffix_rev + 1 < max_suffix + 1) |
| 196 | return max_suffix + 1; | 219 | return max_suffix + 1; |
| 197 | *period = p; | 220 | *period = p; |
| @@ -226,9 +249,9 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, | |||
| 226 | first. */ | 249 | first. */ |
| 227 | if (CMP_FUNC (needle, needle + period, suffix) == 0) | 250 | if (CMP_FUNC (needle, needle + period, suffix) == 0) |
| 228 | { | 251 | { |
| 229 | /* Entire needle is periodic; a mismatch can only advance by the | 252 | /* Entire needle is periodic; a mismatch in the left half can |
| 230 | period, so use memory to avoid rescanning known occurrences | 253 | only advance by the period, so use memory to avoid rescanning |
| 231 | of the period. */ | 254 | known occurrences of the period in the right half. */ |
| 232 | size_t memory = 0; | 255 | size_t memory = 0; |
| 233 | j = 0; | 256 | j = 0; |
| 234 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) | 257 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) |
| @@ -330,9 +353,9 @@ two_way_long_needle (const unsigned char *haystack, size_t haystack_len, | |||
| 330 | first. */ | 353 | first. */ |
| 331 | if (CMP_FUNC (needle, needle + period, suffix) == 0) | 354 | if (CMP_FUNC (needle, needle + period, suffix) == 0) |
| 332 | { | 355 | { |
| 333 | /* Entire needle is periodic; a mismatch can only advance by the | 356 | /* Entire needle is periodic; a mismatch in the left half can |
| 334 | period, so use memory to avoid rescanning known occurrences | 357 | only advance by the period, so use memory to avoid rescanning |
| 335 | of the period. */ | 358 | known occurrences of the period in the right half. */ |
| 336 | size_t memory = 0; | 359 | size_t memory = 0; |
| 337 | size_t shift; | 360 | size_t shift; |
| 338 | j = 0; | 361 | j = 0; |
| @@ -349,8 +372,8 @@ two_way_long_needle (const unsigned char *haystack, size_t haystack_len, | |||
| 349 | a byte out of place, there can be no match until | 372 | a byte out of place, there can be no match until |
| 350 | after the mismatch. */ | 373 | after the mismatch. */ |
| 351 | shift = needle_len - period; | 374 | shift = needle_len - period; |
| 352 | memory = 0; | ||
| 353 | } | 375 | } |
| 376 | memory = 0; | ||
| 354 | j += shift; | 377 | j += shift; |
| 355 | continue; | 378 | continue; |
| 356 | } | 379 | } |
