diff options
Diffstat (limited to 'gl/str-two-way.h')
| -rw-r--r-- | gl/str-two-way.h | 59 |
1 files changed, 40 insertions, 19 deletions
diff --git a/gl/str-two-way.h b/gl/str-two-way.h index c08f60ef..4d555f92 100644 --- a/gl/str-two-way.h +++ b/gl/str-two-way.h | |||
| @@ -95,26 +95,37 @@ | |||
| 95 | A critical factorization has the property that the local period | 95 | A critical factorization has the property that the local period |
| 96 | equals the global period. All strings have at least one critical | 96 | equals the global period. All strings have at least one critical |
| 97 | factorization with the left half smaller than the global period. | 97 | factorization with the left half smaller than the global period. |
| 98 | And while some strings have more than one critical factorization, | ||
| 99 | it is provable that with an ordered alphabet, at least one of the | ||
| 100 | critical factorizations corresponds to a maximal suffix. | ||
| 98 | 101 | ||
| 99 | Given an ordered alphabet, a critical factorization can be computed | 102 | Given an ordered alphabet, a critical factorization can be computed |
| 100 | in linear time, with 2 * NEEDLE_LEN comparisons, by computing the | 103 | in linear time, with 2 * NEEDLE_LEN comparisons, by computing the |
| 101 | larger of two ordered maximal suffixes. The ordered maximal | 104 | shorter of two ordered maximal suffixes. The ordered maximal |
| 102 | suffixes are determined by lexicographic comparison of | 105 | suffixes are determined by lexicographic comparison while tracking |
| 103 | periodicity. */ | 106 | periodicity. */ |
| 104 | static size_t | 107 | static size_t |
| 105 | critical_factorization (const unsigned char *needle, size_t needle_len, | 108 | critical_factorization (const unsigned char *needle, size_t needle_len, |
| 106 | size_t *period) | 109 | size_t *period) |
| 107 | { | 110 | { |
| 108 | /* Index of last byte of left half, or SIZE_MAX. */ | 111 | /* Index of last byte of left half. */ |
| 109 | size_t max_suffix, max_suffix_rev; | 112 | size_t max_suffix, max_suffix_rev; |
| 110 | size_t j; /* Index into NEEDLE for current candidate suffix. */ | 113 | size_t j; /* Index into NEEDLE for current candidate suffix. */ |
| 111 | size_t k; /* Offset into current period. */ | 114 | size_t k; /* Offset into current period. */ |
| 112 | size_t p; /* Intermediate period. */ | 115 | size_t p; /* Intermediate period. */ |
| 113 | unsigned char a, b; /* Current comparison bytes. */ | 116 | unsigned char a, b; /* Current comparison bytes. */ |
| 114 | 117 | ||
| 118 | /* Special case NEEDLE_LEN of 1 or 2 (all callers already filtered | ||
| 119 | out 0-length needles. */ | ||
| 120 | if (needle_len < 3) | ||
| 121 | { | ||
| 122 | *period = 1; | ||
| 123 | return needle_len - 1; | ||
| 124 | } | ||
| 125 | |||
| 115 | /* Invariants: | 126 | /* Invariants: |
| 116 | 0 <= j < NEEDLE_LEN - 1 | 127 | 1 <= j < NEEDLE_LEN - 1 |
| 117 | -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed) | 128 | 0 <= max_suffix{,_rev} < j |
| 118 | min(max_suffix, max_suffix_rev) < global period of NEEDLE | 129 | min(max_suffix, max_suffix_rev) < global period of NEEDLE |
| 119 | 1 <= p <= global period of NEEDLE | 130 | 1 <= p <= global period of NEEDLE |
| 120 | p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j] | 131 | p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j] |
| @@ -122,9 +133,8 @@ critical_factorization (const unsigned char *needle, size_t needle_len, | |||
| 122 | */ | 133 | */ |
| 123 | 134 | ||
| 124 | /* Perform lexicographic search. */ | 135 | /* Perform lexicographic search. */ |
| 125 | max_suffix = SIZE_MAX; | 136 | max_suffix = 0; |
| 126 | j = 0; | 137 | j = k = p = 1; |
| 127 | k = p = 1; | ||
| 128 | while (j + k < needle_len) | 138 | while (j + k < needle_len) |
| 129 | { | 139 | { |
| 130 | a = CANON_ELEMENT (needle[j + k]); | 140 | a = CANON_ELEMENT (needle[j + k]); |
| @@ -157,9 +167,8 @@ critical_factorization (const unsigned char *needle, size_t needle_len, | |||
| 157 | *period = p; | 167 | *period = p; |
| 158 | 168 | ||
| 159 | /* Perform reverse lexicographic search. */ | 169 | /* Perform reverse lexicographic search. */ |
| 160 | max_suffix_rev = SIZE_MAX; | 170 | max_suffix_rev = 0; |
| 161 | j = 0; | 171 | j = k = p = 1; |
| 162 | k = p = 1; | ||
| 163 | while (j + k < needle_len) | 172 | while (j + k < needle_len) |
| 164 | { | 173 | { |
| 165 | a = CANON_ELEMENT (needle[j + k]); | 174 | a = CANON_ELEMENT (needle[j + k]); |
| @@ -190,8 +199,20 @@ critical_factorization (const unsigned char *needle, size_t needle_len, | |||
| 190 | } | 199 | } |
| 191 | } | 200 | } |
| 192 | 201 | ||
| 193 | /* Choose the longer suffix. Return the first byte of the right | 202 | /* Choose the shorter suffix. Return the index of the first byte of |
| 194 | half, rather than the last byte of the left half. */ | 203 | the right half, rather than the last byte of the left half. |
| 204 | |||
| 205 | For some examples, 'banana' has two critical factorizations, both | ||
| 206 | exposed by the two lexicographic extreme suffixes of 'anana' and | ||
| 207 | 'nana', where both suffixes have a period of 2. On the other | ||
| 208 | hand, with 'aab' and 'bba', both strings have a single critical | ||
| 209 | factorization of the last byte, with the suffix having a period | ||
| 210 | of 1. While the maximal lexicographic suffix of 'aab' is 'b', | ||
| 211 | the maximal lexicographic suffix of 'bba' is 'ba', which is not a | ||
| 212 | critical factorization. Conversely, the maximal reverse | ||
| 213 | lexicographic suffix of 'a' works for 'bba', but not 'ab' for | ||
| 214 | 'aab'. The shorter suffix of the two will always be a critical | ||
| 215 | factorization. */ | ||
| 195 | if (max_suffix_rev + 1 < max_suffix + 1) | 216 | if (max_suffix_rev + 1 < max_suffix + 1) |
| 196 | return max_suffix + 1; | 217 | return max_suffix + 1; |
| 197 | *period = p; | 218 | *period = p; |
| @@ -226,9 +247,9 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, | |||
| 226 | first. */ | 247 | first. */ |
| 227 | if (CMP_FUNC (needle, needle + period, suffix) == 0) | 248 | if (CMP_FUNC (needle, needle + period, suffix) == 0) |
| 228 | { | 249 | { |
| 229 | /* Entire needle is periodic; a mismatch can only advance by the | 250 | /* Entire needle is periodic; a mismatch in the left half can |
| 230 | period, so use memory to avoid rescanning known occurrences | 251 | only advance by the period, so use memory to avoid rescanning |
| 231 | of the period. */ | 252 | known occurrences of the period in the right half. */ |
| 232 | size_t memory = 0; | 253 | size_t memory = 0; |
| 233 | j = 0; | 254 | j = 0; |
| 234 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) | 255 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) |
| @@ -330,9 +351,9 @@ two_way_long_needle (const unsigned char *haystack, size_t haystack_len, | |||
| 330 | first. */ | 351 | first. */ |
| 331 | if (CMP_FUNC (needle, needle + period, suffix) == 0) | 352 | if (CMP_FUNC (needle, needle + period, suffix) == 0) |
| 332 | { | 353 | { |
| 333 | /* Entire needle is periodic; a mismatch can only advance by the | 354 | /* Entire needle is periodic; a mismatch in the left half can |
| 334 | period, so use memory to avoid rescanning known occurrences | 355 | only advance by the period, so use memory to avoid rescanning |
| 335 | of the period. */ | 356 | known occurrences of the period in the right half. */ |
| 336 | size_t memory = 0; | 357 | size_t memory = 0; |
| 337 | size_t shift; | 358 | size_t shift; |
| 338 | j = 0; | 359 | j = 0; |
