diff options
Diffstat (limited to 'gl/str-two-way.h')
| -rw-r--r-- | gl/str-two-way.h | 283 |
1 files changed, 143 insertions, 140 deletions
diff --git a/gl/str-two-way.h b/gl/str-two-way.h index d13fb298..7cf91ffd 100644 --- a/gl/str-two-way.h +++ b/gl/str-two-way.h | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Byte-wise substring search, using the Two-Way algorithm. | 1 | /* Byte-wise substring search, using the Two-Way algorithm. |
| 2 | Copyright (C) 2008-2025 Free Software Foundation, Inc. | 2 | Copyright (C) 2008-2026 Free Software Foundation, Inc. |
| 3 | This file is part of the GNU C Library. | 3 | This file is part of the GNU C Library. |
| 4 | Written by Eric Blake <ebb9@byu.net>, 2008. | 4 | Written by Eric Blake <ebb9@byu.net>, 2008. |
| 5 | 5 | ||
| @@ -108,13 +108,6 @@ static size_t | |||
| 108 | critical_factorization (const unsigned char *needle, size_t needle_len, | 108 | critical_factorization (const unsigned char *needle, size_t needle_len, |
| 109 | size_t *period) | 109 | size_t *period) |
| 110 | { | 110 | { |
| 111 | /* Index of last byte of left half, or SIZE_MAX. */ | ||
| 112 | size_t max_suffix, max_suffix_rev; | ||
| 113 | size_t j; /* Index into NEEDLE for current candidate suffix. */ | ||
| 114 | size_t k; /* Offset into current period. */ | ||
| 115 | size_t p; /* Intermediate period. */ | ||
| 116 | unsigned char a, b; /* Current comparison bytes. */ | ||
| 117 | |||
| 118 | /* Special case NEEDLE_LEN of 1 or 2 (all callers already filtered | 111 | /* Special case NEEDLE_LEN of 1 or 2 (all callers already filtered |
| 119 | out 0-length needles. */ | 112 | out 0-length needles. */ |
| 120 | if (needle_len < 3) | 113 | if (needle_len < 3) |
| @@ -133,73 +126,83 @@ critical_factorization (const unsigned char *needle, size_t needle_len, | |||
| 133 | */ | 126 | */ |
| 134 | 127 | ||
| 135 | /* Perform lexicographic search. */ | 128 | /* Perform lexicographic search. */ |
| 136 | max_suffix = SIZE_MAX; | 129 | size_t max_suffix = /* Index of last byte of left half, or SIZE_MAX. */ |
| 137 | j = 0; | 130 | SIZE_MAX; |
| 138 | k = p = 1; | 131 | { |
| 139 | while (j + k < needle_len) | 132 | size_t j = 0; /* Index into NEEDLE for current candidate suffix. */ |
| 140 | { | 133 | size_t k = 1; /* Offset into current period. */ |
| 141 | a = CANON_ELEMENT (needle[j + k]); | 134 | size_t p = 1; /* Intermediate period. */ |
| 142 | b = CANON_ELEMENT (needle[max_suffix + k]); | 135 | while (j + k < needle_len) |
| 143 | if (a < b) | 136 | { |
| 144 | { | 137 | unsigned char a = CANON_ELEMENT (needle[j + k]); |
| 145 | /* Suffix is smaller, period is entire prefix so far. */ | 138 | unsigned char b = CANON_ELEMENT (needle[max_suffix + k]); |
| 146 | j += k; | 139 | if (a < b) |
| 147 | k = 1; | 140 | { |
| 148 | p = j - max_suffix; | 141 | /* Suffix is smaller, period is entire prefix so far. */ |
| 149 | } | 142 | j += k; |
| 150 | else if (a == b) | 143 | k = 1; |
| 151 | { | 144 | p = j - max_suffix; |
| 152 | /* Advance through repetition of the current period. */ | 145 | } |
| 153 | if (k != p) | 146 | else if (a == b) |
| 154 | ++k; | 147 | { |
| 155 | else | 148 | /* Advance through repetition of the current period. */ |
| 156 | { | 149 | if (k != p) |
| 157 | j += p; | 150 | ++k; |
| 158 | k = 1; | 151 | else |
| 159 | } | 152 | { |
| 160 | } | 153 | j += p; |
| 161 | else /* b < a */ | 154 | k = 1; |
| 162 | { | 155 | } |
| 163 | /* Suffix is larger, start over from current location. */ | 156 | } |
| 164 | max_suffix = j++; | 157 | else /* b < a */ |
| 165 | k = p = 1; | 158 | { |
| 166 | } | 159 | /* Suffix is larger, start over from current location. */ |
| 167 | } | 160 | max_suffix = j++; |
| 168 | *period = p; | 161 | k = p = 1; |
| 162 | } | ||
| 163 | } | ||
| 164 | *period = p; | ||
| 165 | } | ||
| 169 | 166 | ||
| 170 | /* Perform reverse lexicographic search. */ | 167 | /* Perform reverse lexicographic search. */ |
| 171 | max_suffix_rev = SIZE_MAX; | 168 | size_t max_suffix_rev = /* Index of last byte of left half, or SIZE_MAX. */ |
| 172 | j = 0; | 169 | SIZE_MAX; |
| 173 | k = p = 1; | 170 | size_t p_rev; |
| 174 | while (j + k < needle_len) | 171 | { |
| 175 | { | 172 | size_t j = 0; /* Index into NEEDLE for current candidate suffix. */ |
| 176 | a = CANON_ELEMENT (needle[j + k]); | 173 | size_t k = 1; /* Offset into current period. */ |
| 177 | b = CANON_ELEMENT (needle[max_suffix_rev + k]); | 174 | size_t p = 1; /* Intermediate period. */ |
| 178 | if (b < a) | 175 | while (j + k < needle_len) |
| 179 | { | 176 | { |
| 180 | /* Suffix is smaller, period is entire prefix so far. */ | 177 | unsigned char a = CANON_ELEMENT (needle[j + k]); |
| 181 | j += k; | 178 | unsigned char b = CANON_ELEMENT (needle[max_suffix_rev + k]); |
| 182 | k = 1; | 179 | if (b < a) |
| 183 | p = j - max_suffix_rev; | 180 | { |
| 184 | } | 181 | /* Suffix is smaller, period is entire prefix so far. */ |
| 185 | else if (a == b) | 182 | j += k; |
| 186 | { | 183 | k = 1; |
| 187 | /* Advance through repetition of the current period. */ | 184 | p = j - max_suffix_rev; |
| 188 | if (k != p) | 185 | } |
| 189 | ++k; | 186 | else if (a == b) |
| 190 | else | 187 | { |
| 191 | { | 188 | /* Advance through repetition of the current period. */ |
| 192 | j += p; | 189 | if (k != p) |
| 193 | k = 1; | 190 | ++k; |
| 194 | } | 191 | else |
| 195 | } | 192 | { |
| 196 | else /* a < b */ | 193 | j += p; |
| 197 | { | 194 | k = 1; |
| 198 | /* Suffix is larger, start over from current location. */ | 195 | } |
| 199 | max_suffix_rev = j++; | 196 | } |
| 200 | k = p = 1; | 197 | else /* a < b */ |
| 201 | } | 198 | { |
| 202 | } | 199 | /* Suffix is larger, start over from current location. */ |
| 200 | max_suffix_rev = j++; | ||
| 201 | k = p = 1; | ||
| 202 | } | ||
| 203 | } | ||
| 204 | p_rev = p; | ||
| 205 | } | ||
| 203 | 206 | ||
| 204 | /* Choose the shorter suffix. Return the index of the first byte of | 207 | /* Choose the shorter suffix. Return the index of the first byte of |
| 205 | the right half, rather than the last byte of the left half. | 208 | the right half, rather than the last byte of the left half. |
| @@ -217,7 +220,7 @@ critical_factorization (const unsigned char *needle, size_t needle_len, | |||
| 217 | factorization. */ | 220 | factorization. */ |
| 218 | if (max_suffix_rev + 1 < max_suffix + 1) | 221 | if (max_suffix_rev + 1 < max_suffix + 1) |
| 219 | return max_suffix + 1; | 222 | return max_suffix + 1; |
| 220 | *period = p; | 223 | *period = p_rev; |
| 221 | return max_suffix_rev + 1; | 224 | return max_suffix_rev + 1; |
| 222 | } | 225 | } |
| 223 | 226 | ||
| @@ -235,15 +238,12 @@ static RETURN_TYPE _GL_ATTRIBUTE_PURE | |||
| 235 | two_way_short_needle (const unsigned char *haystack, size_t haystack_len, | 238 | two_way_short_needle (const unsigned char *haystack, size_t haystack_len, |
| 236 | const unsigned char *needle, size_t needle_len) | 239 | const unsigned char *needle, size_t needle_len) |
| 237 | { | 240 | { |
| 238 | size_t i; /* Index into current byte of NEEDLE. */ | ||
| 239 | size_t j; /* Index into current window of HAYSTACK. */ | ||
| 240 | size_t period; /* The period of the right half of needle. */ | ||
| 241 | size_t suffix; /* The index of the right half of needle. */ | ||
| 242 | |||
| 243 | /* Factor the needle into two halves, such that the left half is | 241 | /* Factor the needle into two halves, such that the left half is |
| 244 | smaller than the global period, and the right half is | 242 | smaller than the global period, and the right half is |
| 245 | periodic (with a period as large as NEEDLE_LEN - suffix). */ | 243 | periodic (with a period as large as NEEDLE_LEN - suffix). */ |
| 246 | suffix = critical_factorization (needle, needle_len, &period); | 244 | size_t period; /* The period of the right half of needle. */ |
| 245 | size_t suffix = /* The index of the right half of needle. */ | ||
| 246 | critical_factorization (needle, needle_len, &period); | ||
| 247 | 247 | ||
| 248 | /* Perform the search. Each iteration compares the right half | 248 | /* Perform the search. Each iteration compares the right half |
| 249 | first. */ | 249 | first. */ |
| @@ -253,11 +253,12 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, | |||
| 253 | only advance by the period, so use memory to avoid rescanning | 253 | only advance by the period, so use memory to avoid rescanning |
| 254 | known occurrences of the period in the right half. */ | 254 | known occurrences of the period in the right half. */ |
| 255 | size_t memory = 0; | 255 | size_t memory = 0; |
| 256 | j = 0; | 256 | size_t j = 0; /* Index into current window of HAYSTACK. */ |
| 257 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) | 257 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) |
| 258 | { | 258 | { |
| 259 | /* Scan for matches in right half. */ | 259 | /* Scan for matches in right half. */ |
| 260 | i = MAX (suffix, memory); | 260 | size_t i = /* Index into current byte of NEEDLE. */ |
| 261 | MAX (suffix, memory); | ||
| 261 | while (i < needle_len && (CANON_ELEMENT (needle[i]) | 262 | while (i < needle_len && (CANON_ELEMENT (needle[i]) |
| 262 | == CANON_ELEMENT (haystack[i + j]))) | 263 | == CANON_ELEMENT (haystack[i + j]))) |
| 263 | ++i; | 264 | ++i; |
| @@ -287,11 +288,12 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, | |||
| 287 | /* The two halves of needle are distinct; no extra memory is | 288 | /* The two halves of needle are distinct; no extra memory is |
| 288 | required, and any mismatch results in a maximal shift. */ | 289 | required, and any mismatch results in a maximal shift. */ |
| 289 | period = MAX (suffix, needle_len - suffix) + 1; | 290 | period = MAX (suffix, needle_len - suffix) + 1; |
| 290 | j = 0; | 291 | size_t j = 0; /* Index into current window of HAYSTACK. */ |
| 291 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) | 292 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) |
| 292 | { | 293 | { |
| 293 | /* Scan for matches in right half. */ | 294 | /* Scan for matches in right half. */ |
| 294 | i = suffix; | 295 | size_t i = /* Index into current byte of NEEDLE. */ |
| 296 | suffix; | ||
| 295 | while (i < needle_len && (CANON_ELEMENT (needle[i]) | 297 | while (i < needle_len && (CANON_ELEMENT (needle[i]) |
| 296 | == CANON_ELEMENT (haystack[i + j]))) | 298 | == CANON_ELEMENT (haystack[i + j]))) |
| 297 | ++i; | 299 | ++i; |
| @@ -329,24 +331,21 @@ static RETURN_TYPE _GL_ATTRIBUTE_PURE | |||
| 329 | two_way_long_needle (const unsigned char *haystack, size_t haystack_len, | 331 | two_way_long_needle (const unsigned char *haystack, size_t haystack_len, |
| 330 | const unsigned char *needle, size_t needle_len) | 332 | const unsigned char *needle, size_t needle_len) |
| 331 | { | 333 | { |
| 332 | size_t i; /* Index into current byte of NEEDLE. */ | ||
| 333 | size_t j; /* Index into current window of HAYSTACK. */ | ||
| 334 | size_t period; /* The period of the right half of needle. */ | ||
| 335 | size_t suffix; /* The index of the right half of needle. */ | ||
| 336 | size_t shift_table[1U << CHAR_BIT]; /* See below. */ | ||
| 337 | |||
| 338 | /* Factor the needle into two halves, such that the left half is | 334 | /* Factor the needle into two halves, such that the left half is |
| 339 | smaller than the global period, and the right half is | 335 | smaller than the global period, and the right half is |
| 340 | periodic (with a period as large as NEEDLE_LEN - suffix). */ | 336 | periodic (with a period as large as NEEDLE_LEN - suffix). */ |
| 341 | suffix = critical_factorization (needle, needle_len, &period); | 337 | size_t period; /* The period of the right half of needle. */ |
| 338 | size_t suffix = /* The index of the right half of needle. */ | ||
| 339 | critical_factorization (needle, needle_len, &period); | ||
| 342 | 340 | ||
| 343 | /* Populate shift_table. For each possible byte value c, | 341 | /* Populate shift_table. For each possible byte value c, |
| 344 | shift_table[c] is the distance from the last occurrence of c to | 342 | shift_table[c] is the distance from the last occurrence of c to |
| 345 | the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE. | 343 | the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE. |
| 346 | shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0. */ | 344 | shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0. */ |
| 347 | for (i = 0; i < 1U << CHAR_BIT; i++) | 345 | size_t shift_table[1U << CHAR_BIT]; |
| 346 | for (size_t i = 0; i < 1U << CHAR_BIT; i++) | ||
| 348 | shift_table[i] = needle_len; | 347 | shift_table[i] = needle_len; |
| 349 | for (i = 0; i < needle_len; i++) | 348 | for (size_t i = 0; i < needle_len; i++) |
| 350 | shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1; | 349 | shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1; |
| 351 | 350 | ||
| 352 | /* Perform the search. Each iteration compares the right half | 351 | /* Perform the search. Each iteration compares the right half |
| @@ -357,13 +356,13 @@ two_way_long_needle (const unsigned char *haystack, size_t haystack_len, | |||
| 357 | only advance by the period, so use memory to avoid rescanning | 356 | only advance by the period, so use memory to avoid rescanning |
| 358 | known occurrences of the period in the right half. */ | 357 | known occurrences of the period in the right half. */ |
| 359 | size_t memory = 0; | 358 | size_t memory = 0; |
| 360 | size_t shift; | 359 | size_t j = 0; /* Index into current window of HAYSTACK. */ |
| 361 | j = 0; | ||
| 362 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) | 360 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) |
| 363 | { | 361 | { |
| 364 | /* Check the last byte first; if it does not match, then | 362 | /* Check the last byte first; if it does not match, then |
| 365 | shift to the next possible match location. */ | 363 | shift to the next possible match location. */ |
| 366 | shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; | 364 | size_t shift = |
| 365 | shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; | ||
| 367 | if (0 < shift) | 366 | if (0 < shift) |
| 368 | { | 367 | { |
| 369 | if (memory && shift < period) | 368 | if (memory && shift < period) |
| @@ -375,32 +374,34 @@ two_way_long_needle (const unsigned char *haystack, size_t haystack_len, | |||
| 375 | } | 374 | } |
| 376 | memory = 0; | 375 | memory = 0; |
| 377 | j += shift; | 376 | j += shift; |
| 378 | continue; | ||
| 379 | } | ||
| 380 | /* Scan for matches in right half. The last byte has | ||
| 381 | already been matched, by virtue of the shift table. */ | ||
| 382 | i = MAX (suffix, memory); | ||
| 383 | while (i < needle_len - 1 && (CANON_ELEMENT (needle[i]) | ||
| 384 | == CANON_ELEMENT (haystack[i + j]))) | ||
| 385 | ++i; | ||
| 386 | if (needle_len - 1 <= i) | ||
| 387 | { | ||
| 388 | /* Scan for matches in left half. */ | ||
| 389 | i = suffix - 1; | ||
| 390 | while (memory < i + 1 && (CANON_ELEMENT (needle[i]) | ||
| 391 | == CANON_ELEMENT (haystack[i + j]))) | ||
| 392 | --i; | ||
| 393 | if (i + 1 < memory + 1) | ||
| 394 | return (RETURN_TYPE) (haystack + j); | ||
| 395 | /* No match, so remember how many repetitions of period | ||
| 396 | on the right half were scanned. */ | ||
| 397 | j += period; | ||
| 398 | memory = needle_len - period; | ||
| 399 | } | 377 | } |
| 400 | else | 378 | else |
| 401 | { | 379 | { |
| 402 | j += i - suffix + 1; | 380 | /* Scan for matches in right half. The last byte has |
| 403 | memory = 0; | 381 | already been matched, by virtue of the shift table. */ |
| 382 | size_t i = MAX (suffix, memory); | ||
| 383 | while (i < needle_len - 1 && (CANON_ELEMENT (needle[i]) | ||
| 384 | == CANON_ELEMENT (haystack[i + j]))) | ||
| 385 | ++i; | ||
| 386 | if (needle_len - 1 <= i) | ||
| 387 | { | ||
| 388 | /* Scan for matches in left half. */ | ||
| 389 | i = suffix - 1; | ||
| 390 | while (memory < i + 1 && (CANON_ELEMENT (needle[i]) | ||
| 391 | == CANON_ELEMENT (haystack[i + j]))) | ||
| 392 | --i; | ||
| 393 | if (i + 1 < memory + 1) | ||
| 394 | return (RETURN_TYPE) (haystack + j); | ||
| 395 | /* No match, so remember how many repetitions of period | ||
| 396 | on the right half were scanned. */ | ||
| 397 | j += period; | ||
| 398 | memory = needle_len - period; | ||
| 399 | } | ||
| 400 | else | ||
| 401 | { | ||
| 402 | j += i - suffix + 1; | ||
| 403 | memory = 0; | ||
| 404 | } | ||
| 404 | } | 405 | } |
| 405 | } | 406 | } |
| 406 | } | 407 | } |
| @@ -408,38 +409,40 @@ two_way_long_needle (const unsigned char *haystack, size_t haystack_len, | |||
| 408 | { | 409 | { |
| 409 | /* The two halves of needle are distinct; no extra memory is | 410 | /* The two halves of needle are distinct; no extra memory is |
| 410 | required, and any mismatch results in a maximal shift. */ | 411 | required, and any mismatch results in a maximal shift. */ |
| 411 | size_t shift; | ||
| 412 | period = MAX (suffix, needle_len - suffix) + 1; | 412 | period = MAX (suffix, needle_len - suffix) + 1; |
| 413 | j = 0; | 413 | size_t j = 0; /* Index into current window of HAYSTACK. */ |
| 414 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) | 414 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) |
| 415 | { | 415 | { |
| 416 | /* Check the last byte first; if it does not match, then | 416 | /* Check the last byte first; if it does not match, then |
| 417 | shift to the next possible match location. */ | 417 | shift to the next possible match location. */ |
| 418 | shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; | 418 | size_t shift = |
| 419 | shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; | ||
| 419 | if (0 < shift) | 420 | if (0 < shift) |
| 420 | { | 421 | { |
| 421 | j += shift; | 422 | j += shift; |
| 422 | continue; | ||
| 423 | } | 423 | } |
| 424 | /* Scan for matches in right half. The last byte has | 424 | else |
| 425 | already been matched, by virtue of the shift table. */ | ||
| 426 | i = suffix; | ||
| 427 | while (i < needle_len - 1 && (CANON_ELEMENT (needle[i]) | ||
| 428 | == CANON_ELEMENT (haystack[i + j]))) | ||
| 429 | ++i; | ||
| 430 | if (needle_len - 1 <= i) | ||
| 431 | { | 425 | { |
| 432 | /* Scan for matches in left half. */ | 426 | /* Scan for matches in right half. The last byte has |
| 433 | i = suffix - 1; | 427 | already been matched, by virtue of the shift table. */ |
| 434 | while (i != SIZE_MAX && (CANON_ELEMENT (needle[i]) | 428 | size_t i = suffix; |
| 435 | == CANON_ELEMENT (haystack[i + j]))) | 429 | while (i < needle_len - 1 && (CANON_ELEMENT (needle[i]) |
| 436 | --i; | 430 | == CANON_ELEMENT (haystack[i + j]))) |
| 437 | if (i == SIZE_MAX) | 431 | ++i; |
| 438 | return (RETURN_TYPE) (haystack + j); | 432 | if (needle_len - 1 <= i) |
| 439 | j += period; | 433 | { |
| 434 | /* Scan for matches in left half. */ | ||
| 435 | i = suffix - 1; | ||
| 436 | while (i != SIZE_MAX && (CANON_ELEMENT (needle[i]) | ||
| 437 | == CANON_ELEMENT (haystack[i + j]))) | ||
| 438 | --i; | ||
| 439 | if (i == SIZE_MAX) | ||
| 440 | return (RETURN_TYPE) (haystack + j); | ||
| 441 | j += period; | ||
| 442 | } | ||
| 443 | else | ||
| 444 | j += i - suffix + 1; | ||
| 440 | } | 445 | } |
| 441 | else | ||
| 442 | j += i - suffix + 1; | ||
| 443 | } | 446 | } |
| 444 | } | 447 | } |
| 445 | return NULL; | 448 | return NULL; |
