diff options
Diffstat (limited to 'gl/mbiterf.h')
| -rw-r--r-- | gl/mbiterf.h | 214 |
1 files changed, 214 insertions, 0 deletions
diff --git a/gl/mbiterf.h b/gl/mbiterf.h new file mode 100644 index 00000000..99d8d11d --- /dev/null +++ b/gl/mbiterf.h | |||
| @@ -0,0 +1,214 @@ | |||
| 1 | /* Iterating through multibyte strings, faster: macros for multi-byte encodings. | ||
| 2 | Copyright (C) 2001, 2005, 2007, 2009-2025 Free Software Foundation, Inc. | ||
| 3 | |||
| 4 | This file is free software: you can redistribute it and/or modify | ||
| 5 | it under the terms of the GNU Lesser General Public License as | ||
| 6 | published by the Free Software Foundation; either version 2.1 of the | ||
| 7 | License, or (at your option) any later version. | ||
| 8 | |||
| 9 | This file is distributed in the hope that it will be useful, | ||
| 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | GNU Lesser General Public License for more details. | ||
| 13 | |||
| 14 | You should have received a copy of the GNU Lesser General Public License | ||
| 15 | along with this program. If not, see <https://www.gnu.org/licenses/>. */ | ||
| 16 | |||
| 17 | /* Written by Bruno Haible <bruno@clisp.org>, | ||
| 18 | with insights from Paul Eggert. */ | ||
| 19 | |||
| 20 | /* The macros in this file implement forward iteration through a | ||
| 21 | multi-byte string. | ||
| 22 | |||
| 23 | With these macros, an iteration loop that looks like | ||
| 24 | |||
| 25 | char *iter; | ||
| 26 | for (iter = buf; iter < buf + buflen; iter++) | ||
| 27 | { | ||
| 28 | do_something (*iter); | ||
| 29 | } | ||
| 30 | |||
| 31 | becomes | ||
| 32 | |||
| 33 | const char *buf_end = buf + buflen; | ||
| 34 | mbif_state_t state; | ||
| 35 | [const] char *iter; | ||
| 36 | for (mbif_init (state), iter = buf; mbif_avail (state, iter, buf_end); ) | ||
| 37 | { | ||
| 38 | mbchar_t cur = mbif_next (state, iter, buf_end); | ||
| 39 | // Note: Here always mb_ptr (cur) == iter. | ||
| 40 | do_something (iter, mb_len (cur)); | ||
| 41 | iter += mb_len (cur); | ||
| 42 | } | ||
| 43 | |||
| 44 | The benefit of these macros over plain use of mbrtowc or mbrtoc32 is: | ||
| 45 | - Handling of invalid multibyte sequences is possible without | ||
| 46 | making the code more complicated, while still preserving the | ||
| 47 | invalid multibyte sequences. | ||
| 48 | |||
| 49 | The benefit of these macros over those from mbiter.h is that it | ||
| 50 | produces faster code with today's optimizing compilers (because mbif_next | ||
| 51 | returns its result by value). | ||
| 52 | |||
| 53 | mbif_state_t | ||
| 54 | is a type usable for variable declarations. | ||
| 55 | |||
| 56 | mbif_init (state) | ||
| 57 | initializes the state. | ||
| 58 | |||
| 59 | mbif_avail (state, iter, endptr) | ||
| 60 | returns true if another loop round is needed. | ||
| 61 | |||
| 62 | mbif_next (state, iter, endptr) | ||
| 63 | returns the next multibyte character. | ||
| 64 | It asssumes that the state is initialized and that iter < endptr. | ||
| 65 | |||
| 66 | Here are the function prototypes of the macros. | ||
| 67 | |||
| 68 | extern void mbif_init (mbif_state_t state); | ||
| 69 | extern bool mbif_avail (mbif_state_t state, const char *iter, const char *endptr); | ||
| 70 | extern mbchar_t mbif_next (mbif_state_t state, const char *iter, const char *endptr); | ||
| 71 | */ | ||
| 72 | |||
| 73 | #ifndef _MBITERF_H | ||
| 74 | #define _MBITERF_H 1 | ||
| 75 | |||
| 76 | /* This file uses _GL_INLINE_HEADER_BEGIN, _GL_INLINE, | ||
| 77 | _GL_ATTRIBUTE_ALWAYS_INLINE. */ | ||
| 78 | #if !_GL_CONFIG_H_INCLUDED | ||
| 79 | #error "Please include config.h first." | ||
| 80 | #endif | ||
| 81 | |||
| 82 | #include <assert.h> | ||
| 83 | #include <stddef.h> | ||
| 84 | #include <string.h> | ||
| 85 | #include <uchar.h> | ||
| 86 | #include <wchar.h> | ||
| 87 | |||
| 88 | #include "mbchar.h" | ||
| 89 | |||
| 90 | _GL_INLINE_HEADER_BEGIN | ||
| 91 | #ifndef MBITERF_INLINE | ||
| 92 | # define MBITERF_INLINE _GL_INLINE _GL_ATTRIBUTE_ALWAYS_INLINE | ||
| 93 | #endif | ||
| 94 | |||
| 95 | #ifdef __cplusplus | ||
| 96 | extern "C" { | ||
| 97 | #endif | ||
| 98 | |||
| 99 | |||
| 100 | struct mbif_state | ||
| 101 | { | ||
| 102 | #if !GNULIB_MBRTOC32_REGULAR | ||
| 103 | bool in_shift; /* true if next byte may not be interpreted as ASCII */ | ||
| 104 | /* If GNULIB_MBRTOC32_REGULAR, it is always false, | ||
| 105 | so optimize it away. */ | ||
| 106 | #endif | ||
| 107 | mbstate_t state; /* if in_shift: current shift state */ | ||
| 108 | /* If GNULIB_MBRTOC32_REGULAR, it is in an initial state | ||
| 109 | before and after every mbiterf_next invocation. | ||
| 110 | */ | ||
| 111 | }; | ||
| 112 | |||
| 113 | MBITERF_INLINE mbchar_t | ||
| 114 | mbiterf_next (struct mbif_state *ps, const char *iter, const char *endptr) | ||
| 115 | { | ||
| 116 | #if !GNULIB_MBRTOC32_REGULAR | ||
| 117 | if (ps->in_shift) | ||
| 118 | goto with_shift; | ||
| 119 | #endif | ||
| 120 | /* Handle most ASCII characters quickly, without calling mbrtowc(). */ | ||
| 121 | if (is_basic (*iter)) | ||
| 122 | { | ||
| 123 | /* These characters are part of the POSIX portable character set. | ||
| 124 | For most of them, namely those in the ISO C basic character set, | ||
| 125 | ISO C 99 guarantees that their wide character code is identical to | ||
| 126 | their char code. For the few other ones, this is the case as well, | ||
| 127 | in all locale encodings that are in use. The 32-bit wide character | ||
| 128 | code is the same as well. */ | ||
| 129 | return (mbchar_t) { .ptr = iter, .bytes = 1, .wc_valid = true, .wc = *iter }; | ||
| 130 | } | ||
| 131 | else | ||
| 132 | { | ||
| 133 | assert (mbsinit (&ps->state)); | ||
| 134 | #if !GNULIB_MBRTOC32_REGULAR | ||
| 135 | ps->in_shift = true; | ||
| 136 | with_shift:; | ||
| 137 | #endif | ||
| 138 | size_t bytes; | ||
| 139 | char32_t wc; | ||
| 140 | bytes = mbrtoc32 (&wc, iter, endptr - iter, &ps->state); | ||
| 141 | if (bytes == (size_t) -1) | ||
| 142 | { | ||
| 143 | /* An invalid multibyte sequence was encountered. */ | ||
| 144 | /* Allow the next invocation to continue from a sane state. */ | ||
| 145 | #if !GNULIB_MBRTOC32_REGULAR | ||
| 146 | ps->in_shift = false; | ||
| 147 | #endif | ||
| 148 | mbszero (&ps->state); | ||
| 149 | return (mbchar_t) { .ptr = iter, .bytes = 1, .wc_valid = false }; | ||
| 150 | } | ||
| 151 | else if (bytes == (size_t) -2) | ||
| 152 | { | ||
| 153 | /* An incomplete multibyte character at the end. */ | ||
| 154 | #if !GNULIB_MBRTOC32_REGULAR | ||
| 155 | ps->in_shift = false; | ||
| 156 | #endif | ||
| 157 | /* Whether to reset ps->state or not is not important; the string end | ||
| 158 | is reached anyway. */ | ||
| 159 | return (mbchar_t) { .ptr = iter, .bytes = endptr - iter, .wc_valid = false }; | ||
| 160 | } | ||
| 161 | else | ||
| 162 | { | ||
| 163 | if (bytes == 0) | ||
| 164 | { | ||
| 165 | /* A null wide character was encountered. */ | ||
| 166 | bytes = 1; | ||
| 167 | assert (*iter == '\0'); | ||
| 168 | assert (wc == 0); | ||
| 169 | } | ||
| 170 | #if !GNULIB_MBRTOC32_REGULAR | ||
| 171 | else if (bytes == (size_t) -3) | ||
| 172 | /* The previous multibyte sequence produced an additional 32-bit | ||
| 173 | wide character. */ | ||
| 174 | bytes = 0; | ||
| 175 | #endif | ||
| 176 | |||
| 177 | /* When in an initial state, we can go back treating ASCII | ||
| 178 | characters more quickly. */ | ||
| 179 | #if !GNULIB_MBRTOC32_REGULAR | ||
| 180 | if (mbsinit (&ps->state)) | ||
| 181 | ps->in_shift = false; | ||
| 182 | #endif | ||
| 183 | return (mbchar_t) { .ptr = iter, .bytes = bytes, .wc_valid = true, .wc = wc }; | ||
| 184 | } | ||
| 185 | } | ||
| 186 | } | ||
| 187 | |||
| 188 | /* Iteration macros. */ | ||
| 189 | typedef struct mbif_state mbif_state_t; | ||
| 190 | #if !GNULIB_MBRTOC32_REGULAR | ||
| 191 | #define mbif_init(st) \ | ||
| 192 | ((st).in_shift = false, mbszero (&(st).state)) | ||
| 193 | #else | ||
| 194 | /* Optimized: no in_shift. */ | ||
| 195 | #define mbif_init(st) \ | ||
| 196 | (mbszero (&(st).state)) | ||
| 197 | #endif | ||
| 198 | #if !GNULIB_MBRTOC32_REGULAR | ||
| 199 | #define mbif_avail(st, iter, endptr) ((st).in_shift || ((iter) < (endptr))) | ||
| 200 | #else | ||
| 201 | /* Optimized: no in_shift. */ | ||
| 202 | #define mbif_avail(st, iter, endptr) ((iter) < (endptr)) | ||
| 203 | #endif | ||
| 204 | #define mbif_next(st, iter, endptr) \ | ||
| 205 | mbiterf_next (&(st), (iter), (endptr)) | ||
| 206 | |||
| 207 | |||
| 208 | #ifdef __cplusplus | ||
| 209 | } | ||
| 210 | #endif | ||
| 211 | |||
| 212 | _GL_INLINE_HEADER_END | ||
| 213 | |||
| 214 | #endif /* _MBITERF_H */ | ||
