diff options
Diffstat (limited to 'gl/mbuiter.h')
| -rw-r--r-- | gl/mbuiter.h | 203 |
1 files changed, 203 insertions, 0 deletions
diff --git a/gl/mbuiter.h b/gl/mbuiter.h new file mode 100644 index 00000000..9da3a6c7 --- /dev/null +++ b/gl/mbuiter.h | |||
| @@ -0,0 +1,203 @@ | |||
| 1 | /* Iterating through multibyte strings: macros for multi-byte encodings. | ||
| 2 | Copyright (C) 2001, 2005 Free Software Foundation, Inc. | ||
| 3 | |||
| 4 | This program is free software; you can redistribute it and/or modify | ||
| 5 | it under the terms of the GNU General Public License as published by | ||
| 6 | the Free Software Foundation; either version 2, or (at your option) | ||
| 7 | any later version. | ||
| 8 | |||
| 9 | This program is distributed in the hope that it will be useful, | ||
| 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | GNU General Public License for more details. | ||
| 13 | |||
| 14 | You should have received a copy of the GNU General Public License | ||
| 15 | along with this program; if not, write to the Free Software Foundation, | ||
| 16 | Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ | ||
| 17 | |||
| 18 | /* Written by Bruno Haible <bruno@clisp.org>. */ | ||
| 19 | |||
| 20 | /* The macros in this file implement forward iteration through a | ||
| 21 | multi-byte string, without knowing its length a-priori. | ||
| 22 | |||
| 23 | With these macros, an iteration loop that looks like | ||
| 24 | |||
| 25 | char *iter; | ||
| 26 | for (iter = buf; *iter != '\0'; iter++) | ||
| 27 | { | ||
| 28 | do_something (*iter); | ||
| 29 | } | ||
| 30 | |||
| 31 | becomes | ||
| 32 | |||
| 33 | mbui_iterator_t iter; | ||
| 34 | for (mbui_init (iter, buf); mbui_avail (iter); mbui_advance (iter)) | ||
| 35 | { | ||
| 36 | do_something (mbui_cur_ptr (iter), mb_len (mbui_cur (iter))); | ||
| 37 | } | ||
| 38 | |||
| 39 | The benefit of these macros over plain use of mbrtowc is: | ||
| 40 | - Handling of invalid multibyte sequences is possible without | ||
| 41 | making the code more complicated, while still preserving the | ||
| 42 | invalid multibyte sequences. | ||
| 43 | |||
| 44 | Compared to mbiter.h, the macros here don't need to know the string's | ||
| 45 | length a-priori. The downside is that at each step, the look-ahead | ||
| 46 | that guards against overrunning the terminating '\0' is more expensive. | ||
| 47 | The mbui_* macros are therefore suitable when there is a high probability | ||
| 48 | that only the first few multibyte characters need to be inspected. | ||
| 49 | Whereas the mbi_* macros are better if usually the iteration runs | ||
| 50 | through the entire string. | ||
| 51 | |||
| 52 | mbui_iterator_t | ||
| 53 | is a type usable for variable declarations. | ||
| 54 | |||
| 55 | mbui_init (iter, startptr) | ||
| 56 | initializes the iterator, starting at startptr. | ||
| 57 | |||
| 58 | mbui_avail (iter) | ||
| 59 | returns true if there are more multibyte chracters available before | ||
| 60 | the end of string is reached. In this case, mbui_cur (iter) is | ||
| 61 | initialized to the next multibyte chracter. | ||
| 62 | |||
| 63 | mbui_advance (iter) | ||
| 64 | advances the iterator by one multibyte character. | ||
| 65 | |||
| 66 | mbui_cur (iter) | ||
| 67 | returns the current multibyte character, of type mbchar_t. All the | ||
| 68 | macros defined in mbchar.h can be used on it. | ||
| 69 | |||
| 70 | mbui_cur_ptr (iter) | ||
| 71 | return a pointer to the beginning of the current multibyte character. | ||
| 72 | |||
| 73 | mbui_reloc (iter, ptrdiff) | ||
| 74 | relocates iterator when the string is moved by ptrdiff bytes. | ||
| 75 | |||
| 76 | Here are the function prototypes of the macros. | ||
| 77 | |||
| 78 | extern void mbui_init (mbui_iterator_t iter, const char *startptr); | ||
| 79 | extern bool mbui_avail (mbui_iterator_t iter); | ||
| 80 | extern void mbui_advance (mbui_iterator_t iter); | ||
| 81 | extern mbchar_t mbui_cur (mbui_iterator_t iter); | ||
| 82 | extern const char * mbui_cur_ptr (mbui_iterator_t iter); | ||
| 83 | extern void mbui_reloc (mbui_iterator_t iter, ptrdiff_t ptrdiff); | ||
| 84 | */ | ||
| 85 | |||
| 86 | #ifndef _MBUITER_H | ||
| 87 | #define _MBUITER_H 1 | ||
| 88 | |||
| 89 | #include <assert.h> | ||
| 90 | #include <stdbool.h> | ||
| 91 | #include <stdlib.h> | ||
| 92 | |||
| 93 | /* Tru64 with Desktop Toolkit C has a bug: <stdio.h> must be included before | ||
| 94 | <wchar.h>. | ||
| 95 | BSD/OS 4.1 has a bug: <stdio.h> and <time.h> must be included before | ||
| 96 | <wchar.h>. */ | ||
| 97 | #include <stdio.h> | ||
| 98 | #include <time.h> | ||
| 99 | #include <wchar.h> | ||
| 100 | |||
| 101 | #include "mbchar.h" | ||
| 102 | #include "strnlen1.h" | ||
| 103 | |||
| 104 | struct mbuiter_multi | ||
| 105 | { | ||
| 106 | bool in_shift; /* true if next byte may not be interpreted as ASCII */ | ||
| 107 | mbstate_t state; /* if in_shift: current shift state */ | ||
| 108 | bool next_done; /* true if mbui_avail has already filled the following */ | ||
| 109 | struct mbchar cur; /* the current character: | ||
| 110 | const char *cur.ptr pointer to current character | ||
| 111 | The following are only valid after mbui_avail. | ||
| 112 | size_t cur.bytes number of bytes of current character | ||
| 113 | bool cur.wc_valid true if wc is a valid wide character | ||
| 114 | wchar_t cur.wc if wc_valid: the current character | ||
| 115 | */ | ||
| 116 | }; | ||
| 117 | |||
| 118 | static inline void | ||
| 119 | mbuiter_multi_next (struct mbuiter_multi *iter) | ||
| 120 | { | ||
| 121 | if (iter->next_done) | ||
| 122 | return; | ||
| 123 | if (iter->in_shift) | ||
| 124 | goto with_shift; | ||
| 125 | /* Handle most ASCII characters quickly, without calling mbrtowc(). */ | ||
| 126 | if (is_basic (*iter->cur.ptr)) | ||
| 127 | { | ||
| 128 | /* These characters are part of the basic character set. ISO C 99 | ||
| 129 | guarantees that their wide character code is identical to their | ||
| 130 | char code. */ | ||
| 131 | iter->cur.bytes = 1; | ||
| 132 | iter->cur.wc = *iter->cur.ptr; | ||
| 133 | iter->cur.wc_valid = true; | ||
| 134 | } | ||
| 135 | else | ||
| 136 | { | ||
| 137 | assert (mbsinit (&iter->state)); | ||
| 138 | iter->in_shift = true; | ||
| 139 | with_shift: | ||
| 140 | iter->cur.bytes = mbrtowc (&iter->cur.wc, iter->cur.ptr, | ||
| 141 | strnlen1 (iter->cur.ptr, MB_CUR_MAX), | ||
| 142 | &iter->state); | ||
| 143 | if (iter->cur.bytes == (size_t) -1) | ||
| 144 | { | ||
| 145 | /* An invalid multibyte sequence was encountered. */ | ||
| 146 | iter->cur.bytes = 1; | ||
| 147 | iter->cur.wc_valid = false; | ||
| 148 | /* Whether to set iter->in_shift = false and reset iter->state | ||
| 149 | or not is not very important; the string is bogus anyway. */ | ||
| 150 | } | ||
| 151 | else if (iter->cur.bytes == (size_t) -2) | ||
| 152 | { | ||
| 153 | /* An incomplete multibyte character at the end. */ | ||
| 154 | iter->cur.bytes = strlen (iter->cur.ptr); | ||
| 155 | iter->cur.wc_valid = false; | ||
| 156 | /* Whether to set iter->in_shift = false and reset iter->state | ||
| 157 | or not is not important; the string end is reached anyway. */ | ||
| 158 | } | ||
| 159 | else | ||
| 160 | { | ||
| 161 | if (iter->cur.bytes == 0) | ||
| 162 | { | ||
| 163 | /* A null wide character was encountered. */ | ||
| 164 | iter->cur.bytes = 1; | ||
| 165 | assert (*iter->cur.ptr == '\0'); | ||
| 166 | assert (iter->cur.wc == 0); | ||
| 167 | } | ||
| 168 | iter->cur.wc_valid = true; | ||
| 169 | |||
| 170 | /* When in the initial state, we can go back treating ASCII | ||
| 171 | characters more quickly. */ | ||
| 172 | if (mbsinit (&iter->state)) | ||
| 173 | iter->in_shift = false; | ||
| 174 | } | ||
| 175 | } | ||
| 176 | iter->next_done = true; | ||
| 177 | } | ||
| 178 | |||
| 179 | static inline void | ||
| 180 | mbuiter_multi_reloc (struct mbuiter_multi *iter, ptrdiff_t ptrdiff) | ||
| 181 | { | ||
| 182 | iter->cur.ptr += ptrdiff; | ||
| 183 | } | ||
| 184 | |||
| 185 | /* Iteration macros. */ | ||
| 186 | typedef struct mbuiter_multi mbui_iterator_t; | ||
| 187 | #define mbui_init(iter, startptr) \ | ||
| 188 | ((iter).cur.ptr = (startptr), \ | ||
| 189 | (iter).in_shift = false, memset (&(iter).state, '\0', sizeof (mbstate_t)), \ | ||
| 190 | (iter).next_done = false) | ||
| 191 | #define mbui_avail(iter) \ | ||
| 192 | (mbuiter_multi_next (&(iter)), !mb_isnul ((iter).cur)) | ||
| 193 | #define mbui_advance(iter) \ | ||
| 194 | ((iter).cur.ptr += (iter).cur.bytes, (iter).next_done = false) | ||
| 195 | |||
| 196 | /* Access to the current character. */ | ||
| 197 | #define mbui_cur(iter) (iter).cur | ||
| 198 | #define mbui_cur_ptr(iter) (iter).cur.ptr | ||
| 199 | |||
| 200 | /* Relocation. */ | ||
| 201 | #define mbui_reloc(iter, ptrdiff) mbuiter_multi_reloc (&iter, ptrdiff) | ||
| 202 | |||
| 203 | #endif /* _MBUITER_H */ | ||
