summaryrefslogtreecommitdiffstats
path: root/gl/mbrtowc.c
diff options
context:
space:
mode:
Diffstat (limited to 'gl/mbrtowc.c')
-rw-r--r--gl/mbrtowc.c474
1 files changed, 237 insertions, 237 deletions
diff --git a/gl/mbrtowc.c b/gl/mbrtowc.c
index 0fec5f1..5c2650e 100644
--- a/gl/mbrtowc.c
+++ b/gl/mbrtowc.c
@@ -1,5 +1,5 @@
1/* Convert multibyte character to wide character. 1/* Convert multibyte character to wide character.
2 Copyright (C) 1999-2002, 2005-2009 Free Software Foundation, Inc. 2 Copyright (C) 1999-2002, 2005-2010 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2008. 3 Written by Bruno Haible <bruno@clisp.org>, 2008.
4 4
5 This program is free software: you can redistribute it and/or modify 5 This program is free software: you can redistribute it and/or modify
@@ -63,30 +63,30 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
63 switch (nstate) 63 switch (nstate)
64 { 64 {
65 case 0: 65 case 0:
66 p = s; 66 p = s;
67 m = n; 67 m = n;
68 break; 68 break;
69 case 3: 69 case 3:
70 buf[2] = pstate[3]; 70 buf[2] = pstate[3];
71 /*FALLTHROUGH*/ 71 /*FALLTHROUGH*/
72 case 2: 72 case 2:
73 buf[1] = pstate[2]; 73 buf[1] = pstate[2];
74 /*FALLTHROUGH*/ 74 /*FALLTHROUGH*/
75 case 1: 75 case 1:
76 buf[0] = pstate[1]; 76 buf[0] = pstate[1];
77 p = buf; 77 p = buf;
78 m = nstate; 78 m = nstate;
79 buf[m++] = s[0]; 79 buf[m++] = s[0];
80 if (n >= 2 && m < 4) 80 if (n >= 2 && m < 4)
81 { 81 {
82 buf[m++] = s[1]; 82 buf[m++] = s[1];
83 if (n >= 3 && m < 4) 83 if (n >= 3 && m < 4)
84 buf[m++] = s[2]; 84 buf[m++] = s[2];
85 } 85 }
86 break; 86 break;
87 default: 87 default:
88 errno = EINVAL; 88 errno = EINVAL;
89 return (size_t)(-1); 89 return (size_t)(-1);
90 } 90 }
91 91
92 /* Here m > 0. */ 92 /* Here m > 0. */
@@ -99,208 +99,208 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
99 int res = mbtowc (pwc, p, m); 99 int res = mbtowc (pwc, p, m);
100 100
101 if (res >= 0) 101 if (res >= 0)
102 { 102 {
103 if (pwc != NULL && ((*pwc == 0) != (res == 0))) 103 if (pwc != NULL && ((*pwc == 0) != (res == 0)))
104 abort (); 104 abort ();
105 if (nstate >= (res > 0 ? res : 1)) 105 if (nstate >= (res > 0 ? res : 1))
106 abort (); 106 abort ();
107 res -= nstate; 107 res -= nstate;
108 pstate[0] = 0; 108 pstate[0] = 0;
109 return res; 109 return res;
110 } 110 }
111 111
112 /* mbtowc does not distinguish between invalid and incomplete multibyte 112 /* mbtowc does not distinguish between invalid and incomplete multibyte
113 sequences. But mbrtowc needs to make this distinction. 113 sequences. But mbrtowc needs to make this distinction.
114 There are two possible approaches: 114 There are two possible approaches:
115 - Use iconv() and its return value. 115 - Use iconv() and its return value.
116 - Use built-in knowledge about the possible encodings. 116 - Use built-in knowledge about the possible encodings.
117 Given the low quality of implementation of iconv() on the systems that 117 Given the low quality of implementation of iconv() on the systems that
118 lack mbrtowc(), we use the second approach. 118 lack mbrtowc(), we use the second approach.
119 The possible encodings are: 119 The possible encodings are:
120 - 8-bit encodings, 120 - 8-bit encodings,
121 - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS, 121 - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
122 - UTF-8. 122 - UTF-8.
123 Use specialized code for each. */ 123 Use specialized code for each. */
124 if (m >= 4 || m >= MB_CUR_MAX) 124 if (m >= 4 || m >= MB_CUR_MAX)
125 goto invalid; 125 goto invalid;
126 /* Here MB_CUR_MAX > 1 and 0 < m < 4. */ 126 /* Here MB_CUR_MAX > 1 and 0 < m < 4. */
127 { 127 {
128 const char *encoding = locale_charset (); 128 const char *encoding = locale_charset ();
129 129
130 if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0)) 130 if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
131 { 131 {
132 /* Cf. unistr/u8-mblen.c. */ 132 /* Cf. unistr/u8-mblen.c. */
133 unsigned char c = (unsigned char) p[0]; 133 unsigned char c = (unsigned char) p[0];
134 134
135 if (c >= 0xc2) 135 if (c >= 0xc2)
136 { 136 {
137 if (c < 0xe0) 137 if (c < 0xe0)
138 { 138 {
139 if (m == 1) 139 if (m == 1)
140 goto incomplete; 140 goto incomplete;
141 } 141 }
142 else if (c < 0xf0) 142 else if (c < 0xf0)
143 { 143 {
144 if (m == 1) 144 if (m == 1)
145 goto incomplete; 145 goto incomplete;
146 if (m == 2) 146 if (m == 2)
147 { 147 {
148 unsigned char c2 = (unsigned char) p[1]; 148 unsigned char c2 = (unsigned char) p[1];
149 149
150 if ((c2 ^ 0x80) < 0x40 150 if ((c2 ^ 0x80) < 0x40
151 && (c >= 0xe1 || c2 >= 0xa0) 151 && (c >= 0xe1 || c2 >= 0xa0)
152 && (c != 0xed || c2 < 0xa0)) 152 && (c != 0xed || c2 < 0xa0))
153 goto incomplete; 153 goto incomplete;
154 } 154 }
155 } 155 }
156 else if (c <= 0xf4) 156 else if (c <= 0xf4)
157 { 157 {
158 if (m == 1) 158 if (m == 1)
159 goto incomplete; 159 goto incomplete;
160 else /* m == 2 || m == 3 */ 160 else /* m == 2 || m == 3 */
161 { 161 {
162 unsigned char c2 = (unsigned char) p[1]; 162 unsigned char c2 = (unsigned char) p[1];
163 163
164 if ((c2 ^ 0x80) < 0x40 164 if ((c2 ^ 0x80) < 0x40
165 && (c >= 0xf1 || c2 >= 0x90) 165 && (c >= 0xf1 || c2 >= 0x90)
166 && (c < 0xf4 || (c == 0xf4 && c2 < 0x90))) 166 && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
167 { 167 {
168 if (m == 2) 168 if (m == 2)
169 goto incomplete; 169 goto incomplete;
170 else /* m == 3 */ 170 else /* m == 3 */
171 { 171 {
172 unsigned char c3 = (unsigned char) p[2]; 172 unsigned char c3 = (unsigned char) p[2];
173 173
174 if ((c3 ^ 0x80) < 0x40) 174 if ((c3 ^ 0x80) < 0x40)
175 goto incomplete; 175 goto incomplete;
176 } 176 }
177 } 177 }
178 } 178 }
179 } 179 }
180 } 180 }
181 goto invalid; 181 goto invalid;
182 } 182 }
183 183
184 /* As a reference for this code, you can use the GNU libiconv 184 /* As a reference for this code, you can use the GNU libiconv
185 implementation. Look for uses of the RET_TOOFEW macro. */ 185 implementation. Look for uses of the RET_TOOFEW macro. */
186 186
187 if (STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)) 187 if (STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
188 { 188 {
189 if (m == 1) 189 if (m == 1)
190 { 190 {
191 unsigned char c = (unsigned char) p[0]; 191 unsigned char c = (unsigned char) p[0];
192 192
193 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f) 193 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
194 goto incomplete; 194 goto incomplete;
195 } 195 }
196 if (m == 2) 196 if (m == 2)
197 { 197 {
198 unsigned char c = (unsigned char) p[0]; 198 unsigned char c = (unsigned char) p[0];
199 199
200 if (c == 0x8f) 200 if (c == 0x8f)
201 { 201 {
202 unsigned char c2 = (unsigned char) p[1]; 202 unsigned char c2 = (unsigned char) p[1];
203 203
204 if (c2 >= 0xa1 && c2 < 0xff) 204 if (c2 >= 0xa1 && c2 < 0xff)
205 goto incomplete; 205 goto incomplete;
206 } 206 }
207 } 207 }
208 goto invalid; 208 goto invalid;
209 } 209 }
210 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0) 210 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
211 || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0) 211 || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
212 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)) 212 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
213 { 213 {
214 if (m == 1) 214 if (m == 1)
215 { 215 {
216 unsigned char c = (unsigned char) p[0]; 216 unsigned char c = (unsigned char) p[0];
217 217
218 if (c >= 0xa1 && c < 0xff) 218 if (c >= 0xa1 && c < 0xff)
219 goto incomplete; 219 goto incomplete;
220 } 220 }
221 goto invalid; 221 goto invalid;
222 } 222 }
223 if (STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)) 223 if (STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
224 { 224 {
225 if (m == 1) 225 if (m == 1)
226 { 226 {
227 unsigned char c = (unsigned char) p[0]; 227 unsigned char c = (unsigned char) p[0];
228 228
229 if ((c >= 0xa1 && c < 0xff) || c == 0x8e) 229 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
230 goto incomplete; 230 goto incomplete;
231 } 231 }
232 else /* m == 2 || m == 3 */ 232 else /* m == 2 || m == 3 */
233 { 233 {
234 unsigned char c = (unsigned char) p[0]; 234 unsigned char c = (unsigned char) p[0];
235 235
236 if (c == 0x8e) 236 if (c == 0x8e)
237 goto incomplete; 237 goto incomplete;
238 } 238 }
239 goto invalid; 239 goto invalid;
240 } 240 }
241 if (STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0)) 241 if (STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
242 { 242 {
243 if (m == 1) 243 if (m == 1)
244 { 244 {
245 unsigned char c = (unsigned char) p[0]; 245 unsigned char c = (unsigned char) p[0];
246 246
247 if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe)) 247 if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
248 goto incomplete; 248 goto incomplete;
249 } 249 }
250 else /* m == 2 || m == 3 */ 250 else /* m == 2 || m == 3 */
251 { 251 {
252 unsigned char c = (unsigned char) p[0]; 252 unsigned char c = (unsigned char) p[0];
253 253
254 if (c >= 0x90 && c <= 0xe3) 254 if (c >= 0x90 && c <= 0xe3)
255 { 255 {
256 unsigned char c2 = (unsigned char) p[1]; 256 unsigned char c2 = (unsigned char) p[1];
257 257
258 if (c2 >= 0x30 && c2 <= 0x39) 258 if (c2 >= 0x30 && c2 <= 0x39)
259 { 259 {
260 if (m == 2) 260 if (m == 2)
261 goto incomplete; 261 goto incomplete;
262 else /* m == 3 */ 262 else /* m == 3 */
263 { 263 {
264 unsigned char c3 = (unsigned char) p[2]; 264 unsigned char c3 = (unsigned char) p[2];
265 265
266 if (c3 >= 0x81 && c3 <= 0xfe) 266 if (c3 >= 0x81 && c3 <= 0xfe)
267 goto incomplete; 267 goto incomplete;
268 } 268 }
269 } 269 }
270 } 270 }
271 } 271 }
272 goto invalid; 272 goto invalid;
273 } 273 }
274 if (STREQ (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0)) 274 if (STREQ (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
275 { 275 {
276 if (m == 1) 276 if (m == 1)
277 { 277 {
278 unsigned char c = (unsigned char) p[0]; 278 unsigned char c = (unsigned char) p[0];
279 279
280 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea) 280 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
281 || (c >= 0xf0 && c <= 0xf9)) 281 || (c >= 0xf0 && c <= 0xf9))
282 goto incomplete; 282 goto incomplete;
283 } 283 }
284 goto invalid; 284 goto invalid;
285 } 285 }
286 286
287 /* An unknown multibyte encoding. */ 287 /* An unknown multibyte encoding. */
288 goto incomplete; 288 goto incomplete;
289 } 289 }
290 290
291 incomplete: 291 incomplete:
292 { 292 {
293 size_t k = nstate; 293 size_t k = nstate;
294 /* Here 0 <= k < m < 4. */ 294 /* Here 0 <= k < m < 4. */
295 pstate[++k] = s[0]; 295 pstate[++k] = s[0];
296 if (k < m) 296 if (k < m)
297 { 297 {
298 pstate[++k] = s[1]; 298 pstate[++k] = s[1];
299 if (k < m) 299 if (k < m)
300 pstate[++k] = s[2]; 300 pstate[++k] = s[2];
301 } 301 }
302 if (k != m) 302 if (k != m)
303 abort (); 303 abort ();
304 } 304 }
305 pstate[0] = m; 305 pstate[0] = m;
306 return (size_t)(-2); 306 return (size_t)(-2);
@@ -341,25 +341,25 @@ rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
341 341
342 if (!mbsinit (ps)) 342 if (!mbsinit (ps))
343 { 343 {
344 /* Parse the rest of the multibyte character byte for byte. */ 344 /* Parse the rest of the multibyte character byte for byte. */
345 size_t count = 0; 345 size_t count = 0;
346 for (; n > 0; s++, n--) 346 for (; n > 0; s++, n--)
347 { 347 {
348 wchar_t wc; 348 wchar_t wc;
349 size_t ret = mbrtowc (&wc, s, 1, ps); 349 size_t ret = mbrtowc (&wc, s, 1, ps);
350 350
351 if (ret == (size_t)(-1)) 351 if (ret == (size_t)(-1))
352 return (size_t)(-1); 352 return (size_t)(-1);
353 count++; 353 count++;
354 if (ret != (size_t)(-2)) 354 if (ret != (size_t)(-2))
355 { 355 {
356 /* The multibyte character has been completed. */ 356 /* The multibyte character has been completed. */
357 if (pwc != NULL) 357 if (pwc != NULL)
358 *pwc = wc; 358 *pwc = wc;
359 return (wc == 0 ? 0 : count); 359 return (wc == 0 ? 0 : count);
360 } 360 }
361 } 361 }
362 return (size_t)(-2); 362 return (size_t)(-2);
363 } 363 }
364 } 364 }
365# endif 365# endif
@@ -371,10 +371,10 @@ rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
371 371
372 if (ret != (size_t)(-1) && ret != (size_t)(-2)) 372 if (ret != (size_t)(-1) && ret != (size_t)(-2))
373 { 373 {
374 if (pwc != NULL) 374 if (pwc != NULL)
375 *pwc = wc; 375 *pwc = wc;
376 if (wc == 0) 376 if (wc == 0)
377 ret = 0; 377 ret = 0;
378 } 378 }
379 return ret; 379 return ret;
380 } 380 }