summaryrefslogtreecommitdiffstats
path: root/gl/localcharset.c
diff options
context:
space:
mode:
Diffstat (limited to 'gl/localcharset.c')
-rw-r--r--gl/localcharset.c1309
1 files changed, 955 insertions, 354 deletions
diff --git a/gl/localcharset.c b/gl/localcharset.c
index a225a2e..3c50858 100644
--- a/gl/localcharset.c
+++ b/gl/localcharset.c
@@ -1,19 +1,19 @@
1/* Determine a canonical name for the current locale's character encoding. 1/* Determine a canonical name for the current locale's character encoding.
2 2
3 Copyright (C) 2000-2006, 2008-2013 Free Software Foundation, Inc. 3 Copyright (C) 2000-2006, 2008-2021 Free Software Foundation, Inc.
4 4
5 This program is free software; you can redistribute it and/or modify 5 This file is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by 6 it under the terms of the GNU Lesser General Public License as
7 the Free Software Foundation; either version 3, or (at your option) 7 published by the Free Software Foundation; either version 2.1 of the
8 any later version. 8 License, or (at your option) any later version.
9 9
10 This program is distributed in the hope that it will be useful, 10 This file is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details. 13 GNU Lesser General Public License for more details.
14 14
15 You should have received a copy of the GNU General Public License along 15 You should have received a copy of the GNU Lesser General Public License
16 with this program; if not, see <http://www.gnu.org/licenses/>. */ 16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 17
18/* Written by Bruno Haible <bruno@clisp.org>. */ 18/* Written by Bruno Haible <bruno@clisp.org>. */
19 19
@@ -22,7 +22,6 @@
22/* Specification. */ 22/* Specification. */
23#include "localcharset.h" 23#include "localcharset.h"
24 24
25#include <fcntl.h>
26#include <stddef.h> 25#include <stddef.h>
27#include <stdio.h> 26#include <stdio.h>
28#include <string.h> 27#include <string.h>
@@ -32,8 +31,9 @@
32# define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */ 31# define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */
33#endif 32#endif
34 33
35#if defined _WIN32 || defined __WIN32__ 34#if defined _WIN32 && !defined __CYGWIN__
36# define WINDOWS_NATIVE 35# define WINDOWS_NATIVE
36# include <locale.h>
37#endif 37#endif
38 38
39#if defined __EMX__ 39#if defined __EMX__
@@ -44,11 +44,10 @@
44#endif 44#endif
45 45
46#if !defined WINDOWS_NATIVE 46#if !defined WINDOWS_NATIVE
47# include <unistd.h>
48# if HAVE_LANGINFO_CODESET 47# if HAVE_LANGINFO_CODESET
49# include <langinfo.h> 48# include <langinfo.h>
50# else 49# else
51# if 0 /* see comment below */ 50# if 0 /* see comment regarding use of setlocale(), below */
52# include <locale.h> 51# include <locale.h>
53# endif 52# endif
54# endif 53# endif
@@ -59,6 +58,9 @@
59#elif defined WINDOWS_NATIVE 58#elif defined WINDOWS_NATIVE
60# define WIN32_LEAN_AND_MEAN 59# define WIN32_LEAN_AND_MEAN
61# include <windows.h> 60# include <windows.h>
61 /* For the use of setlocale() below, the Gnulib override in setlocale.c is
62 not needed; see the platform lists in setlocale_null.m4. */
63# undef setlocale
62#endif 64#endif
63#if defined OS2 65#if defined OS2
64# define INCL_DOS 66# define INCL_DOS
@@ -70,288 +72,755 @@
70# include <xlocale.h> 72# include <xlocale.h>
71#endif 73#endif
72 74
73#if ENABLE_RELOCATABLE
74# include "relocatable.h"
75#else
76# define relocate(pathname) (pathname)
77#endif
78 75
79/* Get LIBDIR. */ 76#if HAVE_LANGINFO_CODESET || defined WINDOWS_NATIVE || defined OS2
80#ifndef LIBDIR
81# include "configmake.h"
82#endif
83
84/* Define O_NOFOLLOW to 0 on platforms where it does not exist. */
85#ifndef O_NOFOLLOW
86# define O_NOFOLLOW 0
87#endif
88
89#if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
90 /* Native Windows, Cygwin, OS/2, DOS */
91# define ISSLASH(C) ((C) == '/' || (C) == '\\')
92#endif
93
94#ifndef DIRECTORY_SEPARATOR
95# define DIRECTORY_SEPARATOR '/'
96#endif
97 77
98#ifndef ISSLASH 78/* On these platforms, we use a mapping from non-canonical encoding name
99# define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR) 79 to GNU canonical encoding name. */
100#endif
101 80
102#if HAVE_DECL_GETC_UNLOCKED 81/* With glibc-2.1 or newer, we don't need any canonicalization,
103# undef getc 82 because glibc has iconv and both glibc and libiconv support all
104# define getc getc_unlocked 83 GNU canonical names directly. */
105#endif 84# if !((defined __GNU_LIBRARY__ && __GLIBC__ >= 2) || defined __UCLIBC__)
106 85
107/* The following static variable is declared 'volatile' to avoid a 86struct table_entry
108 possible multithread problem in the function get_charset_aliases. If we
109 are running in a threaded environment, and if two threads initialize
110 'charset_aliases' simultaneously, both will produce the same value,
111 and everything will be ok if the two assignments to 'charset_aliases'
112 are atomic. But I don't know what will happen if the two assignments mix. */
113#if __STDC__ != 1
114# define volatile /* empty */
115#endif
116/* Pointer to the contents of the charset.alias file, if it has already been
117 read, else NULL. Its format is:
118 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */
119static const char * volatile charset_aliases;
120
121/* Return a pointer to the contents of the charset.alias file. */
122static const char *
123get_charset_aliases (void)
124{ 87{
125 const char *cp; 88 const char alias[11+1];
126 89 const char canonical[11+1];
127 cp = charset_aliases; 90};
128 if (cp == NULL) 91
129 { 92/* Table of platform-dependent mappings, sorted in ascending order. */
130#if !(defined DARWIN7 || defined VMS || defined WINDOWS_NATIVE || defined __CYGWIN__) 93static const struct table_entry alias_table[] =
131 const char *dir; 94 {
132 const char *base = "charset.alias"; 95# if defined __FreeBSD__ /* FreeBSD */
133 char *file_name; 96 /*{ "ARMSCII-8", "ARMSCII-8" },*/
134 97 { "Big5", "BIG5" },
135 /* Make it possible to override the charset.alias location. This is 98 { "C", "ASCII" },
136 necessary for running the testsuite before "make install". */ 99 /*{ "CP1131", "CP1131" },*/
137 dir = getenv ("CHARSETALIASDIR"); 100 /*{ "CP1251", "CP1251" },*/
138 if (dir == NULL || dir[0] == '\0') 101 /*{ "CP866", "CP866" },*/
139 dir = relocate (LIBDIR); 102 /*{ "GB18030", "GB18030" },*/
140 103 /*{ "GB2312", "GB2312" },*/
141 /* Concatenate dir and base into freshly allocated file_name. */ 104 /*{ "GBK", "GBK" },*/
142 { 105 /*{ "ISCII-DEV", "?" },*/
143 size_t dir_len = strlen (dir); 106 { "ISO8859-1", "ISO-8859-1" },
144 size_t base_len = strlen (base); 107 { "ISO8859-13", "ISO-8859-13" },
145 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1])); 108 { "ISO8859-15", "ISO-8859-15" },
146 file_name = (char *) malloc (dir_len + add_slash + base_len + 1); 109 { "ISO8859-2", "ISO-8859-2" },
147 if (file_name != NULL) 110 { "ISO8859-5", "ISO-8859-5" },
148 { 111 { "ISO8859-7", "ISO-8859-7" },
149 memcpy (file_name, dir, dir_len); 112 { "ISO8859-9", "ISO-8859-9" },
150 if (add_slash) 113 /*{ "KOI8-R", "KOI8-R" },*/
151 file_name[dir_len] = DIRECTORY_SEPARATOR; 114 /*{ "KOI8-U", "KOI8-U" },*/
152 memcpy (file_name + dir_len + add_slash, base, base_len + 1); 115 { "SJIS", "SHIFT_JIS" },
153 } 116 { "US-ASCII", "ASCII" },
154 } 117 { "eucCN", "GB2312" },
155 118 { "eucJP", "EUC-JP" },
156 if (file_name == NULL) 119 { "eucKR", "EUC-KR" }
157 /* Out of memory. Treat the file as empty. */ 120# define alias_table_defined
158 cp = ""; 121# endif
159 else 122# if defined __NetBSD__ /* NetBSD */
160 { 123 { "646", "ASCII" },
161 int fd; 124 /*{ "ARMSCII-8", "ARMSCII-8" },*/
162 125 /*{ "BIG5", "BIG5" },*/
163 /* Open the file. Reject symbolic links on platforms that support 126 { "Big5-HKSCS", "BIG5-HKSCS" },
164 O_NOFOLLOW. This is a security feature. Without it, an attacker 127 /*{ "CP1251", "CP1251" },*/
165 could retrieve parts of the contents (namely, the tail of the 128 /*{ "CP866", "CP866" },*/
166 first line that starts with "* ") of an arbitrary file by placing 129 /*{ "GB18030", "GB18030" },*/
167 a symbolic link to that file under the name "charset.alias" in 130 /*{ "GB2312", "GB2312" },*/
168 some writable directory and defining the environment variable 131 { "ISO8859-1", "ISO-8859-1" },
169 CHARSETALIASDIR to point to that directory. */ 132 { "ISO8859-13", "ISO-8859-13" },
170 fd = open (file_name, 133 { "ISO8859-15", "ISO-8859-15" },
171 O_RDONLY | (HAVE_WORKING_O_NOFOLLOW ? O_NOFOLLOW : 0)); 134 { "ISO8859-2", "ISO-8859-2" },
172 if (fd < 0) 135 { "ISO8859-4", "ISO-8859-4" },
173 /* File not found. Treat it as empty. */ 136 { "ISO8859-5", "ISO-8859-5" },
174 cp = ""; 137 { "ISO8859-7", "ISO-8859-7" },
175 else 138 /*{ "KOI8-R", "KOI8-R" },*/
176 { 139 /*{ "KOI8-U", "KOI8-U" },*/
177 FILE *fp; 140 /*{ "PT154", "PT154" },*/
178 141 { "SJIS", "SHIFT_JIS" },
179 fp = fdopen (fd, "r"); 142 { "eucCN", "GB2312" },
180 if (fp == NULL) 143 { "eucJP", "EUC-JP" },
181 { 144 { "eucKR", "EUC-KR" },
182 /* Out of memory. Treat the file as empty. */ 145 { "eucTW", "EUC-TW" }
183 close (fd); 146# define alias_table_defined
184 cp = ""; 147# endif
185 } 148# if defined __OpenBSD__ /* OpenBSD */
186 else 149 { "646", "ASCII" },
187 { 150 { "ISO8859-1", "ISO-8859-1" },
188 /* Parse the file's contents. */ 151 { "ISO8859-13", "ISO-8859-13" },
189 char *res_ptr = NULL; 152 { "ISO8859-15", "ISO-8859-15" },
190 size_t res_size = 0; 153 { "ISO8859-2", "ISO-8859-2" },
191 154 { "ISO8859-4", "ISO-8859-4" },
192 for (;;) 155 { "ISO8859-5", "ISO-8859-5" },
193 { 156 { "ISO8859-7", "ISO-8859-7" },
194 int c; 157 { "US-ASCII", "ASCII" }
195 char buf1[50+1]; 158# define alias_table_defined
196 char buf2[50+1]; 159# endif
197 size_t l1, l2; 160# if defined __APPLE__ && defined __MACH__ /* Mac OS X */
198 char *old_res_ptr; 161 /* Darwin 7.5 has nl_langinfo(CODESET), but sometimes its value is
199 162 useless:
200 c = getc (fp); 163 - It returns the empty string when LANG is set to a locale of the
201 if (c == EOF) 164 form ll_CC, although ll_CC/LC_CTYPE is a symlink to an UTF-8
202 break; 165 LC_CTYPE file.
203 if (c == '\n' || c == ' ' || c == '\t') 166 - The environment variables LANG, LC_CTYPE, LC_ALL are not set by
204 continue; 167 the system; nl_langinfo(CODESET) returns "US-ASCII" in this case.
205 if (c == '#') 168 - The documentation says:
206 { 169 "... all code that calls BSD system routines should ensure
207 /* Skip comment, to end of line. */ 170 that the const *char parameters of these routines are in UTF-8
208 do 171 encoding. All BSD system functions expect their string
209 c = getc (fp); 172 parameters to be in UTF-8 encoding and nothing else."
210 while (!(c == EOF || c == '\n')); 173 It also says
211 if (c == EOF) 174 "An additional caveat is that string parameters for files,
212 break; 175 paths, and other file-system entities must be in canonical
213 continue; 176 UTF-8. In a canonical UTF-8 Unicode string, all decomposable
214 } 177 characters are decomposed ..."
215 ungetc (c, fp); 178 but this is not true: You can pass non-decomposed UTF-8 strings
216 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2) 179 to file system functions, and it is the OS which will convert
217 break; 180 them to decomposed UTF-8 before accessing the file system.
218 l1 = strlen (buf1); 181 - The Apple Terminal application displays UTF-8 by default.
219 l2 = strlen (buf2); 182 - However, other applications are free to use different encodings:
220 old_res_ptr = res_ptr; 183 - xterm uses ISO-8859-1 by default.
221 if (res_size == 0) 184 - TextEdit uses MacRoman by default.
222 { 185 We prefer UTF-8 over decomposed UTF-8-MAC because one should
223 res_size = l1 + 1 + l2 + 1; 186 minimize the use of decomposed Unicode. Unfortunately, through the
224 res_ptr = (char *) malloc (res_size + 1); 187 Darwin file system, decomposed UTF-8 strings are leaked into user
225 } 188 space nevertheless.
226 else 189 Then there are also the locales with encodings other than US-ASCII
227 { 190 and UTF-8. These locales can be occasionally useful to users (e.g.
228 res_size += l1 + 1 + l2 + 1; 191 when grepping through ISO-8859-1 encoded text files), when all their
229 res_ptr = (char *) realloc (res_ptr, res_size + 1); 192 file names are in US-ASCII.
230 } 193 */
231 if (res_ptr == NULL) 194 { "ARMSCII-8", "ARMSCII-8" },
232 { 195 { "Big5", "BIG5" },
233 /* Out of memory. */ 196 { "Big5HKSCS", "BIG5-HKSCS" },
234 res_size = 0; 197 { "CP1131", "CP1131" },
235 free (old_res_ptr); 198 { "CP1251", "CP1251" },
236 break; 199 { "CP866", "CP866" },
237 } 200 { "CP949", "CP949" },
238 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1); 201 { "GB18030", "GB18030" },
239 strcpy (res_ptr + res_size - (l2 + 1), buf2); 202 { "GB2312", "GB2312" },
240 } 203 { "GBK", "GBK" },
241 fclose (fp); 204 /*{ "ISCII-DEV", "?" },*/
242 if (res_size == 0) 205 { "ISO8859-1", "ISO-8859-1" },
243 cp = ""; 206 { "ISO8859-13", "ISO-8859-13" },
244 else 207 { "ISO8859-15", "ISO-8859-15" },
245 { 208 { "ISO8859-2", "ISO-8859-2" },
246 *(res_ptr + res_size) = '\0'; 209 { "ISO8859-4", "ISO-8859-4" },
247 cp = res_ptr; 210 { "ISO8859-5", "ISO-8859-5" },
248 } 211 { "ISO8859-7", "ISO-8859-7" },
249 } 212 { "ISO8859-9", "ISO-8859-9" },
250 } 213 { "KOI8-R", "KOI8-R" },
214 { "KOI8-U", "KOI8-U" },
215 { "PT154", "PT154" },
216 { "SJIS", "SHIFT_JIS" },
217 { "eucCN", "GB2312" },
218 { "eucJP", "EUC-JP" },
219 { "eucKR", "EUC-KR" }
220# define alias_table_defined
221# endif
222# if defined _AIX /* AIX */
223 /*{ "GBK", "GBK" },*/
224 { "IBM-1046", "CP1046" },
225 { "IBM-1124", "CP1124" },
226 { "IBM-1129", "CP1129" },
227 { "IBM-1252", "CP1252" },
228 { "IBM-850", "CP850" },
229 { "IBM-856", "CP856" },
230 { "IBM-921", "ISO-8859-13" },
231 { "IBM-922", "CP922" },
232 { "IBM-932", "CP932" },
233 { "IBM-943", "CP943" },
234 { "IBM-eucCN", "GB2312" },
235 { "IBM-eucJP", "EUC-JP" },
236 { "IBM-eucKR", "EUC-KR" },
237 { "IBM-eucTW", "EUC-TW" },
238 { "ISO8859-1", "ISO-8859-1" },
239 { "ISO8859-15", "ISO-8859-15" },
240 { "ISO8859-2", "ISO-8859-2" },
241 { "ISO8859-5", "ISO-8859-5" },
242 { "ISO8859-6", "ISO-8859-6" },
243 { "ISO8859-7", "ISO-8859-7" },
244 { "ISO8859-8", "ISO-8859-8" },
245 { "ISO8859-9", "ISO-8859-9" },
246 { "TIS-620", "TIS-620" },
247 /*{ "UTF-8", "UTF-8" },*/
248 { "big5", "BIG5" }
249# define alias_table_defined
250# endif
251# if defined __hpux /* HP-UX */
252 { "SJIS", "SHIFT_JIS" },
253 { "arabic8", "HP-ARABIC8" },
254 { "big5", "BIG5" },
255 { "cp1251", "CP1251" },
256 { "eucJP", "EUC-JP" },
257 { "eucKR", "EUC-KR" },
258 { "eucTW", "EUC-TW" },
259 { "gb18030", "GB18030" },
260 { "greek8", "HP-GREEK8" },
261 { "hebrew8", "HP-HEBREW8" },
262 { "hkbig5", "BIG5-HKSCS" },
263 { "hp15CN", "GB2312" },
264 { "iso88591", "ISO-8859-1" },
265 { "iso885913", "ISO-8859-13" },
266 { "iso885915", "ISO-8859-15" },
267 { "iso88592", "ISO-8859-2" },
268 { "iso88594", "ISO-8859-4" },
269 { "iso88595", "ISO-8859-5" },
270 { "iso88596", "ISO-8859-6" },
271 { "iso88597", "ISO-8859-7" },
272 { "iso88598", "ISO-8859-8" },
273 { "iso88599", "ISO-8859-9" },
274 { "kana8", "HP-KANA8" },
275 { "koi8r", "KOI8-R" },
276 { "roman8", "HP-ROMAN8" },
277 { "tis620", "TIS-620" },
278 { "turkish8", "HP-TURKISH8" },
279 { "utf8", "UTF-8" }
280# define alias_table_defined
281# endif
282# if defined __sgi /* IRIX */
283 { "ISO8859-1", "ISO-8859-1" },
284 { "ISO8859-15", "ISO-8859-15" },
285 { "ISO8859-2", "ISO-8859-2" },
286 { "ISO8859-5", "ISO-8859-5" },
287 { "ISO8859-7", "ISO-8859-7" },
288 { "ISO8859-9", "ISO-8859-9" },
289 { "eucCN", "GB2312" },
290 { "eucJP", "EUC-JP" },
291 { "eucKR", "EUC-KR" },
292 { "eucTW", "EUC-TW" }
293# define alias_table_defined
294# endif
295# if defined __osf__ /* OSF/1 */
296 /*{ "GBK", "GBK" },*/
297 { "ISO8859-1", "ISO-8859-1" },
298 { "ISO8859-15", "ISO-8859-15" },
299 { "ISO8859-2", "ISO-8859-2" },
300 { "ISO8859-4", "ISO-8859-4" },
301 { "ISO8859-5", "ISO-8859-5" },
302 { "ISO8859-7", "ISO-8859-7" },
303 { "ISO8859-8", "ISO-8859-8" },
304 { "ISO8859-9", "ISO-8859-9" },
305 { "KSC5601", "CP949" },
306 { "SJIS", "SHIFT_JIS" },
307 { "TACTIS", "TIS-620" },
308 /*{ "UTF-8", "UTF-8" },*/
309 { "big5", "BIG5" },
310 { "cp850", "CP850" },
311 { "dechanyu", "DEC-HANYU" },
312 { "dechanzi", "GB2312" },
313 { "deckanji", "DEC-KANJI" },
314 { "deckorean", "EUC-KR" },
315 { "eucJP", "EUC-JP" },
316 { "eucKR", "EUC-KR" },
317 { "eucTW", "EUC-TW" },
318 { "sdeckanji", "EUC-JP" }
319# define alias_table_defined
320# endif
321# if defined __sun /* Solaris */
322 { "5601", "EUC-KR" },
323 { "646", "ASCII" },
324 /*{ "BIG5", "BIG5" },*/
325 { "Big5-HKSCS", "BIG5-HKSCS" },
326 { "GB18030", "GB18030" },
327 /*{ "GBK", "GBK" },*/
328 { "ISO8859-1", "ISO-8859-1" },
329 { "ISO8859-11", "TIS-620" },
330 { "ISO8859-13", "ISO-8859-13" },
331 { "ISO8859-15", "ISO-8859-15" },
332 { "ISO8859-2", "ISO-8859-2" },
333 { "ISO8859-3", "ISO-8859-3" },
334 { "ISO8859-4", "ISO-8859-4" },
335 { "ISO8859-5", "ISO-8859-5" },
336 { "ISO8859-6", "ISO-8859-6" },
337 { "ISO8859-7", "ISO-8859-7" },
338 { "ISO8859-8", "ISO-8859-8" },
339 { "ISO8859-9", "ISO-8859-9" },
340 { "PCK", "SHIFT_JIS" },
341 { "TIS620.2533", "TIS-620" },
342 /*{ "UTF-8", "UTF-8" },*/
343 { "ansi-1251", "CP1251" },
344 { "cns11643", "EUC-TW" },
345 { "eucJP", "EUC-JP" },
346 { "gb2312", "GB2312" },
347 { "koi8-r", "KOI8-R" }
348# define alias_table_defined
349# endif
350# if defined __minix /* Minix */
351 { "646", "ASCII" }
352# define alias_table_defined
353# endif
354# if defined WINDOWS_NATIVE || defined __CYGWIN__ /* Windows */
355 { "CP1361", "JOHAB" },
356 { "CP20127", "ASCII" },
357 { "CP20866", "KOI8-R" },
358 { "CP20936", "GB2312" },
359 { "CP21866", "KOI8-RU" },
360 { "CP28591", "ISO-8859-1" },
361 { "CP28592", "ISO-8859-2" },
362 { "CP28593", "ISO-8859-3" },
363 { "CP28594", "ISO-8859-4" },
364 { "CP28595", "ISO-8859-5" },
365 { "CP28596", "ISO-8859-6" },
366 { "CP28597", "ISO-8859-7" },
367 { "CP28598", "ISO-8859-8" },
368 { "CP28599", "ISO-8859-9" },
369 { "CP28605", "ISO-8859-15" },
370 { "CP38598", "ISO-8859-8" },
371 { "CP51932", "EUC-JP" },
372 { "CP51936", "GB2312" },
373 { "CP51949", "EUC-KR" },
374 { "CP51950", "EUC-TW" },
375 { "CP54936", "GB18030" },
376 { "CP65001", "UTF-8" },
377 { "CP936", "GBK" }
378# define alias_table_defined
379# endif
380# if defined OS2 /* OS/2 */
381 /* The list of encodings is taken from "List of OS/2 Codepages"
382 by Alex Taylor:
383 <http://altsan.org/os2/toolkits/uls/index.html#codepages>.
384 See also "__convcp() of kLIBC":
385 <https://github.com/bitwiseworks/libc/blob/master/src/emx/src/lib/locale/__convcp.c>. */
386 { "CP1004", "CP1252" },
387 /*{ "CP1041", "CP943" },*/
388 /*{ "CP1088", "CP949" },*/
389 { "CP1089", "ISO-8859-6" },
390 /*{ "CP1114", "CP950" },*/
391 /*{ "CP1115", "GB2312" },*/
392 { "CP1208", "UTF-8" },
393 /*{ "CP1380", "GB2312" },*/
394 { "CP1381", "GB2312" },
395 { "CP1383", "GB2312" },
396 { "CP1386", "GBK" },
397 /*{ "CP301", "CP943" },*/
398 { "CP3372", "EUC-JP" },
399 { "CP4946", "CP850" },
400 /*{ "CP5048", "JIS_X0208-1990" },*/
401 /*{ "CP5049", "JIS_X0212-1990" },*/
402 /*{ "CP5067", "KS_C_5601-1987" },*/
403 { "CP813", "ISO-8859-7" },
404 { "CP819", "ISO-8859-1" },
405 { "CP878", "KOI8-R" },
406 /*{ "CP897", "CP943" },*/
407 { "CP912", "ISO-8859-2" },
408 { "CP913", "ISO-8859-3" },
409 { "CP914", "ISO-8859-4" },
410 { "CP915", "ISO-8859-5" },
411 { "CP916", "ISO-8859-8" },
412 { "CP920", "ISO-8859-9" },
413 { "CP921", "ISO-8859-13" },
414 { "CP923", "ISO-8859-15" },
415 /*{ "CP941", "CP943" },*/
416 /*{ "CP947", "CP950" },*/
417 /*{ "CP951", "CP949" },*/
418 /*{ "CP952", "JIS_X0208-1990" },*/
419 /*{ "CP953", "JIS_X0212-1990" },*/
420 { "CP954", "EUC-JP" },
421 { "CP964", "EUC-TW" },
422 { "CP970", "EUC-KR" },
423 /*{ "CP971", "KS_C_5601-1987" },*/
424 { "IBM-1004", "CP1252" },
425 /*{ "IBM-1006", "?" },*/
426 /*{ "IBM-1008", "?" },*/
427 /*{ "IBM-1041", "CP943" },*/
428 /*{ "IBM-1051", "?" },*/
429 /*{ "IBM-1088", "CP949" },*/
430 { "IBM-1089", "ISO-8859-6" },
431 /*{ "IBM-1098", "?" },*/
432 /*{ "IBM-1114", "CP950" },*/
433 /*{ "IBM-1115", "GB2312" },*/
434 /*{ "IBM-1116", "?" },*/
435 /*{ "IBM-1117", "?" },*/
436 /*{ "IBM-1118", "?" },*/
437 /*{ "IBM-1119", "?" },*/
438 { "IBM-1124", "CP1124" },
439 { "IBM-1125", "CP1125" },
440 { "IBM-1131", "CP1131" },
441 { "IBM-1208", "UTF-8" },
442 { "IBM-1250", "CP1250" },
443 { "IBM-1251", "CP1251" },
444 { "IBM-1252", "CP1252" },
445 { "IBM-1253", "CP1253" },
446 { "IBM-1254", "CP1254" },
447 { "IBM-1255", "CP1255" },
448 { "IBM-1256", "CP1256" },
449 { "IBM-1257", "CP1257" },
450 /*{ "IBM-1275", "?" },*/
451 /*{ "IBM-1276", "?" },*/
452 /*{ "IBM-1277", "?" },*/
453 /*{ "IBM-1280", "?" },*/
454 /*{ "IBM-1281", "?" },*/
455 /*{ "IBM-1282", "?" },*/
456 /*{ "IBM-1283", "?" },*/
457 /*{ "IBM-1380", "GB2312" },*/
458 { "IBM-1381", "GB2312" },
459 { "IBM-1383", "GB2312" },
460 { "IBM-1386", "GBK" },
461 /*{ "IBM-301", "CP943" },*/
462 { "IBM-3372", "EUC-JP" },
463 { "IBM-367", "ASCII" },
464 { "IBM-437", "CP437" },
465 { "IBM-4946", "CP850" },
466 /*{ "IBM-5048", "JIS_X0208-1990" },*/
467 /*{ "IBM-5049", "JIS_X0212-1990" },*/
468 /*{ "IBM-5067", "KS_C_5601-1987" },*/
469 { "IBM-813", "ISO-8859-7" },
470 { "IBM-819", "ISO-8859-1" },
471 { "IBM-850", "CP850" },
472 /*{ "IBM-851", "?" },*/
473 { "IBM-852", "CP852" },
474 { "IBM-855", "CP855" },
475 { "IBM-856", "CP856" },
476 { "IBM-857", "CP857" },
477 /*{ "IBM-859", "?" },*/
478 { "IBM-860", "CP860" },
479 { "IBM-861", "CP861" },
480 { "IBM-862", "CP862" },
481 { "IBM-863", "CP863" },
482 { "IBM-864", "CP864" },
483 { "IBM-865", "CP865" },
484 { "IBM-866", "CP866" },
485 /*{ "IBM-868", "?" },*/
486 { "IBM-869", "CP869" },
487 { "IBM-874", "CP874" },
488 { "IBM-878", "KOI8-R" },
489 /*{ "IBM-895", "?" },*/
490 /*{ "IBM-897", "CP943" },*/
491 /*{ "IBM-907", "?" },*/
492 /*{ "IBM-909", "?" },*/
493 { "IBM-912", "ISO-8859-2" },
494 { "IBM-913", "ISO-8859-3" },
495 { "IBM-914", "ISO-8859-4" },
496 { "IBM-915", "ISO-8859-5" },
497 { "IBM-916", "ISO-8859-8" },
498 { "IBM-920", "ISO-8859-9" },
499 { "IBM-921", "ISO-8859-13" },
500 { "IBM-922", "CP922" },
501 { "IBM-923", "ISO-8859-15" },
502 { "IBM-932", "CP932" },
503 /*{ "IBM-941", "CP943" },*/
504 /*{ "IBM-942", "?" },*/
505 { "IBM-943", "CP943" },
506 /*{ "IBM-947", "CP950" },*/
507 { "IBM-949", "CP949" },
508 { "IBM-950", "CP950" },
509 /*{ "IBM-951", "CP949" },*/
510 /*{ "IBM-952", "JIS_X0208-1990" },*/
511 /*{ "IBM-953", "JIS_X0212-1990" },*/
512 { "IBM-954", "EUC-JP" },
513 /*{ "IBM-955", "?" },*/
514 { "IBM-964", "EUC-TW" },
515 { "IBM-970", "EUC-KR" },
516 /*{ "IBM-971", "KS_C_5601-1987" },*/
517 { "IBM-eucCN", "GB2312" },
518 { "IBM-eucJP", "EUC-JP" },
519 { "IBM-eucKR", "EUC-KR" },
520 { "IBM-eucTW", "EUC-TW" },
521 { "IBM33722", "EUC-JP" },
522 { "ISO8859-1", "ISO-8859-1" },
523 { "ISO8859-2", "ISO-8859-2" },
524 { "ISO8859-3", "ISO-8859-3" },
525 { "ISO8859-4", "ISO-8859-4" },
526 { "ISO8859-5", "ISO-8859-5" },
527 { "ISO8859-6", "ISO-8859-6" },
528 { "ISO8859-7", "ISO-8859-7" },
529 { "ISO8859-8", "ISO-8859-8" },
530 { "ISO8859-9", "ISO-8859-9" },
531 /*{ "JISX0201-1976", "JISX0201-1976" },*/
532 /*{ "JISX0208-1978", "?" },*/
533 /*{ "JISX0208-1983", "JIS_X0208-1983" },*/
534 /*{ "JISX0208-1990", "JIS_X0208-1990" },*/
535 /*{ "JISX0212-1990", "JIS_X0212-1990" },*/
536 /*{ "KSC5601-1987", "KS_C_5601-1987" },*/
537 { "SJIS-1", "CP943" },
538 { "SJIS-2", "CP943" },
539 { "eucJP", "EUC-JP" },
540 { "eucKR", "EUC-KR" },
541 { "eucTW-1993", "EUC-TW" }
542# define alias_table_defined
543# endif
544# if defined VMS /* OpenVMS */
545 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
546 "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
547 section 10.7 "Handling Different Character Sets". */
548 { "DECHANYU", "DEC-HANYU" },
549 { "DECHANZI", "GB2312" },
550 { "DECKANJI", "DEC-KANJI" },
551 { "DECKOREAN", "EUC-KR" },
552 { "ISO8859-1", "ISO-8859-1" },
553 { "ISO8859-2", "ISO-8859-2" },
554 { "ISO8859-5", "ISO-8859-5" },
555 { "ISO8859-7", "ISO-8859-7" },
556 { "ISO8859-8", "ISO-8859-8" },
557 { "ISO8859-9", "ISO-8859-9" },
558 { "SDECKANJI", "EUC-JP" },
559 { "SJIS", "SHIFT_JIS" },
560 { "eucJP", "EUC-JP" },
561 { "eucTW", "EUC-TW" }
562# define alias_table_defined
563# endif
564# ifndef alias_table_defined
565 /* Just a dummy entry, to avoid a C syntax error. */
566 { "", "" }
567# endif
568 };
251 569
252 free (file_name); 570# endif
253 }
254 571
255#else 572#else
256 573
257# if defined DARWIN7 574/* On these platforms, we use a mapping from locale name to GNU canonical
258 /* To avoid the trouble of installing a file that is shared by many 575 encoding name. */
259 GNU packages -- many packaging systems have problems with this --,
260 simply inline the aliases here. */
261 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
262 "ISO8859-2" "\0" "ISO-8859-2" "\0"
263 "ISO8859-4" "\0" "ISO-8859-4" "\0"
264 "ISO8859-5" "\0" "ISO-8859-5" "\0"
265 "ISO8859-7" "\0" "ISO-8859-7" "\0"
266 "ISO8859-9" "\0" "ISO-8859-9" "\0"
267 "ISO8859-13" "\0" "ISO-8859-13" "\0"
268 "ISO8859-15" "\0" "ISO-8859-15" "\0"
269 "KOI8-R" "\0" "KOI8-R" "\0"
270 "KOI8-U" "\0" "KOI8-U" "\0"
271 "CP866" "\0" "CP866" "\0"
272 "CP949" "\0" "CP949" "\0"
273 "CP1131" "\0" "CP1131" "\0"
274 "CP1251" "\0" "CP1251" "\0"
275 "eucCN" "\0" "GB2312" "\0"
276 "GB2312" "\0" "GB2312" "\0"
277 "eucJP" "\0" "EUC-JP" "\0"
278 "eucKR" "\0" "EUC-KR" "\0"
279 "Big5" "\0" "BIG5" "\0"
280 "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
281 "GBK" "\0" "GBK" "\0"
282 "GB18030" "\0" "GB18030" "\0"
283 "SJIS" "\0" "SHIFT_JIS" "\0"
284 "ARMSCII-8" "\0" "ARMSCII-8" "\0"
285 "PT154" "\0" "PT154" "\0"
286 /*"ISCII-DEV" "\0" "?" "\0"*/
287 "*" "\0" "UTF-8" "\0";
288# endif
289 576
290# if defined VMS 577struct table_entry
291 /* To avoid the troubles of an extra file charset.alias_vms in the 578{
292 sources of many GNU packages, simply inline the aliases here. */ 579 const char locale[17+1];
293 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation 580 const char canonical[11+1];
294 "Compaq C Run-Time Library Reference Manual for OpenVMS systems" 581};
295 section 10.7 "Handling Different Character Sets". */ 582
296 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0" 583/* Table of platform-dependent mappings, sorted in ascending order. */
297 "ISO8859-2" "\0" "ISO-8859-2" "\0" 584static const struct table_entry locale_table[] =
298 "ISO8859-5" "\0" "ISO-8859-5" "\0" 585 {
299 "ISO8859-7" "\0" "ISO-8859-7" "\0" 586# if defined __FreeBSD__ /* FreeBSD 4.2 */
300 "ISO8859-8" "\0" "ISO-8859-8" "\0" 587 { "cs_CZ.ISO_8859-2", "ISO-8859-2" },
301 "ISO8859-9" "\0" "ISO-8859-9" "\0" 588 { "da_DK.DIS_8859-15", "ISO-8859-15" },
302 /* Japanese */ 589 { "da_DK.ISO_8859-1", "ISO-8859-1" },
303 "eucJP" "\0" "EUC-JP" "\0" 590 { "de_AT.DIS_8859-15", "ISO-8859-15" },
304 "SJIS" "\0" "SHIFT_JIS" "\0" 591 { "de_AT.ISO_8859-1", "ISO-8859-1" },
305 "DECKANJI" "\0" "DEC-KANJI" "\0" 592 { "de_CH.DIS_8859-15", "ISO-8859-15" },
306 "SDECKANJI" "\0" "EUC-JP" "\0" 593 { "de_CH.ISO_8859-1", "ISO-8859-1" },
307 /* Chinese */ 594 { "de_DE.DIS_8859-15", "ISO-8859-15" },
308 "eucTW" "\0" "EUC-TW" "\0" 595 { "de_DE.ISO_8859-1", "ISO-8859-1" },
309 "DECHANYU" "\0" "DEC-HANYU" "\0" 596 { "en_AU.DIS_8859-15", "ISO-8859-15" },
310 "DECHANZI" "\0" "GB2312" "\0" 597 { "en_AU.ISO_8859-1", "ISO-8859-1" },
311 /* Korean */ 598 { "en_CA.DIS_8859-15", "ISO-8859-15" },
312 "DECKOREAN" "\0" "EUC-KR" "\0"; 599 { "en_CA.ISO_8859-1", "ISO-8859-1" },
600 { "en_GB.DIS_8859-15", "ISO-8859-15" },
601 { "en_GB.ISO_8859-1", "ISO-8859-1" },
602 { "en_US.DIS_8859-15", "ISO-8859-15" },
603 { "en_US.ISO_8859-1", "ISO-8859-1" },
604 { "es_ES.DIS_8859-15", "ISO-8859-15" },
605 { "es_ES.ISO_8859-1", "ISO-8859-1" },
606 { "fi_FI.DIS_8859-15", "ISO-8859-15" },
607 { "fi_FI.ISO_8859-1", "ISO-8859-1" },
608 { "fr_BE.DIS_8859-15", "ISO-8859-15" },
609 { "fr_BE.ISO_8859-1", "ISO-8859-1" },
610 { "fr_CA.DIS_8859-15", "ISO-8859-15" },
611 { "fr_CA.ISO_8859-1", "ISO-8859-1" },
612 { "fr_CH.DIS_8859-15", "ISO-8859-15" },
613 { "fr_CH.ISO_8859-1", "ISO-8859-1" },
614 { "fr_FR.DIS_8859-15", "ISO-8859-15" },
615 { "fr_FR.ISO_8859-1", "ISO-8859-1" },
616 { "hr_HR.ISO_8859-2", "ISO-8859-2" },
617 { "hu_HU.ISO_8859-2", "ISO-8859-2" },
618 { "is_IS.DIS_8859-15", "ISO-8859-15" },
619 { "is_IS.ISO_8859-1", "ISO-8859-1" },
620 { "it_CH.DIS_8859-15", "ISO-8859-15" },
621 { "it_CH.ISO_8859-1", "ISO-8859-1" },
622 { "it_IT.DIS_8859-15", "ISO-8859-15" },
623 { "it_IT.ISO_8859-1", "ISO-8859-1" },
624 { "ja_JP.EUC", "EUC-JP" },
625 { "ja_JP.SJIS", "SHIFT_JIS" },
626 { "ja_JP.Shift_JIS", "SHIFT_JIS" },
627 { "ko_KR.EUC", "EUC-KR" },
628 { "la_LN.ASCII", "ASCII" },
629 { "la_LN.DIS_8859-15", "ISO-8859-15" },
630 { "la_LN.ISO_8859-1", "ISO-8859-1" },
631 { "la_LN.ISO_8859-2", "ISO-8859-2" },
632 { "la_LN.ISO_8859-4", "ISO-8859-4" },
633 { "lt_LN.ASCII", "ASCII" },
634 { "lt_LN.DIS_8859-15", "ISO-8859-15" },
635 { "lt_LN.ISO_8859-1", "ISO-8859-1" },
636 { "lt_LN.ISO_8859-2", "ISO-8859-2" },
637 { "lt_LT.ISO_8859-4", "ISO-8859-4" },
638 { "nl_BE.DIS_8859-15", "ISO-8859-15" },
639 { "nl_BE.ISO_8859-1", "ISO-8859-1" },
640 { "nl_NL.DIS_8859-15", "ISO-8859-15" },
641 { "nl_NL.ISO_8859-1", "ISO-8859-1" },
642 { "no_NO.DIS_8859-15", "ISO-8859-15" },
643 { "no_NO.ISO_8859-1", "ISO-8859-1" },
644 { "pl_PL.ISO_8859-2", "ISO-8859-2" },
645 { "pt_PT.DIS_8859-15", "ISO-8859-15" },
646 { "pt_PT.ISO_8859-1", "ISO-8859-1" },
647 { "ru_RU.CP866", "CP866" },
648 { "ru_RU.ISO_8859-5", "ISO-8859-5" },
649 { "ru_RU.KOI8-R", "KOI8-R" },
650 { "ru_SU.CP866", "CP866" },
651 { "ru_SU.ISO_8859-5", "ISO-8859-5" },
652 { "ru_SU.KOI8-R", "KOI8-R" },
653 { "sl_SI.ISO_8859-2", "ISO-8859-2" },
654 { "sv_SE.DIS_8859-15", "ISO-8859-15" },
655 { "sv_SE.ISO_8859-1", "ISO-8859-1" },
656 { "uk_UA.KOI8-U", "KOI8-U" },
657 { "zh_CN.EUC", "GB2312" },
658 { "zh_TW.BIG5", "BIG5" },
659 { "zh_TW.Big5", "BIG5" }
660# define locale_table_defined
313# endif 661# endif
314 662# if defined __DJGPP__ /* DOS / DJGPP 2.03 */
315# if defined WINDOWS_NATIVE || defined __CYGWIN__ 663 /* The encodings given here may not all be correct.
316 /* To avoid the troubles of installing a separate file in the same 664 If you find that the encoding given for your language and
317 directory as the DLL and of retrieving the DLL's directory at 665 country is not the one your DOS machine actually uses, just
318 runtime, simply inline the aliases here. */ 666 correct it in this file, and send a mail to
319 667 Juan Manuel Guerrero <juan.guerrero@gmx.de>
320 cp = "CP936" "\0" "GBK" "\0" 668 and <bug-gnulib@gnu.org>. */
321 "CP1361" "\0" "JOHAB" "\0" 669 { "C", "ASCII" },
322 "CP20127" "\0" "ASCII" "\0" 670 { "ar", "CP864" },
323 "CP20866" "\0" "KOI8-R" "\0" 671 { "ar_AE", "CP864" },
324 "CP20936" "\0" "GB2312" "\0" 672 { "ar_DZ", "CP864" },
325 "CP21866" "\0" "KOI8-RU" "\0" 673 { "ar_EG", "CP864" },
326 "CP28591" "\0" "ISO-8859-1" "\0" 674 { "ar_IQ", "CP864" },
327 "CP28592" "\0" "ISO-8859-2" "\0" 675 { "ar_IR", "CP864" },
328 "CP28593" "\0" "ISO-8859-3" "\0" 676 { "ar_JO", "CP864" },
329 "CP28594" "\0" "ISO-8859-4" "\0" 677 { "ar_KW", "CP864" },
330 "CP28595" "\0" "ISO-8859-5" "\0" 678 { "ar_MA", "CP864" },
331 "CP28596" "\0" "ISO-8859-6" "\0" 679 { "ar_OM", "CP864" },
332 "CP28597" "\0" "ISO-8859-7" "\0" 680 { "ar_QA", "CP864" },
333 "CP28598" "\0" "ISO-8859-8" "\0" 681 { "ar_SA", "CP864" },
334 "CP28599" "\0" "ISO-8859-9" "\0" 682 { "ar_SY", "CP864" },
335 "CP28605" "\0" "ISO-8859-15" "\0" 683 { "be", "CP866" },
336 "CP38598" "\0" "ISO-8859-8" "\0" 684 { "be_BE", "CP866" },
337 "CP51932" "\0" "EUC-JP" "\0" 685 { "bg", "CP866" }, /* not CP855 ?? */
338 "CP51936" "\0" "GB2312" "\0" 686 { "bg_BG", "CP866" }, /* not CP855 ?? */
339 "CP51949" "\0" "EUC-KR" "\0" 687 { "ca", "CP850" },
340 "CP51950" "\0" "EUC-TW" "\0" 688 { "ca_ES", "CP850" },
341 "CP54936" "\0" "GB18030" "\0" 689 { "cs", "CP852" },
342 "CP65001" "\0" "UTF-8" "\0"; 690 { "cs_CZ", "CP852" },
691 { "da", "CP865" }, /* not CP850 ?? */
692 { "da_DK", "CP865" }, /* not CP850 ?? */
693 { "de", "CP850" },
694 { "de_AT", "CP850" },
695 { "de_CH", "CP850" },
696 { "de_DE", "CP850" },
697 { "el", "CP869" },
698 { "el_GR", "CP869" },
699 { "en", "CP850" },
700 { "en_AU", "CP850" }, /* not CP437 ?? */
701 { "en_CA", "CP850" },
702 { "en_GB", "CP850" },
703 { "en_NZ", "CP437" },
704 { "en_US", "CP437" },
705 { "en_ZA", "CP850" }, /* not CP437 ?? */
706 { "eo", "CP850" },
707 { "eo_EO", "CP850" },
708 { "es", "CP850" },
709 { "es_AR", "CP850" },
710 { "es_BO", "CP850" },
711 { "es_CL", "CP850" },
712 { "es_CO", "CP850" },
713 { "es_CR", "CP850" },
714 { "es_CU", "CP850" },
715 { "es_DO", "CP850" },
716 { "es_EC", "CP850" },
717 { "es_ES", "CP850" },
718 { "es_GT", "CP850" },
719 { "es_HN", "CP850" },
720 { "es_MX", "CP850" },
721 { "es_NI", "CP850" },
722 { "es_PA", "CP850" },
723 { "es_PE", "CP850" },
724 { "es_PY", "CP850" },
725 { "es_SV", "CP850" },
726 { "es_UY", "CP850" },
727 { "es_VE", "CP850" },
728 { "et", "CP850" },
729 { "et_EE", "CP850" },
730 { "eu", "CP850" },
731 { "eu_ES", "CP850" },
732 { "fi", "CP850" },
733 { "fi_FI", "CP850" },
734 { "fr", "CP850" },
735 { "fr_BE", "CP850" },
736 { "fr_CA", "CP850" },
737 { "fr_CH", "CP850" },
738 { "fr_FR", "CP850" },
739 { "ga", "CP850" },
740 { "ga_IE", "CP850" },
741 { "gd", "CP850" },
742 { "gd_GB", "CP850" },
743 { "gl", "CP850" },
744 { "gl_ES", "CP850" },
745 { "he", "CP862" },
746 { "he_IL", "CP862" },
747 { "hr", "CP852" },
748 { "hr_HR", "CP852" },
749 { "hu", "CP852" },
750 { "hu_HU", "CP852" },
751 { "id", "CP850" }, /* not CP437 ?? */
752 { "id_ID", "CP850" }, /* not CP437 ?? */
753 { "is", "CP861" }, /* not CP850 ?? */
754 { "is_IS", "CP861" }, /* not CP850 ?? */
755 { "it", "CP850" },
756 { "it_CH", "CP850" },
757 { "it_IT", "CP850" },
758 { "ja", "CP932" },
759 { "ja_JP", "CP932" },
760 { "kr", "CP949" }, /* not CP934 ?? */
761 { "kr_KR", "CP949" }, /* not CP934 ?? */
762 { "lt", "CP775" },
763 { "lt_LT", "CP775" },
764 { "lv", "CP775" },
765 { "lv_LV", "CP775" },
766 { "mk", "CP866" }, /* not CP855 ?? */
767 { "mk_MK", "CP866" }, /* not CP855 ?? */
768 { "mt", "CP850" },
769 { "mt_MT", "CP850" },
770 { "nb", "CP865" }, /* not CP850 ?? */
771 { "nb_NO", "CP865" }, /* not CP850 ?? */
772 { "nl", "CP850" },
773 { "nl_BE", "CP850" },
774 { "nl_NL", "CP850" },
775 { "nn", "CP865" }, /* not CP850 ?? */
776 { "nn_NO", "CP865" }, /* not CP850 ?? */
777 { "no", "CP865" }, /* not CP850 ?? */
778 { "no_NO", "CP865" }, /* not CP850 ?? */
779 { "pl", "CP852" },
780 { "pl_PL", "CP852" },
781 { "pt", "CP850" },
782 { "pt_BR", "CP850" },
783 { "pt_PT", "CP850" },
784 { "ro", "CP852" },
785 { "ro_RO", "CP852" },
786 { "ru", "CP866" },
787 { "ru_RU", "CP866" },
788 { "sk", "CP852" },
789 { "sk_SK", "CP852" },
790 { "sl", "CP852" },
791 { "sl_SI", "CP852" },
792 { "sq", "CP852" },
793 { "sq_AL", "CP852" },
794 { "sr", "CP852" }, /* CP852 or CP866 or CP855 ?? */
795 { "sr_CS", "CP852" }, /* CP852 or CP866 or CP855 ?? */
796 { "sr_YU", "CP852" }, /* CP852 or CP866 or CP855 ?? */
797 { "sv", "CP850" },
798 { "sv_SE", "CP850" },
799 { "th", "CP874" },
800 { "th_TH", "CP874" },
801 { "tr", "CP857" },
802 { "tr_TR", "CP857" },
803 { "uk", "CP1125" },
804 { "uk_UA", "CP1125" },
805 { "zh_CN", "GBK" },
806 { "zh_TW", "CP950" } /* not CP938 ?? */
807# define locale_table_defined
343# endif 808# endif
344#endif 809# ifndef locale_table_defined
810 /* Just a dummy entry, to avoid a C syntax error. */
811 { "", "" }
812# endif
813 };
345 814
346 charset_aliases = cp; 815#endif
347 }
348 816
349 return cp;
350}
351 817
352/* Determine the current locale's character encoding, and canonicalize it 818/* Determine the current locale's character encoding, and canonicalize it
353 into one of the canonical names listed in config.charset. 819 into one of the canonical names listed below.
354 The result must not be freed; it is statically allocated. 820 The result must not be freed; it is statically allocated. The result
821 becomes invalid when setlocale() is used to change the global locale, or
822 when the value of one of the environment variables LC_ALL, LC_CTYPE, LANG
823 is changed; threads in multithreaded programs should not do this.
355 If the canonical name cannot be determined, the result is a non-canonical 824 If the canonical name cannot be determined, the result is a non-canonical
356 name. */ 825 name. */
357 826
@@ -362,9 +831,15 @@ const char *
362locale_charset (void) 831locale_charset (void)
363{ 832{
364 const char *codeset; 833 const char *codeset;
365 const char *aliases;
366 834
367#if !(defined WINDOWS_NATIVE || defined OS2) 835 /* This function must be multithread-safe. To achieve this without using
836 thread-local storage, we use a simple strcpy or memcpy to fill this static
837 buffer. Filling it through, for example, strcpy + strcat would not be
838 guaranteed to leave the buffer's contents intact if another thread is
839 currently accessing it. If necessary, the contents is first assembled in
840 a stack-allocated buffer. */
841
842#if HAVE_LANGINFO_CODESET || defined WINDOWS_NATIVE || defined OS2
368 843
369# if HAVE_LANGINFO_CODESET 844# if HAVE_LANGINFO_CODESET
370 845
@@ -378,7 +853,7 @@ locale_charset (void)
378 if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0) 853 if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
379 { 854 {
380 const char *locale; 855 const char *locale;
381 static char buf[2 + 10 + 1]; 856 static char resultbuf[2 + 10 + 1];
382 857
383 locale = getenv ("LC_ALL"); 858 locale = getenv ("LC_ALL");
384 if (locale == NULL || locale[0] == '\0') 859 if (locale == NULL || locale[0] == '\0')
@@ -402,11 +877,12 @@ locale_charset (void)
402 modifier = strchr (dot, '@'); 877 modifier = strchr (dot, '@');
403 if (modifier == NULL) 878 if (modifier == NULL)
404 return dot; 879 return dot;
405 if (modifier - dot < sizeof (buf)) 880 if (modifier - dot < sizeof (resultbuf))
406 { 881 {
407 memcpy (buf, dot, modifier - dot); 882 /* This way of filling resultbuf is multithread-safe. */
408 buf [modifier - dot] = '\0'; 883 memcpy (resultbuf, dot, modifier - dot);
409 return buf; 884 resultbuf [modifier - dot] = '\0';
885 return resultbuf;
410 } 886 }
411 } 887 }
412 } 888 }
@@ -422,62 +898,65 @@ locale_charset (void)
422 converting to GetConsoleOutputCP(). This leads to correct results, 898 converting to GetConsoleOutputCP(). This leads to correct results,
423 except when SetConsoleOutputCP has been called and a raster font is 899 except when SetConsoleOutputCP has been called and a raster font is
424 in use. */ 900 in use. */
425 sprintf (buf, "CP%u", GetACP ()); 901 {
426 codeset = buf; 902 char buf[2 + 10 + 1];
427 }
428# endif
429
430# else
431
432 /* On old systems which lack it, use setlocale or getenv. */
433 const char *locale = NULL;
434 903
435 /* But most old systems don't have a complete set of locales. Some 904 sprintf (buf, "CP%u", GetACP ());
436 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't 905 strcpy (resultbuf, buf);
437 use setlocale here; it would return "C" when it doesn't support the 906 codeset = resultbuf;
438 locale name the user has set. */ 907 }
439# if 0
440 locale = setlocale (LC_CTYPE, NULL);
441# endif
442 if (locale == NULL || locale[0] == '\0')
443 {
444 locale = getenv ("LC_ALL");
445 if (locale == NULL || locale[0] == '\0')
446 {
447 locale = getenv ("LC_CTYPE");
448 if (locale == NULL || locale[0] == '\0')
449 locale = getenv ("LANG");
450 }
451 } 908 }
909# endif
452 910
453 /* On some old systems, one used to set locale = "iso8859_1". On others, 911 if (codeset == NULL)
454 you set it to "language_COUNTRY.charset". In any case, we resolve it 912 /* The canonical name cannot be determined. */
455 through the charset.alias file. */ 913 codeset = "";
456 codeset = locale;
457 914
458# endif 915# elif defined WINDOWS_NATIVE
459 916
460#elif defined WINDOWS_NATIVE 917 char buf[2 + 10 + 1];
918 static char resultbuf[2 + 10 + 1];
461 919
462 static char buf[2 + 10 + 1]; 920 /* The Windows API has a function returning the locale's codepage as
921 a number, but the value doesn't change according to what the
922 'setlocale' call specified. So we use it as a last resort, in
923 case the string returned by 'setlocale' doesn't specify the
924 codepage. */
925 char *current_locale = setlocale (LC_CTYPE, NULL);
926 char *pdot = strrchr (current_locale, '.');
463 927
464 /* The Windows API has a function returning the locale's codepage as a 928 if (pdot && 2 + strlen (pdot + 1) + 1 <= sizeof (buf))
465 number: GetACP(). 929 sprintf (buf, "CP%s", pdot + 1);
466 When the output goes to a console window, it needs to be provided in 930 else
467 GetOEMCP() encoding if the console is using a raster font, or in 931 {
468 GetConsoleOutputCP() encoding if it is using a TrueType font. 932 /* The Windows API has a function returning the locale's codepage as a
469 But in GUI programs and for output sent to files and pipes, GetACP() 933 number: GetACP().
470 encoding is the best bet. */ 934 When the output goes to a console window, it needs to be provided in
471 sprintf (buf, "CP%u", GetACP ()); 935 GetOEMCP() encoding if the console is using a raster font, or in
472 codeset = buf; 936 GetConsoleOutputCP() encoding if it is using a TrueType font.
937 But in GUI programs and for output sent to files and pipes, GetACP()
938 encoding is the best bet. */
939 sprintf (buf, "CP%u", GetACP ());
940 }
941 /* For a locale name such as "French_France.65001", in Windows 10,
942 setlocale now returns "French_France.utf8" instead. */
943 if (strcmp (buf + 2, "65001") == 0 || strcmp (buf + 2, "utf8") == 0)
944 codeset = "UTF-8";
945 else
946 {
947 strcpy (resultbuf, buf);
948 codeset = resultbuf;
949 }
473 950
474#elif defined OS2 951# elif defined OS2
475 952
476 const char *locale; 953 const char *locale;
477 static char buf[2 + 10 + 1]; 954 static char resultbuf[2 + 10 + 1];
478 ULONG cp[3]; 955 ULONG cp[3];
479 ULONG cplen; 956 ULONG cplen;
480 957
958 codeset = NULL;
959
481 /* Allow user to override the codeset, as set in the operating system, 960 /* Allow user to override the codeset, as set in the operating system,
482 with standard language environment variables. */ 961 with standard language environment variables. */
483 locale = getenv ("LC_ALL"); 962 locale = getenv ("LC_ALL");
@@ -501,51 +980,173 @@ locale_charset (void)
501 modifier = strchr (dot, '@'); 980 modifier = strchr (dot, '@');
502 if (modifier == NULL) 981 if (modifier == NULL)
503 return dot; 982 return dot;
504 if (modifier - dot < sizeof (buf)) 983 if (modifier - dot < sizeof (resultbuf))
505 { 984 {
506 memcpy (buf, dot, modifier - dot); 985 /* This way of filling resultbuf is multithread-safe. */
507 buf [modifier - dot] = '\0'; 986 memcpy (resultbuf, dot, modifier - dot);
508 return buf; 987 resultbuf [modifier - dot] = '\0';
988 return resultbuf;
509 } 989 }
510 } 990 }
511 991
512 /* Resolve through the charset.alias file. */ 992 /* For the POSIX locale, don't use the system's codepage. */
513 codeset = locale; 993 if (strcmp (locale, "C") == 0 || strcmp (locale, "POSIX") == 0)
994 codeset = "";
514 } 995 }
515 else 996
997 if (codeset == NULL)
516 { 998 {
517 /* OS/2 has a function returning the locale's codepage as a number. */ 999 /* OS/2 has a function returning the locale's codepage as a number. */
518 if (DosQueryCp (sizeof (cp), cp, &cplen)) 1000 if (DosQueryCp (sizeof (cp), cp, &cplen))
519 codeset = ""; 1001 codeset = "";
520 else 1002 else
521 { 1003 {
1004 char buf[2 + 10 + 1];
1005
522 sprintf (buf, "CP%u", cp[0]); 1006 sprintf (buf, "CP%u", cp[0]);
523 codeset = buf; 1007 strcpy (resultbuf, buf);
1008 codeset = resultbuf;
524 } 1009 }
525 } 1010 }
526 1011
527#endif 1012# else
528 1013
529 if (codeset == NULL) 1014# error "Add code for other platforms here."
530 /* The canonical name cannot be determined. */ 1015
531 codeset = ""; 1016# endif
1017
1018 /* Resolve alias. */
1019 {
1020# ifdef alias_table_defined
1021 /* On some platforms, UTF-8 locales are the most frequently used ones.
1022 Speed up the common case and slow down the less common cases by
1023 testing for this case first. */
1024# if defined __OpenBSD__ || (defined __APPLE__ && defined __MACH__) || defined __sun || defined __CYGWIN__
1025 if (strcmp (codeset, "UTF-8") == 0)
1026 goto done_table_lookup;
1027 else
1028# endif
1029 {
1030 const struct table_entry * const table = alias_table;
1031 size_t const table_size =
1032 sizeof (alias_table) / sizeof (struct table_entry);
1033 /* The table is sorted. Perform a binary search. */
1034 size_t hi = table_size;
1035 size_t lo = 0;
1036 while (lo < hi)
1037 {
1038 /* Invariant:
1039 for i < lo, strcmp (table[i].alias, codeset) < 0,
1040 for i >= hi, strcmp (table[i].alias, codeset) > 0. */
1041 size_t mid = (hi + lo) >> 1; /* >= lo, < hi */
1042 int cmp = strcmp (table[mid].alias, codeset);
1043 if (cmp < 0)
1044 lo = mid + 1;
1045 else if (cmp > 0)
1046 hi = mid;
1047 else
1048 {
1049 /* Found an i with
1050 strcmp (table[i].alias, codeset) == 0. */
1051 codeset = table[mid].canonical;
1052 goto done_table_lookup;
1053 }
1054 }
1055 }
1056 if (0)
1057 done_table_lookup: ;
1058 else
1059# endif
1060 {
1061 /* Did not find it in the table. */
1062 /* On Mac OS X, all modern locales use the UTF-8 encoding.
1063 BeOS and Haiku have a single locale, and it has UTF-8 encoding. */
1064# if (defined __APPLE__ && defined __MACH__) || defined __BEOS__ || defined __HAIKU__
1065 codeset = "UTF-8";
1066# else
1067 /* Don't return an empty string. GNU libc and GNU libiconv interpret
1068 the empty string as denoting "the locale's character encoding",
1069 thus GNU libiconv would call this function a second time. */
1070 if (codeset[0] == '\0')
1071 codeset = "ASCII";
1072# endif
1073 }
1074 }
1075
1076#else
532 1077
533 /* Resolve alias. */ 1078 /* On old systems which lack it, use setlocale or getenv. */
534 for (aliases = get_charset_aliases (); 1079 const char *locale = NULL;
535 *aliases != '\0'; 1080
536 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1) 1081 /* But most old systems don't have a complete set of locales. Some
537 if (strcmp (codeset, aliases) == 0 1082 (like DJGPP) have only the C locale. Therefore we don't use setlocale
538 || (aliases[0] == '*' && aliases[1] == '\0')) 1083 here; it would return "C" when it doesn't support the locale name the
1084 user has set. */
1085# if 0
1086 locale = setlocale (LC_CTYPE, NULL);
1087# endif
1088 if (locale == NULL || locale[0] == '\0')
1089 {
1090 locale = getenv ("LC_ALL");
1091 if (locale == NULL || locale[0] == '\0')
1092 {
1093 locale = getenv ("LC_CTYPE");
1094 if (locale == NULL || locale[0] == '\0')
1095 locale = getenv ("LANG");
1096 if (locale == NULL)
1097 locale = "";
1098 }
1099 }
1100
1101 /* Map locale name to canonical encoding name. */
1102 {
1103# ifdef locale_table_defined
1104 const struct table_entry * const table = locale_table;
1105 size_t const table_size =
1106 sizeof (locale_table) / sizeof (struct table_entry);
1107 /* The table is sorted. Perform a binary search. */
1108 size_t hi = table_size;
1109 size_t lo = 0;
1110 while (lo < hi)
539 { 1111 {
540 codeset = aliases + strlen (aliases) + 1; 1112 /* Invariant:
541 break; 1113 for i < lo, strcmp (table[i].locale, locale) < 0,
1114 for i >= hi, strcmp (table[i].locale, locale) > 0. */
1115 size_t mid = (hi + lo) >> 1; /* >= lo, < hi */
1116 int cmp = strcmp (table[mid].locale, locale);
1117 if (cmp < 0)
1118 lo = mid + 1;
1119 else if (cmp > 0)
1120 hi = mid;
1121 else
1122 {
1123 /* Found an i with
1124 strcmp (table[i].locale, locale) == 0. */
1125 codeset = table[mid].canonical;
1126 goto done_table_lookup;
1127 }
1128 }
1129 if (0)
1130 done_table_lookup: ;
1131 else
1132# endif
1133 {
1134 /* Did not find it in the table. */
1135 /* On Mac OS X, all modern locales use the UTF-8 encoding.
1136 BeOS and Haiku have a single locale, and it has UTF-8 encoding. */
1137# if (defined __APPLE__ && defined __MACH__) || defined __BEOS__ || defined __HAIKU__
1138 codeset = "UTF-8";
1139# else
1140 /* The canonical name cannot be determined. */
1141 /* Don't return an empty string. GNU libc and GNU libiconv interpret
1142 the empty string as denoting "the locale's character encoding",
1143 thus GNU libiconv would call this function a second time. */
1144 codeset = "ASCII";
1145# endif
542 } 1146 }
1147 }
543 1148
544 /* Don't return an empty string. GNU libc and GNU libiconv interpret 1149#endif
545 the empty string as denoting "the locale's character encoding",
546 thus GNU libiconv would call this function a second time. */
547 if (codeset[0] == '\0')
548 codeset = "ASCII";
549 1150
550#ifdef DARWIN7 1151#ifdef DARWIN7
551 /* Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8" 1152 /* Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8"