diff options
Diffstat (limited to 'gl/uninorm.h')
| -rw-r--r-- | gl/uninorm.h | 256 |
1 files changed, 256 insertions, 0 deletions
diff --git a/gl/uninorm.h b/gl/uninorm.h new file mode 100644 index 00000000..f6815c49 --- /dev/null +++ b/gl/uninorm.h | |||
| @@ -0,0 +1,256 @@ | |||
| 1 | /* DO NOT EDIT! GENERATED AUTOMATICALLY! */ | ||
| 2 | /* Normalization forms (composition and decomposition) of Unicode strings. | ||
| 3 | Copyright (C) 2001-2002, 2009-2025 Free Software Foundation, Inc. | ||
| 4 | Written by Bruno Haible <bruno@clisp.org>, 2009. | ||
| 5 | |||
| 6 | This file is free software: you can redistribute it and/or modify | ||
| 7 | it under the terms of the GNU Lesser General Public License as | ||
| 8 | published by the Free Software Foundation; either version 2.1 of the | ||
| 9 | License, or (at your option) any later version. | ||
| 10 | |||
| 11 | This file is distributed in the hope that it will be useful, | ||
| 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 14 | GNU Lesser General Public License for more details. | ||
| 15 | |||
| 16 | You should have received a copy of the GNU Lesser General Public License | ||
| 17 | along with this program. If not, see <https://www.gnu.org/licenses/>. */ | ||
| 18 | |||
| 19 | #ifndef _UNINORM_H | ||
| 20 | #define _UNINORM_H | ||
| 21 | |||
| 22 | /* Get size_t. */ | ||
| 23 | #include <stddef.h> | ||
| 24 | |||
| 25 | #include "unitypes.h" | ||
| 26 | |||
| 27 | #if 0 | ||
| 28 | # include <unistring/woe32dll.h> | ||
| 29 | #else | ||
| 30 | # define LIBUNISTRING_DLL_VARIABLE | ||
| 31 | #endif | ||
| 32 | |||
| 33 | |||
| 34 | #ifdef __cplusplus | ||
| 35 | extern "C" { | ||
| 36 | #endif | ||
| 37 | |||
| 38 | |||
| 39 | /* Conventions: | ||
| 40 | |||
| 41 | All functions prefixed with u8_ operate on UTF-8 encoded strings. | ||
| 42 | Their unit is an uint8_t (1 byte). | ||
| 43 | |||
| 44 | All functions prefixed with u16_ operate on UTF-16 encoded strings. | ||
| 45 | Their unit is an uint16_t (a 2-byte word). | ||
| 46 | |||
| 47 | All functions prefixed with u32_ operate on UCS-4 encoded strings. | ||
| 48 | Their unit is an uint32_t (a 4-byte word). | ||
| 49 | |||
| 50 | All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly | ||
| 51 | n units. | ||
| 52 | |||
| 53 | Functions returning a string result take a (resultbuf, lengthp) argument | ||
| 54 | pair. If resultbuf is not NULL and the result fits into *lengthp units, | ||
| 55 | it is put in resultbuf, and resultbuf is returned. Otherwise, a freshly | ||
| 56 | allocated string is returned. In both cases, *lengthp is set to the | ||
| 57 | length (number of units) of the returned string. In case of error, | ||
| 58 | NULL is returned and errno is set. */ | ||
| 59 | |||
| 60 | |||
| 61 | enum | ||
| 62 | { | ||
| 63 | UC_DECOMP_CANONICAL,/* Canonical decomposition. */ | ||
| 64 | UC_DECOMP_FONT, /* <font> A font variant (e.g. a blackletter form). */ | ||
| 65 | UC_DECOMP_NOBREAK, /* <noBreak> A no-break version of a space or hyphen. */ | ||
| 66 | UC_DECOMP_INITIAL, /* <initial> An initial presentation form (Arabic). */ | ||
| 67 | UC_DECOMP_MEDIAL, /* <medial> A medial presentation form (Arabic). */ | ||
| 68 | UC_DECOMP_FINAL, /* <final> A final presentation form (Arabic). */ | ||
| 69 | UC_DECOMP_ISOLATED,/* <isolated> An isolated presentation form (Arabic). */ | ||
| 70 | UC_DECOMP_CIRCLE, /* <circle> An encircled form. */ | ||
| 71 | UC_DECOMP_SUPER, /* <super> A superscript form. */ | ||
| 72 | UC_DECOMP_SUB, /* <sub> A subscript form. */ | ||
| 73 | UC_DECOMP_VERTICAL,/* <vertical> A vertical layout presentation form. */ | ||
| 74 | UC_DECOMP_WIDE, /* <wide> A wide (or zenkaku) compatibility character. */ | ||
| 75 | UC_DECOMP_NARROW, /* <narrow> A narrow (or hankaku) compatibility character. */ | ||
| 76 | UC_DECOMP_SMALL, /* <small> A small variant form (CNS compatibility). */ | ||
| 77 | UC_DECOMP_SQUARE, /* <square> A CJK squared font variant. */ | ||
| 78 | UC_DECOMP_FRACTION,/* <fraction> A vulgar fraction form. */ | ||
| 79 | UC_DECOMP_COMPAT /* <compat> Otherwise unspecified compatibility character. */ | ||
| 80 | }; | ||
| 81 | |||
| 82 | /* Maximum size of decomposition of a single Unicode character. */ | ||
| 83 | #define UC_DECOMPOSITION_MAX_LENGTH 32 | ||
| 84 | |||
| 85 | /* Return the character decomposition mapping of a Unicode character. | ||
| 86 | DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH | ||
| 87 | ucs_t elements. | ||
| 88 | When a decomposition exists, DECOMPOSITION[0..N-1] and *DECOMP_TAG are | ||
| 89 | filled and N is returned. Otherwise -1 is returned. */ | ||
| 90 | extern int | ||
| 91 | uc_decomposition (ucs4_t uc, int *decomp_tag, ucs4_t *decomposition); | ||
| 92 | |||
| 93 | /* Return the canonical character decomposition mapping of a Unicode character. | ||
| 94 | DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH | ||
| 95 | ucs_t elements. | ||
| 96 | When a decomposition exists, DECOMPOSITION[0..N-1] is filled and N is | ||
| 97 | returned. Otherwise -1 is returned. */ | ||
| 98 | extern int | ||
| 99 | uc_canonical_decomposition (ucs4_t uc, ucs4_t *decomposition); | ||
| 100 | |||
| 101 | |||
| 102 | /* Attempt to combine the Unicode characters uc1, uc2. | ||
| 103 | uc1 is known to have canonical combining class 0. | ||
| 104 | Return the combination of uc1 and uc2, if it exists. | ||
| 105 | Return 0 otherwise. | ||
| 106 | Not all decompositions can be recombined using this function. See the | ||
| 107 | Unicode file CompositionExclusions.txt for details. */ | ||
| 108 | extern ucs4_t | ||
| 109 | uc_composition (ucs4_t uc1, ucs4_t uc2) | ||
| 110 | _UC_ATTRIBUTE_CONST; | ||
| 111 | |||
| 112 | |||
| 113 | /* An object of type uninorm_t denotes a Unicode normalization form. */ | ||
| 114 | struct unicode_normalization_form; | ||
| 115 | typedef const struct unicode_normalization_form *uninorm_t; | ||
| 116 | |||
| 117 | /* UNINORM_NFD: Normalization form D: canonical decomposition. */ | ||
| 118 | extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfd; | ||
| 119 | #define UNINORM_NFD (&uninorm_nfd) | ||
| 120 | |||
| 121 | /* UNINORM_NFC: Normalization form C: canonical decomposition, then | ||
| 122 | canonical composition. */ | ||
| 123 | extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfc; | ||
| 124 | #define UNINORM_NFC (&uninorm_nfc) | ||
| 125 | |||
| 126 | /* UNINORM_NFKD: Normalization form KD: compatibility decomposition. */ | ||
| 127 | extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfkd; | ||
| 128 | #define UNINORM_NFKD (&uninorm_nfkd) | ||
| 129 | |||
| 130 | /* UNINORM_NFKC: Normalization form KC: compatibility decomposition, then | ||
| 131 | canonical composition. */ | ||
| 132 | extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfkc; | ||
| 133 | #define UNINORM_NFKC (&uninorm_nfkc) | ||
| 134 | |||
| 135 | /* Test whether a normalization form does compatibility decomposition. */ | ||
| 136 | #define uninorm_is_compat_decomposing(nf) \ | ||
| 137 | ((* (const unsigned int *) (nf) >> 0) & 1) | ||
| 138 | |||
| 139 | /* Test whether a normalization form includes canonical composition. */ | ||
| 140 | #define uninorm_is_composing(nf) \ | ||
| 141 | ((* (const unsigned int *) (nf) >> 1) & 1) | ||
| 142 | |||
| 143 | /* Return the decomposing variant of a normalization form. | ||
| 144 | This maps NFC,NFD -> NFD and NFKC,NFKD -> NFKD. */ | ||
| 145 | extern uninorm_t | ||
| 146 | uninorm_decomposing_form (uninorm_t nf) | ||
| 147 | _UC_ATTRIBUTE_PURE; | ||
| 148 | |||
| 149 | |||
| 150 | /* Return the specified normalization form of a string. */ | ||
| 151 | extern uint8_t * | ||
| 152 | u8_normalize (uninorm_t nf, const uint8_t *s, size_t n, | ||
| 153 | uint8_t *_UC_RESTRICT resultbuf, size_t *lengthp); | ||
| 154 | extern uint16_t * | ||
| 155 | u16_normalize (uninorm_t nf, const uint16_t *s, size_t n, | ||
| 156 | uint16_t *_UC_RESTRICT resultbuf, size_t *lengthp); | ||
| 157 | extern uint32_t * | ||
| 158 | u32_normalize (uninorm_t nf, const uint32_t *s, size_t n, | ||
| 159 | uint32_t *_UC_RESTRICT resultbuf, size_t *lengthp); | ||
| 160 | |||
| 161 | |||
| 162 | /* Compare S1 and S2, ignoring differences in normalization. | ||
| 163 | NF must be either UNINORM_NFD or UNINORM_NFKD. | ||
| 164 | If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and | ||
| 165 | return 0. Upon failure, return -1 with errno set. */ | ||
| 166 | extern int | ||
| 167 | u8_normcmp (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2, | ||
| 168 | uninorm_t nf, int *resultp); | ||
| 169 | extern int | ||
| 170 | u16_normcmp (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2, | ||
| 171 | uninorm_t nf, int *resultp); | ||
| 172 | extern int | ||
| 173 | u32_normcmp (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2, | ||
| 174 | uninorm_t nf, int *resultp); | ||
| 175 | |||
| 176 | |||
| 177 | /* Converts the string S of length N to a NUL-terminated byte sequence, in such | ||
| 178 | a way that comparing uN_normxfrm (S1) and uN_normxfrm (S2) with uN_cmp2() is | ||
| 179 | equivalent to comparing S1 and S2 with uN_normcoll(). | ||
| 180 | NF must be either UNINORM_NFC or UNINORM_NFKC. */ | ||
| 181 | extern char * | ||
| 182 | u8_normxfrm (const uint8_t *s, size_t n, uninorm_t nf, | ||
| 183 | char *resultbuf, size_t *lengthp); | ||
| 184 | extern char * | ||
| 185 | u16_normxfrm (const uint16_t *s, size_t n, uninorm_t nf, | ||
| 186 | char *resultbuf, size_t *lengthp); | ||
| 187 | extern char * | ||
| 188 | u32_normxfrm (const uint32_t *s, size_t n, uninorm_t nf, | ||
| 189 | char *resultbuf, size_t *lengthp); | ||
| 190 | |||
| 191 | |||
| 192 | /* Compare S1 and S2, ignoring differences in normalization, using the | ||
| 193 | collation rules of the current locale. | ||
| 194 | NF must be either UNINORM_NFC or UNINORM_NFKC. | ||
| 195 | If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and | ||
| 196 | return 0. Upon failure, return -1 with errno set. */ | ||
| 197 | extern int | ||
| 198 | u8_normcoll (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2, | ||
| 199 | uninorm_t nf, int *resultp); | ||
| 200 | extern int | ||
| 201 | u16_normcoll (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2, | ||
| 202 | uninorm_t nf, int *resultp); | ||
| 203 | extern int | ||
| 204 | u32_normcoll (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2, | ||
| 205 | uninorm_t nf, int *resultp); | ||
| 206 | |||
| 207 | |||
| 208 | /* Normalization of a stream of Unicode characters. | ||
| 209 | |||
| 210 | A "stream of Unicode characters" is essentially a function that accepts an | ||
| 211 | ucs4_t argument repeatedly, optionally combined with a function that | ||
| 212 | "flushes" the stream. */ | ||
| 213 | |||
| 214 | /* Data type of a stream of Unicode characters that normalizes its input | ||
| 215 | according to a given normalization form and passes the normalized character | ||
| 216 | sequence to the encapsulated stream of Unicode characters. */ | ||
| 217 | struct uninorm_filter; | ||
| 218 | |||
| 219 | /* Bring data buffered in the filter to its destination, the encapsulated | ||
| 220 | stream, then close and free the filter. | ||
| 221 | Return 0 if successful, or -1 with errno set upon failure. */ | ||
| 222 | extern int | ||
| 223 | uninorm_filter_free (struct uninorm_filter *filter); | ||
| 224 | |||
| 225 | /* Create and return a normalization filter for Unicode characters. | ||
| 226 | The pair (stream_func, stream_data) is the encapsulated stream. | ||
| 227 | stream_func (stream_data, uc) receives the Unicode character uc | ||
| 228 | and returns 0 if successful, or -1 with errno set upon failure. | ||
| 229 | Return the new filter, or NULL with errno set upon failure. */ | ||
| 230 | extern struct uninorm_filter * | ||
| 231 | uninorm_filter_create (uninorm_t nf, | ||
| 232 | int (*stream_func) (void *stream_data, ucs4_t uc), | ||
| 233 | void *stream_data) | ||
| 234 | _GL_ATTRIBUTE_DEALLOC (uninorm_filter_free, 1); | ||
| 235 | |||
| 236 | /* Stuff a Unicode character into a normalizing filter. | ||
| 237 | Return 0 if successful, or -1 with errno set upon failure. */ | ||
| 238 | extern int | ||
| 239 | uninorm_filter_write (struct uninorm_filter *filter, ucs4_t uc); | ||
| 240 | |||
| 241 | /* Bring data buffered in the filter to its destination, the encapsulated | ||
| 242 | stream. | ||
| 243 | Return 0 if successful, or -1 with errno set upon failure. | ||
| 244 | Note! If after calling this function, additional characters are written | ||
| 245 | into the filter, the resulting character sequence in the encapsulated stream | ||
| 246 | will not necessarily be normalized. */ | ||
| 247 | extern int | ||
| 248 | uninorm_filter_flush (struct uninorm_filter *filter); | ||
| 249 | |||
| 250 | |||
| 251 | #ifdef __cplusplus | ||
| 252 | } | ||
| 253 | #endif | ||
| 254 | |||
| 255 | |||
| 256 | #endif /* _UNINORM_H */ | ||
