diff options
Diffstat (limited to 'gl/uninorm.in.h')
| -rw-r--r-- | gl/uninorm.in.h | 255 |
1 files changed, 255 insertions, 0 deletions
diff --git a/gl/uninorm.in.h b/gl/uninorm.in.h new file mode 100644 index 00000000..76ab32b6 --- /dev/null +++ b/gl/uninorm.in.h | |||
| @@ -0,0 +1,255 @@ | |||
| 1 | /* Normalization forms (composition and decomposition) of Unicode strings. | ||
| 2 | Copyright (C) 2001-2002, 2009-2025 Free Software Foundation, Inc. | ||
| 3 | Written by Bruno Haible <bruno@clisp.org>, 2009. | ||
| 4 | |||
| 5 | This file is free software: you can redistribute it and/or modify | ||
| 6 | it under the terms of the GNU Lesser General Public License as | ||
| 7 | published by the Free Software Foundation; either version 2.1 of the | ||
| 8 | License, or (at your option) any later version. | ||
| 9 | |||
| 10 | This file is distributed in the hope that it will be useful, | ||
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 13 | GNU Lesser General Public License for more details. | ||
| 14 | |||
| 15 | You should have received a copy of the GNU Lesser General Public License | ||
| 16 | along with this program. If not, see <https://www.gnu.org/licenses/>. */ | ||
| 17 | |||
| 18 | #ifndef _UNINORM_H | ||
| 19 | #define _UNINORM_H | ||
| 20 | |||
| 21 | /* Get size_t. */ | ||
| 22 | #include <stddef.h> | ||
| 23 | |||
| 24 | #include "unitypes.h" | ||
| 25 | |||
| 26 | #if @HAVE_UNISTRING_WOE32DLL_H@ | ||
| 27 | # include <unistring/woe32dll.h> | ||
| 28 | #else | ||
| 29 | # define LIBUNISTRING_DLL_VARIABLE | ||
| 30 | #endif | ||
| 31 | |||
| 32 | |||
| 33 | #ifdef __cplusplus | ||
| 34 | extern "C" { | ||
| 35 | #endif | ||
| 36 | |||
| 37 | |||
| 38 | /* Conventions: | ||
| 39 | |||
| 40 | All functions prefixed with u8_ operate on UTF-8 encoded strings. | ||
| 41 | Their unit is an uint8_t (1 byte). | ||
| 42 | |||
| 43 | All functions prefixed with u16_ operate on UTF-16 encoded strings. | ||
| 44 | Their unit is an uint16_t (a 2-byte word). | ||
| 45 | |||
| 46 | All functions prefixed with u32_ operate on UCS-4 encoded strings. | ||
| 47 | Their unit is an uint32_t (a 4-byte word). | ||
| 48 | |||
| 49 | All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly | ||
| 50 | n units. | ||
| 51 | |||
| 52 | Functions returning a string result take a (resultbuf, lengthp) argument | ||
| 53 | pair. If resultbuf is not NULL and the result fits into *lengthp units, | ||
| 54 | it is put in resultbuf, and resultbuf is returned. Otherwise, a freshly | ||
| 55 | allocated string is returned. In both cases, *lengthp is set to the | ||
| 56 | length (number of units) of the returned string. In case of error, | ||
| 57 | NULL is returned and errno is set. */ | ||
| 58 | |||
| 59 | |||
| 60 | enum | ||
| 61 | { | ||
| 62 | UC_DECOMP_CANONICAL,/* Canonical decomposition. */ | ||
| 63 | UC_DECOMP_FONT, /* <font> A font variant (e.g. a blackletter form). */ | ||
| 64 | UC_DECOMP_NOBREAK, /* <noBreak> A no-break version of a space or hyphen. */ | ||
| 65 | UC_DECOMP_INITIAL, /* <initial> An initial presentation form (Arabic). */ | ||
| 66 | UC_DECOMP_MEDIAL, /* <medial> A medial presentation form (Arabic). */ | ||
| 67 | UC_DECOMP_FINAL, /* <final> A final presentation form (Arabic). */ | ||
| 68 | UC_DECOMP_ISOLATED,/* <isolated> An isolated presentation form (Arabic). */ | ||
| 69 | UC_DECOMP_CIRCLE, /* <circle> An encircled form. */ | ||
| 70 | UC_DECOMP_SUPER, /* <super> A superscript form. */ | ||
| 71 | UC_DECOMP_SUB, /* <sub> A subscript form. */ | ||
| 72 | UC_DECOMP_VERTICAL,/* <vertical> A vertical layout presentation form. */ | ||
| 73 | UC_DECOMP_WIDE, /* <wide> A wide (or zenkaku) compatibility character. */ | ||
| 74 | UC_DECOMP_NARROW, /* <narrow> A narrow (or hankaku) compatibility character. */ | ||
| 75 | UC_DECOMP_SMALL, /* <small> A small variant form (CNS compatibility). */ | ||
| 76 | UC_DECOMP_SQUARE, /* <square> A CJK squared font variant. */ | ||
| 77 | UC_DECOMP_FRACTION,/* <fraction> A vulgar fraction form. */ | ||
| 78 | UC_DECOMP_COMPAT /* <compat> Otherwise unspecified compatibility character. */ | ||
| 79 | }; | ||
| 80 | |||
| 81 | /* Maximum size of decomposition of a single Unicode character. */ | ||
| 82 | #define UC_DECOMPOSITION_MAX_LENGTH 32 | ||
| 83 | |||
| 84 | /* Return the character decomposition mapping of a Unicode character. | ||
| 85 | DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH | ||
| 86 | ucs_t elements. | ||
| 87 | When a decomposition exists, DECOMPOSITION[0..N-1] and *DECOMP_TAG are | ||
| 88 | filled and N is returned. Otherwise -1 is returned. */ | ||
| 89 | extern int | ||
| 90 | uc_decomposition (ucs4_t uc, int *decomp_tag, ucs4_t *decomposition); | ||
| 91 | |||
| 92 | /* Return the canonical character decomposition mapping of a Unicode character. | ||
| 93 | DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH | ||
| 94 | ucs_t elements. | ||
| 95 | When a decomposition exists, DECOMPOSITION[0..N-1] is filled and N is | ||
| 96 | returned. Otherwise -1 is returned. */ | ||
| 97 | extern int | ||
| 98 | uc_canonical_decomposition (ucs4_t uc, ucs4_t *decomposition); | ||
| 99 | |||
| 100 | |||
| 101 | /* Attempt to combine the Unicode characters uc1, uc2. | ||
| 102 | uc1 is known to have canonical combining class 0. | ||
| 103 | Return the combination of uc1 and uc2, if it exists. | ||
| 104 | Return 0 otherwise. | ||
| 105 | Not all decompositions can be recombined using this function. See the | ||
| 106 | Unicode file CompositionExclusions.txt for details. */ | ||
| 107 | extern ucs4_t | ||
| 108 | uc_composition (ucs4_t uc1, ucs4_t uc2) | ||
| 109 | _UC_ATTRIBUTE_CONST; | ||
| 110 | |||
| 111 | |||
| 112 | /* An object of type uninorm_t denotes a Unicode normalization form. */ | ||
| 113 | struct unicode_normalization_form; | ||
| 114 | typedef const struct unicode_normalization_form *uninorm_t; | ||
| 115 | |||
| 116 | /* UNINORM_NFD: Normalization form D: canonical decomposition. */ | ||
| 117 | extern @GNULIB_UNINORM_NFD_DLL_VARIABLE@ const struct unicode_normalization_form uninorm_nfd; | ||
| 118 | #define UNINORM_NFD (&uninorm_nfd) | ||
| 119 | |||
| 120 | /* UNINORM_NFC: Normalization form C: canonical decomposition, then | ||
| 121 | canonical composition. */ | ||
| 122 | extern @GNULIB_UNINORM_NFC_DLL_VARIABLE@ const struct unicode_normalization_form uninorm_nfc; | ||
| 123 | #define UNINORM_NFC (&uninorm_nfc) | ||
| 124 | |||
| 125 | /* UNINORM_NFKD: Normalization form KD: compatibility decomposition. */ | ||
| 126 | extern @GNULIB_UNINORM_NFKD_DLL_VARIABLE@ const struct unicode_normalization_form uninorm_nfkd; | ||
| 127 | #define UNINORM_NFKD (&uninorm_nfkd) | ||
| 128 | |||
| 129 | /* UNINORM_NFKC: Normalization form KC: compatibility decomposition, then | ||
| 130 | canonical composition. */ | ||
| 131 | extern @GNULIB_UNINORM_NFKC_DLL_VARIABLE@ const struct unicode_normalization_form uninorm_nfkc; | ||
| 132 | #define UNINORM_NFKC (&uninorm_nfkc) | ||
| 133 | |||
| 134 | /* Test whether a normalization form does compatibility decomposition. */ | ||
| 135 | #define uninorm_is_compat_decomposing(nf) \ | ||
| 136 | ((* (const unsigned int *) (nf) >> 0) & 1) | ||
| 137 | |||
| 138 | /* Test whether a normalization form includes canonical composition. */ | ||
| 139 | #define uninorm_is_composing(nf) \ | ||
| 140 | ((* (const unsigned int *) (nf) >> 1) & 1) | ||
| 141 | |||
| 142 | /* Return the decomposing variant of a normalization form. | ||
| 143 | This maps NFC,NFD -> NFD and NFKC,NFKD -> NFKD. */ | ||
| 144 | extern uninorm_t | ||
| 145 | uninorm_decomposing_form (uninorm_t nf) | ||
| 146 | _UC_ATTRIBUTE_PURE; | ||
| 147 | |||
| 148 | |||
| 149 | /* Return the specified normalization form of a string. */ | ||
| 150 | extern uint8_t * | ||
| 151 | u8_normalize (uninorm_t nf, const uint8_t *s, size_t n, | ||
| 152 | uint8_t *_UC_RESTRICT resultbuf, size_t *lengthp); | ||
| 153 | extern uint16_t * | ||
| 154 | u16_normalize (uninorm_t nf, const uint16_t *s, size_t n, | ||
| 155 | uint16_t *_UC_RESTRICT resultbuf, size_t *lengthp); | ||
| 156 | extern uint32_t * | ||
| 157 | u32_normalize (uninorm_t nf, const uint32_t *s, size_t n, | ||
| 158 | uint32_t *_UC_RESTRICT resultbuf, size_t *lengthp); | ||
| 159 | |||
| 160 | |||
| 161 | /* Compare S1 and S2, ignoring differences in normalization. | ||
| 162 | NF must be either UNINORM_NFD or UNINORM_NFKD. | ||
| 163 | If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and | ||
| 164 | return 0. Upon failure, return -1 with errno set. */ | ||
| 165 | extern int | ||
| 166 | u8_normcmp (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2, | ||
| 167 | uninorm_t nf, int *resultp); | ||
| 168 | extern int | ||
| 169 | u16_normcmp (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2, | ||
| 170 | uninorm_t nf, int *resultp); | ||
| 171 | extern int | ||
| 172 | u32_normcmp (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2, | ||
| 173 | uninorm_t nf, int *resultp); | ||
| 174 | |||
| 175 | |||
| 176 | /* Converts the string S of length N to a NUL-terminated byte sequence, in such | ||
| 177 | a way that comparing uN_normxfrm (S1) and uN_normxfrm (S2) with uN_cmp2() is | ||
| 178 | equivalent to comparing S1 and S2 with uN_normcoll(). | ||
| 179 | NF must be either UNINORM_NFC or UNINORM_NFKC. */ | ||
| 180 | extern char * | ||
| 181 | u8_normxfrm (const uint8_t *s, size_t n, uninorm_t nf, | ||
| 182 | char *resultbuf, size_t *lengthp); | ||
| 183 | extern char * | ||
| 184 | u16_normxfrm (const uint16_t *s, size_t n, uninorm_t nf, | ||
| 185 | char *resultbuf, size_t *lengthp); | ||
| 186 | extern char * | ||
| 187 | u32_normxfrm (const uint32_t *s, size_t n, uninorm_t nf, | ||
| 188 | char *resultbuf, size_t *lengthp); | ||
| 189 | |||
| 190 | |||
| 191 | /* Compare S1 and S2, ignoring differences in normalization, using the | ||
| 192 | collation rules of the current locale. | ||
| 193 | NF must be either UNINORM_NFC or UNINORM_NFKC. | ||
| 194 | If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and | ||
| 195 | return 0. Upon failure, return -1 with errno set. */ | ||
| 196 | extern int | ||
| 197 | u8_normcoll (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2, | ||
| 198 | uninorm_t nf, int *resultp); | ||
| 199 | extern int | ||
| 200 | u16_normcoll (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2, | ||
| 201 | uninorm_t nf, int *resultp); | ||
| 202 | extern int | ||
| 203 | u32_normcoll (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2, | ||
| 204 | uninorm_t nf, int *resultp); | ||
| 205 | |||
| 206 | |||
| 207 | /* Normalization of a stream of Unicode characters. | ||
| 208 | |||
| 209 | A "stream of Unicode characters" is essentially a function that accepts an | ||
| 210 | ucs4_t argument repeatedly, optionally combined with a function that | ||
| 211 | "flushes" the stream. */ | ||
| 212 | |||
| 213 | /* Data type of a stream of Unicode characters that normalizes its input | ||
| 214 | according to a given normalization form and passes the normalized character | ||
| 215 | sequence to the encapsulated stream of Unicode characters. */ | ||
| 216 | struct uninorm_filter; | ||
| 217 | |||
| 218 | /* Bring data buffered in the filter to its destination, the encapsulated | ||
| 219 | stream, then close and free the filter. | ||
| 220 | Return 0 if successful, or -1 with errno set upon failure. */ | ||
| 221 | extern int | ||
| 222 | uninorm_filter_free (struct uninorm_filter *filter); | ||
| 223 | |||
| 224 | /* Create and return a normalization filter for Unicode characters. | ||
| 225 | The pair (stream_func, stream_data) is the encapsulated stream. | ||
| 226 | stream_func (stream_data, uc) receives the Unicode character uc | ||
| 227 | and returns 0 if successful, or -1 with errno set upon failure. | ||
| 228 | Return the new filter, or NULL with errno set upon failure. */ | ||
| 229 | extern struct uninorm_filter * | ||
| 230 | uninorm_filter_create (uninorm_t nf, | ||
| 231 | int (*stream_func) (void *stream_data, ucs4_t uc), | ||
| 232 | void *stream_data) | ||
| 233 | _GL_ATTRIBUTE_DEALLOC (uninorm_filter_free, 1); | ||
| 234 | |||
| 235 | /* Stuff a Unicode character into a normalizing filter. | ||
| 236 | Return 0 if successful, or -1 with errno set upon failure. */ | ||
| 237 | extern int | ||
| 238 | uninorm_filter_write (struct uninorm_filter *filter, ucs4_t uc); | ||
| 239 | |||
| 240 | /* Bring data buffered in the filter to its destination, the encapsulated | ||
| 241 | stream. | ||
| 242 | Return 0 if successful, or -1 with errno set upon failure. | ||
| 243 | Note! If after calling this function, additional characters are written | ||
| 244 | into the filter, the resulting character sequence in the encapsulated stream | ||
| 245 | will not necessarily be normalized. */ | ||
| 246 | extern int | ||
| 247 | uninorm_filter_flush (struct uninorm_filter *filter); | ||
| 248 | |||
| 249 | |||
| 250 | #ifdef __cplusplus | ||
| 251 | } | ||
| 252 | #endif | ||
| 253 | |||
| 254 | |||
| 255 | #endif /* _UNINORM_H */ | ||
