summaryrefslogtreecommitdiffstats
path: root/gl/uninorm.h
diff options
context:
space:
mode:
Diffstat (limited to 'gl/uninorm.h')
-rw-r--r--gl/uninorm.h256
1 files changed, 256 insertions, 0 deletions
diff --git a/gl/uninorm.h b/gl/uninorm.h
new file mode 100644
index 00000000..f6815c49
--- /dev/null
+++ b/gl/uninorm.h
@@ -0,0 +1,256 @@
1/* DO NOT EDIT! GENERATED AUTOMATICALLY! */
2/* Normalization forms (composition and decomposition) of Unicode strings.
3 Copyright (C) 2001-2002, 2009-2025 Free Software Foundation, Inc.
4 Written by Bruno Haible <bruno@clisp.org>, 2009.
5
6 This file is free software: you can redistribute it and/or modify
7 it under the terms of the GNU Lesser General Public License as
8 published by the Free Software Foundation; either version 2.1 of the
9 License, or (at your option) any later version.
10
11 This file is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18
19#ifndef _UNINORM_H
20#define _UNINORM_H
21
22/* Get size_t. */
23#include <stddef.h>
24
25#include "unitypes.h"
26
27#if 0
28# include <unistring/woe32dll.h>
29#else
30# define LIBUNISTRING_DLL_VARIABLE
31#endif
32
33
34#ifdef __cplusplus
35extern "C" {
36#endif
37
38
39/* Conventions:
40
41 All functions prefixed with u8_ operate on UTF-8 encoded strings.
42 Their unit is an uint8_t (1 byte).
43
44 All functions prefixed with u16_ operate on UTF-16 encoded strings.
45 Their unit is an uint16_t (a 2-byte word).
46
47 All functions prefixed with u32_ operate on UCS-4 encoded strings.
48 Their unit is an uint32_t (a 4-byte word).
49
50 All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly
51 n units.
52
53 Functions returning a string result take a (resultbuf, lengthp) argument
54 pair. If resultbuf is not NULL and the result fits into *lengthp units,
55 it is put in resultbuf, and resultbuf is returned. Otherwise, a freshly
56 allocated string is returned. In both cases, *lengthp is set to the
57 length (number of units) of the returned string. In case of error,
58 NULL is returned and errno is set. */
59
60
61enum
62{
63 UC_DECOMP_CANONICAL,/* Canonical decomposition. */
64 UC_DECOMP_FONT, /* <font> A font variant (e.g. a blackletter form). */
65 UC_DECOMP_NOBREAK, /* <noBreak> A no-break version of a space or hyphen. */
66 UC_DECOMP_INITIAL, /* <initial> An initial presentation form (Arabic). */
67 UC_DECOMP_MEDIAL, /* <medial> A medial presentation form (Arabic). */
68 UC_DECOMP_FINAL, /* <final> A final presentation form (Arabic). */
69 UC_DECOMP_ISOLATED,/* <isolated> An isolated presentation form (Arabic). */
70 UC_DECOMP_CIRCLE, /* <circle> An encircled form. */
71 UC_DECOMP_SUPER, /* <super> A superscript form. */
72 UC_DECOMP_SUB, /* <sub> A subscript form. */
73 UC_DECOMP_VERTICAL,/* <vertical> A vertical layout presentation form. */
74 UC_DECOMP_WIDE, /* <wide> A wide (or zenkaku) compatibility character. */
75 UC_DECOMP_NARROW, /* <narrow> A narrow (or hankaku) compatibility character. */
76 UC_DECOMP_SMALL, /* <small> A small variant form (CNS compatibility). */
77 UC_DECOMP_SQUARE, /* <square> A CJK squared font variant. */
78 UC_DECOMP_FRACTION,/* <fraction> A vulgar fraction form. */
79 UC_DECOMP_COMPAT /* <compat> Otherwise unspecified compatibility character. */
80};
81
82/* Maximum size of decomposition of a single Unicode character. */
83#define UC_DECOMPOSITION_MAX_LENGTH 32
84
85/* Return the character decomposition mapping of a Unicode character.
86 DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
87 ucs_t elements.
88 When a decomposition exists, DECOMPOSITION[0..N-1] and *DECOMP_TAG are
89 filled and N is returned. Otherwise -1 is returned. */
90extern int
91 uc_decomposition (ucs4_t uc, int *decomp_tag, ucs4_t *decomposition);
92
93/* Return the canonical character decomposition mapping of a Unicode character.
94 DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
95 ucs_t elements.
96 When a decomposition exists, DECOMPOSITION[0..N-1] is filled and N is
97 returned. Otherwise -1 is returned. */
98extern int
99 uc_canonical_decomposition (ucs4_t uc, ucs4_t *decomposition);
100
101
102/* Attempt to combine the Unicode characters uc1, uc2.
103 uc1 is known to have canonical combining class 0.
104 Return the combination of uc1 and uc2, if it exists.
105 Return 0 otherwise.
106 Not all decompositions can be recombined using this function. See the
107 Unicode file CompositionExclusions.txt for details. */
108extern ucs4_t
109 uc_composition (ucs4_t uc1, ucs4_t uc2)
110 _UC_ATTRIBUTE_CONST;
111
112
113/* An object of type uninorm_t denotes a Unicode normalization form. */
114struct unicode_normalization_form;
115typedef const struct unicode_normalization_form *uninorm_t;
116
117/* UNINORM_NFD: Normalization form D: canonical decomposition. */
118extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfd;
119#define UNINORM_NFD (&uninorm_nfd)
120
121/* UNINORM_NFC: Normalization form C: canonical decomposition, then
122 canonical composition. */
123extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfc;
124#define UNINORM_NFC (&uninorm_nfc)
125
126/* UNINORM_NFKD: Normalization form KD: compatibility decomposition. */
127extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfkd;
128#define UNINORM_NFKD (&uninorm_nfkd)
129
130/* UNINORM_NFKC: Normalization form KC: compatibility decomposition, then
131 canonical composition. */
132extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfkc;
133#define UNINORM_NFKC (&uninorm_nfkc)
134
135/* Test whether a normalization form does compatibility decomposition. */
136#define uninorm_is_compat_decomposing(nf) \
137 ((* (const unsigned int *) (nf) >> 0) & 1)
138
139/* Test whether a normalization form includes canonical composition. */
140#define uninorm_is_composing(nf) \
141 ((* (const unsigned int *) (nf) >> 1) & 1)
142
143/* Return the decomposing variant of a normalization form.
144 This maps NFC,NFD -> NFD and NFKC,NFKD -> NFKD. */
145extern uninorm_t
146 uninorm_decomposing_form (uninorm_t nf)
147 _UC_ATTRIBUTE_PURE;
148
149
150/* Return the specified normalization form of a string. */
151extern uint8_t *
152 u8_normalize (uninorm_t nf, const uint8_t *s, size_t n,
153 uint8_t *_UC_RESTRICT resultbuf, size_t *lengthp);
154extern uint16_t *
155 u16_normalize (uninorm_t nf, const uint16_t *s, size_t n,
156 uint16_t *_UC_RESTRICT resultbuf, size_t *lengthp);
157extern uint32_t *
158 u32_normalize (uninorm_t nf, const uint32_t *s, size_t n,
159 uint32_t *_UC_RESTRICT resultbuf, size_t *lengthp);
160
161
162/* Compare S1 and S2, ignoring differences in normalization.
163 NF must be either UNINORM_NFD or UNINORM_NFKD.
164 If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
165 return 0. Upon failure, return -1 with errno set. */
166extern int
167 u8_normcmp (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2,
168 uninorm_t nf, int *resultp);
169extern int
170 u16_normcmp (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2,
171 uninorm_t nf, int *resultp);
172extern int
173 u32_normcmp (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2,
174 uninorm_t nf, int *resultp);
175
176
177/* Converts the string S of length N to a NUL-terminated byte sequence, in such
178 a way that comparing uN_normxfrm (S1) and uN_normxfrm (S2) with uN_cmp2() is
179 equivalent to comparing S1 and S2 with uN_normcoll().
180 NF must be either UNINORM_NFC or UNINORM_NFKC. */
181extern char *
182 u8_normxfrm (const uint8_t *s, size_t n, uninorm_t nf,
183 char *resultbuf, size_t *lengthp);
184extern char *
185 u16_normxfrm (const uint16_t *s, size_t n, uninorm_t nf,
186 char *resultbuf, size_t *lengthp);
187extern char *
188 u32_normxfrm (const uint32_t *s, size_t n, uninorm_t nf,
189 char *resultbuf, size_t *lengthp);
190
191
192/* Compare S1 and S2, ignoring differences in normalization, using the
193 collation rules of the current locale.
194 NF must be either UNINORM_NFC or UNINORM_NFKC.
195 If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
196 return 0. Upon failure, return -1 with errno set. */
197extern int
198 u8_normcoll (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2,
199 uninorm_t nf, int *resultp);
200extern int
201 u16_normcoll (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2,
202 uninorm_t nf, int *resultp);
203extern int
204 u32_normcoll (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2,
205 uninorm_t nf, int *resultp);
206
207
208/* Normalization of a stream of Unicode characters.
209
210 A "stream of Unicode characters" is essentially a function that accepts an
211 ucs4_t argument repeatedly, optionally combined with a function that
212 "flushes" the stream. */
213
214/* Data type of a stream of Unicode characters that normalizes its input
215 according to a given normalization form and passes the normalized character
216 sequence to the encapsulated stream of Unicode characters. */
217struct uninorm_filter;
218
219/* Bring data buffered in the filter to its destination, the encapsulated
220 stream, then close and free the filter.
221 Return 0 if successful, or -1 with errno set upon failure. */
222extern int
223 uninorm_filter_free (struct uninorm_filter *filter);
224
225/* Create and return a normalization filter for Unicode characters.
226 The pair (stream_func, stream_data) is the encapsulated stream.
227 stream_func (stream_data, uc) receives the Unicode character uc
228 and returns 0 if successful, or -1 with errno set upon failure.
229 Return the new filter, or NULL with errno set upon failure. */
230extern struct uninorm_filter *
231 uninorm_filter_create (uninorm_t nf,
232 int (*stream_func) (void *stream_data, ucs4_t uc),
233 void *stream_data)
234 _GL_ATTRIBUTE_DEALLOC (uninorm_filter_free, 1);
235
236/* Stuff a Unicode character into a normalizing filter.
237 Return 0 if successful, or -1 with errno set upon failure. */
238extern int
239 uninorm_filter_write (struct uninorm_filter *filter, ucs4_t uc);
240
241/* Bring data buffered in the filter to its destination, the encapsulated
242 stream.
243 Return 0 if successful, or -1 with errno set upon failure.
244 Note! If after calling this function, additional characters are written
245 into the filter, the resulting character sequence in the encapsulated stream
246 will not necessarily be normalized. */
247extern int
248 uninorm_filter_flush (struct uninorm_filter *filter);
249
250
251#ifdef __cplusplus
252}
253#endif
254
255
256#endif /* _UNINORM_H */