xref: /xnu-8792.81.2/bsd/sys/unicode.h (revision 19c3b8c28c31cb8130e034cfb5df6bf9ba342d90)
1*19c3b8c2SApple OSS Distributions /*
2*19c3b8c2SApple OSS Distributions  * Copyright (c) 2016-2020 Apple Inc. All rights reserved.
3*19c3b8c2SApple OSS Distributions  *
4*19c3b8c2SApple OSS Distributions  * @APPLE_LICENSE_HEADER_START@
5*19c3b8c2SApple OSS Distributions  *
6*19c3b8c2SApple OSS Distributions  * This file contains Original Code and/or Modifications of Original Code
7*19c3b8c2SApple OSS Distributions  * as defined in and that are subject to the Apple Public Source License
8*19c3b8c2SApple OSS Distributions  * Version 2.0 (the 'License'). You may not use this file except in
9*19c3b8c2SApple OSS Distributions  * compliance with the License. Please obtain a copy of the License at
10*19c3b8c2SApple OSS Distributions  * http://www.opensource.apple.com/apsl/ and read it before using this
11*19c3b8c2SApple OSS Distributions  * file.
12*19c3b8c2SApple OSS Distributions  *
13*19c3b8c2SApple OSS Distributions  * The Original Code and all software distributed under the License are
14*19c3b8c2SApple OSS Distributions  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15*19c3b8c2SApple OSS Distributions  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16*19c3b8c2SApple OSS Distributions  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17*19c3b8c2SApple OSS Distributions  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18*19c3b8c2SApple OSS Distributions  * Please see the License for the specific language governing rights and
19*19c3b8c2SApple OSS Distributions  * limitations under the License.
20*19c3b8c2SApple OSS Distributions  *
21*19c3b8c2SApple OSS Distributions  * @APPLE_LICENSE_HEADER_END@
22*19c3b8c2SApple OSS Distributions  */
23*19c3b8c2SApple OSS Distributions 
24*19c3b8c2SApple OSS Distributions #ifndef unicode_h
25*19c3b8c2SApple OSS Distributions #define unicode_h
26*19c3b8c2SApple OSS Distributions 
27*19c3b8c2SApple OSS Distributions #ifdef KERNEL_PRIVATE
28*19c3b8c2SApple OSS Distributions 
29*19c3b8c2SApple OSS Distributions #include <sys/cdefs.h>
30*19c3b8c2SApple OSS Distributions #include <stdbool.h>
31*19c3b8c2SApple OSS Distributions 
32*19c3b8c2SApple OSS Distributions /*
33*19c3b8c2SApple OSS Distributions  * WARNING - callers that use the following Unicode normalization interface for on-disk
34*19c3b8c2SApple OSS Distributions  * structures should be aware that the implementation will be periodically updated for
35*19c3b8c2SApple OSS Distributions  * the latest Unicode standard version.
36*19c3b8c2SApple OSS Distributions  */
37*19c3b8c2SApple OSS Distributions 
38*19c3b8c2SApple OSS Distributions enum {
39*19c3b8c2SApple OSS Distributions 	/* Maximum size of UTF32 reordering buffer for stream-safe format */
40*19c3b8c2SApple OSS Distributions 	kNCFStreamSafeBufMax = 32
41*19c3b8c2SApple OSS Distributions };
42*19c3b8c2SApple OSS Distributions 
43*19c3b8c2SApple OSS Distributions /*
44*19c3b8c2SApple OSS Distributions  * utf8_normalizeOptCaseFoldAndHash
45*19c3b8c2SApple OSS Distributions  *
46*19c3b8c2SApple OSS Distributions  * Convert a given UTF-8 string to UTF-32 in one of the following normalized forms,
47*19c3b8c2SApple OSS Distributions  * as specified by the case_sens parameter, and feed the result incrementally to
48*19c3b8c2SApple OSS Distributions  * the provided hash function callback:
49*19c3b8c2SApple OSS Distributions  * - "canonical caseless form" (case-folded NFD, as described by definition D145
50*19c3b8c2SApple OSS Distributions  *    in chapter 3 of The Unicode Standard); for case-insensitive behavior.
51*19c3b8c2SApple OSS Distributions  * - standard NFD; for case-sensitive behavior (if case_sens = true).
52*19c3b8c2SApple OSS Distributions  *
53*19c3b8c2SApple OSS Distributions  * The input string should be valid UTF-8 that meets the criteria for stream safe
54*19c3b8c2SApple OSS Distributions  * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.
55*19c3b8c2SApple OSS Distributions  * It should not contain ASCII 0x00 or '/'.
56*19c3b8c2SApple OSS Distributions  *
57*19c3b8c2SApple OSS Distributions  * str:       The input UTF-8 string (need not be 0 terminated)
58*19c3b8c2SApple OSS Distributions  * str_len:   The byte length of the input string (excluding any 0 terminator)
59*19c3b8c2SApple OSS Distributions  * case_sens: False for case-insensitive behavior; generates canonical caseless form.
60*19c3b8c2SApple OSS Distributions  *            True for case-sensitive behavior; generates standard NFD.
61*19c3b8c2SApple OSS Distributions  * hash_func: A pointer to a hashing function to compute the hash of the
62*19c3b8c2SApple OSS Distributions  *            normalized/case-folded result. buf contains buf_len bytes
63*19c3b8c2SApple OSS Distributions  *            of data to be added to the hash using the caller-supplied
64*19c3b8c2SApple OSS Distributions  *            context (ctx).
65*19c3b8c2SApple OSS Distributions  * hash_ctx:  The context for the hash function.
66*19c3b8c2SApple OSS Distributions  *
67*19c3b8c2SApple OSS Distributions  * Returns: 0 on success, or
68*19c3b8c2SApple OSS Distributions  *          EILSEQ: The input string contains illegal ASCII-range characters
69*19c3b8c2SApple OSS Distributions  *                  (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
70*19c3b8c2SApple OSS Distributions  *                  contains codepoints that are non-characters or unassigned in
71*19c3b8c2SApple OSS Distributions  *                  the version of Unicode currently supported.
72*19c3b8c2SApple OSS Distributions  */
73*19c3b8c2SApple OSS Distributions int utf8_normalizeOptCaseFoldAndHash(const char *str,
74*19c3b8c2SApple OSS Distributions     size_t      str_len,
75*19c3b8c2SApple OSS Distributions     bool        case_sens,
76*19c3b8c2SApple OSS Distributions     void      (*hash_func)(void *buf, size_t buf_len, void *ctx),
77*19c3b8c2SApple OSS Distributions     void       *hash_ctx);
78*19c3b8c2SApple OSS Distributions 
79*19c3b8c2SApple OSS Distributions /*
80*19c3b8c2SApple OSS Distributions  * utf8_normalizeOptCaseFoldAndCompare
81*19c3b8c2SApple OSS Distributions  *
82*19c3b8c2SApple OSS Distributions  * Determine whether two UTF-8 strings are equal after converting each to one of the
83*19c3b8c2SApple OSS Distributions  * following normalized forms, as specified by the case_sens parameter:
84*19c3b8c2SApple OSS Distributions  * - "canonical caseless form" (case-folded NFD); for case-insensitive comparison.
85*19c3b8c2SApple OSS Distributions  * - standard NFD; for case-sensitive comparison (if case_sens = true).
86*19c3b8c2SApple OSS Distributions  * On success, sets are_equal to true if the strings are equal, or false if they are not.
87*19c3b8c2SApple OSS Distributions  *
88*19c3b8c2SApple OSS Distributions  * The input strings should be valid UTF-8 that meet the criteria for stream safe
89*19c3b8c2SApple OSS Distributions  * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.
90*19c3b8c2SApple OSS Distributions  * They should not contain ASCII 0x00 or '/'.
91*19c3b8c2SApple OSS Distributions  *
92*19c3b8c2SApple OSS Distributions  * strA:      A UTF-8 string to be compared (need not be 0 terminated)
93*19c3b8c2SApple OSS Distributions  * strA_len:  The byte length of strA (excluding any 0 terminator)
94*19c3b8c2SApple OSS Distributions  * strB:      The second UTF-8 string to be compared (need not be 0 terminated)
95*19c3b8c2SApple OSS Distributions  * strB_len:  The byte length of strB (excluding any 0 terminator)
96*19c3b8c2SApple OSS Distributions  * case_sens: False for case-insensitive behavior; compares canonical caseless forms.
97*19c3b8c2SApple OSS Distributions  *            True for case-sensitive behavior; compares standard NFD forms.
98*19c3b8c2SApple OSS Distributions  * are_equal: On success, set to true if the strings are equal, or set to false
99*19c3b8c2SApple OSS Distributions  *            if they are not.
100*19c3b8c2SApple OSS Distributions  *
101*19c3b8c2SApple OSS Distributions  * Returns: 0 on success, or
102*19c3b8c2SApple OSS Distributions  *          EILSEQ: One or both of the input strings contains illegal ASCII-range
103*19c3b8c2SApple OSS Distributions  *                  characters (0x00 or '/'), or is not well-formed stream-safe UTF-8,
104*19c3b8c2SApple OSS Distributions  *                  or contains codepoints that are non-characters or unassigned in
105*19c3b8c2SApple OSS Distributions  *                  the version of Unicode currently supported.
106*19c3b8c2SApple OSS Distributions  *                  Note: The comparison may terminate early when a difference is
107*19c3b8c2SApple OSS Distributions  *                        detected, and may return 0 and set *are_equal=false even
108*19c3b8c2SApple OSS Distributions  *                        if one or both strings are invalid.
109*19c3b8c2SApple OSS Distributions  */
110*19c3b8c2SApple OSS Distributions int utf8_normalizeOptCaseFoldAndCompare(const char *strA,
111*19c3b8c2SApple OSS Distributions     size_t      strA_len,
112*19c3b8c2SApple OSS Distributions     const char *strB,
113*19c3b8c2SApple OSS Distributions     size_t      strB_len,
114*19c3b8c2SApple OSS Distributions     bool        case_sens,
115*19c3b8c2SApple OSS Distributions     bool       *are_equal);
116*19c3b8c2SApple OSS Distributions 
117*19c3b8c2SApple OSS Distributions /*
118*19c3b8c2SApple OSS Distributions  * utf8_normalizeOptCaseFold
119*19c3b8c2SApple OSS Distributions  *
120*19c3b8c2SApple OSS Distributions  * Convert a given UTF-8 string to UTF-32 in one of the following normalized forms,
121*19c3b8c2SApple OSS Distributions  * as specified by the case_sens parameter, and copy the result to the ustr
122*19c3b8c2SApple OSS Distributions  * buffer:
123*19c3b8c2SApple OSS Distributions  * - "canonical caseless form" (case-folded NFD, as described by definition D145
124*19c3b8c2SApple OSS Distributions  *    in chapter 3 of The Unicode Standard); for case-insensitive behavior.
125*19c3b8c2SApple OSS Distributions  * - standard NFD; for case-sensitive behavior (if case_sens = true).
126*19c3b8c2SApple OSS Distributions  *
127*19c3b8c2SApple OSS Distributions  * The input string should be valid UTF-8 that meets the criteria for stream safe
128*19c3b8c2SApple OSS Distributions  * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.
129*19c3b8c2SApple OSS Distributions  * It should not contain ASCII 0x00 or '/'.
130*19c3b8c2SApple OSS Distributions  *
131*19c3b8c2SApple OSS Distributions  * str:       The input UTF-8 string (need not be 0 terminated)
132*19c3b8c2SApple OSS Distributions  * str_len:   The byte length of the input string (excluding any 0 terminator)
133*19c3b8c2SApple OSS Distributions  * case_sens: False for case-insensitive behavior; generates canonical caseless form.
134*19c3b8c2SApple OSS Distributions  *            True for case-sensitive behavior; generates standard NFD.
135*19c3b8c2SApple OSS Distributions  * ustr:      A pointer to a buffer for the resulting UTF-32 string.
136*19c3b8c2SApple OSS Distributions  * ustr_size: The capacity of ustr, in UTF-32 units.
137*19c3b8c2SApple OSS Distributions  * ustr_len:  Pointer to a value that will be filled in with the actual length
138*19c3b8c2SApple OSS Distributions  *            in UTF-32 units of the string copied to ustr.
139*19c3b8c2SApple OSS Distributions  *
140*19c3b8c2SApple OSS Distributions  * Returns: 0 on success, or
141*19c3b8c2SApple OSS Distributions  *          EILSEQ: The input string contains illegal ASCII-range characters
142*19c3b8c2SApple OSS Distributions  *                  (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
143*19c3b8c2SApple OSS Distributions  *                  contains codepoints that are non-characters or unassigned in
144*19c3b8c2SApple OSS Distributions  *                  the version of Unicode currently supported.
145*19c3b8c2SApple OSS Distributions  *          ENOMEM: ustr_size is insufficient for the resulting string. In this
146*19c3b8c2SApple OSS Distributions  *                  case the value returned in *ustr_len is invalid.
147*19c3b8c2SApple OSS Distributions  */
148*19c3b8c2SApple OSS Distributions int utf8_normalizeOptCaseFold(const char *str,
149*19c3b8c2SApple OSS Distributions     size_t      str_len,
150*19c3b8c2SApple OSS Distributions     bool        case_sens,
151*19c3b8c2SApple OSS Distributions     int32_t    *ustr,
152*19c3b8c2SApple OSS Distributions     int32_t     ustr_size,
153*19c3b8c2SApple OSS Distributions     int32_t    *ustr_len);
154*19c3b8c2SApple OSS Distributions 
155*19c3b8c2SApple OSS Distributions /*
156*19c3b8c2SApple OSS Distributions  * utf8_normalizeOptCaseFoldToUTF8
157*19c3b8c2SApple OSS Distributions  *
158*19c3b8c2SApple OSS Distributions  * Convert a given UTF-8 string to UTF-8 in one of the following normalized forms,
159*19c3b8c2SApple OSS Distributions  * as specified by the case_sens parameter, and copy the result to the ustr
160*19c3b8c2SApple OSS Distributions  * buffer:
161*19c3b8c2SApple OSS Distributions  * - "canonical caseless form" (case-folded NFD, as described by definition D145
162*19c3b8c2SApple OSS Distributions  *    in chapter 3 of The Unicode Standard); for case-insensitive behavior.
163*19c3b8c2SApple OSS Distributions  * - standard NFD; for case-sensitive behavior (if case_sens = true).
164*19c3b8c2SApple OSS Distributions  *
165*19c3b8c2SApple OSS Distributions  * The input string should be valid UTF-8 that meets the criteria for stream safe
166*19c3b8c2SApple OSS Distributions  * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.
167*19c3b8c2SApple OSS Distributions  * It should not contain ASCII 0x00 or '/'.
168*19c3b8c2SApple OSS Distributions  *
169*19c3b8c2SApple OSS Distributions  * str:       The input UTF-8 string (need not be 0 terminated)
170*19c3b8c2SApple OSS Distributions  * str_len:   The byte length of the input string (excluding any 0 terminator)
171*19c3b8c2SApple OSS Distributions  * case_sens: False for case-insensitive behavior; generates canonical caseless form.
172*19c3b8c2SApple OSS Distributions  *            True for case-sensitive behavior; generates standard NFD.
173*19c3b8c2SApple OSS Distributions  * ustr:      A pointer to a buffer for the resulting UTF-8 string.
174*19c3b8c2SApple OSS Distributions  * ustr_size: The capacity of ustr, in bytes.
175*19c3b8c2SApple OSS Distributions  * ustr_len:  Pointer to a value that will be filled in with the actual length
176*19c3b8c2SApple OSS Distributions  *            in bytes of the string copied to ustr.
177*19c3b8c2SApple OSS Distributions  *
178*19c3b8c2SApple OSS Distributions  * Returns: 0 on success, or
179*19c3b8c2SApple OSS Distributions  *          EILSEQ: The input string contains illegal ASCII-range characters
180*19c3b8c2SApple OSS Distributions  *                  (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
181*19c3b8c2SApple OSS Distributions  *                  contains codepoints that are non-characters or unassigned in
182*19c3b8c2SApple OSS Distributions  *                  the version of Unicode currently supported.
183*19c3b8c2SApple OSS Distributions  *          ENOMEM: ustr_size is insufficient for the resulting string. In this
184*19c3b8c2SApple OSS Distributions  *                  case the value returned in *ustr_len is invalid.
185*19c3b8c2SApple OSS Distributions  */
186*19c3b8c2SApple OSS Distributions int utf8_normalizeOptCaseFoldToUTF8(const char *str,
187*19c3b8c2SApple OSS Distributions     size_t      str_len,
188*19c3b8c2SApple OSS Distributions     bool        case_sens,
189*19c3b8c2SApple OSS Distributions     char       *ustr,
190*19c3b8c2SApple OSS Distributions     size_t      ustr_size,
191*19c3b8c2SApple OSS Distributions     size_t     *ustr_len);
192*19c3b8c2SApple OSS Distributions 
193*19c3b8c2SApple OSS Distributions /*
194*19c3b8c2SApple OSS Distributions  * utf8_normalizeOptCaseFoldAndMatchSubstring
195*19c3b8c2SApple OSS Distributions  *
196*19c3b8c2SApple OSS Distributions  * Determine whether the normalized UTF32 string derived from a specified UTF-8 string
197*19c3b8c2SApple OSS Distributions  * strA contains another UTF32 string ustrB which has already been normalized, typically
198*19c3b8c2SApple OSS Distributions  * with normalizeOptCaseFold. The normalization for both strings is one of the following,
199*19c3b8c2SApple OSS Distributions  * as specified by the case_sens parameter:
200*19c3b8c2SApple OSS Distributions  * - "canonical caseless form" (case-folded NFD); for case-insensitive comparison.
201*19c3b8c2SApple OSS Distributions  * - standard NFD; for case-sensitive comparison (if case_sens = true).
202*19c3b8c2SApple OSS Distributions  * On success, sets are_equal to true if strA contains ustrB, or false otherwise.
203*19c3b8c2SApple OSS Distributions  *
204*19c3b8c2SApple OSS Distributions  * The input string strA should be valid UTF-8 that meets the criteria for stream safe
205*19c3b8c2SApple OSS Distributions  * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.
206*19c3b8c2SApple OSS Distributions  * It should not contain ASCII 0x00 or '/'.
207*19c3b8c2SApple OSS Distributions  *
208*19c3b8c2SApple OSS Distributions  * strA:      A UTF-8 string (need not be 0 terminated) in which to search for the
209*19c3b8c2SApple OSS Distributions  *            substring specified by ustrB.
210*19c3b8c2SApple OSS Distributions  * strA_len:  The byte length of strA (excluding any 0 terminator)
211*19c3b8c2SApple OSS Distributions  * ustrB:     A normalized UTF-32 substring (need not be 0 terminated) to be searched
212*19c3b8c2SApple OSS Distributions  *            for in the UTF-32 string resulting from converting strA to the normalized
213*19c3b8c2SApple OSS Distributions  *            UTF-32 form specified by the case_sens parameter; ustrB must already be
214*19c3b8c2SApple OSS Distributions  *            in that form. Normally this will be produced using normalizeOptCaseFold.
215*19c3b8c2SApple OSS Distributions  * ustrB_len: The length of ustrB in UTF-32 units (excluding any 0 terminator).
216*19c3b8c2SApple OSS Distributions  * case_sens: False for case-insensitive matching; compares canonical caseless forms.
217*19c3b8c2SApple OSS Distributions  *            True for case-sensitive matching; compares standard NFD forms.
218*19c3b8c2SApple OSS Distributions  * buf:       Pointer to caller-supplied working memory for storing the portion of
219*19c3b8c2SApple OSS Distributions  *            strA which has been converted to normalized UTF-32.
220*19c3b8c2SApple OSS Distributions  * buf_size:  The size of buf.
221*19c3b8c2SApple OSS Distributions  * has_match: On success, set to true if strA (when converter to UTF-32 and normalized
222*19c3b8c2SApple OSS Distributions  *            per case_sens) contains ustrB, set to false otherwise.
223*19c3b8c2SApple OSS Distributions  *
224*19c3b8c2SApple OSS Distributions  * Returns: 0 on success, or
225*19c3b8c2SApple OSS Distributions  *          EILSEQ: strA contains illegal ASCII-range characters (0x00 or '/'), or is
226*19c3b8c2SApple OSS Distributions  *                  not well-formed stream-safe UTF-8, or contains codepoints that are
227*19c3b8c2SApple OSS Distributions  *                  non-characters or unassigned in the version of Unicode currently
228*19c3b8c2SApple OSS Distributions  *                  supported.
229*19c3b8c2SApple OSS Distributions  *                  Note: The search may terminate early when a match is detected, and
230*19c3b8c2SApple OSS Distributions  *                        may return 0 and set *has_match=true even if strA is invalid.
231*19c3b8c2SApple OSS Distributions  *          ENOMEM: buf_size is insufficient.
232*19c3b8c2SApple OSS Distributions  */
233*19c3b8c2SApple OSS Distributions int utf8_normalizeOptCaseFoldAndMatchSubstring(const char    *strA,
234*19c3b8c2SApple OSS Distributions     size_t         strA_len,
235*19c3b8c2SApple OSS Distributions     const int32_t *ustrB,
236*19c3b8c2SApple OSS Distributions     int32_t        ustrB_len,
237*19c3b8c2SApple OSS Distributions     bool           case_sens,
238*19c3b8c2SApple OSS Distributions     void          *buf,
239*19c3b8c2SApple OSS Distributions     size_t         buf_size,
240*19c3b8c2SApple OSS Distributions     bool          *has_match);
241*19c3b8c2SApple OSS Distributions 
242*19c3b8c2SApple OSS Distributions /*
243*19c3b8c2SApple OSS Distributions  * utf8_normalizeOptCaseFoldGetUVersion
244*19c3b8c2SApple OSS Distributions  *
245*19c3b8c2SApple OSS Distributions  * Get the Unicode and code version currently associated with the normalizeOptCaseFold
246*19c3b8c2SApple OSS Distributions  * functions. The caller allocates the version array and passes it to the function,
247*19c3b8c2SApple OSS Distributions  * which will fill out the array as follows:
248*19c3b8c2SApple OSS Distributions  * version[0] = Unicode major version; for Unicode 6.3.0 this would be 6
249*19c3b8c2SApple OSS Distributions  * version[1] = Unicode minor version; for Unicode 6.3.0 this would be 3
250*19c3b8c2SApple OSS Distributions  * version[2] = Unicode patch version; for Unicode 6.3.0 this would be 0
251*19c3b8c2SApple OSS Distributions  * version[3] = Code revision level; for any given Unicode version, this value starts
252*19c3b8c2SApple OSS Distributions  *              at 0 and is incremented for each significant revision to the
253*19c3b8c2SApple OSS Distributions  *              normalizeOptCaseFold functions.
254*19c3b8c2SApple OSS Distributions  */
255*19c3b8c2SApple OSS Distributions void utf8_normalizeOptCaseFoldGetUVersion(unsigned char version[4]);
256*19c3b8c2SApple OSS Distributions 
257*19c3b8c2SApple OSS Distributions #endif /* KERNEL_PRIVATE */
258*19c3b8c2SApple OSS Distributions 
259*19c3b8c2SApple OSS Distributions #endif  /* unicode_h */
260