1*043036a2SApple OSS Distributions /* 2*043036a2SApple OSS Distributions * Copyright (c) 2016-2023 Apple Inc. All rights reserved. 3*043036a2SApple OSS Distributions * 4*043036a2SApple OSS Distributions * @APPLE_LICENSE_HEADER_START@ 5*043036a2SApple OSS Distributions * 6*043036a2SApple OSS Distributions * This file contains Original Code and/or Modifications of Original Code 7*043036a2SApple OSS Distributions * as defined in and that are subject to the Apple Public Source License 8*043036a2SApple OSS Distributions * Version 2.0 (the 'License'). You may not use this file except in 9*043036a2SApple OSS Distributions * compliance with the License. Please obtain a copy of the License at 10*043036a2SApple OSS Distributions * http://www.opensource.apple.com/apsl/ and read it before using this 11*043036a2SApple OSS Distributions * file. 12*043036a2SApple OSS Distributions * 13*043036a2SApple OSS Distributions * The Original Code and all software distributed under the License are 14*043036a2SApple OSS Distributions * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 15*043036a2SApple OSS Distributions * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 16*043036a2SApple OSS Distributions * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 17*043036a2SApple OSS Distributions * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 18*043036a2SApple OSS Distributions * Please see the License for the specific language governing rights and 19*043036a2SApple OSS Distributions * limitations under the License. 20*043036a2SApple OSS Distributions * 21*043036a2SApple OSS Distributions * @APPLE_LICENSE_HEADER_END@ 22*043036a2SApple OSS Distributions */ 23*043036a2SApple OSS Distributions 24*043036a2SApple OSS Distributions #ifndef unicode_h 25*043036a2SApple OSS Distributions #define unicode_h 26*043036a2SApple OSS Distributions 27*043036a2SApple OSS Distributions #ifdef KERNEL_PRIVATE 28*043036a2SApple OSS Distributions 29*043036a2SApple OSS Distributions #include <sys/cdefs.h> 30*043036a2SApple OSS Distributions #include <stdbool.h> 31*043036a2SApple OSS Distributions 32*043036a2SApple OSS Distributions /* 33*043036a2SApple OSS Distributions * WARNING - callers that use the following Unicode normalization interface for on-disk 34*043036a2SApple OSS Distributions * structures should be aware that the implementation will be periodically updated for 35*043036a2SApple OSS Distributions * the latest Unicode standard version. 36*043036a2SApple OSS Distributions */ 37*043036a2SApple OSS Distributions 38*043036a2SApple OSS Distributions enum { 39*043036a2SApple OSS Distributions /* Maximum size of UTF32 reordering buffer for stream-safe format */ 40*043036a2SApple OSS Distributions kNCFStreamSafeBufMax = 32 41*043036a2SApple OSS Distributions }; 42*043036a2SApple OSS Distributions 43*043036a2SApple OSS Distributions /* 44*043036a2SApple OSS Distributions * utf8_normalizeOptCaseFoldAndHash 45*043036a2SApple OSS Distributions * 46*043036a2SApple OSS Distributions * Convert a given UTF-8 string to UTF-32 in one of the following normalized forms, 47*043036a2SApple OSS Distributions * as specified by the case_sens parameter, and feed the result incrementally to 48*043036a2SApple OSS Distributions * the provided hash function callback: 49*043036a2SApple OSS Distributions * - "canonical caseless form" (case-folded NFD, as described by definition D145 50*043036a2SApple OSS Distributions * in chapter 3 of The Unicode Standard); for case-insensitive behavior. 51*043036a2SApple OSS Distributions * - standard NFD; for case-sensitive behavior (if case_sens = true). 52*043036a2SApple OSS Distributions * 53*043036a2SApple OSS Distributions * The input string should be valid UTF-8 that meets the criteria for stream safe 54*043036a2SApple OSS Distributions * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format. 55*043036a2SApple OSS Distributions * It should not contain ASCII 0x00 or '/'. 56*043036a2SApple OSS Distributions * 57*043036a2SApple OSS Distributions * str: The input UTF-8 string (need not be 0 terminated) 58*043036a2SApple OSS Distributions * str_len: The byte length of the input string (excluding any 0 terminator) 59*043036a2SApple OSS Distributions * case_sens: False for case-insensitive behavior; generates canonical caseless form. 60*043036a2SApple OSS Distributions * True for case-sensitive behavior; generates standard NFD. 61*043036a2SApple OSS Distributions * hash_func: A pointer to a hashing function to compute the hash of the 62*043036a2SApple OSS Distributions * normalized/case-folded result. buf contains buf_len bytes 63*043036a2SApple OSS Distributions * of data to be added to the hash using the caller-supplied 64*043036a2SApple OSS Distributions * context (ctx). 65*043036a2SApple OSS Distributions * hash_ctx: The context for the hash function. 66*043036a2SApple OSS Distributions * 67*043036a2SApple OSS Distributions * Returns: 0 on success, or 68*043036a2SApple OSS Distributions * EILSEQ: The input string contains illegal ASCII-range characters 69*043036a2SApple OSS Distributions * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or 70*043036a2SApple OSS Distributions * contains codepoints that are non-characters or unassigned in 71*043036a2SApple OSS Distributions * the version of Unicode currently supported. 72*043036a2SApple OSS Distributions */ 73*043036a2SApple OSS Distributions int utf8_normalizeOptCaseFoldAndHash(const char *str, 74*043036a2SApple OSS Distributions size_t str_len, 75*043036a2SApple OSS Distributions bool case_sens, 76*043036a2SApple OSS Distributions void (*hash_func)(void *buf, size_t buf_len, void *ctx), 77*043036a2SApple OSS Distributions void *hash_ctx); 78*043036a2SApple OSS Distributions 79*043036a2SApple OSS Distributions /* 80*043036a2SApple OSS Distributions * utf8_normalizeOptCaseFoldAndCompare 81*043036a2SApple OSS Distributions * 82*043036a2SApple OSS Distributions * Determine whether two UTF-8 strings are equal after converting each to one of the 83*043036a2SApple OSS Distributions * following normalized forms, as specified by the case_sens parameter: 84*043036a2SApple OSS Distributions * - "canonical caseless form" (case-folded NFD); for case-insensitive comparison. 85*043036a2SApple OSS Distributions * - standard NFD; for case-sensitive comparison (if case_sens = true). 86*043036a2SApple OSS Distributions * On success, sets are_equal to true if the strings are equal, or false if they are not. 87*043036a2SApple OSS Distributions * 88*043036a2SApple OSS Distributions * The input strings should be valid UTF-8 that meet the criteria for stream safe 89*043036a2SApple OSS Distributions * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format. 90*043036a2SApple OSS Distributions * They should not contain ASCII 0x00 or '/'. 91*043036a2SApple OSS Distributions * 92*043036a2SApple OSS Distributions * strA: A UTF-8 string to be compared (need not be 0 terminated) 93*043036a2SApple OSS Distributions * strA_len: The byte length of strA (excluding any 0 terminator) 94*043036a2SApple OSS Distributions * strB: The second UTF-8 string to be compared (need not be 0 terminated) 95*043036a2SApple OSS Distributions * strB_len: The byte length of strB (excluding any 0 terminator) 96*043036a2SApple OSS Distributions * case_sens: False for case-insensitive behavior; compares canonical caseless forms. 97*043036a2SApple OSS Distributions * True for case-sensitive behavior; compares standard NFD forms. 98*043036a2SApple OSS Distributions * are_equal: On success, set to true if the strings are equal, or set to false 99*043036a2SApple OSS Distributions * if they are not. 100*043036a2SApple OSS Distributions * 101*043036a2SApple OSS Distributions * Returns: 0 on success, or 102*043036a2SApple OSS Distributions * EILSEQ: One or both of the input strings contains illegal ASCII-range 103*043036a2SApple OSS Distributions * characters (0x00 or '/'), or is not well-formed stream-safe UTF-8, 104*043036a2SApple OSS Distributions * or contains codepoints that are non-characters or unassigned in 105*043036a2SApple OSS Distributions * the version of Unicode currently supported. 106*043036a2SApple OSS Distributions * Note: The comparison may terminate early when a difference is 107*043036a2SApple OSS Distributions * detected, and may return 0 and set *are_equal=false even 108*043036a2SApple OSS Distributions * if one or both strings are invalid. 109*043036a2SApple OSS Distributions */ 110*043036a2SApple OSS Distributions int utf8_normalizeOptCaseFoldAndCompare(const char *strA, 111*043036a2SApple OSS Distributions size_t strA_len, 112*043036a2SApple OSS Distributions const char *strB, 113*043036a2SApple OSS Distributions size_t strB_len, 114*043036a2SApple OSS Distributions bool case_sens, 115*043036a2SApple OSS Distributions bool *are_equal); 116*043036a2SApple OSS Distributions 117*043036a2SApple OSS Distributions /* 118*043036a2SApple OSS Distributions * utf8_normalizeOptCaseFold 119*043036a2SApple OSS Distributions * 120*043036a2SApple OSS Distributions * Convert a given UTF-8 string to UTF-32 in one of the following normalized forms, 121*043036a2SApple OSS Distributions * as specified by the case_sens parameter, and copy the result to the ustr 122*043036a2SApple OSS Distributions * buffer: 123*043036a2SApple OSS Distributions * - "canonical caseless form" (case-folded NFD, as described by definition D145 124*043036a2SApple OSS Distributions * in chapter 3 of The Unicode Standard); for case-insensitive behavior. 125*043036a2SApple OSS Distributions * - standard NFD; for case-sensitive behavior (if case_sens = true). 126*043036a2SApple OSS Distributions * 127*043036a2SApple OSS Distributions * The input string should be valid UTF-8 that meets the criteria for stream safe 128*043036a2SApple OSS Distributions * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format. 129*043036a2SApple OSS Distributions * It should not contain ASCII 0x00 or '/'. 130*043036a2SApple OSS Distributions * 131*043036a2SApple OSS Distributions * str: The input UTF-8 string (need not be 0 terminated) 132*043036a2SApple OSS Distributions * str_len: The byte length of the input string (excluding any 0 terminator) 133*043036a2SApple OSS Distributions * case_sens: False for case-insensitive behavior; generates canonical caseless form. 134*043036a2SApple OSS Distributions * True for case-sensitive behavior; generates standard NFD. 135*043036a2SApple OSS Distributions * ustr: A pointer to a buffer for the resulting UTF-32 string. 136*043036a2SApple OSS Distributions * ustr_size: The capacity of ustr, in UTF-32 units. 137*043036a2SApple OSS Distributions * ustr_len: Pointer to a value that will be filled in with the actual length 138*043036a2SApple OSS Distributions * in UTF-32 units of the string copied to ustr. 139*043036a2SApple OSS Distributions * 140*043036a2SApple OSS Distributions * Returns: 0 on success, or 141*043036a2SApple OSS Distributions * EILSEQ: The input string contains illegal ASCII-range characters 142*043036a2SApple OSS Distributions * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or 143*043036a2SApple OSS Distributions * contains codepoints that are non-characters or unassigned in 144*043036a2SApple OSS Distributions * the version of Unicode currently supported. 145*043036a2SApple OSS Distributions * ENOMEM: ustr_size is insufficient for the resulting string. In this 146*043036a2SApple OSS Distributions * case the value returned in *ustr_len is invalid. 147*043036a2SApple OSS Distributions */ 148*043036a2SApple OSS Distributions int utf8_normalizeOptCaseFold(const char *str, 149*043036a2SApple OSS Distributions size_t str_len, 150*043036a2SApple OSS Distributions bool case_sens, 151*043036a2SApple OSS Distributions int32_t *ustr, 152*043036a2SApple OSS Distributions int32_t ustr_size, 153*043036a2SApple OSS Distributions int32_t *ustr_len); 154*043036a2SApple OSS Distributions 155*043036a2SApple OSS Distributions /* 156*043036a2SApple OSS Distributions * utf8_normalizeOptCaseFoldToUTF8 157*043036a2SApple OSS Distributions * 158*043036a2SApple OSS Distributions * Convert a given UTF-8 string to UTF-8 in one of the following normalized forms, 159*043036a2SApple OSS Distributions * as specified by the case_sens parameter, and copy the result to the ustr 160*043036a2SApple OSS Distributions * buffer: 161*043036a2SApple OSS Distributions * - "canonical caseless form" (case-folded NFD, as described by definition D145 162*043036a2SApple OSS Distributions * in chapter 3 of The Unicode Standard); for case-insensitive behavior. 163*043036a2SApple OSS Distributions * - standard NFD; for case-sensitive behavior (if case_sens = true). 164*043036a2SApple OSS Distributions * 165*043036a2SApple OSS Distributions * The input string should be valid UTF-8 that meets the criteria for stream safe 166*043036a2SApple OSS Distributions * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format. 167*043036a2SApple OSS Distributions * It should not contain ASCII 0x00 or '/'. 168*043036a2SApple OSS Distributions * 169*043036a2SApple OSS Distributions * str: The input UTF-8 string (need not be 0 terminated) 170*043036a2SApple OSS Distributions * str_len: The byte length of the input string (excluding any 0 terminator) 171*043036a2SApple OSS Distributions * case_sens: False for case-insensitive behavior; generates canonical caseless form. 172*043036a2SApple OSS Distributions * True for case-sensitive behavior; generates standard NFD. 173*043036a2SApple OSS Distributions * ustr: A pointer to a buffer for the resulting UTF-8 string. 174*043036a2SApple OSS Distributions * ustr_size: The capacity of ustr, in bytes. 175*043036a2SApple OSS Distributions * ustr_len: Pointer to a value that will be filled in with the actual length 176*043036a2SApple OSS Distributions * in bytes of the string copied to ustr. 177*043036a2SApple OSS Distributions * 178*043036a2SApple OSS Distributions * Returns: 0 on success, or 179*043036a2SApple OSS Distributions * EILSEQ: The input string contains illegal ASCII-range characters 180*043036a2SApple OSS Distributions * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or 181*043036a2SApple OSS Distributions * contains codepoints that are non-characters or unassigned in 182*043036a2SApple OSS Distributions * the version of Unicode currently supported. 183*043036a2SApple OSS Distributions * ENOMEM: ustr_size is insufficient for the resulting string. In this 184*043036a2SApple OSS Distributions * case the value returned in *ustr_len is invalid. 185*043036a2SApple OSS Distributions */ 186*043036a2SApple OSS Distributions int utf8_normalizeOptCaseFoldToUTF8(const char *str, 187*043036a2SApple OSS Distributions size_t str_len, 188*043036a2SApple OSS Distributions bool case_sens, 189*043036a2SApple OSS Distributions char *ustr, 190*043036a2SApple OSS Distributions size_t ustr_size, 191*043036a2SApple OSS Distributions size_t *ustr_len); 192*043036a2SApple OSS Distributions 193*043036a2SApple OSS Distributions /* 194*043036a2SApple OSS Distributions * utf8_normalizeOptCaseFoldToUTF8ForPath 195*043036a2SApple OSS Distributions * 196*043036a2SApple OSS Distributions * Convert a given UTF-8 path string to UTF-8 in one of the following normalized forms, 197*043036a2SApple OSS Distributions * as specified by the case_sens parameter, and copy the result to the ustr 198*043036a2SApple OSS Distributions * buffer: 199*043036a2SApple OSS Distributions * - "canonical caseless form" (case-folded NFD, as described by definition D145 200*043036a2SApple OSS Distributions * in chapter 3 of The Unicode Standard); for case-insensitive behavior. 201*043036a2SApple OSS Distributions * - standard NFD; for case-sensitive behavior (if case_sens = true). 202*043036a2SApple OSS Distributions * 203*043036a2SApple OSS Distributions * The input string should be valid UTF-8 that meets the criteria for stream safe 204*043036a2SApple OSS Distributions * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format. 205*043036a2SApple OSS Distributions * 206*043036a2SApple OSS Distributions * str: The input UTF-8 path string 207*043036a2SApple OSS Distributions * str_len: The byte length of the input path string (excluding any 0 terminator) 208*043036a2SApple OSS Distributions * case_sens: False for case-insensitive behavior; generates canonical caseless form. 209*043036a2SApple OSS Distributions * True for case-sensitive behavior; generates standard NFD. 210*043036a2SApple OSS Distributions * ustr: A pointer to a buffer for the resulting UTF-8 string. 211*043036a2SApple OSS Distributions * ustr_size: The capacity of ustr, in bytes. 212*043036a2SApple OSS Distributions * ustr_len: Pointer to a value that will be filled in with the actual length 213*043036a2SApple OSS Distributions * in bytes of the string copied to ustr. 214*043036a2SApple OSS Distributions * 215*043036a2SApple OSS Distributions * Returns: 0 on success, or 216*043036a2SApple OSS Distributions * EILSEQ: The input string contains illegal ASCII-range characters 217*043036a2SApple OSS Distributions * (0x00), or is not well-formed stream-safe UTF-8, or 218*043036a2SApple OSS Distributions * contains codepoints that are non-characters or unassigned in 219*043036a2SApple OSS Distributions * the version of Unicode currently supported. 220*043036a2SApple OSS Distributions * ENOMEM: ustr_size is insufficient for the resulting string. In this 221*043036a2SApple OSS Distributions * case the value returned in *ustr_len is invalid. 222*043036a2SApple OSS Distributions */ 223*043036a2SApple OSS Distributions int utf8_normalizeOptCaseFoldToUTF8ForPath(const char *str, 224*043036a2SApple OSS Distributions size_t str_len, 225*043036a2SApple OSS Distributions bool case_sens, 226*043036a2SApple OSS Distributions char *ustr, 227*043036a2SApple OSS Distributions size_t ustr_size, 228*043036a2SApple OSS Distributions size_t *ustr_len); 229*043036a2SApple OSS Distributions 230*043036a2SApple OSS Distributions /* 231*043036a2SApple OSS Distributions * utf8_normalizeOptCaseFoldAndMatchSubstring 232*043036a2SApple OSS Distributions * 233*043036a2SApple OSS Distributions * Determine whether the normalized UTF32 string derived from a specified UTF-8 string 234*043036a2SApple OSS Distributions * strA contains another UTF32 string ustrB which has already been normalized, typically 235*043036a2SApple OSS Distributions * with normalizeOptCaseFold. The normalization for both strings is one of the following, 236*043036a2SApple OSS Distributions * as specified by the case_sens parameter: 237*043036a2SApple OSS Distributions * - "canonical caseless form" (case-folded NFD); for case-insensitive comparison. 238*043036a2SApple OSS Distributions * - standard NFD; for case-sensitive comparison (if case_sens = true). 239*043036a2SApple OSS Distributions * On success, sets are_equal to true if strA contains ustrB, or false otherwise. 240*043036a2SApple OSS Distributions * 241*043036a2SApple OSS Distributions * The input string strA should be valid UTF-8 that meets the criteria for stream safe 242*043036a2SApple OSS Distributions * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format. 243*043036a2SApple OSS Distributions * It should not contain ASCII 0x00 or '/'. 244*043036a2SApple OSS Distributions * 245*043036a2SApple OSS Distributions * strA: A UTF-8 string (need not be 0 terminated) in which to search for the 246*043036a2SApple OSS Distributions * substring specified by ustrB. 247*043036a2SApple OSS Distributions * strA_len: The byte length of strA (excluding any 0 terminator) 248*043036a2SApple OSS Distributions * ustrB: A normalized UTF-32 substring (need not be 0 terminated) to be searched 249*043036a2SApple OSS Distributions * for in the UTF-32 string resulting from converting strA to the normalized 250*043036a2SApple OSS Distributions * UTF-32 form specified by the case_sens parameter; ustrB must already be 251*043036a2SApple OSS Distributions * in that form. Normally this will be produced using normalizeOptCaseFold. 252*043036a2SApple OSS Distributions * ustrB_len: The length of ustrB in UTF-32 units (excluding any 0 terminator). 253*043036a2SApple OSS Distributions * case_sens: False for case-insensitive matching; compares canonical caseless forms. 254*043036a2SApple OSS Distributions * True for case-sensitive matching; compares standard NFD forms. 255*043036a2SApple OSS Distributions * buf: Pointer to caller-supplied working memory for storing the portion of 256*043036a2SApple OSS Distributions * strA which has been converted to normalized UTF-32. 257*043036a2SApple OSS Distributions * buf_size: The size of buf. 258*043036a2SApple OSS Distributions * has_match: On success, set to true if strA (when converter to UTF-32 and normalized 259*043036a2SApple OSS Distributions * per case_sens) contains ustrB, set to false otherwise. 260*043036a2SApple OSS Distributions * 261*043036a2SApple OSS Distributions * Returns: 0 on success, or 262*043036a2SApple OSS Distributions * EILSEQ: strA contains illegal ASCII-range characters (0x00 or '/'), or is 263*043036a2SApple OSS Distributions * not well-formed stream-safe UTF-8, or contains codepoints that are 264*043036a2SApple OSS Distributions * non-characters or unassigned in the version of Unicode currently 265*043036a2SApple OSS Distributions * supported. 266*043036a2SApple OSS Distributions * Note: The search may terminate early when a match is detected, and 267*043036a2SApple OSS Distributions * may return 0 and set *has_match=true even if strA is invalid. 268*043036a2SApple OSS Distributions * ENOMEM: buf_size is insufficient. 269*043036a2SApple OSS Distributions */ 270*043036a2SApple OSS Distributions int utf8_normalizeOptCaseFoldAndMatchSubstring(const char *strA, 271*043036a2SApple OSS Distributions size_t strA_len, 272*043036a2SApple OSS Distributions const int32_t *ustrB, 273*043036a2SApple OSS Distributions int32_t ustrB_len, 274*043036a2SApple OSS Distributions bool case_sens, 275*043036a2SApple OSS Distributions void *buf, 276*043036a2SApple OSS Distributions size_t buf_size, 277*043036a2SApple OSS Distributions bool *has_match); 278*043036a2SApple OSS Distributions 279*043036a2SApple OSS Distributions /* 280*043036a2SApple OSS Distributions * utf8_normalizeOptCaseFoldGetUVersion 281*043036a2SApple OSS Distributions * 282*043036a2SApple OSS Distributions * Get the Unicode and code version currently associated with the normalizeOptCaseFold 283*043036a2SApple OSS Distributions * functions. The caller allocates the version array and passes it to the function, 284*043036a2SApple OSS Distributions * which will fill out the array as follows: 285*043036a2SApple OSS Distributions * version[0] = Unicode major version; for Unicode 6.3.0 this would be 6 286*043036a2SApple OSS Distributions * version[1] = Unicode minor version; for Unicode 6.3.0 this would be 3 287*043036a2SApple OSS Distributions * version[2] = Unicode patch version; for Unicode 6.3.0 this would be 0 288*043036a2SApple OSS Distributions * version[3] = Code revision level; for any given Unicode version, this value starts 289*043036a2SApple OSS Distributions * at 0 and is incremented for each significant revision to the 290*043036a2SApple OSS Distributions * normalizeOptCaseFold functions. 291*043036a2SApple OSS Distributions */ 292*043036a2SApple OSS Distributions void utf8_normalizeOptCaseFoldGetUVersion(unsigned char version[4]); 293*043036a2SApple OSS Distributions 294*043036a2SApple OSS Distributions #endif /* KERNEL_PRIVATE */ 295*043036a2SApple OSS Distributions 296*043036a2SApple OSS Distributions #endif /* unicode_h */ 297