1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2009-2015, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: unorm2.h 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2009dec15 16 * created by: Markus W. Scherer 17 */ 18 19 #ifndef __UNORM2_H__ 20 #define __UNORM2_H__ 21 22 /** 23 * @addtogroup icu4c ICU4C 24 * @{ 25 * \file 26 * \brief C API: New API for Unicode Normalization. 27 * 28 * Unicode normalization functionality for standard Unicode normalization or 29 * for using custom mapping tables. 30 * All instances of UNormalizer2 are unmodifiable/immutable. 31 * Instances returned by unorm2_getInstance() are singletons that must not be deleted by the caller. 32 * For more details see the Normalizer2 C++ class. 33 */ 34 35 #include "unicode/utypes.h" 36 #include "unicode/stringoptions.h" 37 38 #if U_SHOW_CPLUSPLUS_API 39 #include "unicode/localpointer.h" 40 #endif // U_SHOW_CPLUSPLUS_API 41 42 /** 43 * Constants for normalization modes. 44 * For details about standard Unicode normalization forms 45 * and about the algorithms which are also used with custom mapping tables 46 * see http://www.unicode.org/unicode/reports/tr15/ 47 * \xrefitem stable "Stable" "Stable List" ICU 4.4 48 */ 49 typedef enum { 50 /** 51 * Decomposition followed by composition. 52 * Same as standard NFC when using an "nfc" instance. 53 * Same as standard NFKC when using an "nfkc" instance. 54 * For details about standard Unicode normalization forms 55 * see http://www.unicode.org/unicode/reports/tr15/ 56 * \xrefitem stable "Stable" "Stable List" ICU 4.4 57 */ 58 UNORM2_COMPOSE, 59 /** 60 * Map, and reorder canonically. 61 * Same as standard NFD when using an "nfc" instance. 62 * Same as standard NFKD when using an "nfkc" instance. 63 * For details about standard Unicode normalization forms 64 * see http://www.unicode.org/unicode/reports/tr15/ 65 * \xrefitem stable "Stable" "Stable List" ICU 4.4 66 */ 67 UNORM2_DECOMPOSE, 68 /** 69 * "Fast C or D" form. 70 * If a string is in this form, then further decomposition <i>without reordering</i> 71 * would yield the same form as DECOMPOSE. 72 * Text in "Fast C or D" form can be processed efficiently with data tables 73 * that are "canonically closed", that is, that provide equivalent data for 74 * equivalent text, without having to be fully normalized. 75 * Not a standard Unicode normalization form. 76 * Not a unique form: Different FCD strings can be canonically equivalent. 77 * For details see http://www.unicode.org/notes/tn5/#FCD 78 * \xrefitem stable "Stable" "Stable List" ICU 4.4 79 */ 80 UNORM2_FCD, 81 /** 82 * Compose only contiguously. 83 * Also known as "FCC" or "Fast C Contiguous". 84 * The result will often but not always be in NFC. 85 * The result will conform to FCD which is useful for processing. 86 * Not a standard Unicode normalization form. 87 * For details see http://www.unicode.org/notes/tn5/#FCC 88 * \xrefitem stable "Stable" "Stable List" ICU 4.4 89 */ 90 UNORM2_COMPOSE_CONTIGUOUS 91 } UNormalization2Mode; 92 93 /** 94 * Result values for normalization quick check functions. 95 * For details see http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms 96 * \xrefitem stable "Stable" "Stable List" ICU 2.0 97 */ 98 typedef enum UNormalizationCheckResult { 99 /** 100 * The input string is not in the normalization form. 101 * \xrefitem stable "Stable" "Stable List" ICU 2.0 102 */ 103 UNORM_NO, 104 /** 105 * The input string is in the normalization form. 106 * \xrefitem stable "Stable" "Stable List" ICU 2.0 107 */ 108 UNORM_YES, 109 /** 110 * The input string may or may not be in the normalization form. 111 * This value is only returned for composition forms like NFC and FCC, 112 * when a backward-combining character is found for which the surrounding text 113 * would have to be analyzed further. 114 * \xrefitem stable "Stable" "Stable List" ICU 2.0 115 */ 116 UNORM_MAYBE 117 } UNormalizationCheckResult; 118 119 /** 120 * Opaque C service object type for the new normalization API. 121 * \xrefitem stable "Stable" "Stable List" ICU 4.4 122 */ 123 struct UNormalizer2; 124 typedef struct UNormalizer2 UNormalizer2; /**< C typedef for struct UNormalizer2. \xrefitem stable "Stable" "Stable List" ICU 4.4 */ 125 126 #if !UCONFIG_NO_NORMALIZATION 127 128 /** 129 * Returns a UNormalizer2 instance for Unicode NFC normalization. 130 * Same as unorm2_getInstance(NULL, "nfc", UNORM2_COMPOSE, pErrorCode). 131 * Returns an unmodifiable singleton instance. Do not delete it. 132 * @param pErrorCode Standard ICU error code. Its input value must 133 * pass the U_SUCCESS() test, or else the function returns 134 * immediately. Check for U_FAILURE() on output or use with 135 * function chaining. (See User Guide for details.) 136 * @return the requested Normalizer2, if successful 137 * \xrefitem stable "Stable" "Stable List" ICU 49 138 */ 139 U_CAPI const UNormalizer2 * U_EXPORT2 140 unorm2_getNFCInstance(UErrorCode *pErrorCode) __INTRODUCED_IN(31); 141 142 143 144 /** 145 * Returns a UNormalizer2 instance for Unicode NFD normalization. 146 * Same as unorm2_getInstance(NULL, "nfc", UNORM2_DECOMPOSE, pErrorCode). 147 * Returns an unmodifiable singleton instance. Do not delete it. 148 * @param pErrorCode Standard ICU error code. Its input value must 149 * pass the U_SUCCESS() test, or else the function returns 150 * immediately. Check for U_FAILURE() on output or use with 151 * function chaining. (See User Guide for details.) 152 * @return the requested Normalizer2, if successful 153 * \xrefitem stable "Stable" "Stable List" ICU 49 154 */ 155 U_CAPI const UNormalizer2 * U_EXPORT2 156 unorm2_getNFDInstance(UErrorCode *pErrorCode) __INTRODUCED_IN(31); 157 158 159 160 /** 161 * Returns a UNormalizer2 instance for Unicode NFKC normalization. 162 * Same as unorm2_getInstance(NULL, "nfkc", UNORM2_COMPOSE, pErrorCode). 163 * Returns an unmodifiable singleton instance. Do not delete it. 164 * @param pErrorCode Standard ICU error code. Its input value must 165 * pass the U_SUCCESS() test, or else the function returns 166 * immediately. Check for U_FAILURE() on output or use with 167 * function chaining. (See User Guide for details.) 168 * @return the requested Normalizer2, if successful 169 * \xrefitem stable "Stable" "Stable List" ICU 49 170 */ 171 U_CAPI const UNormalizer2 * U_EXPORT2 172 unorm2_getNFKCInstance(UErrorCode *pErrorCode) __INTRODUCED_IN(31); 173 174 175 176 /** 177 * Returns a UNormalizer2 instance for Unicode NFKD normalization. 178 * Same as unorm2_getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, pErrorCode). 179 * Returns an unmodifiable singleton instance. Do not delete it. 180 * @param pErrorCode Standard ICU error code. Its input value must 181 * pass the U_SUCCESS() test, or else the function returns 182 * immediately. Check for U_FAILURE() on output or use with 183 * function chaining. (See User Guide for details.) 184 * @return the requested Normalizer2, if successful 185 * \xrefitem stable "Stable" "Stable List" ICU 49 186 */ 187 U_CAPI const UNormalizer2 * U_EXPORT2 188 unorm2_getNFKDInstance(UErrorCode *pErrorCode) __INTRODUCED_IN(31); 189 190 191 192 /** 193 * Returns a UNormalizer2 instance for Unicode toNFKC_Casefold() normalization 194 * which is equivalent to applying the NFKC_Casefold mappings and then NFC. 195 * See https://www.unicode.org/reports/tr44/#NFKC_Casefold 196 * 197 * Same as unorm2_getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, pErrorCode). 198 * Returns an unmodifiable singleton instance. Do not delete it. 199 * @param pErrorCode Standard ICU error code. Its input value must 200 * pass the U_SUCCESS() test, or else the function returns 201 * immediately. Check for U_FAILURE() on output or use with 202 * function chaining. (See User Guide for details.) 203 * @return the requested Normalizer2, if successful 204 * \xrefitem stable "Stable" "Stable List" ICU 49 205 */ 206 U_CAPI const UNormalizer2 * U_EXPORT2 207 unorm2_getNFKCCasefoldInstance(UErrorCode *pErrorCode) __INTRODUCED_IN(31); 208 209 210 211 #ifndef U_HIDE_DRAFT_API 212 213 #endif // U_HIDE_DRAFT_API 214 215 216 217 218 219 /** 220 * Closes a UNormalizer2 instance from unorm2_openFiltered(). 221 * Do not close instances from unorm2_getInstance()! 222 * @param norm2 UNormalizer2 instance to be closed 223 * \xrefitem stable "Stable" "Stable List" ICU 4.4 224 */ 225 U_CAPI void U_EXPORT2 226 unorm2_close(UNormalizer2 *norm2) __INTRODUCED_IN(31); 227 228 229 230 #if U_SHOW_CPLUSPLUS_API 231 232 U_NAMESPACE_BEGIN 233 234 /** 235 * \class LocalUNormalizer2Pointer 236 * "Smart pointer" class, closes a UNormalizer2 via unorm2_close(). 237 * For most methods see the LocalPointerBase base class. 238 * 239 * @see LocalPointerBase 240 * @see LocalPointer 241 * \xrefitem stable "Stable" "Stable List" ICU 4.4 242 */ 243 U_DEFINE_LOCAL_OPEN_POINTER(LocalUNormalizer2Pointer, UNormalizer2, unorm2_close); 244 245 U_NAMESPACE_END 246 247 #endif 248 249 /** 250 * Writes the normalized form of the source string to the destination string 251 * (replacing its contents) and returns the length of the destination string. 252 * The source and destination strings must be different buffers. 253 * @param norm2 UNormalizer2 instance 254 * @param src source string 255 * @param length length of the source string, or -1 if NUL-terminated 256 * @param dest destination string; its contents is replaced with normalized src 257 * @param capacity number of UChars that can be written to dest 258 * @param pErrorCode Standard ICU error code. Its input value must 259 * pass the U_SUCCESS() test, or else the function returns 260 * immediately. Check for U_FAILURE() on output or use with 261 * function chaining. (See User Guide for details.) 262 * @return dest 263 * \xrefitem stable "Stable" "Stable List" ICU 4.4 264 */ 265 U_CAPI int32_t U_EXPORT2 266 unorm2_normalize(const UNormalizer2 *norm2, 267 const UChar *src, int32_t length, 268 UChar *dest, int32_t capacity, 269 UErrorCode *pErrorCode) __INTRODUCED_IN(31); 270 271 272 /** 273 * Appends the normalized form of the second string to the first string 274 * (merging them at the boundary) and returns the length of the first string. 275 * The result is normalized if the first string was normalized. 276 * The first and second strings must be different buffers. 277 * @param norm2 UNormalizer2 instance 278 * @param first string, should be normalized 279 * @param firstLength length of the first string, or -1 if NUL-terminated 280 * @param firstCapacity number of UChars that can be written to first 281 * @param second string, will be normalized 282 * @param secondLength length of the source string, or -1 if NUL-terminated 283 * @param pErrorCode Standard ICU error code. Its input value must 284 * pass the U_SUCCESS() test, or else the function returns 285 * immediately. Check for U_FAILURE() on output or use with 286 * function chaining. (See User Guide for details.) 287 * @return first 288 * \xrefitem stable "Stable" "Stable List" ICU 4.4 289 */ 290 U_CAPI int32_t U_EXPORT2 291 unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2, 292 UChar *first, int32_t firstLength, int32_t firstCapacity, 293 const UChar *second, int32_t secondLength, 294 UErrorCode *pErrorCode) __INTRODUCED_IN(31); 295 296 297 /** 298 * Appends the second string to the first string 299 * (merging them at the boundary) and returns the length of the first string. 300 * The result is normalized if both the strings were normalized. 301 * The first and second strings must be different buffers. 302 * @param norm2 UNormalizer2 instance 303 * @param first string, should be normalized 304 * @param firstLength length of the first string, or -1 if NUL-terminated 305 * @param firstCapacity number of UChars that can be written to first 306 * @param second string, should be normalized 307 * @param secondLength length of the source string, or -1 if NUL-terminated 308 * @param pErrorCode Standard ICU error code. Its input value must 309 * pass the U_SUCCESS() test, or else the function returns 310 * immediately. Check for U_FAILURE() on output or use with 311 * function chaining. (See User Guide for details.) 312 * @return first 313 * \xrefitem stable "Stable" "Stable List" ICU 4.4 314 */ 315 U_CAPI int32_t U_EXPORT2 316 unorm2_append(const UNormalizer2 *norm2, 317 UChar *first, int32_t firstLength, int32_t firstCapacity, 318 const UChar *second, int32_t secondLength, 319 UErrorCode *pErrorCode) __INTRODUCED_IN(31); 320 321 322 323 /** 324 * Gets the decomposition mapping of c. 325 * Roughly equivalent to normalizing the String form of c 326 * on a UNORM2_DECOMPOSE UNormalizer2 instance, but much faster, and except that this function 327 * returns a negative value and does not write a string 328 * if c does not have a decomposition mapping in this instance's data. 329 * This function is independent of the mode of the UNormalizer2. 330 * @param norm2 UNormalizer2 instance 331 * @param c code point 332 * @param decomposition String buffer which will be set to c's 333 * decomposition mapping, if there is one. 334 * @param capacity number of UChars that can be written to decomposition 335 * @param pErrorCode Standard ICU error code. Its input value must 336 * pass the U_SUCCESS() test, or else the function returns 337 * immediately. Check for U_FAILURE() on output or use with 338 * function chaining. (See User Guide for details.) 339 * @return the non-negative length of c's decomposition, if there is one; otherwise a negative value 340 * \xrefitem stable "Stable" "Stable List" ICU 4.6 341 */ 342 U_CAPI int32_t U_EXPORT2 343 unorm2_getDecomposition(const UNormalizer2 *norm2, 344 UChar32 c, UChar *decomposition, int32_t capacity, 345 UErrorCode *pErrorCode) __INTRODUCED_IN(31); 346 347 348 349 /** 350 * Gets the raw decomposition mapping of c. 351 * 352 * This is similar to the unorm2_getDecomposition() function but returns the 353 * raw decomposition mapping as specified in UnicodeData.txt or 354 * (for custom data) in the mapping files processed by the gennorm2 tool. 355 * By contrast, unorm2_getDecomposition() returns the processed, 356 * recursively-decomposed version of this mapping. 357 * 358 * When used on a standard NFKC Normalizer2 instance, 359 * unorm2_getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property. 360 * 361 * When used on a standard NFC Normalizer2 instance, 362 * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can); 363 * in this case, the result contains either one or two code points (=1..4 UChars). 364 * 365 * This function is independent of the mode of the UNormalizer2. 366 * @param norm2 UNormalizer2 instance 367 * @param c code point 368 * @param decomposition String buffer which will be set to c's 369 * raw decomposition mapping, if there is one. 370 * @param capacity number of UChars that can be written to decomposition 371 * @param pErrorCode Standard ICU error code. Its input value must 372 * pass the U_SUCCESS() test, or else the function returns 373 * immediately. Check for U_FAILURE() on output or use with 374 * function chaining. (See User Guide for details.) 375 * @return the non-negative length of c's raw decomposition, if there is one; otherwise a negative value 376 * \xrefitem stable "Stable" "Stable List" ICU 49 377 */ 378 U_CAPI int32_t U_EXPORT2 379 unorm2_getRawDecomposition(const UNormalizer2 *norm2, 380 UChar32 c, UChar *decomposition, int32_t capacity, 381 UErrorCode *pErrorCode) __INTRODUCED_IN(31); 382 383 384 385 /** 386 * Performs pairwise composition of a & b and returns the composite if there is one. 387 * 388 * Returns a composite code point c only if c has a two-way mapping to a+b. 389 * In standard Unicode normalization, this means that 390 * c has a canonical decomposition to a+b 391 * and c does not have the Full_Composition_Exclusion property. 392 * 393 * This function is independent of the mode of the UNormalizer2. 394 * @param norm2 UNormalizer2 instance 395 * @param a A (normalization starter) code point. 396 * @param b Another code point. 397 * @return The non-negative composite code point if there is one; otherwise a negative value. 398 * \xrefitem stable "Stable" "Stable List" ICU 49 399 */ 400 U_CAPI UChar32 U_EXPORT2 401 unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b) __INTRODUCED_IN(31); 402 403 404 405 /** 406 * Gets the combining class of c. 407 * The default implementation returns 0 408 * but all standard implementations return the Unicode Canonical_Combining_Class value. 409 * @param norm2 UNormalizer2 instance 410 * @param c code point 411 * @return c's combining class 412 * \xrefitem stable "Stable" "Stable List" ICU 49 413 */ 414 U_CAPI uint8_t U_EXPORT2 415 unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) __INTRODUCED_IN(31); 416 417 418 419 /** 420 * Tests if the string is normalized. 421 * Internally, in cases where the quickCheck() method would return "maybe" 422 * (which is only possible for the two COMPOSE modes) this method 423 * resolves to "yes" or "no" to provide a definitive result, 424 * at the cost of doing more work in those cases. 425 * @param norm2 UNormalizer2 instance 426 * @param s input string 427 * @param length length of the string, or -1 if NUL-terminated 428 * @param pErrorCode Standard ICU error code. Its input value must 429 * pass the U_SUCCESS() test, or else the function returns 430 * immediately. Check for U_FAILURE() on output or use with 431 * function chaining. (See User Guide for details.) 432 * @return true if s is normalized 433 * \xrefitem stable "Stable" "Stable List" ICU 4.4 434 */ 435 U_CAPI UBool U_EXPORT2 436 unorm2_isNormalized(const UNormalizer2 *norm2, 437 const UChar *s, int32_t length, 438 UErrorCode *pErrorCode) __INTRODUCED_IN(31); 439 440 441 442 /** 443 * Tests if the string is normalized. 444 * For the two COMPOSE modes, the result could be "maybe" in cases that 445 * would take a little more work to resolve definitively. 446 * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster 447 * combination of quick check + normalization, to avoid 448 * re-checking the "yes" prefix. 449 * @param norm2 UNormalizer2 instance 450 * @param s input string 451 * @param length length of the string, or -1 if NUL-terminated 452 * @param pErrorCode Standard ICU error code. Its input value must 453 * pass the U_SUCCESS() test, or else the function returns 454 * immediately. Check for U_FAILURE() on output or use with 455 * function chaining. (See User Guide for details.) 456 * @return UNormalizationCheckResult 457 * \xrefitem stable "Stable" "Stable List" ICU 4.4 458 */ 459 U_CAPI UNormalizationCheckResult U_EXPORT2 460 unorm2_quickCheck(const UNormalizer2 *norm2, 461 const UChar *s, int32_t length, 462 UErrorCode *pErrorCode) __INTRODUCED_IN(31); 463 464 465 466 /** 467 * Returns the end of the normalized substring of the input string. 468 * In other words, with <code>end=spanQuickCheckYes(s, ec);</code> 469 * the substring <code>UnicodeString(s, 0, end)</code> 470 * will pass the quick check with a "yes" result. 471 * 472 * The returned end index is usually one or more characters before the 473 * "no" or "maybe" character: The end index is at a normalization boundary. 474 * (See the class documentation for more about normalization boundaries.) 475 * 476 * When the goal is a normalized string and most input strings are expected 477 * to be normalized already, then call this method, 478 * and if it returns a prefix shorter than the input string, 479 * copy that prefix and use normalizeSecondAndAppend() for the remainder. 480 * @param norm2 UNormalizer2 instance 481 * @param s input string 482 * @param length length of the string, or -1 if NUL-terminated 483 * @param pErrorCode Standard ICU error code. Its input value must 484 * pass the U_SUCCESS() test, or else the function returns 485 * immediately. Check for U_FAILURE() on output or use with 486 * function chaining. (See User Guide for details.) 487 * @return "yes" span end index 488 * \xrefitem stable "Stable" "Stable List" ICU 4.4 489 */ 490 U_CAPI int32_t U_EXPORT2 491 unorm2_spanQuickCheckYes(const UNormalizer2 *norm2, 492 const UChar *s, int32_t length, 493 UErrorCode *pErrorCode) __INTRODUCED_IN(31); 494 495 496 497 /** 498 * Tests if the character always has a normalization boundary before it, 499 * regardless of context. 500 * For details see the Normalizer2 base class documentation. 501 * @param norm2 UNormalizer2 instance 502 * @param c character to test 503 * @return true if c has a normalization boundary before it 504 * \xrefitem stable "Stable" "Stable List" ICU 4.4 505 */ 506 U_CAPI UBool U_EXPORT2 507 unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c) __INTRODUCED_IN(31); 508 509 510 511 /** 512 * Tests if the character always has a normalization boundary after it, 513 * regardless of context. 514 * For details see the Normalizer2 base class documentation. 515 * @param norm2 UNormalizer2 instance 516 * @param c character to test 517 * @return true if c has a normalization boundary after it 518 * \xrefitem stable "Stable" "Stable List" ICU 4.4 519 */ 520 U_CAPI UBool U_EXPORT2 521 unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c) __INTRODUCED_IN(31); 522 523 524 525 /** 526 * Tests if the character is normalization-inert. 527 * For details see the Normalizer2 base class documentation. 528 * @param norm2 UNormalizer2 instance 529 * @param c character to test 530 * @return true if c is normalization-inert 531 * \xrefitem stable "Stable" "Stable List" ICU 4.4 532 */ 533 U_CAPI UBool U_EXPORT2 534 unorm2_isInert(const UNormalizer2 *norm2, UChar32 c) __INTRODUCED_IN(31); 535 536 537 538 539 540 #endif /* !UCONFIG_NO_NORMALIZATION */ 541 #endif /* __UNORM2_H__ */ 542 543 /** @} */ // addtogroup 544