1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2009-2015, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  unorm2.h
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2009dec15
16 *   created by: Markus W. Scherer
17 */
18 
19 #ifndef __UNORM2_H__
20 #define __UNORM2_H__
21 
22 /**
23  * @addtogroup icu4c ICU4C
24  * @{
25  * \file
26  * \brief C API: New API for Unicode Normalization.
27  *
28  * Unicode normalization functionality for standard Unicode normalization or
29  * for using custom mapping tables.
30  * All instances of UNormalizer2 are unmodifiable/immutable.
31  * Instances returned by unorm2_getInstance() are singletons that must not be deleted by the caller.
32  * For more details see the Normalizer2 C++ class.
33  */
34 
35 #include "unicode/utypes.h"
36 #include "unicode/stringoptions.h"
37 
38 #if U_SHOW_CPLUSPLUS_API
39 #include "unicode/localpointer.h"
40 #endif   // U_SHOW_CPLUSPLUS_API
41 
42 /**
43  * Constants for normalization modes.
44  * For details about standard Unicode normalization forms
45  * and about the algorithms which are also used with custom mapping tables
46  * see http://www.unicode.org/unicode/reports/tr15/
47  * \xrefitem stable "Stable" "Stable List" ICU 4.4
48  */
49 typedef enum {
50     /**
51      * Decomposition followed by composition.
52      * Same as standard NFC when using an "nfc" instance.
53      * Same as standard NFKC when using an "nfkc" instance.
54      * For details about standard Unicode normalization forms
55      * see http://www.unicode.org/unicode/reports/tr15/
56      * \xrefitem stable "Stable" "Stable List" ICU 4.4
57      */
58     UNORM2_COMPOSE,
59     /**
60      * Map, and reorder canonically.
61      * Same as standard NFD when using an "nfc" instance.
62      * Same as standard NFKD when using an "nfkc" instance.
63      * For details about standard Unicode normalization forms
64      * see http://www.unicode.org/unicode/reports/tr15/
65      * \xrefitem stable "Stable" "Stable List" ICU 4.4
66      */
67     UNORM2_DECOMPOSE,
68     /**
69      * "Fast C or D" form.
70      * If a string is in this form, then further decomposition <i>without reordering</i>
71      * would yield the same form as DECOMPOSE.
72      * Text in "Fast C or D" form can be processed efficiently with data tables
73      * that are "canonically closed", that is, that provide equivalent data for
74      * equivalent text, without having to be fully normalized.
75      * Not a standard Unicode normalization form.
76      * Not a unique form: Different FCD strings can be canonically equivalent.
77      * For details see http://www.unicode.org/notes/tn5/#FCD
78      * \xrefitem stable "Stable" "Stable List" ICU 4.4
79      */
80     UNORM2_FCD,
81     /**
82      * Compose only contiguously.
83      * Also known as "FCC" or "Fast C Contiguous".
84      * The result will often but not always be in NFC.
85      * The result will conform to FCD which is useful for processing.
86      * Not a standard Unicode normalization form.
87      * For details see http://www.unicode.org/notes/tn5/#FCC
88      * \xrefitem stable "Stable" "Stable List" ICU 4.4
89      */
90     UNORM2_COMPOSE_CONTIGUOUS
91 } UNormalization2Mode;
92 
93 /**
94  * Result values for normalization quick check functions.
95  * For details see http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
96  * \xrefitem stable "Stable" "Stable List" ICU 2.0
97  */
98 typedef enum UNormalizationCheckResult {
99   /**
100    * The input string is not in the normalization form.
101    * \xrefitem stable "Stable" "Stable List" ICU 2.0
102    */
103   UNORM_NO,
104   /**
105    * The input string is in the normalization form.
106    * \xrefitem stable "Stable" "Stable List" ICU 2.0
107    */
108   UNORM_YES,
109   /**
110    * The input string may or may not be in the normalization form.
111    * This value is only returned for composition forms like NFC and FCC,
112    * when a backward-combining character is found for which the surrounding text
113    * would have to be analyzed further.
114    * \xrefitem stable "Stable" "Stable List" ICU 2.0
115    */
116   UNORM_MAYBE
117 } UNormalizationCheckResult;
118 
119 /**
120  * Opaque C service object type for the new normalization API.
121  * \xrefitem stable "Stable" "Stable List" ICU 4.4
122  */
123 struct UNormalizer2;
124 typedef struct UNormalizer2 UNormalizer2;  /**< C typedef for struct UNormalizer2. \xrefitem stable "Stable" "Stable List" ICU 4.4 */
125 
126 #if !UCONFIG_NO_NORMALIZATION
127 
128 /**
129  * Returns a UNormalizer2 instance for Unicode NFC normalization.
130  * Same as unorm2_getInstance(NULL, "nfc", UNORM2_COMPOSE, pErrorCode).
131  * Returns an unmodifiable singleton instance. Do not delete it.
132  * @param pErrorCode Standard ICU error code. Its input value must
133  *                  pass the U_SUCCESS() test, or else the function returns
134  *                  immediately. Check for U_FAILURE() on output or use with
135  *                  function chaining. (See User Guide for details.)
136  * @return the requested Normalizer2, if successful
137  * \xrefitem stable "Stable" "Stable List" ICU 49
138  */
139 U_CAPI const UNormalizer2 * U_EXPORT2
140 unorm2_getNFCInstance(UErrorCode *pErrorCode) __INTRODUCED_IN(31);
141 
142 
143 
144 /**
145  * Returns a UNormalizer2 instance for Unicode NFD normalization.
146  * Same as unorm2_getInstance(NULL, "nfc", UNORM2_DECOMPOSE, pErrorCode).
147  * Returns an unmodifiable singleton instance. Do not delete it.
148  * @param pErrorCode Standard ICU error code. Its input value must
149  *                  pass the U_SUCCESS() test, or else the function returns
150  *                  immediately. Check for U_FAILURE() on output or use with
151  *                  function chaining. (See User Guide for details.)
152  * @return the requested Normalizer2, if successful
153  * \xrefitem stable "Stable" "Stable List" ICU 49
154  */
155 U_CAPI const UNormalizer2 * U_EXPORT2
156 unorm2_getNFDInstance(UErrorCode *pErrorCode) __INTRODUCED_IN(31);
157 
158 
159 
160 /**
161  * Returns a UNormalizer2 instance for Unicode NFKC normalization.
162  * Same as unorm2_getInstance(NULL, "nfkc", UNORM2_COMPOSE, pErrorCode).
163  * Returns an unmodifiable singleton instance. Do not delete it.
164  * @param pErrorCode Standard ICU error code. Its input value must
165  *                  pass the U_SUCCESS() test, or else the function returns
166  *                  immediately. Check for U_FAILURE() on output or use with
167  *                  function chaining. (See User Guide for details.)
168  * @return the requested Normalizer2, if successful
169  * \xrefitem stable "Stable" "Stable List" ICU 49
170  */
171 U_CAPI const UNormalizer2 * U_EXPORT2
172 unorm2_getNFKCInstance(UErrorCode *pErrorCode) __INTRODUCED_IN(31);
173 
174 
175 
176 /**
177  * Returns a UNormalizer2 instance for Unicode NFKD normalization.
178  * Same as unorm2_getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, pErrorCode).
179  * Returns an unmodifiable singleton instance. Do not delete it.
180  * @param pErrorCode Standard ICU error code. Its input value must
181  *                  pass the U_SUCCESS() test, or else the function returns
182  *                  immediately. Check for U_FAILURE() on output or use with
183  *                  function chaining. (See User Guide for details.)
184  * @return the requested Normalizer2, if successful
185  * \xrefitem stable "Stable" "Stable List" ICU 49
186  */
187 U_CAPI const UNormalizer2 * U_EXPORT2
188 unorm2_getNFKDInstance(UErrorCode *pErrorCode) __INTRODUCED_IN(31);
189 
190 
191 
192 /**
193  * Returns a UNormalizer2 instance for Unicode toNFKC_Casefold() normalization
194  * which is equivalent to applying the NFKC_Casefold mappings and then NFC.
195  * See https://www.unicode.org/reports/tr44/#NFKC_Casefold
196  *
197  * Same as unorm2_getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, pErrorCode).
198  * Returns an unmodifiable singleton instance. Do not delete it.
199  * @param pErrorCode Standard ICU error code. Its input value must
200  *                  pass the U_SUCCESS() test, or else the function returns
201  *                  immediately. Check for U_FAILURE() on output or use with
202  *                  function chaining. (See User Guide for details.)
203  * @return the requested Normalizer2, if successful
204  * \xrefitem stable "Stable" "Stable List" ICU 49
205  */
206 U_CAPI const UNormalizer2 * U_EXPORT2
207 unorm2_getNFKCCasefoldInstance(UErrorCode *pErrorCode) __INTRODUCED_IN(31);
208 
209 
210 
211 #ifndef U_HIDE_DRAFT_API
212 
213 #endif  // U_HIDE_DRAFT_API
214 
215 
216 
217 
218 
219 /**
220  * Closes a UNormalizer2 instance from unorm2_openFiltered().
221  * Do not close instances from unorm2_getInstance()!
222  * @param norm2 UNormalizer2 instance to be closed
223  * \xrefitem stable "Stable" "Stable List" ICU 4.4
224  */
225 U_CAPI void U_EXPORT2
226 unorm2_close(UNormalizer2 *norm2) __INTRODUCED_IN(31);
227 
228 
229 
230 #if U_SHOW_CPLUSPLUS_API
231 
232 U_NAMESPACE_BEGIN
233 
234 /**
235  * \class LocalUNormalizer2Pointer
236  * "Smart pointer" class, closes a UNormalizer2 via unorm2_close().
237  * For most methods see the LocalPointerBase base class.
238  *
239  * @see LocalPointerBase
240  * @see LocalPointer
241  * \xrefitem stable "Stable" "Stable List" ICU 4.4
242  */
243 U_DEFINE_LOCAL_OPEN_POINTER(LocalUNormalizer2Pointer, UNormalizer2, unorm2_close);
244 
245 U_NAMESPACE_END
246 
247 #endif
248 
249 /**
250  * Writes the normalized form of the source string to the destination string
251  * (replacing its contents) and returns the length of the destination string.
252  * The source and destination strings must be different buffers.
253  * @param norm2 UNormalizer2 instance
254  * @param src source string
255  * @param length length of the source string, or -1 if NUL-terminated
256  * @param dest destination string; its contents is replaced with normalized src
257  * @param capacity number of UChars that can be written to dest
258  * @param pErrorCode Standard ICU error code. Its input value must
259  *                   pass the U_SUCCESS() test, or else the function returns
260  *                   immediately. Check for U_FAILURE() on output or use with
261  *                   function chaining. (See User Guide for details.)
262  * @return dest
263  * \xrefitem stable "Stable" "Stable List" ICU 4.4
264  */
265 U_CAPI int32_t U_EXPORT2
266 unorm2_normalize(const UNormalizer2 *norm2,
267                  const UChar *src, int32_t length,
268                  UChar *dest, int32_t capacity,
269                  UErrorCode *pErrorCode) __INTRODUCED_IN(31);
270 
271 
272 /**
273  * Appends the normalized form of the second string to the first string
274  * (merging them at the boundary) and returns the length of the first string.
275  * The result is normalized if the first string was normalized.
276  * The first and second strings must be different buffers.
277  * @param norm2 UNormalizer2 instance
278  * @param first string, should be normalized
279  * @param firstLength length of the first string, or -1 if NUL-terminated
280  * @param firstCapacity number of UChars that can be written to first
281  * @param second string, will be normalized
282  * @param secondLength length of the source string, or -1 if NUL-terminated
283  * @param pErrorCode Standard ICU error code. Its input value must
284  *                   pass the U_SUCCESS() test, or else the function returns
285  *                   immediately. Check for U_FAILURE() on output or use with
286  *                   function chaining. (See User Guide for details.)
287  * @return first
288  * \xrefitem stable "Stable" "Stable List" ICU 4.4
289  */
290 U_CAPI int32_t U_EXPORT2
291 unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,
292                                 UChar *first, int32_t firstLength, int32_t firstCapacity,
293                                 const UChar *second, int32_t secondLength,
294                                 UErrorCode *pErrorCode) __INTRODUCED_IN(31);
295 
296 
297 /**
298  * Appends the second string to the first string
299  * (merging them at the boundary) and returns the length of the first string.
300  * The result is normalized if both the strings were normalized.
301  * The first and second strings must be different buffers.
302  * @param norm2 UNormalizer2 instance
303  * @param first string, should be normalized
304  * @param firstLength length of the first string, or -1 if NUL-terminated
305  * @param firstCapacity number of UChars that can be written to first
306  * @param second string, should be normalized
307  * @param secondLength length of the source string, or -1 if NUL-terminated
308  * @param pErrorCode Standard ICU error code. Its input value must
309  *                   pass the U_SUCCESS() test, or else the function returns
310  *                   immediately. Check for U_FAILURE() on output or use with
311  *                   function chaining. (See User Guide for details.)
312  * @return first
313  * \xrefitem stable "Stable" "Stable List" ICU 4.4
314  */
315 U_CAPI int32_t U_EXPORT2
316 unorm2_append(const UNormalizer2 *norm2,
317               UChar *first, int32_t firstLength, int32_t firstCapacity,
318               const UChar *second, int32_t secondLength,
319               UErrorCode *pErrorCode) __INTRODUCED_IN(31);
320 
321 
322 
323 /**
324  * Gets the decomposition mapping of c.
325  * Roughly equivalent to normalizing the String form of c
326  * on a UNORM2_DECOMPOSE UNormalizer2 instance, but much faster, and except that this function
327  * returns a negative value and does not write a string
328  * if c does not have a decomposition mapping in this instance's data.
329  * This function is independent of the mode of the UNormalizer2.
330  * @param norm2 UNormalizer2 instance
331  * @param c code point
332  * @param decomposition String buffer which will be set to c's
333  *                      decomposition mapping, if there is one.
334  * @param capacity number of UChars that can be written to decomposition
335  * @param pErrorCode Standard ICU error code. Its input value must
336  *                   pass the U_SUCCESS() test, or else the function returns
337  *                   immediately. Check for U_FAILURE() on output or use with
338  *                   function chaining. (See User Guide for details.)
339  * @return the non-negative length of c's decomposition, if there is one; otherwise a negative value
340  * \xrefitem stable "Stable" "Stable List" ICU 4.6
341  */
342 U_CAPI int32_t U_EXPORT2
343 unorm2_getDecomposition(const UNormalizer2 *norm2,
344                         UChar32 c, UChar *decomposition, int32_t capacity,
345                         UErrorCode *pErrorCode) __INTRODUCED_IN(31);
346 
347 
348 
349 /**
350  * Gets the raw decomposition mapping of c.
351  *
352  * This is similar to the unorm2_getDecomposition() function but returns the
353  * raw decomposition mapping as specified in UnicodeData.txt or
354  * (for custom data) in the mapping files processed by the gennorm2 tool.
355  * By contrast, unorm2_getDecomposition() returns the processed,
356  * recursively-decomposed version of this mapping.
357  *
358  * When used on a standard NFKC Normalizer2 instance,
359  * unorm2_getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
360  *
361  * When used on a standard NFC Normalizer2 instance,
362  * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
363  * in this case, the result contains either one or two code points (=1..4 UChars).
364  *
365  * This function is independent of the mode of the UNormalizer2.
366  * @param norm2 UNormalizer2 instance
367  * @param c code point
368  * @param decomposition String buffer which will be set to c's
369  *                      raw decomposition mapping, if there is one.
370  * @param capacity number of UChars that can be written to decomposition
371  * @param pErrorCode Standard ICU error code. Its input value must
372  *                   pass the U_SUCCESS() test, or else the function returns
373  *                   immediately. Check for U_FAILURE() on output or use with
374  *                   function chaining. (See User Guide for details.)
375  * @return the non-negative length of c's raw decomposition, if there is one; otherwise a negative value
376  * \xrefitem stable "Stable" "Stable List" ICU 49
377  */
378 U_CAPI int32_t U_EXPORT2
379 unorm2_getRawDecomposition(const UNormalizer2 *norm2,
380                            UChar32 c, UChar *decomposition, int32_t capacity,
381                            UErrorCode *pErrorCode) __INTRODUCED_IN(31);
382 
383 
384 
385 /**
386  * Performs pairwise composition of a & b and returns the composite if there is one.
387  *
388  * Returns a composite code point c only if c has a two-way mapping to a+b.
389  * In standard Unicode normalization, this means that
390  * c has a canonical decomposition to a+b
391  * and c does not have the Full_Composition_Exclusion property.
392  *
393  * This function is independent of the mode of the UNormalizer2.
394  * @param norm2 UNormalizer2 instance
395  * @param a A (normalization starter) code point.
396  * @param b Another code point.
397  * @return The non-negative composite code point if there is one; otherwise a negative value.
398  * \xrefitem stable "Stable" "Stable List" ICU 49
399  */
400 U_CAPI UChar32 U_EXPORT2
401 unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b) __INTRODUCED_IN(31);
402 
403 
404 
405 /**
406  * Gets the combining class of c.
407  * The default implementation returns 0
408  * but all standard implementations return the Unicode Canonical_Combining_Class value.
409  * @param norm2 UNormalizer2 instance
410  * @param c code point
411  * @return c's combining class
412  * \xrefitem stable "Stable" "Stable List" ICU 49
413  */
414 U_CAPI uint8_t U_EXPORT2
415 unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) __INTRODUCED_IN(31);
416 
417 
418 
419 /**
420  * Tests if the string is normalized.
421  * Internally, in cases where the quickCheck() method would return "maybe"
422  * (which is only possible for the two COMPOSE modes) this method
423  * resolves to "yes" or "no" to provide a definitive result,
424  * at the cost of doing more work in those cases.
425  * @param norm2 UNormalizer2 instance
426  * @param s input string
427  * @param length length of the string, or -1 if NUL-terminated
428  * @param pErrorCode Standard ICU error code. Its input value must
429  *                   pass the U_SUCCESS() test, or else the function returns
430  *                   immediately. Check for U_FAILURE() on output or use with
431  *                   function chaining. (See User Guide for details.)
432  * @return true if s is normalized
433  * \xrefitem stable "Stable" "Stable List" ICU 4.4
434  */
435 U_CAPI UBool U_EXPORT2
436 unorm2_isNormalized(const UNormalizer2 *norm2,
437                     const UChar *s, int32_t length,
438                     UErrorCode *pErrorCode) __INTRODUCED_IN(31);
439 
440 
441 
442 /**
443  * Tests if the string is normalized.
444  * For the two COMPOSE modes, the result could be "maybe" in cases that
445  * would take a little more work to resolve definitively.
446  * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
447  * combination of quick check + normalization, to avoid
448  * re-checking the "yes" prefix.
449  * @param norm2 UNormalizer2 instance
450  * @param s input string
451  * @param length length of the string, or -1 if NUL-terminated
452  * @param pErrorCode Standard ICU error code. Its input value must
453  *                   pass the U_SUCCESS() test, or else the function returns
454  *                   immediately. Check for U_FAILURE() on output or use with
455  *                   function chaining. (See User Guide for details.)
456  * @return UNormalizationCheckResult
457  * \xrefitem stable "Stable" "Stable List" ICU 4.4
458  */
459 U_CAPI UNormalizationCheckResult U_EXPORT2
460 unorm2_quickCheck(const UNormalizer2 *norm2,
461                   const UChar *s, int32_t length,
462                   UErrorCode *pErrorCode) __INTRODUCED_IN(31);
463 
464 
465 
466 /**
467  * Returns the end of the normalized substring of the input string.
468  * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
469  * the substring <code>UnicodeString(s, 0, end)</code>
470  * will pass the quick check with a "yes" result.
471  *
472  * The returned end index is usually one or more characters before the
473  * "no" or "maybe" character: The end index is at a normalization boundary.
474  * (See the class documentation for more about normalization boundaries.)
475  *
476  * When the goal is a normalized string and most input strings are expected
477  * to be normalized already, then call this method,
478  * and if it returns a prefix shorter than the input string,
479  * copy that prefix and use normalizeSecondAndAppend() for the remainder.
480  * @param norm2 UNormalizer2 instance
481  * @param s input string
482  * @param length length of the string, or -1 if NUL-terminated
483  * @param pErrorCode Standard ICU error code. Its input value must
484  *                   pass the U_SUCCESS() test, or else the function returns
485  *                   immediately. Check for U_FAILURE() on output or use with
486  *                   function chaining. (See User Guide for details.)
487  * @return "yes" span end index
488  * \xrefitem stable "Stable" "Stable List" ICU 4.4
489  */
490 U_CAPI int32_t U_EXPORT2
491 unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,
492                          const UChar *s, int32_t length,
493                          UErrorCode *pErrorCode) __INTRODUCED_IN(31);
494 
495 
496 
497 /**
498  * Tests if the character always has a normalization boundary before it,
499  * regardless of context.
500  * For details see the Normalizer2 base class documentation.
501  * @param norm2 UNormalizer2 instance
502  * @param c character to test
503  * @return true if c has a normalization boundary before it
504  * \xrefitem stable "Stable" "Stable List" ICU 4.4
505  */
506 U_CAPI UBool U_EXPORT2
507 unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c) __INTRODUCED_IN(31);
508 
509 
510 
511 /**
512  * Tests if the character always has a normalization boundary after it,
513  * regardless of context.
514  * For details see the Normalizer2 base class documentation.
515  * @param norm2 UNormalizer2 instance
516  * @param c character to test
517  * @return true if c has a normalization boundary after it
518  * \xrefitem stable "Stable" "Stable List" ICU 4.4
519  */
520 U_CAPI UBool U_EXPORT2
521 unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c) __INTRODUCED_IN(31);
522 
523 
524 
525 /**
526  * Tests if the character is normalization-inert.
527  * For details see the Normalizer2 base class documentation.
528  * @param norm2 UNormalizer2 instance
529  * @param c character to test
530  * @return true if c is normalization-inert
531  * \xrefitem stable "Stable" "Stable List" ICU 4.4
532  */
533 U_CAPI UBool U_EXPORT2
534 unorm2_isInert(const UNormalizer2 *norm2, UChar32 c) __INTRODUCED_IN(31);
535 
536 
537 
538 
539 
540 #endif  /* !UCONFIG_NO_NORMALIZATION */
541 #endif  /* __UNORM2_H__ */
542 
543 /** @} */ // addtogroup
544