1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2002-2014, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: uset.h 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2002mar07 16 * created by: Markus W. Scherer 17 * 18 * C version of UnicodeSet. 19 */ 20 21 22 /** 23 * \file 24 * \brief C API: Unicode Set 25 * 26 * <p>This is a C wrapper around the C++ UnicodeSet class.</p> 27 */ 28 29 #ifndef __USET_H__ 30 #define __USET_H__ 31 32 #include "unicode/utypes.h" 33 #include "unicode/uchar.h" 34 35 #if U_SHOW_CPLUSPLUS_API 36 #include "unicode/localpointer.h" 37 #endif // U_SHOW_CPLUSPLUS_API 38 39 #ifndef USET_DEFINED 40 41 #ifndef U_IN_DOXYGEN 42 #define USET_DEFINED 43 #endif 44 /** 45 * USet is the C API type corresponding to C++ class UnicodeSet. 46 * Use the uset_* API to manipulate. Create with 47 * uset_open*, and destroy with uset_close. 48 * @stable ICU 2.4 49 */ 50 typedef struct USet USet; 51 #endif 52 53 /** 54 * Bitmask values to be passed to uset_openPatternOptions() or 55 * uset_applyPattern() taking an option parameter. 56 * 57 * Use at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. 58 * These case options are mutually exclusive. 59 * 60 * Undefined options bits are ignored, and reserved for future use. 61 * 62 * @stable ICU 2.4 63 */ 64 enum { 65 /** 66 * Ignore white space within patterns unless quoted or escaped. 67 * @stable ICU 2.4 68 */ 69 USET_IGNORE_SPACE = 1, 70 71 /** 72 * Enable case insensitive matching. E.g., "[ab]" with this flag 73 * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will 74 * match all except 'a', 'A', 'b', and 'B'. This performs a full 75 * closure over case mappings, e.g. 'ſ' (U+017F long s) for 's'. 76 * 77 * The resulting set is a superset of the input for the code points but 78 * not for the strings. 79 * It performs a case mapping closure of the code points and adds 80 * full case folding strings for the code points, and reduces strings of 81 * the original set to their full case folding equivalents. 82 * 83 * This is designed for case-insensitive matches, for example 84 * in regular expressions. The full code point case closure allows checking of 85 * an input character directly against the closure set. 86 * Strings are matched by comparing the case-folded form from the closure 87 * set with an incremental case folding of the string in question. 88 * 89 * The closure set will also contain single code points if the original 90 * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.). 91 * This is not necessary (that is, redundant) for the above matching method 92 * but results in the same closure sets regardless of whether the original 93 * set contained the code point or a string. 94 * 95 * @stable ICU 2.4 96 */ 97 USET_CASE_INSENSITIVE = 2, 98 99 /** 100 * Adds all case mappings for each element in the set. 101 * This adds the full lower-, title-, and uppercase mappings as well as the full case folding 102 * of each existing element in the set. 103 * 104 * Unlike the “case insensitive” options, this does not perform a closure. 105 * For example, it does not add 'ſ' (U+017F long s) for 's', 106 * 'K' (U+212A Kelvin sign) for 'k', or replace set strings by their case-folded versions. 107 * 108 * @stable ICU 3.2 109 */ 110 USET_ADD_CASE_MAPPINGS = 4, 111 112 #ifndef U_HIDE_DRAFT_API 113 /** 114 * Enable case insensitive matching. 115 * Same as USET_CASE_INSENSITIVE but using only Simple_Case_Folding (scf) mappings, 116 * which map each code point to one code point, 117 * not full Case_Folding (cf) mappings, which map some code points to multiple code points. 118 * 119 * This is designed for case-insensitive matches, for example in certain 120 * regular expression implementations where only Simple_Case_Folding mappings are used, 121 * such as in ECMAScript (JavaScript) regular expressions. 122 * 123 * @draft ICU 73 124 */ 125 USET_SIMPLE_CASE_INSENSITIVE = 6 126 #endif // U_HIDE_DRAFT_API 127 }; 128 129 /** 130 * Argument values for whether span() and similar functions continue while 131 * the current character is contained vs. not contained in the set. 132 * 133 * The functionality is straightforward for sets with only single code points, 134 * without strings (which is the common case): 135 * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE work the same. 136 * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE are inverses of USET_SPAN_NOT_CONTAINED. 137 * - span() and spanBack() partition any string the same way when 138 * alternating between span(USET_SPAN_NOT_CONTAINED) and 139 * span(either "contained" condition). 140 * - Using a complemented (inverted) set and the opposite span conditions 141 * yields the same results. 142 * 143 * When a set contains multi-code point strings, then these statements may not 144 * be true, depending on the strings in the set (for example, whether they 145 * overlap with each other) and the string that is processed. 146 * For a set with strings: 147 * - The complement of the set contains the opposite set of code points, 148 * but the same set of strings. 149 * Therefore, complementing both the set and the span conditions 150 * may yield different results. 151 * - When starting spans at different positions in a string 152 * (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different 153 * because a set string may start before the later position. 154 * - span(USET_SPAN_SIMPLE) may be shorter than 155 * span(USET_SPAN_CONTAINED) because it will not recursively try 156 * all possible paths. 157 * For example, with a set which contains the three strings "xy", "xya" and "ax", 158 * span("xyax", USET_SPAN_CONTAINED) will return 4 but 159 * span("xyax", USET_SPAN_SIMPLE) will return 3. 160 * span(USET_SPAN_SIMPLE) will never be longer than 161 * span(USET_SPAN_CONTAINED). 162 * - With either "contained" condition, span() and spanBack() may partition 163 * a string in different ways. 164 * For example, with a set which contains the two strings "ab" and "ba", 165 * and when processing the string "aba", 166 * span() will yield contained/not-contained boundaries of { 0, 2, 3 } 167 * while spanBack() will yield boundaries of { 0, 1, 3 }. 168 * 169 * Note: If it is important to get the same boundaries whether iterating forward 170 * or backward through a string, then either only span() should be used and 171 * the boundaries cached for backward operation, or an ICU BreakIterator 172 * could be used. 173 * 174 * Note: Unpaired surrogates are treated like surrogate code points. 175 * Similarly, set strings match only on code point boundaries, 176 * never in the middle of a surrogate pair. 177 * Illegal UTF-8 sequences are treated like U+FFFD. 178 * When processing UTF-8 strings, malformed set strings 179 * (strings with unpaired surrogates which cannot be converted to UTF-8) 180 * are ignored. 181 * 182 * @stable ICU 3.8 183 */ 184 typedef enum USetSpanCondition { 185 /** 186 * Continues a span() while there is no set element at the current position. 187 * Increments by one code point at a time. 188 * Stops before the first set element (character or string). 189 * (For code points only, this is like while contains(current)==false). 190 * 191 * When span() returns, the substring between where it started and the position 192 * it returned consists only of characters that are not in the set, 193 * and none of its strings overlap with the span. 194 * 195 * @stable ICU 3.8 196 */ 197 USET_SPAN_NOT_CONTAINED = 0, 198 /** 199 * Spans the longest substring that is a concatenation of set elements (characters or strings). 200 * (For characters only, this is like while contains(current)==true). 201 * 202 * When span() returns, the substring between where it started and the position 203 * it returned consists only of set elements (characters or strings) that are in the set. 204 * 205 * If a set contains strings, then the span will be the longest substring for which there 206 * exists at least one non-overlapping concatenation of set elements (characters or strings). 207 * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>. 208 * (Java/ICU/Perl regex stops at the first match of an OR.) 209 * 210 * @stable ICU 3.8 211 */ 212 USET_SPAN_CONTAINED = 1, 213 /** 214 * Continues a span() while there is a set element at the current position. 215 * Increments by the longest matching element at each position. 216 * (For characters only, this is like while contains(current)==true). 217 * 218 * When span() returns, the substring between where it started and the position 219 * it returned consists only of set elements (characters or strings) that are in the set. 220 * 221 * If a set only contains single characters, then this is the same 222 * as USET_SPAN_CONTAINED. 223 * 224 * If a set contains strings, then the span will be the longest substring 225 * with a match at each position with the longest single set element (character or string). 226 * 227 * Use this span condition together with other longest-match algorithms, 228 * such as ICU converters (ucnv_getUnicodeSet()). 229 * 230 * @stable ICU 3.8 231 */ 232 USET_SPAN_SIMPLE = 2, 233 #ifndef U_HIDE_DEPRECATED_API 234 /** 235 * One more than the last span condition. 236 * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420. 237 */ 238 USET_SPAN_CONDITION_COUNT 239 #endif // U_HIDE_DEPRECATED_API 240 } USetSpanCondition; 241 242 enum { 243 /** 244 * Capacity of USerializedSet::staticArray. 245 * Enough for any single-code point set. 246 * Also provides padding for nice sizeof(USerializedSet). 247 * @stable ICU 2.4 248 */ 249 USET_SERIALIZED_STATIC_ARRAY_CAPACITY=8 250 }; 251 252 /** 253 * A serialized form of a Unicode set. Limited manipulations are 254 * possible directly on a serialized set. See below. 255 * @stable ICU 2.4 256 */ 257 typedef struct USerializedSet { 258 /** 259 * The serialized Unicode Set. 260 * @stable ICU 2.4 261 */ 262 const uint16_t *array; 263 /** 264 * The length of the array that contains BMP characters. 265 * @stable ICU 2.4 266 */ 267 int32_t bmpLength; 268 /** 269 * The total length of the array. 270 * @stable ICU 2.4 271 */ 272 int32_t length; 273 /** 274 * A small buffer for the array to reduce memory allocations. 275 * @stable ICU 2.4 276 */ 277 uint16_t staticArray[USET_SERIALIZED_STATIC_ARRAY_CAPACITY]; 278 } USerializedSet; 279 280 /********************************************************************* 281 * USet API 282 *********************************************************************/ 283 284 /** 285 * Create an empty USet object. 286 * Equivalent to uset_open(1, 0). 287 * @return a newly created USet. The caller must call uset_close() on 288 * it when done. 289 * @stable ICU 4.2 290 */ 291 U_CAPI USet* U_EXPORT2 292 uset_openEmpty(void); 293 294 /** 295 * Creates a USet object that contains the range of characters 296 * start..end, inclusive. If <code>start > end</code> 297 * then an empty set is created (same as using uset_openEmpty()). 298 * @param start first character of the range, inclusive 299 * @param end last character of the range, inclusive 300 * @return a newly created USet. The caller must call uset_close() on 301 * it when done. 302 * @stable ICU 2.4 303 */ 304 U_CAPI USet* U_EXPORT2 305 uset_open(UChar32 start, UChar32 end); 306 307 /** 308 * Creates a set from the given pattern. See the UnicodeSet class 309 * description for the syntax of the pattern language. 310 * @param pattern a string specifying what characters are in the set 311 * @param patternLength the length of the pattern, or -1 if null 312 * terminated 313 * @param ec the error code 314 * @stable ICU 2.4 315 */ 316 U_CAPI USet* U_EXPORT2 317 uset_openPattern(const UChar* pattern, int32_t patternLength, 318 UErrorCode* ec); 319 320 /** 321 * Creates a set from the given pattern. See the UnicodeSet class 322 * description for the syntax of the pattern language. 323 * @param pattern a string specifying what characters are in the set 324 * @param patternLength the length of the pattern, or -1 if null 325 * terminated 326 * @param options bitmask for options to apply to the pattern. 327 * Valid options are USET_IGNORE_SPACE and 328 * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. 329 * These case options are mutually exclusive. 330 * @param ec the error code 331 * @stable ICU 2.4 332 */ 333 U_CAPI USet* U_EXPORT2 334 uset_openPatternOptions(const UChar* pattern, int32_t patternLength, 335 uint32_t options, 336 UErrorCode* ec); 337 338 /** 339 * Disposes of the storage used by a USet object. This function should 340 * be called exactly once for objects returned by uset_open(). 341 * @param set the object to dispose of 342 * @stable ICU 2.4 343 */ 344 U_CAPI void U_EXPORT2 345 uset_close(USet* set); 346 347 #if U_SHOW_CPLUSPLUS_API 348 349 U_NAMESPACE_BEGIN 350 351 /** 352 * \class LocalUSetPointer 353 * "Smart pointer" class, closes a USet via uset_close(). 354 * For most methods see the LocalPointerBase base class. 355 * 356 * @see LocalPointerBase 357 * @see LocalPointer 358 * @stable ICU 4.4 359 */ 360 U_DEFINE_LOCAL_OPEN_POINTER(LocalUSetPointer, USet, uset_close); 361 362 U_NAMESPACE_END 363 364 #endif 365 366 /** 367 * Returns a copy of this object. 368 * If this set is frozen, then the clone will be frozen as well. 369 * Use uset_cloneAsThawed() for a mutable clone of a frozen set. 370 * @param set the original set 371 * @return the newly allocated copy of the set 372 * @see uset_cloneAsThawed 373 * @stable ICU 3.8 374 */ 375 U_CAPI USet * U_EXPORT2 376 uset_clone(const USet *set); 377 378 /** 379 * Determines whether the set has been frozen (made immutable) or not. 380 * See the ICU4J Freezable interface for details. 381 * @param set the set 382 * @return true/false for whether the set has been frozen 383 * @see uset_freeze 384 * @see uset_cloneAsThawed 385 * @stable ICU 3.8 386 */ 387 U_CAPI UBool U_EXPORT2 388 uset_isFrozen(const USet *set); 389 390 /** 391 * Freeze the set (make it immutable). 392 * Once frozen, it cannot be unfrozen and is therefore thread-safe 393 * until it is deleted. 394 * See the ICU4J Freezable interface for details. 395 * Freezing the set may also make some operations faster, for example 396 * uset_contains() and uset_span(). 397 * A frozen set will not be modified. (It remains frozen.) 398 * @param set the set 399 * @return the same set, now frozen 400 * @see uset_isFrozen 401 * @see uset_cloneAsThawed 402 * @stable ICU 3.8 403 */ 404 U_CAPI void U_EXPORT2 405 uset_freeze(USet *set); 406 407 /** 408 * Clone the set and make the clone mutable. 409 * See the ICU4J Freezable interface for details. 410 * @param set the set 411 * @return the mutable clone 412 * @see uset_freeze 413 * @see uset_isFrozen 414 * @see uset_clone 415 * @stable ICU 3.8 416 */ 417 U_CAPI USet * U_EXPORT2 418 uset_cloneAsThawed(const USet *set); 419 420 /** 421 * Causes the USet object to represent the range <code>start - end</code>. 422 * If <code>start > end</code> then this USet is set to an empty range. 423 * A frozen set will not be modified. 424 * @param set the object to set to the given range 425 * @param start first character in the set, inclusive 426 * @param end last character in the set, inclusive 427 * @stable ICU 3.2 428 */ 429 U_CAPI void U_EXPORT2 430 uset_set(USet* set, 431 UChar32 start, UChar32 end); 432 433 /** 434 * Modifies the set to represent the set specified by the given 435 * pattern. See the UnicodeSet class description for the syntax of 436 * the pattern language. See also the User Guide chapter about UnicodeSet. 437 * <em>Empties the set passed before applying the pattern.</em> 438 * A frozen set will not be modified. 439 * @param set The set to which the pattern is to be applied. 440 * @param pattern A pointer to UChar string specifying what characters are in the set. 441 * The character at pattern[0] must be a '['. 442 * @param patternLength The length of the UChar string. -1 if NUL terminated. 443 * @param options A bitmask for options to apply to the pattern. 444 * Valid options are USET_IGNORE_SPACE and 445 * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, 446 * USET_SIMPLE_CASE_INSENSITIVE. 447 * These case options are mutually exclusive. 448 * @param status Returns an error if the pattern cannot be parsed. 449 * @return Upon successful parse, the value is either 450 * the index of the character after the closing ']' 451 * of the parsed pattern. 452 * If the status code indicates failure, then the return value 453 * is the index of the error in the source. 454 * 455 * @stable ICU 2.8 456 */ 457 U_CAPI int32_t U_EXPORT2 458 uset_applyPattern(USet *set, 459 const UChar *pattern, int32_t patternLength, 460 uint32_t options, 461 UErrorCode *status); 462 463 /** 464 * Modifies the set to contain those code points which have the given value 465 * for the given binary or enumerated property, as returned by 466 * u_getIntPropertyValue. Prior contents of this set are lost. 467 * A frozen set will not be modified. 468 * 469 * @param set the object to contain the code points defined by the property 470 * 471 * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1 472 * or UCHAR_INT_START..UCHAR_INT_LIMIT-1 473 * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1. 474 * 475 * @param value a value in the range u_getIntPropertyMinValue(prop).. 476 * u_getIntPropertyMaxValue(prop), with one exception. If prop is 477 * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but 478 * rather a mask value produced by U_GET_GC_MASK(). This allows grouped 479 * categories such as [:L:] to be represented. 480 * 481 * @param ec error code input/output parameter 482 * 483 * @stable ICU 3.2 484 */ 485 U_CAPI void U_EXPORT2 486 uset_applyIntPropertyValue(USet* set, 487 UProperty prop, int32_t value, UErrorCode* ec); 488 489 /** 490 * Modifies the set to contain those code points which have the 491 * given value for the given property. Prior contents of this 492 * set are lost. 493 * A frozen set will not be modified. 494 * 495 * @param set the object to contain the code points defined by the given 496 * property and value alias 497 * 498 * @param prop a string specifying a property alias, either short or long. 499 * The name is matched loosely. See PropertyAliases.txt for names and a 500 * description of loose matching. If the value string is empty, then this 501 * string is interpreted as either a General_Category value alias, a Script 502 * value alias, a binary property alias, or a special ID. Special IDs are 503 * matched loosely and correspond to the following sets: 504 * 505 * "ANY" = [\\u0000-\\U0010FFFF], 506 * "ASCII" = [\\u0000-\\u007F], 507 * "Assigned" = [:^Cn:]. 508 * 509 * @param propLength the length of the prop, or -1 if NULL 510 * 511 * @param value a string specifying a value alias, either short or long. 512 * The name is matched loosely. See PropertyValueAliases.txt for names 513 * and a description of loose matching. In addition to aliases listed, 514 * numeric values and canonical combining classes may be expressed 515 * numerically, e.g., ("nv", "0.5") or ("ccc", "220"). The value string 516 * may also be empty. 517 * 518 * @param valueLength the length of the value, or -1 if NULL 519 * 520 * @param ec error code input/output parameter 521 * 522 * @stable ICU 3.2 523 */ 524 U_CAPI void U_EXPORT2 525 uset_applyPropertyAlias(USet* set, 526 const UChar *prop, int32_t propLength, 527 const UChar *value, int32_t valueLength, 528 UErrorCode* ec); 529 530 /** 531 * Return true if the given position, in the given pattern, appears 532 * to be the start of a UnicodeSet pattern. 533 * 534 * @param pattern a string specifying the pattern 535 * @param patternLength the length of the pattern, or -1 if NULL 536 * @param pos the given position 537 * @stable ICU 3.2 538 */ 539 U_CAPI UBool U_EXPORT2 540 uset_resemblesPattern(const UChar *pattern, int32_t patternLength, 541 int32_t pos); 542 543 /** 544 * Returns a string representation of this set. If the result of 545 * calling this function is passed to a uset_openPattern(), it 546 * will produce another set that is equal to this one. 547 * @param set the set 548 * @param result the string to receive the rules, may be NULL 549 * @param resultCapacity the capacity of result, may be 0 if result is NULL 550 * @param escapeUnprintable if true then convert unprintable 551 * character to their hex escape representations, \\uxxxx or 552 * \\Uxxxxxxxx. Unprintable characters are those other than 553 * U+000A, U+0020..U+007E. 554 * @param ec error code. 555 * @return length of string, possibly larger than resultCapacity 556 * @stable ICU 2.4 557 */ 558 U_CAPI int32_t U_EXPORT2 559 uset_toPattern(const USet* set, 560 UChar* result, int32_t resultCapacity, 561 UBool escapeUnprintable, 562 UErrorCode* ec); 563 564 /** 565 * Adds the given character to the given USet. After this call, 566 * uset_contains(set, c) will return true. 567 * A frozen set will not be modified. 568 * @param set the object to which to add the character 569 * @param c the character to add 570 * @stable ICU 2.4 571 */ 572 U_CAPI void U_EXPORT2 573 uset_add(USet* set, UChar32 c); 574 575 /** 576 * Adds all of the elements in the specified set to this set if 577 * they're not already present. This operation effectively 578 * modifies this set so that its value is the <i>union</i> of the two 579 * sets. The behavior of this operation is unspecified if the specified 580 * collection is modified while the operation is in progress. 581 * A frozen set will not be modified. 582 * 583 * @param set the object to which to add the set 584 * @param additionalSet the source set whose elements are to be added to this set. 585 * @stable ICU 2.6 586 */ 587 U_CAPI void U_EXPORT2 588 uset_addAll(USet* set, const USet *additionalSet); 589 590 /** 591 * Adds the given range of characters to the given USet. After this call, 592 * uset_contains(set, start, end) will return true. 593 * A frozen set will not be modified. 594 * @param set the object to which to add the character 595 * @param start the first character of the range to add, inclusive 596 * @param end the last character of the range to add, inclusive 597 * @stable ICU 2.2 598 */ 599 U_CAPI void U_EXPORT2 600 uset_addRange(USet* set, UChar32 start, UChar32 end); 601 602 /** 603 * Adds the given string to the given USet. After this call, 604 * uset_containsString(set, str, strLen) will return true. 605 * A frozen set will not be modified. 606 * @param set the object to which to add the character 607 * @param str the string to add 608 * @param strLen the length of the string or -1 if null terminated. 609 * @stable ICU 2.4 610 */ 611 U_CAPI void U_EXPORT2 612 uset_addString(USet* set, const UChar* str, int32_t strLen); 613 614 /** 615 * Adds each of the characters in this string to the set. Note: "ch" => {"c", "h"} 616 * If this set already contains any particular character, it has no effect on that character. 617 * A frozen set will not be modified. 618 * @param set the object to which to add the character 619 * @param str the source string 620 * @param strLen the length of the string or -1 if null terminated. 621 * @stable ICU 3.4 622 */ 623 U_CAPI void U_EXPORT2 624 uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen); 625 626 /** 627 * Removes the given character from the given USet. After this call, 628 * uset_contains(set, c) will return false. 629 * A frozen set will not be modified. 630 * @param set the object from which to remove the character 631 * @param c the character to remove 632 * @stable ICU 2.4 633 */ 634 U_CAPI void U_EXPORT2 635 uset_remove(USet* set, UChar32 c); 636 637 /** 638 * Removes the given range of characters from the given USet. After this call, 639 * uset_contains(set, start, end) will return false. 640 * A frozen set will not be modified. 641 * @param set the object to which to add the character 642 * @param start the first character of the range to remove, inclusive 643 * @param end the last character of the range to remove, inclusive 644 * @stable ICU 2.2 645 */ 646 U_CAPI void U_EXPORT2 647 uset_removeRange(USet* set, UChar32 start, UChar32 end); 648 649 /** 650 * Removes the given string to the given USet. After this call, 651 * uset_containsString(set, str, strLen) will return false. 652 * A frozen set will not be modified. 653 * @param set the object to which to add the character 654 * @param str the string to remove 655 * @param strLen the length of the string or -1 if null terminated. 656 * @stable ICU 2.4 657 */ 658 U_CAPI void U_EXPORT2 659 uset_removeString(USet* set, const UChar* str, int32_t strLen); 660 661 /** 662 * Removes EACH of the characters in this string. Note: "ch" == {"c", "h"} 663 * A frozen set will not be modified. 664 * 665 * @param set the object to be modified 666 * @param str the string 667 * @param length the length of the string, or -1 if NUL-terminated 668 * @stable ICU 69 669 */ 670 U_CAPI void U_EXPORT2 671 uset_removeAllCodePoints(USet *set, const UChar *str, int32_t length); 672 673 /** 674 * Removes from this set all of its elements that are contained in the 675 * specified set. This operation effectively modifies this 676 * set so that its value is the <i>asymmetric set difference</i> of 677 * the two sets. 678 * A frozen set will not be modified. 679 * @param set the object from which the elements are to be removed 680 * @param removeSet the object that defines which elements will be 681 * removed from this set 682 * @stable ICU 3.2 683 */ 684 U_CAPI void U_EXPORT2 685 uset_removeAll(USet* set, const USet* removeSet); 686 687 /** 688 * Retain only the elements in this set that are contained in the 689 * specified range. If <code>start > end</code> then an empty range is 690 * retained, leaving the set empty. This is equivalent to 691 * a boolean logic AND, or a set INTERSECTION. 692 * A frozen set will not be modified. 693 * 694 * @param set the object for which to retain only the specified range 695 * @param start first character, inclusive, of range 696 * @param end last character, inclusive, of range 697 * @stable ICU 3.2 698 */ 699 U_CAPI void U_EXPORT2 700 uset_retain(USet* set, UChar32 start, UChar32 end); 701 702 /** 703 * Retains only the specified string from this set if it is present. 704 * Upon return this set will be empty if it did not contain s, or 705 * will only contain s if it did contain s. 706 * A frozen set will not be modified. 707 * 708 * @param set the object to be modified 709 * @param str the string 710 * @param length the length of the string, or -1 if NUL-terminated 711 * @stable ICU 69 712 */ 713 U_CAPI void U_EXPORT2 714 uset_retainString(USet *set, const UChar *str, int32_t length); 715 716 /** 717 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"} 718 * A frozen set will not be modified. 719 * 720 * @param set the object to be modified 721 * @param str the string 722 * @param length the length of the string, or -1 if NUL-terminated 723 * @stable ICU 69 724 */ 725 U_CAPI void U_EXPORT2 726 uset_retainAllCodePoints(USet *set, const UChar *str, int32_t length); 727 728 /** 729 * Retains only the elements in this set that are contained in the 730 * specified set. In other words, removes from this set all of 731 * its elements that are not contained in the specified set. This 732 * operation effectively modifies this set so that its value is 733 * the <i>intersection</i> of the two sets. 734 * A frozen set will not be modified. 735 * 736 * @param set the object on which to perform the retain 737 * @param retain set that defines which elements this set will retain 738 * @stable ICU 3.2 739 */ 740 U_CAPI void U_EXPORT2 741 uset_retainAll(USet* set, const USet* retain); 742 743 /** 744 * Reallocate this objects internal structures to take up the least 745 * possible space, without changing this object's value. 746 * A frozen set will not be modified. 747 * 748 * @param set the object on which to perform the compact 749 * @stable ICU 3.2 750 */ 751 U_CAPI void U_EXPORT2 752 uset_compact(USet* set); 753 754 /** 755 * This is equivalent to 756 * <code>uset_complementRange(set, 0, 0x10FFFF)</code>. 757 * 758 * <strong>Note:</strong> This performs a symmetric difference with all code points 759 * <em>and thus retains all multicharacter strings</em>. 760 * In order to achieve a “code point complement” (all code points minus this set), 761 * the easiest is to <code>uset_complement(set); uset_removeAllStrings(set);</code>. 762 * 763 * A frozen set will not be modified. 764 * @param set the set 765 * @stable ICU 2.4 766 */ 767 U_CAPI void U_EXPORT2 768 uset_complement(USet* set); 769 770 /** 771 * Complements the specified range in this set. Any character in 772 * the range will be removed if it is in this set, or will be 773 * added if it is not in this set. If <code>start > end</code> 774 * then an empty range is complemented, leaving the set unchanged. 775 * This is equivalent to a boolean logic XOR. 776 * A frozen set will not be modified. 777 * 778 * @param set the object to be modified 779 * @param start first character, inclusive, of range 780 * @param end last character, inclusive, of range 781 * @stable ICU 69 782 */ 783 U_CAPI void U_EXPORT2 784 uset_complementRange(USet *set, UChar32 start, UChar32 end); 785 786 /** 787 * Complements the specified string in this set. 788 * The string will be removed if it is in this set, or will be added if it is not in this set. 789 * A frozen set will not be modified. 790 * 791 * @param set the object to be modified 792 * @param str the string 793 * @param length the length of the string, or -1 if NUL-terminated 794 * @stable ICU 69 795 */ 796 U_CAPI void U_EXPORT2 797 uset_complementString(USet *set, const UChar *str, int32_t length); 798 799 /** 800 * Complements EACH of the characters in this string. Note: "ch" == {"c", "h"} 801 * A frozen set will not be modified. 802 * 803 * @param set the object to be modified 804 * @param str the string 805 * @param length the length of the string, or -1 if NUL-terminated 806 * @stable ICU 69 807 */ 808 U_CAPI void U_EXPORT2 809 uset_complementAllCodePoints(USet *set, const UChar *str, int32_t length); 810 811 /** 812 * Complements in this set all elements contained in the specified 813 * set. Any character in the other set will be removed if it is 814 * in this set, or will be added if it is not in this set. 815 * A frozen set will not be modified. 816 * 817 * @param set the set with which to complement 818 * @param complement set that defines which elements will be xor'ed 819 * from this set. 820 * @stable ICU 3.2 821 */ 822 U_CAPI void U_EXPORT2 823 uset_complementAll(USet* set, const USet* complement); 824 825 /** 826 * Removes all of the elements from this set. This set will be 827 * empty after this call returns. 828 * A frozen set will not be modified. 829 * @param set the set 830 * @stable ICU 2.4 831 */ 832 U_CAPI void U_EXPORT2 833 uset_clear(USet* set); 834 835 /** 836 * Close this set over the given attribute. For the attribute 837 * USET_CASE_INSENSITIVE, the result is to modify this set so that: 838 * 839 * 1. For each character or string 'a' in this set, all strings or 840 * characters 'b' such that foldCase(a) == foldCase(b) are added 841 * to this set. 842 * 843 * 2. For each string 'e' in the resulting set, if e != 844 * foldCase(e), 'e' will be removed. 845 * 846 * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}] 847 * 848 * (Here foldCase(x) refers to the operation u_strFoldCase, and a 849 * == b denotes that the contents are the same, not pointer 850 * comparison.) 851 * 852 * A frozen set will not be modified. 853 * 854 * @param set the set 855 * 856 * @param attributes bitmask for attributes to close over. 857 * Valid options: 858 * At most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. 859 * These case options are mutually exclusive. 860 * Unrelated options bits are ignored. 861 * @stable ICU 4.2 862 */ 863 U_CAPI void U_EXPORT2 864 uset_closeOver(USet* set, int32_t attributes); 865 866 /** 867 * Remove all strings from this set. 868 * 869 * @param set the set 870 * @stable ICU 4.2 871 */ 872 U_CAPI void U_EXPORT2 873 uset_removeAllStrings(USet* set); 874 875 /** 876 * Returns true if the given USet contains no characters and no 877 * strings. 878 * @param set the set 879 * @return true if set is empty 880 * @stable ICU 2.4 881 */ 882 U_CAPI UBool U_EXPORT2 883 uset_isEmpty(const USet* set); 884 885 /** 886 * @param set the set 887 * @return true if this set contains multi-character strings or the empty string. 888 * @stable ICU 70 889 */ 890 U_CAPI UBool U_EXPORT2 891 uset_hasStrings(const USet *set); 892 893 /** 894 * Returns true if the given USet contains the given character. 895 * This function works faster with a frozen set. 896 * @param set the set 897 * @param c The codepoint to check for within the set 898 * @return true if set contains c 899 * @stable ICU 2.4 900 */ 901 U_CAPI UBool U_EXPORT2 902 uset_contains(const USet* set, UChar32 c); 903 904 /** 905 * Returns true if the given USet contains all characters c 906 * where start <= c && c <= end. 907 * @param set the set 908 * @param start the first character of the range to test, inclusive 909 * @param end the last character of the range to test, inclusive 910 * @return true if set contains the range 911 * @stable ICU 2.2 912 */ 913 U_CAPI UBool U_EXPORT2 914 uset_containsRange(const USet* set, UChar32 start, UChar32 end); 915 916 /** 917 * Returns true if the given USet contains the given string. 918 * @param set the set 919 * @param str the string 920 * @param strLen the length of the string or -1 if null terminated. 921 * @return true if set contains str 922 * @stable ICU 2.4 923 */ 924 U_CAPI UBool U_EXPORT2 925 uset_containsString(const USet* set, const UChar* str, int32_t strLen); 926 927 /** 928 * Returns the index of the given character within this set, where 929 * the set is ordered by ascending code point. If the character 930 * is not in this set, return -1. The inverse of this method is 931 * <code>charAt()</code>. 932 * @param set the set 933 * @param c the character to obtain the index for 934 * @return an index from 0..size()-1, or -1 935 * @stable ICU 3.2 936 */ 937 U_CAPI int32_t U_EXPORT2 938 uset_indexOf(const USet* set, UChar32 c); 939 940 /** 941 * Returns the character at the given index within this set, where 942 * the set is ordered by ascending code point. If the index is 943 * out of range for characters, returns (UChar32)-1. 944 * The inverse of this method is <code>indexOf()</code>. 945 * 946 * For iteration, this is slower than uset_getRangeCount()/uset_getItemCount() 947 * with uset_getItem(), because for each call it skips linearly over <code>index</code> 948 * characters in the ranges. 949 * 950 * @param set the set 951 * @param charIndex an index from 0..size()-1 to obtain the char for 952 * @return the character at the given index, or (UChar32)-1. 953 * @stable ICU 3.2 954 */ 955 U_CAPI UChar32 U_EXPORT2 956 uset_charAt(const USet* set, int32_t charIndex); 957 958 /** 959 * Returns the number of characters and strings contained in this set. 960 * The last (uset_getItemCount() - uset_getRangeCount()) items are strings. 961 * 962 * This is slower than uset_getRangeCount() and uset_getItemCount() because 963 * it counts the code points of all ranges. 964 * 965 * @param set the set 966 * @return a non-negative integer counting the characters and strings 967 * contained in set 968 * @stable ICU 2.4 969 * @see uset_getRangeCount 970 */ 971 U_CAPI int32_t U_EXPORT2 972 uset_size(const USet* set); 973 974 /** 975 * @param set the set 976 * @return the number of ranges in this set. 977 * @stable ICU 70 978 * @see uset_getItemCount 979 * @see uset_getItem 980 * @see uset_size 981 */ 982 U_CAPI int32_t U_EXPORT2 983 uset_getRangeCount(const USet *set); 984 985 /** 986 * Returns the number of items in this set. An item is either a range 987 * of characters or a single multicharacter string. 988 * @param set the set 989 * @return a non-negative integer counting the character ranges 990 * and/or strings contained in set 991 * @stable ICU 2.4 992 */ 993 U_CAPI int32_t U_EXPORT2 994 uset_getItemCount(const USet* set); 995 996 /** 997 * Returns an item of this set. An item is either a range of 998 * characters or a single multicharacter string (which can be the empty string). 999 * 1000 * If <code>itemIndex</code> is less than uset_getRangeCount(), then this function returns 0, 1001 * and the range is <code>*start</code>..<code>*end</code>. 1002 * 1003 * If <code>itemIndex</code> is at least uset_getRangeCount() and less than uset_getItemCount(), then 1004 * this function copies the string into <code>str[strCapacity]</code> and 1005 * returns the length of the string (0 for the empty string). 1006 * 1007 * If <code>itemIndex</code> is out of range, then this function returns -1. 1008 * 1009 * Note that 0 is returned for each range as well as for the empty string. 1010 * 1011 * @param set the set 1012 * @param itemIndex a non-negative integer in the range 0..uset_getItemCount(set)-1 1013 * @param start pointer to variable to receive first character in range, inclusive; 1014 * can be NULL for a string item 1015 * @param end pointer to variable to receive last character in range, inclusive; 1016 * can be NULL for a string item 1017 * @param str buffer to receive the string, may be NULL 1018 * @param strCapacity capacity of str, or 0 if str is NULL 1019 * @param ec error code; U_INDEX_OUTOFBOUNDS_ERROR if the itemIndex is out of range 1020 * @return the length of the string (0 or >= 2), or 0 if the item is a range, 1021 * or -1 if the itemIndex is out of range 1022 * @stable ICU 2.4 1023 */ 1024 U_CAPI int32_t U_EXPORT2 1025 uset_getItem(const USet* set, int32_t itemIndex, 1026 UChar32* start, UChar32* end, 1027 UChar* str, int32_t strCapacity, 1028 UErrorCode* ec); 1029 1030 /** 1031 * Returns true if set1 contains all the characters and strings 1032 * of set2. It answers the question, 'Is set1 a superset of set2?' 1033 * @param set1 set to be checked for containment 1034 * @param set2 set to be checked for containment 1035 * @return true if the test condition is met 1036 * @stable ICU 3.2 1037 */ 1038 U_CAPI UBool U_EXPORT2 1039 uset_containsAll(const USet* set1, const USet* set2); 1040 1041 /** 1042 * Returns true if this set contains all the characters 1043 * of the given string. This is does not check containment of grapheme 1044 * clusters, like uset_containsString. 1045 * @param set set of characters to be checked for containment 1046 * @param str string containing codepoints to be checked for containment 1047 * @param strLen the length of the string or -1 if null terminated. 1048 * @return true if the test condition is met 1049 * @stable ICU 3.4 1050 */ 1051 U_CAPI UBool U_EXPORT2 1052 uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen); 1053 1054 /** 1055 * Returns true if set1 contains none of the characters and strings 1056 * of set2. It answers the question, 'Is set1 a disjoint set of set2?' 1057 * @param set1 set to be checked for containment 1058 * @param set2 set to be checked for containment 1059 * @return true if the test condition is met 1060 * @stable ICU 3.2 1061 */ 1062 U_CAPI UBool U_EXPORT2 1063 uset_containsNone(const USet* set1, const USet* set2); 1064 1065 /** 1066 * Returns true if set1 contains some of the characters and strings 1067 * of set2. It answers the question, 'Does set1 and set2 have an intersection?' 1068 * @param set1 set to be checked for containment 1069 * @param set2 set to be checked for containment 1070 * @return true if the test condition is met 1071 * @stable ICU 3.2 1072 */ 1073 U_CAPI UBool U_EXPORT2 1074 uset_containsSome(const USet* set1, const USet* set2); 1075 1076 /** 1077 * Returns the length of the initial substring of the input string which 1078 * consists only of characters and strings that are contained in this set 1079 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 1080 * or only of characters and strings that are not contained 1081 * in this set (USET_SPAN_NOT_CONTAINED). 1082 * See USetSpanCondition for details. 1083 * Similar to the strspn() C library function. 1084 * Unpaired surrogates are treated according to contains() of their surrogate code points. 1085 * This function works faster with a frozen set and with a non-negative string length argument. 1086 * @param set the set 1087 * @param s start of the string 1088 * @param length of the string; can be -1 for NUL-terminated 1089 * @param spanCondition specifies the containment condition 1090 * @return the length of the initial substring according to the spanCondition; 1091 * 0 if the start of the string does not fit the spanCondition 1092 * @stable ICU 3.8 1093 * @see USetSpanCondition 1094 */ 1095 U_CAPI int32_t U_EXPORT2 1096 uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition); 1097 1098 /** 1099 * Returns the start of the trailing substring of the input string which 1100 * consists only of characters and strings that are contained in this set 1101 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 1102 * or only of characters and strings that are not contained 1103 * in this set (USET_SPAN_NOT_CONTAINED). 1104 * See USetSpanCondition for details. 1105 * Unpaired surrogates are treated according to contains() of their surrogate code points. 1106 * This function works faster with a frozen set and with a non-negative string length argument. 1107 * @param set the set 1108 * @param s start of the string 1109 * @param length of the string; can be -1 for NUL-terminated 1110 * @param spanCondition specifies the containment condition 1111 * @return the start of the trailing substring according to the spanCondition; 1112 * the string length if the end of the string does not fit the spanCondition 1113 * @stable ICU 3.8 1114 * @see USetSpanCondition 1115 */ 1116 U_CAPI int32_t U_EXPORT2 1117 uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition); 1118 1119 /** 1120 * Returns the length of the initial substring of the input string which 1121 * consists only of characters and strings that are contained in this set 1122 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 1123 * or only of characters and strings that are not contained 1124 * in this set (USET_SPAN_NOT_CONTAINED). 1125 * See USetSpanCondition for details. 1126 * Similar to the strspn() C library function. 1127 * Malformed byte sequences are treated according to contains(0xfffd). 1128 * This function works faster with a frozen set and with a non-negative string length argument. 1129 * @param set the set 1130 * @param s start of the string (UTF-8) 1131 * @param length of the string; can be -1 for NUL-terminated 1132 * @param spanCondition specifies the containment condition 1133 * @return the length of the initial substring according to the spanCondition; 1134 * 0 if the start of the string does not fit the spanCondition 1135 * @stable ICU 3.8 1136 * @see USetSpanCondition 1137 */ 1138 U_CAPI int32_t U_EXPORT2 1139 uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition); 1140 1141 /** 1142 * Returns the start of the trailing substring of the input string which 1143 * consists only of characters and strings that are contained in this set 1144 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 1145 * or only of characters and strings that are not contained 1146 * in this set (USET_SPAN_NOT_CONTAINED). 1147 * See USetSpanCondition for details. 1148 * Malformed byte sequences are treated according to contains(0xfffd). 1149 * This function works faster with a frozen set and with a non-negative string length argument. 1150 * @param set the set 1151 * @param s start of the string (UTF-8) 1152 * @param length of the string; can be -1 for NUL-terminated 1153 * @param spanCondition specifies the containment condition 1154 * @return the start of the trailing substring according to the spanCondition; 1155 * the string length if the end of the string does not fit the spanCondition 1156 * @stable ICU 3.8 1157 * @see USetSpanCondition 1158 */ 1159 U_CAPI int32_t U_EXPORT2 1160 uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition); 1161 1162 /** 1163 * Returns true if set1 contains all of the characters and strings 1164 * of set2, and vis versa. It answers the question, 'Is set1 equal to set2?' 1165 * @param set1 set to be checked for containment 1166 * @param set2 set to be checked for containment 1167 * @return true if the test condition is met 1168 * @stable ICU 3.2 1169 */ 1170 U_CAPI UBool U_EXPORT2 1171 uset_equals(const USet* set1, const USet* set2); 1172 1173 /********************************************************************* 1174 * Serialized set API 1175 *********************************************************************/ 1176 1177 /** 1178 * Serializes this set into an array of 16-bit integers. Serialization 1179 * (currently) only records the characters in the set; multicharacter 1180 * strings are ignored. 1181 * 1182 * The array 1183 * has following format (each line is one 16-bit integer): 1184 * 1185 * length = (n+2*m) | (m!=0?0x8000:0) 1186 * bmpLength = n; present if m!=0 1187 * bmp[0] 1188 * bmp[1] 1189 * ... 1190 * bmp[n-1] 1191 * supp-high[0] 1192 * supp-low[0] 1193 * supp-high[1] 1194 * supp-low[1] 1195 * ... 1196 * supp-high[m-1] 1197 * supp-low[m-1] 1198 * 1199 * The array starts with a header. After the header are n bmp 1200 * code points, then m supplementary code points. Either n or m 1201 * or both may be zero. n+2*m is always <= 0x7FFF. 1202 * 1203 * If there are no supplementary characters (if m==0) then the 1204 * header is one 16-bit integer, 'length', with value n. 1205 * 1206 * If there are supplementary characters (if m!=0) then the header 1207 * is two 16-bit integers. The first, 'length', has value 1208 * (n+2*m)|0x8000. The second, 'bmpLength', has value n. 1209 * 1210 * After the header the code points are stored in ascending order. 1211 * Supplementary code points are stored as most significant 16 1212 * bits followed by least significant 16 bits. 1213 * 1214 * @param set the set 1215 * @param dest pointer to buffer of destCapacity 16-bit integers. 1216 * May be NULL only if destCapacity is zero. 1217 * @param destCapacity size of dest, or zero. Must not be negative. 1218 * @param pErrorCode pointer to the error code. Will be set to 1219 * U_INDEX_OUTOFBOUNDS_ERROR if n+2*m > 0x7FFF. Will be set to 1220 * U_BUFFER_OVERFLOW_ERROR if n+2*m+(m!=0?2:1) > destCapacity. 1221 * @return the total length of the serialized format, including 1222 * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other 1223 * than U_BUFFER_OVERFLOW_ERROR. 1224 * @stable ICU 2.4 1225 */ 1226 U_CAPI int32_t U_EXPORT2 1227 uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* pErrorCode); 1228 1229 /** 1230 * Given a serialized array, fill in the given serialized set object. 1231 * @param fillSet pointer to result 1232 * @param src pointer to start of array 1233 * @param srcLength length of array 1234 * @return true if the given array is valid, otherwise false 1235 * @stable ICU 2.4 1236 */ 1237 U_CAPI UBool U_EXPORT2 1238 uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength); 1239 1240 /** 1241 * Set the USerializedSet to contain the given character (and nothing 1242 * else). 1243 * @param fillSet pointer to result 1244 * @param c The codepoint to set 1245 * @stable ICU 2.4 1246 */ 1247 U_CAPI void U_EXPORT2 1248 uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c); 1249 1250 /** 1251 * Returns true if the given USerializedSet contains the given 1252 * character. 1253 * @param set the serialized set 1254 * @param c The codepoint to check for within the set 1255 * @return true if set contains c 1256 * @stable ICU 2.4 1257 */ 1258 U_CAPI UBool U_EXPORT2 1259 uset_serializedContains(const USerializedSet* set, UChar32 c); 1260 1261 /** 1262 * Returns the number of disjoint ranges of characters contained in 1263 * the given serialized set. Ignores any strings contained in the 1264 * set. 1265 * @param set the serialized set 1266 * @return a non-negative integer counting the character ranges 1267 * contained in set 1268 * @stable ICU 2.4 1269 */ 1270 U_CAPI int32_t U_EXPORT2 1271 uset_getSerializedRangeCount(const USerializedSet* set); 1272 1273 /** 1274 * Returns a range of characters contained in the given serialized 1275 * set. 1276 * @param set the serialized set 1277 * @param rangeIndex a non-negative integer in the range 0.. 1278 * uset_getSerializedRangeCount(set)-1 1279 * @param pStart pointer to variable to receive first character 1280 * in range, inclusive 1281 * @param pEnd pointer to variable to receive last character in range, 1282 * inclusive 1283 * @return true if rangeIndex is valid, otherwise false 1284 * @stable ICU 2.4 1285 */ 1286 U_CAPI UBool U_EXPORT2 1287 uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex, 1288 UChar32* pStart, UChar32* pEnd); 1289 1290 #endif 1291