1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2004-2012, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: utext.h 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2004oct06 16 * created by: Markus W. Scherer 17 */ 18 19 #ifndef __UTEXT_H__ 20 #define __UTEXT_H__ 21 22 /** 23 * @addtogroup icu4c ICU4C 24 * @{ 25 * \file 26 * \brief C API: Abstract Unicode Text API 27 * 28 * The Text Access API provides a means to allow text that is stored in alternative 29 * formats to work with ICU services. ICU normally operates on text that is 30 * stored in UTF-16 format, in (UChar *) arrays for the C APIs or as type 31 * UnicodeString for C++ APIs. 32 * 33 * ICU Text Access allows other formats, such as UTF-8 or non-contiguous 34 * UTF-16 strings, to be placed in a UText wrapper and then passed to ICU services. 35 * 36 * There are three general classes of usage for UText: 37 * 38 * Application Level Use. This is the simplest usage - applications would 39 * use one of the utext_open() functions on their input text, and pass 40 * the resulting UText to the desired ICU service. 41 * 42 * Second is usage in ICU Services, such as break iteration, that will need to 43 * operate on input presented to them as a UText. These implementations 44 * will need to use the iteration and related UText functions to gain 45 * access to the actual text. 46 * 47 * The third class of UText users are "text providers." These are the 48 * UText implementations for the various text storage formats. An application 49 * or system with a unique text storage format can implement a set of 50 * UText provider functions for that format, which will then allow 51 * ICU services to operate on that format. 52 * 53 * 54 * <em>Iterating over text</em> 55 * 56 * Here is sample code for a forward iteration over the contents of a UText 57 * 58 * \code 59 * UChar32 c; 60 * UText *ut = whatever(); 61 * 62 * for (c=utext_next32From(ut, 0); c>=0; c=utext_next32(ut)) { 63 * // do whatever with the codepoint c here. 64 * } 65 * \endcode 66 * 67 * And here is similar code to iterate in the reverse direction, from the end 68 * of the text towards the beginning. 69 * 70 * \code 71 * UChar32 c; 72 * UText *ut = whatever(); 73 * int textLength = utext_nativeLength(ut); 74 * for (c=utext_previous32From(ut, textLength); c>=0; c=utext_previous32(ut)) { 75 * // do whatever with the codepoint c here. 76 * } 77 * \endcode 78 * 79 * <em>Characters and Indexing</em> 80 * 81 * Indexing into text by UText functions is nearly always in terms of the native 82 * indexing of the underlying text storage. The storage format could be UTF-8 83 * or UTF-32, for example. When coding to the UText access API, no assumptions 84 * can be made regarding the size of characters, or how far an index 85 * may move when iterating between characters. 86 * 87 * All indices supplied to UText functions are pinned to the length of the 88 * text. An out-of-bounds index is not considered to be an error, but is 89 * adjusted to be in the range 0 <= index <= length of input text. 90 * 91 * 92 * When an index position is returned from a UText function, it will be 93 * a native index to the underlying text. In the case of multi-unit characters, 94 * it will always refer to the first position of the character, 95 * never to the interior. This is essentially the same thing as saying that 96 * a returned index will always point to a boundary between characters. 97 * 98 * When a native index is supplied to a UText function, all indices that 99 * refer to any part of a multi-unit character representation are considered 100 * to be equivalent. In the case of multi-unit characters, an incoming index 101 * will be logically normalized to refer to the start of the character. 102 * 103 * It is possible to test whether a native index is on a code point boundary 104 * by doing a utext_setNativeIndex() followed by a utext_getNativeIndex(). 105 * If the index is returned unchanged, it was on a code point boundary. If 106 * an adjusted index is returned, the original index referred to the 107 * interior of a character. 108 * 109 * <em>Conventions for calling UText functions</em> 110 * 111 * Most UText access functions have as their first parameter a (UText *) pointer, 112 * which specifies the UText to be used. Unless otherwise noted, the 113 * pointer must refer to a valid, open UText. Attempting to 114 * use a closed UText or passing a NULL pointer is a programming error and 115 * will produce undefined results or NULL pointer exceptions. 116 * 117 * The UText_Open family of functions can either open an existing (closed) 118 * UText, or heap allocate a new UText. Here is sample code for creating 119 * a stack-allocated UText. 120 * 121 * \code 122 * char *s = whatever(); // A utf-8 string 123 * U_ErrorCode status = U_ZERO_ERROR; 124 * UText ut = UTEXT_INITIALIZER; 125 * utext_openUTF8(ut, s, -1, &status); 126 * if (U_FAILURE(status)) { 127 * // error handling 128 * } else { 129 * // work with the UText 130 * } 131 * \endcode 132 * 133 * Any existing UText passed to an open function _must_ have been initialized, 134 * either by the UTEXT_INITIALIZER, or by having been originally heap-allocated 135 * by an open function. Passing NULL will cause the open function to 136 * heap-allocate and fully initialize a new UText. 137 * 138 */ 139 140 141 142 #include "unicode/utypes.h" 143 #include "unicode/uchar.h" 144 #if U_SHOW_CPLUSPLUS_API 145 #include "unicode/localpointer.h" 146 #include "unicode/rep.h" 147 #include "unicode/unistr.h" 148 #include "unicode/chariter.h" 149 #endif 150 151 152 U_CDECL_BEGIN 153 154 struct UText; 155 typedef struct UText UText; /**< C typedef for struct UText. \xrefitem stable "Stable" "Stable List" ICU 3.6 */ 156 157 158 /*************************************************************************************** 159 * 160 * C Functions for creating UText wrappers around various kinds of text strings. 161 * 162 ****************************************************************************************/ 163 164 165 /** 166 * Close function for UText instances. 167 * Cleans up, releases any resources being held by an open UText. 168 * <p> 169 * If the UText was originally allocated by one of the utext_open functions, 170 * the storage associated with the utext will also be freed. 171 * If the UText storage originated with the application, as it would with 172 * a local or static instance, the storage will not be deleted. 173 * 174 * An open UText can be reset to refer to new string by using one of the utext_open() 175 * functions without first closing the UText. 176 * 177 * @param ut The UText to be closed. 178 * @return NULL if the UText struct was deleted by the close. If the UText struct 179 * was originally provided by the caller to the open function, it is 180 * returned by this function, and may be safely used again in 181 * a subsequent utext_open. 182 * 183 * \xrefitem stable "Stable" "Stable List" ICU 3.4 184 */ 185 U_CAPI UText * U_EXPORT2 186 utext_close(UText *ut) __INTRODUCED_IN(31); 187 188 189 190 /** 191 * Open a read-only UText implementation for UTF-8 strings. 192 * 193 * \htmlonly 194 * Any invalid UTF-8 in the input will be handled in this way: 195 * a sequence of bytes that has the form of a truncated, but otherwise valid, 196 * UTF-8 sequence will be replaced by a single unicode replacement character, \uFFFD. 197 * Any other illegal bytes will each be replaced by a \uFFFD. 198 * \endhtmlonly 199 * 200 * @param ut Pointer to a UText struct. If NULL, a new UText will be created. 201 * If non-NULL, must refer to an initialized UText struct, which will then 202 * be reset to reference the specified UTF-8 string. 203 * @param s A UTF-8 string. Must not be NULL. 204 * @param length The length of the UTF-8 string in bytes, or -1 if the string is 205 * zero terminated. 206 * @param status Errors are returned here. 207 * @return A pointer to the UText. If a pre-allocated UText was provided, it 208 * will always be used and returned. 209 * \xrefitem stable "Stable" "Stable List" ICU 3.4 210 */ 211 U_CAPI UText * U_EXPORT2 212 utext_openUTF8(UText *ut, const char *s, int64_t length, UErrorCode *status) __INTRODUCED_IN(31); 213 214 215 216 217 /** 218 * Open a read-only UText for UChar * string. 219 * 220 * @param ut Pointer to a UText struct. If NULL, a new UText will be created. 221 * If non-NULL, must refer to an initialized UText struct, which will then 222 * be reset to reference the specified UChar string. 223 * @param s A UChar (UTF-16) string 224 * @param length The number of UChars in the input string, or -1 if the string is 225 * zero terminated. 226 * @param status Errors are returned here. 227 * @return A pointer to the UText. If a pre-allocated UText was provided, it 228 * will always be used and returned. 229 * \xrefitem stable "Stable" "Stable List" ICU 3.4 230 */ 231 U_CAPI UText * U_EXPORT2 232 utext_openUChars(UText *ut, const UChar *s, int64_t length, UErrorCode *status) __INTRODUCED_IN(31); 233 234 235 236 237 #if U_SHOW_CPLUSPLUS_API 238 239 240 241 242 243 244 245 246 247 248 #endif 249 250 251 /** 252 * Clone a UText. This is much like opening a UText where the source text is itself 253 * another UText. 254 * 255 * A deep clone will copy both the UText data structures and the underlying text. 256 * The original and cloned UText will operate completely independently; modifications 257 * made to the text in one will not affect the other. Text providers are not 258 * required to support deep clones. The user of clone() must check the status return 259 * and be prepared to handle failures. 260 * 261 * The standard UText implementations for UTF8, UChar *, UnicodeString and 262 * Replaceable all support deep cloning. 263 * 264 * The UText returned from a deep clone will be writable, assuming that the text 265 * provider is able to support writing, even if the source UText had been made 266 * non-writable by means of UText_freeze(). 267 * 268 * A shallow clone replicates only the UText data structures; it does not make 269 * a copy of the underlying text. Shallow clones can be used as an efficient way to 270 * have multiple iterators active in a single text string that is not being 271 * modified. 272 * 273 * A shallow clone operation will not fail, barring truly exceptional conditions such 274 * as memory allocation failures. 275 * 276 * Shallow UText clones should be avoided if the UText functions that modify the 277 * text are expected to be used, either on the original or the cloned UText. 278 * Any such modifications can cause unpredictable behavior. Read Only 279 * shallow clones provide some protection against errors of this type by 280 * disabling text modification via the cloned UText. 281 * 282 * A shallow clone made with the readOnly parameter == false will preserve the 283 * utext_isWritable() state of the source object. Note, however, that 284 * write operations must be avoided while more than one UText exists that refer 285 * to the same underlying text. 286 * 287 * A UText and its clone may be safely concurrently accessed by separate threads. 288 * This is true for read access only with shallow clones, and for both read and 289 * write access with deep clones. 290 * It is the responsibility of the Text Provider to ensure that this thread safety 291 * constraint is met. 292 * 293 * @param dest A UText struct to be filled in with the result of the clone operation, 294 * or NULL if the clone function should heap-allocate a new UText struct. 295 * If non-NULL, must refer to an already existing UText, which will then 296 * be reset to become the clone. 297 * @param src The UText to be cloned. 298 * @param deep true to request a deep clone, false for a shallow clone. 299 * @param readOnly true to request that the cloned UText have read only access to the 300 * underlying text. 301 302 * @param status Errors are returned here. For deep clones, U_UNSUPPORTED_ERROR 303 * will be returned if the text provider is unable to clone the 304 * original text. 305 * @return The newly created clone, or NULL if the clone operation failed. 306 * \xrefitem stable "Stable" "Stable List" ICU 3.4 307 */ 308 U_CAPI UText * U_EXPORT2 309 utext_clone(UText *dest, const UText *src, UBool deep, UBool readOnly, UErrorCode *status) __INTRODUCED_IN(31); 310 311 312 313 314 /** 315 * Compare two UText objects for equality. 316 * UTexts are equal if they are iterating over the same text, and 317 * have the same iteration position within the text. 318 * If either or both of the parameters are NULL, the comparison is false. 319 * 320 * @param a The first of the two UTexts to compare. 321 * @param b The other UText to be compared. 322 * @return true if the two UTexts are equal. 323 * \xrefitem stable "Stable" "Stable List" ICU 3.6 324 */ 325 U_CAPI UBool U_EXPORT2 326 utext_equals(const UText *a, const UText *b) __INTRODUCED_IN(31); 327 328 329 330 331 /***************************************************************************** 332 * 333 * Functions to work with the text represented by a UText wrapper 334 * 335 *****************************************************************************/ 336 337 /** 338 * Get the length of the text. Depending on the characteristics 339 * of the underlying text representation, this may be expensive. 340 * @see utext_isLengthExpensive() 341 * 342 * 343 * @param ut the text to be accessed. 344 * @return the length of the text, expressed in native units. 345 * 346 * \xrefitem stable "Stable" "Stable List" ICU 3.4 347 */ 348 U_CAPI int64_t U_EXPORT2 349 utext_nativeLength(UText *ut) __INTRODUCED_IN(31); 350 351 352 353 354 355 /** 356 * Returns the code point at the requested index, 357 * or U_SENTINEL (-1) if it is out of bounds. 358 * 359 * If the specified index points to the interior of a multi-unit 360 * character - one of the trail bytes of a UTF-8 sequence, for example - 361 * the complete code point will be returned. 362 * 363 * The iteration position will be set to the start of the returned code point. 364 * 365 * This function is roughly equivalent to the sequence 366 * utext_setNativeIndex(index); 367 * utext_current32(); 368 * (There is a subtle difference if the index is out of bounds by being less than zero - 369 * utext_setNativeIndex(negative value) sets the index to zero, after which utext_current() 370 * will return the char at zero. utext_char32At(negative index), on the other hand, will 371 * return the U_SENTINEL value of -1.) 372 * 373 * @param ut the text to be accessed 374 * @param nativeIndex the native index of the character to be accessed. If the index points 375 * to other than the first unit of a multi-unit character, it will be adjusted 376 * to the start of the character. 377 * @return the code point at the specified index. 378 * \xrefitem stable "Stable" "Stable List" ICU 3.4 379 */ 380 U_CAPI UChar32 U_EXPORT2 381 utext_char32At(UText *ut, int64_t nativeIndex) __INTRODUCED_IN(31); 382 383 384 385 386 /** 387 * 388 * Get the code point at the current iteration position, 389 * or U_SENTINEL (-1) if the iteration has reached the end of 390 * the input text. 391 * 392 * @param ut the text to be accessed. 393 * @return the Unicode code point at the current iterator position. 394 * \xrefitem stable "Stable" "Stable List" ICU 3.4 395 */ 396 U_CAPI UChar32 U_EXPORT2 397 utext_current32(UText *ut) __INTRODUCED_IN(31); 398 399 400 401 402 /** 403 * Get the code point at the current iteration position of the UText, and 404 * advance the position to the first index following the character. 405 * 406 * If the position is at the end of the text (the index following 407 * the last character, which is also the length of the text), 408 * return U_SENTINEL (-1) and do not advance the index. 409 * 410 * This is a post-increment operation. 411 * 412 * An inline macro version of this function, UTEXT_NEXT32(), 413 * is available for performance critical use. 414 * 415 * @param ut the text to be accessed. 416 * @return the Unicode code point at the iteration position. 417 * @see UTEXT_NEXT32 418 * \xrefitem stable "Stable" "Stable List" ICU 3.4 419 */ 420 U_CAPI UChar32 U_EXPORT2 421 utext_next32(UText *ut) __INTRODUCED_IN(31); 422 423 424 425 426 /** 427 * Move the iterator position to the character (code point) whose 428 * index precedes the current position, and return that character. 429 * This is a pre-decrement operation. 430 * 431 * If the initial position is at the start of the text (index of 0) 432 * return U_SENTINEL (-1), and leave the position unchanged. 433 * 434 * An inline macro version of this function, UTEXT_PREVIOUS32(), 435 * is available for performance critical use. 436 * 437 * @param ut the text to be accessed. 438 * @return the previous UChar32 code point, or U_SENTINEL (-1) 439 * if the iteration has reached the start of the text. 440 * @see UTEXT_PREVIOUS32 441 * \xrefitem stable "Stable" "Stable List" ICU 3.4 442 */ 443 U_CAPI UChar32 U_EXPORT2 444 utext_previous32(UText *ut) __INTRODUCED_IN(31); 445 446 447 448 449 /** 450 * Set the iteration index and return the code point at that index. 451 * Leave the iteration index at the start of the following code point. 452 * 453 * This function is the most efficient and convenient way to 454 * begin a forward iteration. The results are identical to the those 455 * from the sequence 456 * \code 457 * utext_setIndex(); 458 * utext_next32(); 459 * \endcode 460 * 461 * @param ut the text to be accessed. 462 * @param nativeIndex Iteration index, in the native units of the text provider. 463 * @return Code point which starts at or before index, 464 * or U_SENTINEL (-1) if it is out of bounds. 465 * \xrefitem stable "Stable" "Stable List" ICU 3.4 466 */ 467 U_CAPI UChar32 U_EXPORT2 468 utext_next32From(UText *ut, int64_t nativeIndex) __INTRODUCED_IN(31); 469 470 471 472 473 474 /** 475 * Set the iteration index, and return the code point preceding the 476 * one specified by the initial index. Leave the iteration position 477 * at the start of the returned code point. 478 * 479 * This function is the most efficient and convenient way to 480 * begin a backwards iteration. 481 * 482 * @param ut the text to be accessed. 483 * @param nativeIndex Iteration index in the native units of the text provider. 484 * @return Code point preceding the one at the initial index, 485 * or U_SENTINEL (-1) if it is out of bounds. 486 * 487 * \xrefitem stable "Stable" "Stable List" ICU 3.4 488 */ 489 U_CAPI UChar32 U_EXPORT2 490 utext_previous32From(UText *ut, int64_t nativeIndex) __INTRODUCED_IN(31); 491 492 493 494 /** 495 * Get the current iterator position, which can range from 0 to 496 * the length of the text. 497 * The position is a native index into the input text, in whatever format it 498 * may have (possibly UTF-8 for example), and may not always be the same as 499 * the corresponding UChar (UTF-16) index. 500 * The returned position will always be aligned to a code point boundary. 501 * 502 * @param ut the text to be accessed. 503 * @return the current index position, in the native units of the text provider. 504 * \xrefitem stable "Stable" "Stable List" ICU 3.4 505 */ 506 U_CAPI int64_t U_EXPORT2 507 utext_getNativeIndex(const UText *ut) __INTRODUCED_IN(31); 508 509 510 511 /** 512 * Set the current iteration position to the nearest code point 513 * boundary at or preceding the specified index. 514 * The index is in the native units of the original input text. 515 * If the index is out of range, it will be pinned to be within 516 * the range of the input text. 517 * <p> 518 * It will usually be more efficient to begin an iteration 519 * using the functions utext_next32From() or utext_previous32From() 520 * rather than setIndex(). 521 * <p> 522 * Moving the index position to an adjacent character is best done 523 * with utext_next32(), utext_previous32() or utext_moveIndex32(). 524 * Attempting to do direct arithmetic on the index position is 525 * complicated by the fact that the size (in native units) of a 526 * character depends on the underlying representation of the character 527 * (UTF-8, UTF-16, UTF-32, arbitrary codepage), and is not 528 * easily knowable. 529 * 530 * @param ut the text to be accessed. 531 * @param nativeIndex the native unit index of the new iteration position. 532 * \xrefitem stable "Stable" "Stable List" ICU 3.4 533 */ 534 U_CAPI void U_EXPORT2 535 utext_setNativeIndex(UText *ut, int64_t nativeIndex) __INTRODUCED_IN(31); 536 537 538 539 /** 540 * Move the iterator position by delta code points. The number of code points 541 * is a signed number; a negative delta will move the iterator backwards, 542 * towards the start of the text. 543 * <p> 544 * The index is moved by <code>delta</code> code points 545 * forward or backward, but no further backward than to 0 and 546 * no further forward than to utext_nativeLength(). 547 * The resulting index value will be in between 0 and length, inclusive. 548 * 549 * @param ut the text to be accessed. 550 * @param delta the signed number of code points to move the iteration position. 551 * @return true if the position could be moved the requested number of positions while 552 * staying within the range [0 - text length]. 553 * \xrefitem stable "Stable" "Stable List" ICU 3.4 554 */ 555 U_CAPI UBool U_EXPORT2 556 utext_moveIndex32(UText *ut, int32_t delta) __INTRODUCED_IN(31); 557 558 559 560 /** 561 * Get the native index of the character preceding the current position. 562 * If the iteration position is already at the start of the text, zero 563 * is returned. 564 * The value returned is the same as that obtained from the following sequence, 565 * but without the side effect of changing the iteration position. 566 * 567 * \code 568 * UText *ut = whatever; 569 * ... 570 * utext_previous(ut) 571 * utext_getNativeIndex(ut); 572 * \endcode 573 * 574 * This function is most useful during forwards iteration, where it will get the 575 * native index of the character most recently returned from utext_next(). 576 * 577 * @param ut the text to be accessed 578 * @return the native index of the character preceding the current index position, 579 * or zero if the current position is at the start of the text. 580 * \xrefitem stable "Stable" "Stable List" ICU 3.6 581 */ 582 U_CAPI int64_t U_EXPORT2 583 utext_getPreviousNativeIndex(UText *ut); 584 585 586 /** 587 * 588 * Extract text from a UText into a UChar buffer. The range of text to be extracted 589 * is specified in the native indices of the UText provider. These may not necessarily 590 * be UTF-16 indices. 591 * <p> 592 * The size (number of 16 bit UChars) of the data to be extracted is returned. The 593 * full number of UChars is returned, even when the extracted text is truncated 594 * because the specified buffer size is too small. 595 * <p> 596 * The extracted string will (if you are a user) / must (if you are a text provider) 597 * be NUL-terminated if there is sufficient space in the destination buffer. This 598 * terminating NUL is not included in the returned length. 599 * <p> 600 * The iteration index is left at the position following the last extracted character. 601 * 602 * @param ut the UText from which to extract data. 603 * @param nativeStart the native index of the first character to extract.\ 604 * If the specified index is out of range, 605 * it will be pinned to be within 0 <= index <= textLength 606 * @param nativeLimit the native string index of the position following the last 607 * character to extract. If the specified index is out of range, 608 * it will be pinned to be within 0 <= index <= textLength. 609 * nativeLimit must be >= nativeStart. 610 * @param dest the UChar (UTF-16) buffer into which the extracted text is placed 611 * @param destCapacity The size, in UChars, of the destination buffer. May be zero 612 * for precomputing the required size. 613 * @param status receives any error status. 614 * U_BUFFER_OVERFLOW_ERROR: the extracted text was truncated because the 615 * buffer was too small. Returns number of UChars for preflighting. 616 * @return Number of UChars in the data to be extracted. Does not include a trailing NUL. 617 * 618 * \xrefitem stable "Stable" "Stable List" ICU 3.4 619 */ 620 U_CAPI int32_t U_EXPORT2 621 utext_extract(UText *ut, 622 int64_t nativeStart, int64_t nativeLimit, 623 UChar *dest, int32_t destCapacity, 624 UErrorCode *status) __INTRODUCED_IN(31); 625 626 627 628 629 630 631 632 U_CDECL_END 633 634 635 #if U_SHOW_CPLUSPLUS_API 636 637 U_NAMESPACE_BEGIN 638 639 /** 640 * \class LocalUTextPointer 641 * "Smart pointer" class, closes a UText via utext_close(). 642 * For most methods see the LocalPointerBase base class. 643 * 644 * @see LocalPointerBase 645 * @see LocalPointer 646 * \xrefitem stable "Stable" "Stable List" ICU 4.4 647 */ 648 U_DEFINE_LOCAL_OPEN_POINTER(LocalUTextPointer, UText, utext_close); 649 650 U_NAMESPACE_END 651 652 #endif 653 654 655 #endif 656 657 /** @} */ // addtogroup 658