1 /* 2 ************************************************************************* 3 * © 2016 and later: Unicode, Inc. and others. 4 * License & terms of use: http://www.unicode.org/copyright.html 5 ************************************************************************* 6 ************************************************************************* 7 * Copyright (C) 2007, International Business Machines 8 * Corporation and others. All Rights Reserved. 9 ************************************************************************* 10 * file name: trieset.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2007jan15 16 * created by: Markus Scherer 17 * 18 * Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet 19 * using a UTrie with 8-bit (byte) results per code point. 20 * Modifies the trie index to make the BMP linear, and uses the original set 21 * for supplementary code points. 22 */ 23 24 #include "cmemory.h" 25 #include "unicode/uniset.h" 26 #include "unicode/uobject.h" 27 #include "unicode/usetiter.h" 28 #include "unicode/utypes.h" 29 #include "unicont.h" 30 #include "utrie.h" 31 32 using icu::UObject; 33 using icu::UnicodeSet; 34 using icu::UnicodeSetIterator; 35 36 #define UTRIE_GET8_LATIN1(trie) ((const uint8_t *)(trie)->data32+UTRIE_DATA_BLOCK_LENGTH) 37 38 #define UTRIE_GET8_FROM_LEAD(trie, c16) \ 39 ((const uint8_t *)(trie)->data32)[ \ 40 ((int32_t)((trie)->index[(c16)>>UTRIE_SHIFT])<<UTRIE_INDEX_SHIFT)+ \ 41 ((c16)&UTRIE_MASK) \ 42 ] 43 44 class TrieSet : public UObject, public UnicodeContainable { 45 public: TrieSet(const UnicodeSet & set,UErrorCode & errorCode)46 TrieSet(const UnicodeSet &set, UErrorCode &errorCode) 47 : trieData(nullptr), latin1(nullptr), restSet(set.clone()) { 48 if(U_FAILURE(errorCode)) { 49 return; 50 } 51 if(restSet==nullptr) { 52 errorCode=U_MEMORY_ALLOCATION_ERROR; 53 return; 54 } 55 56 UNewTrie *newTrie=utrie_open(nullptr, nullptr, 0x11000, 0, 0, true); 57 UChar32 start, end; 58 59 UnicodeSetIterator iter(set); 60 61 while(iter.nextRange() && !iter.isString()) { 62 start=iter.getCodepoint(); 63 end=iter.getCodepointEnd(); 64 if(start>0xffff) { 65 break; 66 } 67 if(end>0xffff) { 68 end=0xffff; 69 } 70 if(!utrie_setRange32(newTrie, start, end+1, true, true)) { 71 errorCode=U_INTERNAL_PROGRAM_ERROR; 72 return; 73 } 74 } 75 76 // Preflight the trie length. 77 int32_t length=utrie_serialize(newTrie, nullptr, 0, nullptr, 8, &errorCode); 78 if(errorCode!=U_BUFFER_OVERFLOW_ERROR) { 79 return; 80 } 81 82 trieData=(uint32_t *)uprv_malloc(length); 83 if(trieData==nullptr) { 84 errorCode=U_MEMORY_ALLOCATION_ERROR; 85 return; 86 } 87 88 errorCode=U_ZERO_ERROR; 89 utrie_serialize(newTrie, trieData, length, nullptr, 8, &errorCode); 90 utrie_unserialize(&trie, trieData, length, &errorCode); // TODO: Implement for 8-bit UTrie! 91 92 if(U_SUCCESS(errorCode)) { 93 // Copy the indexes for surrogate code points into the BMP range 94 // for simple access across the entire BMP. 95 uprv_memcpy((uint16_t *)trie.index+(0xd800>>UTRIE_SHIFT), 96 trie.index+UTRIE_BMP_INDEX_LENGTH, 97 (0x800>>UTRIE_SHIFT)*2); 98 latin1=UTRIE_GET8_LATIN1(&trie); 99 } 100 101 restSet->remove(0, 0xffff); 102 } 103 ~TrieSet()104 ~TrieSet() { 105 uprv_free(trieData); 106 delete restSet; 107 } 108 contains(UChar32 c) const109 UBool contains(UChar32 c) const { 110 if((uint32_t)c<=0xff) { 111 return (UBool)latin1[c]; 112 } else if((uint32_t)c<0xffff) { 113 return (UBool)UTRIE_GET8_FROM_LEAD(&trie, c); 114 } else { 115 return restSet->contains(c); 116 } 117 } 118 119 private: 120 uint32_t *trieData; 121 const uint8_t *latin1; 122 UTrie trie; 123 UnicodeSet *restSet; 124 }; 125