1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others. 2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html 3*0e209d39SAndroid Build Coastguard Worker /* 4*0e209d39SAndroid Build Coastguard Worker ******************************************************************************* 5*0e209d39SAndroid Build Coastguard Worker * Copyright (C) 2012-2014, International Business Machines 6*0e209d39SAndroid Build Coastguard Worker * Corporation and others. All Rights Reserved. 7*0e209d39SAndroid Build Coastguard Worker ******************************************************************************* 8*0e209d39SAndroid Build Coastguard Worker * collationdatabuilder.h 9*0e209d39SAndroid Build Coastguard Worker * 10*0e209d39SAndroid Build Coastguard Worker * created on: 2012apr01 11*0e209d39SAndroid Build Coastguard Worker * created by: Markus W. Scherer 12*0e209d39SAndroid Build Coastguard Worker */ 13*0e209d39SAndroid Build Coastguard Worker 14*0e209d39SAndroid Build Coastguard Worker #ifndef __COLLATIONDATABUILDER_H__ 15*0e209d39SAndroid Build Coastguard Worker #define __COLLATIONDATABUILDER_H__ 16*0e209d39SAndroid Build Coastguard Worker 17*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h" 18*0e209d39SAndroid Build Coastguard Worker 19*0e209d39SAndroid Build Coastguard Worker #if !UCONFIG_NO_COLLATION 20*0e209d39SAndroid Build Coastguard Worker 21*0e209d39SAndroid Build Coastguard Worker #include "unicode/uniset.h" 22*0e209d39SAndroid Build Coastguard Worker #include "unicode/unistr.h" 23*0e209d39SAndroid Build Coastguard Worker #include "unicode/uversion.h" 24*0e209d39SAndroid Build Coastguard Worker #include "collation.h" 25*0e209d39SAndroid Build Coastguard Worker #include "collationdata.h" 26*0e209d39SAndroid Build Coastguard Worker #include "collationsettings.h" 27*0e209d39SAndroid Build Coastguard Worker #include "normalizer2impl.h" 28*0e209d39SAndroid Build Coastguard Worker #include "utrie2.h" 29*0e209d39SAndroid Build Coastguard Worker #include "uvectr32.h" 30*0e209d39SAndroid Build Coastguard Worker #include "uvectr64.h" 31*0e209d39SAndroid Build Coastguard Worker #include "uvector.h" 32*0e209d39SAndroid Build Coastguard Worker 33*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN 34*0e209d39SAndroid Build Coastguard Worker 35*0e209d39SAndroid Build Coastguard Worker struct ConditionalCE32; 36*0e209d39SAndroid Build Coastguard Worker 37*0e209d39SAndroid Build Coastguard Worker class CollationFastLatinBuilder; 38*0e209d39SAndroid Build Coastguard Worker class CopyHelper; 39*0e209d39SAndroid Build Coastguard Worker class DataBuilderCollationIterator; 40*0e209d39SAndroid Build Coastguard Worker class UCharsTrieBuilder; 41*0e209d39SAndroid Build Coastguard Worker 42*0e209d39SAndroid Build Coastguard Worker /** 43*0e209d39SAndroid Build Coastguard Worker * Low-level CollationData builder. 44*0e209d39SAndroid Build Coastguard Worker * Takes (character, CE) pairs and builds them into runtime data structures. 45*0e209d39SAndroid Build Coastguard Worker * Supports characters with context prefixes and contraction suffixes. 46*0e209d39SAndroid Build Coastguard Worker */ 47*0e209d39SAndroid Build Coastguard Worker class U_I18N_API CollationDataBuilder : public UObject { 48*0e209d39SAndroid Build Coastguard Worker public: 49*0e209d39SAndroid Build Coastguard Worker /** 50*0e209d39SAndroid Build Coastguard Worker * Collation element modifier. Interface class for a modifier 51*0e209d39SAndroid Build Coastguard Worker * that changes a tailoring builder's temporary CEs to final CEs. 52*0e209d39SAndroid Build Coastguard Worker * Called for every non-special CE32 and every expansion CE. 53*0e209d39SAndroid Build Coastguard Worker */ 54*0e209d39SAndroid Build Coastguard Worker class CEModifier : public UObject { 55*0e209d39SAndroid Build Coastguard Worker public: 56*0e209d39SAndroid Build Coastguard Worker virtual ~CEModifier(); 57*0e209d39SAndroid Build Coastguard Worker /** Returns a new CE to replace the non-special input CE32, or else Collation::NO_CE. */ 58*0e209d39SAndroid Build Coastguard Worker virtual int64_t modifyCE32(uint32_t ce32) const = 0; 59*0e209d39SAndroid Build Coastguard Worker /** Returns a new CE to replace the input CE, or else Collation::NO_CE. */ 60*0e209d39SAndroid Build Coastguard Worker virtual int64_t modifyCE(int64_t ce) const = 0; 61*0e209d39SAndroid Build Coastguard Worker }; 62*0e209d39SAndroid Build Coastguard Worker 63*0e209d39SAndroid Build Coastguard Worker CollationDataBuilder(UBool icu4xMode, UErrorCode &errorCode); 64*0e209d39SAndroid Build Coastguard Worker 65*0e209d39SAndroid Build Coastguard Worker virtual ~CollationDataBuilder(); 66*0e209d39SAndroid Build Coastguard Worker 67*0e209d39SAndroid Build Coastguard Worker void initForTailoring(const CollationData *b, UErrorCode &errorCode); 68*0e209d39SAndroid Build Coastguard Worker 69*0e209d39SAndroid Build Coastguard Worker virtual UBool isCompressibleLeadByte(uint32_t b) const; 70*0e209d39SAndroid Build Coastguard Worker isCompressiblePrimary(uint32_t p)71*0e209d39SAndroid Build Coastguard Worker inline UBool isCompressiblePrimary(uint32_t p) const { 72*0e209d39SAndroid Build Coastguard Worker return isCompressibleLeadByte(p >> 24); 73*0e209d39SAndroid Build Coastguard Worker } 74*0e209d39SAndroid Build Coastguard Worker 75*0e209d39SAndroid Build Coastguard Worker /** 76*0e209d39SAndroid Build Coastguard Worker * @return true if this builder has mappings (e.g., add() has been called) 77*0e209d39SAndroid Build Coastguard Worker */ hasMappings()78*0e209d39SAndroid Build Coastguard Worker UBool hasMappings() const { return modified; } 79*0e209d39SAndroid Build Coastguard Worker 80*0e209d39SAndroid Build Coastguard Worker /** 81*0e209d39SAndroid Build Coastguard Worker * @return true if c has CEs in this builder 82*0e209d39SAndroid Build Coastguard Worker */ 83*0e209d39SAndroid Build Coastguard Worker UBool isAssigned(UChar32 c) const; 84*0e209d39SAndroid Build Coastguard Worker 85*0e209d39SAndroid Build Coastguard Worker /** 86*0e209d39SAndroid Build Coastguard Worker * @return the three-byte primary if c maps to a single such CE and has no context data, 87*0e209d39SAndroid Build Coastguard Worker * otherwise returns 0. 88*0e209d39SAndroid Build Coastguard Worker */ 89*0e209d39SAndroid Build Coastguard Worker uint32_t getLongPrimaryIfSingleCE(UChar32 c) const; 90*0e209d39SAndroid Build Coastguard Worker 91*0e209d39SAndroid Build Coastguard Worker /** 92*0e209d39SAndroid Build Coastguard Worker * @return the single CE for c. 93*0e209d39SAndroid Build Coastguard Worker * Sets an error code if c does not have a single CE. 94*0e209d39SAndroid Build Coastguard Worker */ 95*0e209d39SAndroid Build Coastguard Worker int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const; 96*0e209d39SAndroid Build Coastguard Worker 97*0e209d39SAndroid Build Coastguard Worker void add(const UnicodeString &prefix, const UnicodeString &s, 98*0e209d39SAndroid Build Coastguard Worker const int64_t ces[], int32_t cesLength, 99*0e209d39SAndroid Build Coastguard Worker UErrorCode &errorCode); 100*0e209d39SAndroid Build Coastguard Worker 101*0e209d39SAndroid Build Coastguard Worker /** 102*0e209d39SAndroid Build Coastguard Worker * Encodes the ces as either the returned ce32 by itself, 103*0e209d39SAndroid Build Coastguard Worker * or by storing an expansion, with the returned ce32 referring to that. 104*0e209d39SAndroid Build Coastguard Worker * 105*0e209d39SAndroid Build Coastguard Worker * add(p, s, ces, cesLength) = addCE32(p, s, encodeCEs(ces, cesLength)) 106*0e209d39SAndroid Build Coastguard Worker */ 107*0e209d39SAndroid Build Coastguard Worker virtual uint32_t encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode); 108*0e209d39SAndroid Build Coastguard Worker void addCE32(const UnicodeString &prefix, const UnicodeString &s, 109*0e209d39SAndroid Build Coastguard Worker uint32_t ce32, UErrorCode &errorCode); 110*0e209d39SAndroid Build Coastguard Worker 111*0e209d39SAndroid Build Coastguard Worker /** 112*0e209d39SAndroid Build Coastguard Worker * Sets three-byte-primary CEs for a range of code points in code point order, 113*0e209d39SAndroid Build Coastguard Worker * if it is worth doing; otherwise no change is made. 114*0e209d39SAndroid Build Coastguard Worker * None of the code points in the range should have complex mappings so far 115*0e209d39SAndroid Build Coastguard Worker * (expansions/contractions/prefixes). 116*0e209d39SAndroid Build Coastguard Worker * @param start first code point 117*0e209d39SAndroid Build Coastguard Worker * @param end last code point (inclusive) 118*0e209d39SAndroid Build Coastguard Worker * @param primary primary weight for 'start' 119*0e209d39SAndroid Build Coastguard Worker * @param step per-code point primary-weight increment 120*0e209d39SAndroid Build Coastguard Worker * @param errorCode ICU in/out error code 121*0e209d39SAndroid Build Coastguard Worker * @return true if an OFFSET_TAG range was used for start..end 122*0e209d39SAndroid Build Coastguard Worker */ 123*0e209d39SAndroid Build Coastguard Worker UBool maybeSetPrimaryRange(UChar32 start, UChar32 end, 124*0e209d39SAndroid Build Coastguard Worker uint32_t primary, int32_t step, 125*0e209d39SAndroid Build Coastguard Worker UErrorCode &errorCode); 126*0e209d39SAndroid Build Coastguard Worker 127*0e209d39SAndroid Build Coastguard Worker /** 128*0e209d39SAndroid Build Coastguard Worker * Sets three-byte-primary CEs for a range of code points in code point order. 129*0e209d39SAndroid Build Coastguard Worker * Sets range values if that is worth doing, or else individual values. 130*0e209d39SAndroid Build Coastguard Worker * None of the code points in the range should have complex mappings so far 131*0e209d39SAndroid Build Coastguard Worker * (expansions/contractions/prefixes). 132*0e209d39SAndroid Build Coastguard Worker * @param start first code point 133*0e209d39SAndroid Build Coastguard Worker * @param end last code point (inclusive) 134*0e209d39SAndroid Build Coastguard Worker * @param primary primary weight for 'start' 135*0e209d39SAndroid Build Coastguard Worker * @param step per-code point primary-weight increment 136*0e209d39SAndroid Build Coastguard Worker * @param errorCode ICU in/out error code 137*0e209d39SAndroid Build Coastguard Worker * @return the next primary after 'end': start primary incremented by ((end-start)+1)*step 138*0e209d39SAndroid Build Coastguard Worker */ 139*0e209d39SAndroid Build Coastguard Worker uint32_t setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end, 140*0e209d39SAndroid Build Coastguard Worker uint32_t primary, int32_t step, 141*0e209d39SAndroid Build Coastguard Worker UErrorCode &errorCode); 142*0e209d39SAndroid Build Coastguard Worker 143*0e209d39SAndroid Build Coastguard Worker /** 144*0e209d39SAndroid Build Coastguard Worker * Copies all mappings from the src builder, with modifications. 145*0e209d39SAndroid Build Coastguard Worker * This builder here must not be built yet, and should be empty. 146*0e209d39SAndroid Build Coastguard Worker */ 147*0e209d39SAndroid Build Coastguard Worker void copyFrom(const CollationDataBuilder &src, const CEModifier &modifier, 148*0e209d39SAndroid Build Coastguard Worker UErrorCode &errorCode); 149*0e209d39SAndroid Build Coastguard Worker 150*0e209d39SAndroid Build Coastguard Worker void optimize(const UnicodeSet &set, UErrorCode &errorCode); 151*0e209d39SAndroid Build Coastguard Worker void suppressContractions(const UnicodeSet &set, UErrorCode &errorCode); 152*0e209d39SAndroid Build Coastguard Worker enableFastLatin()153*0e209d39SAndroid Build Coastguard Worker void enableFastLatin() { fastLatinEnabled = true; } 154*0e209d39SAndroid Build Coastguard Worker virtual void build(CollationData &data, UErrorCode &errorCode); 155*0e209d39SAndroid Build Coastguard Worker 156*0e209d39SAndroid Build Coastguard Worker /** 157*0e209d39SAndroid Build Coastguard Worker * Looks up CEs for s and appends them to the ces array. 158*0e209d39SAndroid Build Coastguard Worker * Does not handle normalization: s should be in FCD form. 159*0e209d39SAndroid Build Coastguard Worker * 160*0e209d39SAndroid Build Coastguard Worker * Does not write completely ignorable CEs. 161*0e209d39SAndroid Build Coastguard Worker * Does not write beyond Collation::MAX_EXPANSION_LENGTH. 162*0e209d39SAndroid Build Coastguard Worker * 163*0e209d39SAndroid Build Coastguard Worker * @return incremented cesLength 164*0e209d39SAndroid Build Coastguard Worker */ 165*0e209d39SAndroid Build Coastguard Worker int32_t getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength); 166*0e209d39SAndroid Build Coastguard Worker int32_t getCEs(const UnicodeString &prefix, const UnicodeString &s, 167*0e209d39SAndroid Build Coastguard Worker int64_t ces[], int32_t cesLength); 168*0e209d39SAndroid Build Coastguard Worker 169*0e209d39SAndroid Build Coastguard Worker protected: 170*0e209d39SAndroid Build Coastguard Worker friend class CopyHelper; 171*0e209d39SAndroid Build Coastguard Worker friend class DataBuilderCollationIterator; 172*0e209d39SAndroid Build Coastguard Worker 173*0e209d39SAndroid Build Coastguard Worker uint32_t getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const; 174*0e209d39SAndroid Build Coastguard Worker 175*0e209d39SAndroid Build Coastguard Worker int32_t addCE(int64_t ce, UErrorCode &errorCode); 176*0e209d39SAndroid Build Coastguard Worker int32_t addCE32(uint32_t ce32, UErrorCode &errorCode); 177*0e209d39SAndroid Build Coastguard Worker int32_t addConditionalCE32(const UnicodeString &context, uint32_t ce32, UErrorCode &errorCode); 178*0e209d39SAndroid Build Coastguard Worker getConditionalCE32(int32_t index)179*0e209d39SAndroid Build Coastguard Worker inline ConditionalCE32 *getConditionalCE32(int32_t index) const { 180*0e209d39SAndroid Build Coastguard Worker return static_cast<ConditionalCE32 *>(conditionalCE32s[index]); 181*0e209d39SAndroid Build Coastguard Worker } getConditionalCE32ForCE32(uint32_t ce32)182*0e209d39SAndroid Build Coastguard Worker inline ConditionalCE32 *getConditionalCE32ForCE32(uint32_t ce32) const { 183*0e209d39SAndroid Build Coastguard Worker return getConditionalCE32(Collation::indexFromCE32(ce32)); 184*0e209d39SAndroid Build Coastguard Worker } 185*0e209d39SAndroid Build Coastguard Worker makeBuilderContextCE32(int32_t index)186*0e209d39SAndroid Build Coastguard Worker static uint32_t makeBuilderContextCE32(int32_t index) { 187*0e209d39SAndroid Build Coastguard Worker return Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, index); 188*0e209d39SAndroid Build Coastguard Worker } isBuilderContextCE32(uint32_t ce32)189*0e209d39SAndroid Build Coastguard Worker static inline UBool isBuilderContextCE32(uint32_t ce32) { 190*0e209d39SAndroid Build Coastguard Worker return Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG); 191*0e209d39SAndroid Build Coastguard Worker } 192*0e209d39SAndroid Build Coastguard Worker 193*0e209d39SAndroid Build Coastguard Worker static uint32_t encodeOneCEAsCE32(int64_t ce); 194*0e209d39SAndroid Build Coastguard Worker uint32_t encodeOneCE(int64_t ce, UErrorCode &errorCode); 195*0e209d39SAndroid Build Coastguard Worker uint32_t encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode); 196*0e209d39SAndroid Build Coastguard Worker uint32_t encodeExpansion32(const int32_t newCE32s[], int32_t length, UErrorCode &errorCode); 197*0e209d39SAndroid Build Coastguard Worker 198*0e209d39SAndroid Build Coastguard Worker uint32_t copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext, UErrorCode &errorCode); 199*0e209d39SAndroid Build Coastguard Worker /** 200*0e209d39SAndroid Build Coastguard Worker * Copies base contractions to a list of ConditionalCE32. 201*0e209d39SAndroid Build Coastguard Worker * Sets cond->next to the index of the first new item 202*0e209d39SAndroid Build Coastguard Worker * and returns the index of the last new item. 203*0e209d39SAndroid Build Coastguard Worker */ 204*0e209d39SAndroid Build Coastguard Worker int32_t copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32, 205*0e209d39SAndroid Build Coastguard Worker ConditionalCE32 *cond, UErrorCode &errorCode); 206*0e209d39SAndroid Build Coastguard Worker 207*0e209d39SAndroid Build Coastguard Worker UBool getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode); 208*0e209d39SAndroid Build Coastguard Worker void setDigitTags(UErrorCode &errorCode); 209*0e209d39SAndroid Build Coastguard Worker void setLeadSurrogates(UErrorCode &errorCode); 210*0e209d39SAndroid Build Coastguard Worker 211*0e209d39SAndroid Build Coastguard Worker void buildMappings(CollationData &data, UErrorCode &errorCode); 212*0e209d39SAndroid Build Coastguard Worker 213*0e209d39SAndroid Build Coastguard Worker void clearContexts(); 214*0e209d39SAndroid Build Coastguard Worker void buildContexts(UErrorCode &errorCode); 215*0e209d39SAndroid Build Coastguard Worker uint32_t buildContext(ConditionalCE32 *head, UErrorCode &errorCode); 216*0e209d39SAndroid Build Coastguard Worker int32_t addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder, 217*0e209d39SAndroid Build Coastguard Worker UErrorCode &errorCode); 218*0e209d39SAndroid Build Coastguard Worker 219*0e209d39SAndroid Build Coastguard Worker void buildFastLatinTable(CollationData &data, UErrorCode &errorCode); 220*0e209d39SAndroid Build Coastguard Worker 221*0e209d39SAndroid Build Coastguard Worker int32_t getCEs(const UnicodeString &s, int32_t start, int64_t ces[], int32_t cesLength); 222*0e209d39SAndroid Build Coastguard Worker jamoCpFromIndex(int32_t i)223*0e209d39SAndroid Build Coastguard Worker static UChar32 jamoCpFromIndex(int32_t i) { 224*0e209d39SAndroid Build Coastguard Worker // 0 <= i < CollationData::JAMO_CE32S_LENGTH = 19 + 21 + 27 225*0e209d39SAndroid Build Coastguard Worker if(i < Hangul::JAMO_L_COUNT) { return Hangul::JAMO_L_BASE + i; } 226*0e209d39SAndroid Build Coastguard Worker i -= Hangul::JAMO_L_COUNT; 227*0e209d39SAndroid Build Coastguard Worker if(i < Hangul::JAMO_V_COUNT) { return Hangul::JAMO_V_BASE + i; } 228*0e209d39SAndroid Build Coastguard Worker i -= Hangul::JAMO_V_COUNT; 229*0e209d39SAndroid Build Coastguard Worker // i < 27 230*0e209d39SAndroid Build Coastguard Worker return Hangul::JAMO_T_BASE + 1 + i; 231*0e209d39SAndroid Build Coastguard Worker } 232*0e209d39SAndroid Build Coastguard Worker 233*0e209d39SAndroid Build Coastguard Worker /** @see Collation::BUILDER_DATA_TAG */ 234*0e209d39SAndroid Build Coastguard Worker static const uint32_t IS_BUILDER_JAMO_CE32 = 0x100; 235*0e209d39SAndroid Build Coastguard Worker 236*0e209d39SAndroid Build Coastguard Worker const Normalizer2Impl &nfcImpl; 237*0e209d39SAndroid Build Coastguard Worker const CollationData *base; 238*0e209d39SAndroid Build Coastguard Worker const CollationSettings *baseSettings; 239*0e209d39SAndroid Build Coastguard Worker UTrie2 *trie; 240*0e209d39SAndroid Build Coastguard Worker UVector32 ce32s; 241*0e209d39SAndroid Build Coastguard Worker UVector64 ce64s; 242*0e209d39SAndroid Build Coastguard Worker UVector conditionalCE32s; // vector of ConditionalCE32 243*0e209d39SAndroid Build Coastguard Worker // Characters that have context (prefixes or contraction suffixes). 244*0e209d39SAndroid Build Coastguard Worker UnicodeSet contextChars; 245*0e209d39SAndroid Build Coastguard Worker // Serialized UCharsTrie structures for finalized contexts. 246*0e209d39SAndroid Build Coastguard Worker UnicodeString contexts; 247*0e209d39SAndroid Build Coastguard Worker private: 248*0e209d39SAndroid Build Coastguard Worker /** 249*0e209d39SAndroid Build Coastguard Worker * The "era" of building intermediate contexts. 250*0e209d39SAndroid Build Coastguard Worker * When the array of cached, temporary contexts overflows, then clearContexts() 251*0e209d39SAndroid Build Coastguard Worker * removes them all and invalidates the builtCE32 that used to point to built tries. 252*0e209d39SAndroid Build Coastguard Worker * See ConditionalCE32::era. 253*0e209d39SAndroid Build Coastguard Worker */ 254*0e209d39SAndroid Build Coastguard Worker int32_t contextsEra = 0; 255*0e209d39SAndroid Build Coastguard Worker protected: 256*0e209d39SAndroid Build Coastguard Worker UnicodeSet unsafeBackwardSet; 257*0e209d39SAndroid Build Coastguard Worker UBool modified; 258*0e209d39SAndroid Build Coastguard Worker UBool icu4xMode; 259*0e209d39SAndroid Build Coastguard Worker 260*0e209d39SAndroid Build Coastguard Worker UBool fastLatinEnabled; 261*0e209d39SAndroid Build Coastguard Worker CollationFastLatinBuilder *fastLatinBuilder; 262*0e209d39SAndroid Build Coastguard Worker 263*0e209d39SAndroid Build Coastguard Worker DataBuilderCollationIterator *collIter; 264*0e209d39SAndroid Build Coastguard Worker }; 265*0e209d39SAndroid Build Coastguard Worker 266*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END 267*0e209d39SAndroid Build Coastguard Worker 268*0e209d39SAndroid Build Coastguard Worker #endif // !UCONFIG_NO_COLLATION 269*0e209d39SAndroid Build Coastguard Worker #endif // __COLLATIONDATABUILDER_H__ 270