xref: /aosp_15_r20/external/icu/icu4c/source/test/perf/unisetperf/draft/trieset.cpp (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1 /*
2 *************************************************************************
3 *   © 2016 and later: Unicode, Inc. and others.
4 *   License & terms of use: http://www.unicode.org/copyright.html
5 *************************************************************************
6 *************************************************************************
7 *   Copyright (C) 2007, International Business Machines
8 *   Corporation and others.  All Rights Reserved.
9 *************************************************************************
10 *   file name:  trieset.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2007jan15
16 *   created by: Markus Scherer
17 *
18 *   Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
19 *   using a UTrie with 8-bit (byte) results per code point.
20 *   Modifies the trie index to make the BMP linear, and uses the original set
21 *   for supplementary code points.
22 */
23 
24 #include "cmemory.h"
25 #include "unicode/uniset.h"
26 #include "unicode/uobject.h"
27 #include "unicode/usetiter.h"
28 #include "unicode/utypes.h"
29 #include "unicont.h"
30 #include "utrie.h"
31 
32 using icu::UObject;
33 using icu::UnicodeSet;
34 using icu::UnicodeSetIterator;
35 
36 #define UTRIE_GET8_LATIN1(trie) ((const uint8_t *)(trie)->data32+UTRIE_DATA_BLOCK_LENGTH)
37 
38 #define UTRIE_GET8_FROM_LEAD(trie, c16) \
39     ((const uint8_t *)(trie)->data32)[ \
40         ((int32_t)((trie)->index[(c16)>>UTRIE_SHIFT])<<UTRIE_INDEX_SHIFT)+ \
41         ((c16)&UTRIE_MASK) \
42     ]
43 
44 class TrieSet : public UObject, public UnicodeContainable {
45 public:
TrieSet(const UnicodeSet & set,UErrorCode & errorCode)46     TrieSet(const UnicodeSet &set, UErrorCode &errorCode)
47             : trieData(nullptr), latin1(nullptr), restSet(set.clone()) {
48         if(U_FAILURE(errorCode)) {
49             return;
50         }
51         if(restSet==nullptr) {
52             errorCode=U_MEMORY_ALLOCATION_ERROR;
53             return;
54         }
55 
56         UNewTrie *newTrie=utrie_open(nullptr, nullptr, 0x11000, 0, 0, true);
57         UChar32 start, end;
58 
59         UnicodeSetIterator iter(set);
60 
61         while(iter.nextRange() && !iter.isString()) {
62             start=iter.getCodepoint();
63             end=iter.getCodepointEnd();
64             if(start>0xffff) {
65                 break;
66             }
67             if(end>0xffff) {
68                 end=0xffff;
69             }
70             if(!utrie_setRange32(newTrie, start, end+1, true, true)) {
71                 errorCode=U_INTERNAL_PROGRAM_ERROR;
72                 return;
73             }
74         }
75 
76         // Preflight the trie length.
77         int32_t length=utrie_serialize(newTrie, nullptr, 0, nullptr, 8, &errorCode);
78         if(errorCode!=U_BUFFER_OVERFLOW_ERROR) {
79             return;
80         }
81 
82         trieData=(uint32_t *)uprv_malloc(length);
83         if(trieData==nullptr) {
84             errorCode=U_MEMORY_ALLOCATION_ERROR;
85             return;
86         }
87 
88         errorCode=U_ZERO_ERROR;
89         utrie_serialize(newTrie, trieData, length, nullptr, 8, &errorCode);
90         utrie_unserialize(&trie, trieData, length, &errorCode);  // TODO: Implement for 8-bit UTrie!
91 
92         if(U_SUCCESS(errorCode)) {
93             // Copy the indexes for surrogate code points into the BMP range
94             // for simple access across the entire BMP.
95             uprv_memcpy((uint16_t *)trie.index+(0xd800>>UTRIE_SHIFT),
96                         trie.index+UTRIE_BMP_INDEX_LENGTH,
97                         (0x800>>UTRIE_SHIFT)*2);
98             latin1=UTRIE_GET8_LATIN1(&trie);
99         }
100 
101         restSet->remove(0, 0xffff);
102     }
103 
~TrieSet()104     ~TrieSet() {
105         uprv_free(trieData);
106         delete restSet;
107     }
108 
contains(UChar32 c) const109     UBool contains(UChar32 c) const {
110         if((uint32_t)c<=0xff) {
111             return (UBool)latin1[c];
112         } else if((uint32_t)c<0xffff) {
113             return (UBool)UTRIE_GET8_FROM_LEAD(&trie, c);
114         } else {
115             return restSet->contains(c);
116         }
117     }
118 
119 private:
120     uint32_t *trieData;
121     const uint8_t *latin1;
122     UTrie trie;
123     UnicodeSet *restSet;
124 };
125