xref: /aosp_15_r20/external/cronet/third_party/icu/source/common/rbbirb.cpp (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 //  file:  rbbirb.cpp
5 //
6 //  Copyright (C) 2002-2011, International Business Machines Corporation and others.
7 //  All Rights Reserved.
8 //
9 //  This file contains the RBBIRuleBuilder class implementation.  This is the main class for
10 //    building (compiling) break rules into the tables required by the runtime
11 //    RBBI engine.
12 //
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_BREAK_ITERATION
17 
18 #include "unicode/brkiter.h"
19 #include "unicode/rbbi.h"
20 #include "unicode/ubrk.h"
21 #include "unicode/unistr.h"
22 #include "unicode/uniset.h"
23 #include "unicode/uchar.h"
24 #include "unicode/uchriter.h"
25 #include "unicode/ustring.h"
26 #include "unicode/parsepos.h"
27 #include "unicode/parseerr.h"
28 
29 #include "cmemory.h"
30 #include "cstring.h"
31 #include "rbbirb.h"
32 #include "rbbinode.h"
33 #include "rbbiscan.h"
34 #include "rbbisetb.h"
35 #include "rbbitblb.h"
36 #include "rbbidata.h"
37 #include "uassert.h"
38 
39 
40 U_NAMESPACE_BEGIN
41 
42 
43 //----------------------------------------------------------------------------------------
44 //
45 //  Constructor.
46 //
47 //----------------------------------------------------------------------------------------
RBBIRuleBuilder(const UnicodeString & rules,UParseError * parseErr,UErrorCode & status)48 RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString   &rules,
49                                        UParseError     *parseErr,
50                                        UErrorCode      &status)
51  : fRules(rules), fStrippedRules(rules)
52 {
53     fStatus = &status; // status is checked below
54     fParseError = parseErr;
55     fDebugEnv   = nullptr;
56 #ifdef RBBI_DEBUG
57     fDebugEnv   = getenv("U_RBBIDEBUG");
58 #endif
59 
60 
61     fForwardTree        = nullptr;
62     fReverseTree        = nullptr;
63     fSafeFwdTree        = nullptr;
64     fSafeRevTree        = nullptr;
65     fDefaultTree        = &fForwardTree;
66     fForwardTable       = nullptr;
67     fRuleStatusVals     = nullptr;
68     fChainRules         = false;
69     fLookAheadHardBreak = false;
70     fUSetNodes          = nullptr;
71     fRuleStatusVals     = nullptr;
72     fScanner            = nullptr;
73     fSetBuilder         = nullptr;
74     if (parseErr) {
75         uprv_memset(parseErr, 0, sizeof(UParseError));
76     }
77 
78     if (U_FAILURE(status)) {
79         return;
80     }
81 
82     fUSetNodes          = new UVector(status); // bcos status gets overwritten here
83     fRuleStatusVals     = new UVector(status);
84     fScanner            = new RBBIRuleScanner(this);
85     fSetBuilder         = new RBBISetBuilder(this);
86     if (U_FAILURE(status)) {
87         return;
88     }
89     if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) {
90         status = U_MEMORY_ALLOCATION_ERROR;
91     }
92 }
93 
94 
95 
96 //----------------------------------------------------------------------------------------
97 //
98 //  Destructor
99 //
100 //----------------------------------------------------------------------------------------
~RBBIRuleBuilder()101 RBBIRuleBuilder::~RBBIRuleBuilder() {
102 
103     int        i;
104     for (i=0; ; i++) {
105         RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i);
106         if (n==nullptr) {
107             break;
108         }
109         delete n;
110     }
111 
112     delete fUSetNodes;
113     delete fSetBuilder;
114     delete fForwardTable;
115     delete fForwardTree;
116     delete fReverseTree;
117     delete fSafeFwdTree;
118     delete fSafeRevTree;
119     delete fScanner;
120     delete fRuleStatusVals;
121 }
122 
123 
124 
125 
126 
127 //----------------------------------------------------------------------------------------
128 //
129 //   flattenData() -  Collect up the compiled RBBI rule data and put it into
130 //                    the format for saving in ICU data files,
131 //                    which is also the format needed by the RBBI runtime engine.
132 //
133 //----------------------------------------------------------------------------------------
align8(int32_t i)134 static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
135 
flattenData()136 RBBIDataHeader *RBBIRuleBuilder::flattenData() {
137     int32_t    i;
138 
139     if (U_FAILURE(*fStatus)) {
140         return nullptr;
141     }
142 
143     // Remove whitespace from the rules to make it smaller.
144     // The rule parser has already removed comments.
145     fStrippedRules = fScanner->stripRules(fStrippedRules);
146 
147     // Calculate the size of each section in the data.
148     //   Sizes here are padded up to a multiple of 8 for better memory alignment.
149     //   Sections sizes actually stored in the header are for the actual data
150     //     without the padding.
151     //
152     int32_t headerSize        = align8(sizeof(RBBIDataHeader));
153     int32_t forwardTableSize  = align8(fForwardTable->getTableSize());
154     int32_t reverseTableSize  = align8(fForwardTable->getSafeTableSize());
155     int32_t trieSize          = align8(fSetBuilder->getTrieSize());
156     int32_t statusTableSize   = align8(fRuleStatusVals->size() * sizeof(int32_t));
157 
158     int32_t rulesLengthInUTF8 = 0;
159     u_strToUTF8WithSub(0, 0, &rulesLengthInUTF8,
160                        fStrippedRules.getBuffer(), fStrippedRules.length(),
161                        0xfffd, nullptr, fStatus);
162     *fStatus = U_ZERO_ERROR;
163 
164     int32_t rulesSize         = align8((rulesLengthInUTF8+1));
165 
166     int32_t         totalSize = headerSize
167                                 + forwardTableSize
168                                 + reverseTableSize
169                                 + statusTableSize + trieSize + rulesSize;
170 
171 #ifdef RBBI_DEBUG
172     if (fDebugEnv && uprv_strstr(fDebugEnv, "size")) {
173         RBBIDebugPrintf("Header Size:        %8d\n", headerSize);
174         RBBIDebugPrintf("Forward Table Size: %8d\n", forwardTableSize);
175         RBBIDebugPrintf("Reverse Table Size: %8d\n", reverseTableSize);
176         RBBIDebugPrintf("Trie Size:          %8d\n", trieSize);
177         RBBIDebugPrintf("Status Table Size:  %8d\n", statusTableSize);
178         RBBIDebugPrintf("Rules Size:         %8d\n", rulesSize);
179         RBBIDebugPrintf("-----------------------------\n");
180         RBBIDebugPrintf("Total Size:         %8d\n", totalSize);
181     }
182 #endif
183 
184     RBBIDataHeader  *data     = (RBBIDataHeader *)uprv_malloc(totalSize);
185     if (data == nullptr) {
186         *fStatus = U_MEMORY_ALLOCATION_ERROR;
187         return nullptr;
188     }
189     uprv_memset(data, 0, totalSize);
190 
191 
192     data->fMagic            = 0xb1a0;
193     data->fFormatVersion[0] = RBBI_DATA_FORMAT_VERSION[0];
194     data->fFormatVersion[1] = RBBI_DATA_FORMAT_VERSION[1];
195     data->fFormatVersion[2] = RBBI_DATA_FORMAT_VERSION[2];
196     data->fFormatVersion[3] = RBBI_DATA_FORMAT_VERSION[3];
197     data->fLength           = totalSize;
198     data->fCatCount         = fSetBuilder->getNumCharCategories();
199 
200     data->fFTable        = headerSize;
201     data->fFTableLen     = forwardTableSize;
202 
203     data->fRTable        = data->fFTable  + data->fFTableLen;
204     data->fRTableLen     = reverseTableSize;
205 
206     data->fTrie          = data->fRTable + data->fRTableLen;
207     data->fTrieLen       = trieSize;
208     data->fStatusTable   = data->fTrie    + data->fTrieLen;
209     data->fStatusTableLen= statusTableSize;
210     data->fRuleSource    = data->fStatusTable + statusTableSize;
211     data->fRuleSourceLen = rulesLengthInUTF8;
212 
213     uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
214 
215     fForwardTable->exportTable((uint8_t *)data + data->fFTable);
216     fForwardTable->exportSafeTable((uint8_t *)data + data->fRTable);
217     fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
218 
219     int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
220     for (i=0; i<fRuleStatusVals->size(); i++) {
221         ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
222     }
223 
224     u_strToUTF8WithSub((char *)data+data->fRuleSource, rulesSize, &rulesLengthInUTF8,
225                        fStrippedRules.getBuffer(), fStrippedRules.length(),
226                        0xfffd, nullptr, fStatus);
227     if (U_FAILURE(*fStatus)) {
228         return nullptr;
229     }
230 
231     return data;
232 }
233 
234 
235 //----------------------------------------------------------------------------------------
236 //
237 //  createRuleBasedBreakIterator    construct from source rules that are passed in
238 //                                  in a UnicodeString
239 //
240 //----------------------------------------------------------------------------------------
241 BreakIterator *
createRuleBasedBreakIterator(const UnicodeString & rules,UParseError * parseError,UErrorCode & status)242 RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString    &rules,
243                                     UParseError      *parseError,
244                                     UErrorCode       &status)
245 {
246     //
247     // Read the input rules, generate a parse tree, symbol table,
248     // and list of all Unicode Sets referenced by the rules.
249     //
250     RBBIRuleBuilder  builder(rules, parseError, status);
251     if (U_FAILURE(status)) { // status checked here bcos build below doesn't
252         return nullptr;
253     }
254 
255     RBBIDataHeader *data = builder.build(status);
256 
257     if (U_FAILURE(status)) {
258         return nullptr;
259     }
260 
261     //
262     //  Create a break iterator from the compiled rules.
263     //     (Identical to creation from stored pre-compiled rules)
264     //
265     // status is checked after init in construction.
266     RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
267     if (U_FAILURE(status)) {
268         delete This;
269         This = nullptr;
270     }
271     else if(This == nullptr) { // test for nullptr
272         status = U_MEMORY_ALLOCATION_ERROR;
273     }
274     return This;
275 }
276 
build(UErrorCode & status)277 RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
278     if (U_FAILURE(status)) {
279         return nullptr;
280     }
281 
282     fScanner->parse();
283     if (U_FAILURE(status)) {
284         return nullptr;
285     }
286 
287     //
288     // UnicodeSet processing.
289     //    Munge the Unicode Sets to create an initial set of character categories.
290     //
291     fSetBuilder->buildRanges();
292 
293     //
294     //   Generate the DFA state transition table.
295     //
296     fForwardTable = new RBBITableBuilder(this, &fForwardTree, status);
297     if (fForwardTable == nullptr) {
298         status = U_MEMORY_ALLOCATION_ERROR;
299         return nullptr;
300     }
301 
302     fForwardTable->buildForwardTable();
303 
304     // State table and character category optimization.
305     // Merge equivalent rows and columns.
306     // Note that this process alters the initial set of character categories,
307     // causing the representation of UnicodeSets in the parse tree to become invalid.
308 
309     optimizeTables();
310     fForwardTable->buildSafeReverseTable(status);
311 
312 
313 #ifdef RBBI_DEBUG
314     if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
315         fForwardTable->printStates();
316         fForwardTable->printRuleStatusTable();
317         fForwardTable->printReverseTable();
318     }
319 #endif
320 
321     //    Generate the mapping tables (TRIE) from input code points to
322     //    the character categories.
323     //
324     fSetBuilder->buildTrie();
325 
326     //
327     //   Package up the compiled data into a memory image
328     //      in the run-time format.
329     //
330     RBBIDataHeader *data = flattenData(); // returns nullptr if error
331     if (U_FAILURE(status)) {
332         return nullptr;
333     }
334     return data;
335 }
336 
optimizeTables()337 void RBBIRuleBuilder::optimizeTables() {
338     bool didSomething;
339     do {
340         didSomething = false;
341 
342         // Begin looking for duplicates with char class 3.
343         // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
344         // and should not have other categories merged into them.
345         IntPair duplPair = {3, 0};
346         while (fForwardTable->findDuplCharClassFrom(&duplPair)) {
347             fSetBuilder->mergeCategories(duplPair);
348             fForwardTable->removeColumn(duplPair.second);
349             didSomething = true;
350         }
351 
352         while (fForwardTable->removeDuplicateStates() > 0) {
353             didSomething = true;
354         }
355     } while (didSomething);
356 }
357 
358 U_NAMESPACE_END
359 
360 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
361