1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 // file: rbbirb.cpp
5 //
6 // Copyright (C) 2002-2011, International Business Machines Corporation and others.
7 // All Rights Reserved.
8 //
9 // This file contains the RBBIRuleBuilder class implementation. This is the main class for
10 // building (compiling) break rules into the tables required by the runtime
11 // RBBI engine.
12 //
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_BREAK_ITERATION
17
18 #include "unicode/brkiter.h"
19 #include "unicode/rbbi.h"
20 #include "unicode/ubrk.h"
21 #include "unicode/unistr.h"
22 #include "unicode/uniset.h"
23 #include "unicode/uchar.h"
24 #include "unicode/uchriter.h"
25 #include "unicode/ustring.h"
26 #include "unicode/parsepos.h"
27 #include "unicode/parseerr.h"
28
29 #include "cmemory.h"
30 #include "cstring.h"
31 #include "rbbirb.h"
32 #include "rbbinode.h"
33 #include "rbbiscan.h"
34 #include "rbbisetb.h"
35 #include "rbbitblb.h"
36 #include "rbbidata.h"
37 #include "uassert.h"
38
39
40 U_NAMESPACE_BEGIN
41
42
43 //----------------------------------------------------------------------------------------
44 //
45 // Constructor.
46 //
47 //----------------------------------------------------------------------------------------
RBBIRuleBuilder(const UnicodeString & rules,UParseError * parseErr,UErrorCode & status)48 RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
49 UParseError *parseErr,
50 UErrorCode &status)
51 : fRules(rules), fStrippedRules(rules)
52 {
53 fStatus = &status; // status is checked below
54 fParseError = parseErr;
55 fDebugEnv = nullptr;
56 #ifdef RBBI_DEBUG
57 fDebugEnv = getenv("U_RBBIDEBUG");
58 #endif
59
60
61 fForwardTree = nullptr;
62 fReverseTree = nullptr;
63 fSafeFwdTree = nullptr;
64 fSafeRevTree = nullptr;
65 fDefaultTree = &fForwardTree;
66 fForwardTable = nullptr;
67 fRuleStatusVals = nullptr;
68 fChainRules = false;
69 fLookAheadHardBreak = false;
70 fUSetNodes = nullptr;
71 fRuleStatusVals = nullptr;
72 fScanner = nullptr;
73 fSetBuilder = nullptr;
74 if (parseErr) {
75 uprv_memset(parseErr, 0, sizeof(UParseError));
76 }
77
78 if (U_FAILURE(status)) {
79 return;
80 }
81
82 fUSetNodes = new UVector(status); // bcos status gets overwritten here
83 fRuleStatusVals = new UVector(status);
84 fScanner = new RBBIRuleScanner(this);
85 fSetBuilder = new RBBISetBuilder(this);
86 if (U_FAILURE(status)) {
87 return;
88 }
89 if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) {
90 status = U_MEMORY_ALLOCATION_ERROR;
91 }
92 }
93
94
95
96 //----------------------------------------------------------------------------------------
97 //
98 // Destructor
99 //
100 //----------------------------------------------------------------------------------------
~RBBIRuleBuilder()101 RBBIRuleBuilder::~RBBIRuleBuilder() {
102
103 int i;
104 for (i=0; ; i++) {
105 RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i);
106 if (n==nullptr) {
107 break;
108 }
109 delete n;
110 }
111
112 delete fUSetNodes;
113 delete fSetBuilder;
114 delete fForwardTable;
115 delete fForwardTree;
116 delete fReverseTree;
117 delete fSafeFwdTree;
118 delete fSafeRevTree;
119 delete fScanner;
120 delete fRuleStatusVals;
121 }
122
123
124
125
126
127 //----------------------------------------------------------------------------------------
128 //
129 // flattenData() - Collect up the compiled RBBI rule data and put it into
130 // the format for saving in ICU data files,
131 // which is also the format needed by the RBBI runtime engine.
132 //
133 //----------------------------------------------------------------------------------------
align8(int32_t i)134 static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
135
flattenData()136 RBBIDataHeader *RBBIRuleBuilder::flattenData() {
137 int32_t i;
138
139 if (U_FAILURE(*fStatus)) {
140 return nullptr;
141 }
142
143 // Remove whitespace from the rules to make it smaller.
144 // The rule parser has already removed comments.
145 fStrippedRules = fScanner->stripRules(fStrippedRules);
146
147 // Calculate the size of each section in the data.
148 // Sizes here are padded up to a multiple of 8 for better memory alignment.
149 // Sections sizes actually stored in the header are for the actual data
150 // without the padding.
151 //
152 int32_t headerSize = align8(sizeof(RBBIDataHeader));
153 int32_t forwardTableSize = align8(fForwardTable->getTableSize());
154 int32_t reverseTableSize = align8(fForwardTable->getSafeTableSize());
155 int32_t trieSize = align8(fSetBuilder->getTrieSize());
156 int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
157
158 int32_t rulesLengthInUTF8 = 0;
159 u_strToUTF8WithSub(0, 0, &rulesLengthInUTF8,
160 fStrippedRules.getBuffer(), fStrippedRules.length(),
161 0xfffd, nullptr, fStatus);
162 *fStatus = U_ZERO_ERROR;
163
164 int32_t rulesSize = align8((rulesLengthInUTF8+1));
165
166 int32_t totalSize = headerSize
167 + forwardTableSize
168 + reverseTableSize
169 + statusTableSize + trieSize + rulesSize;
170
171 #ifdef RBBI_DEBUG
172 if (fDebugEnv && uprv_strstr(fDebugEnv, "size")) {
173 RBBIDebugPrintf("Header Size: %8d\n", headerSize);
174 RBBIDebugPrintf("Forward Table Size: %8d\n", forwardTableSize);
175 RBBIDebugPrintf("Reverse Table Size: %8d\n", reverseTableSize);
176 RBBIDebugPrintf("Trie Size: %8d\n", trieSize);
177 RBBIDebugPrintf("Status Table Size: %8d\n", statusTableSize);
178 RBBIDebugPrintf("Rules Size: %8d\n", rulesSize);
179 RBBIDebugPrintf("-----------------------------\n");
180 RBBIDebugPrintf("Total Size: %8d\n", totalSize);
181 }
182 #endif
183
184 RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize);
185 if (data == nullptr) {
186 *fStatus = U_MEMORY_ALLOCATION_ERROR;
187 return nullptr;
188 }
189 uprv_memset(data, 0, totalSize);
190
191
192 data->fMagic = 0xb1a0;
193 data->fFormatVersion[0] = RBBI_DATA_FORMAT_VERSION[0];
194 data->fFormatVersion[1] = RBBI_DATA_FORMAT_VERSION[1];
195 data->fFormatVersion[2] = RBBI_DATA_FORMAT_VERSION[2];
196 data->fFormatVersion[3] = RBBI_DATA_FORMAT_VERSION[3];
197 data->fLength = totalSize;
198 data->fCatCount = fSetBuilder->getNumCharCategories();
199
200 data->fFTable = headerSize;
201 data->fFTableLen = forwardTableSize;
202
203 data->fRTable = data->fFTable + data->fFTableLen;
204 data->fRTableLen = reverseTableSize;
205
206 data->fTrie = data->fRTable + data->fRTableLen;
207 data->fTrieLen = trieSize;
208 data->fStatusTable = data->fTrie + data->fTrieLen;
209 data->fStatusTableLen= statusTableSize;
210 data->fRuleSource = data->fStatusTable + statusTableSize;
211 data->fRuleSourceLen = rulesLengthInUTF8;
212
213 uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
214
215 fForwardTable->exportTable((uint8_t *)data + data->fFTable);
216 fForwardTable->exportSafeTable((uint8_t *)data + data->fRTable);
217 fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
218
219 int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
220 for (i=0; i<fRuleStatusVals->size(); i++) {
221 ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
222 }
223
224 u_strToUTF8WithSub((char *)data+data->fRuleSource, rulesSize, &rulesLengthInUTF8,
225 fStrippedRules.getBuffer(), fStrippedRules.length(),
226 0xfffd, nullptr, fStatus);
227 if (U_FAILURE(*fStatus)) {
228 return nullptr;
229 }
230
231 return data;
232 }
233
234
235 //----------------------------------------------------------------------------------------
236 //
237 // createRuleBasedBreakIterator construct from source rules that are passed in
238 // in a UnicodeString
239 //
240 //----------------------------------------------------------------------------------------
241 BreakIterator *
createRuleBasedBreakIterator(const UnicodeString & rules,UParseError * parseError,UErrorCode & status)242 RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
243 UParseError *parseError,
244 UErrorCode &status)
245 {
246 //
247 // Read the input rules, generate a parse tree, symbol table,
248 // and list of all Unicode Sets referenced by the rules.
249 //
250 RBBIRuleBuilder builder(rules, parseError, status);
251 if (U_FAILURE(status)) { // status checked here bcos build below doesn't
252 return nullptr;
253 }
254
255 RBBIDataHeader *data = builder.build(status);
256
257 if (U_FAILURE(status)) {
258 return nullptr;
259 }
260
261 //
262 // Create a break iterator from the compiled rules.
263 // (Identical to creation from stored pre-compiled rules)
264 //
265 // status is checked after init in construction.
266 RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
267 if (U_FAILURE(status)) {
268 delete This;
269 This = nullptr;
270 }
271 else if(This == nullptr) { // test for nullptr
272 status = U_MEMORY_ALLOCATION_ERROR;
273 }
274 return This;
275 }
276
build(UErrorCode & status)277 RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
278 if (U_FAILURE(status)) {
279 return nullptr;
280 }
281
282 fScanner->parse();
283 if (U_FAILURE(status)) {
284 return nullptr;
285 }
286
287 //
288 // UnicodeSet processing.
289 // Munge the Unicode Sets to create an initial set of character categories.
290 //
291 fSetBuilder->buildRanges();
292
293 //
294 // Generate the DFA state transition table.
295 //
296 fForwardTable = new RBBITableBuilder(this, &fForwardTree, status);
297 if (fForwardTable == nullptr) {
298 status = U_MEMORY_ALLOCATION_ERROR;
299 return nullptr;
300 }
301
302 fForwardTable->buildForwardTable();
303
304 // State table and character category optimization.
305 // Merge equivalent rows and columns.
306 // Note that this process alters the initial set of character categories,
307 // causing the representation of UnicodeSets in the parse tree to become invalid.
308
309 optimizeTables();
310 fForwardTable->buildSafeReverseTable(status);
311
312
313 #ifdef RBBI_DEBUG
314 if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
315 fForwardTable->printStates();
316 fForwardTable->printRuleStatusTable();
317 fForwardTable->printReverseTable();
318 }
319 #endif
320
321 // Generate the mapping tables (TRIE) from input code points to
322 // the character categories.
323 //
324 fSetBuilder->buildTrie();
325
326 //
327 // Package up the compiled data into a memory image
328 // in the run-time format.
329 //
330 RBBIDataHeader *data = flattenData(); // returns nullptr if error
331 if (U_FAILURE(status)) {
332 return nullptr;
333 }
334 return data;
335 }
336
optimizeTables()337 void RBBIRuleBuilder::optimizeTables() {
338 bool didSomething;
339 do {
340 didSomething = false;
341
342 // Begin looking for duplicates with char class 3.
343 // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
344 // and should not have other categories merged into them.
345 IntPair duplPair = {3, 0};
346 while (fForwardTable->findDuplCharClassFrom(&duplPair)) {
347 fSetBuilder->mergeCategories(duplPair);
348 fForwardTable->removeColumn(duplPair.second);
349 didSomething = true;
350 }
351
352 while (fForwardTable->removeDuplicateStates() > 0) {
353 didSomething = true;
354 }
355 } while (didSomething);
356 }
357
358 U_NAMESPACE_END
359
360 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
361