1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * Copyright (c) 1999-2016, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 ********************************************************************
7 * Date Name Description
8 * 12/14/99 Madhu Creation.
9 * 01/12/2000 Madhu updated for changed API
10 ********************************************************************/
11
12 #include "unicode/utypes.h"
13
14 #if !UCONFIG_NO_BREAK_ITERATION
15
16 #include "unicode/uchar.h"
17 #include "intltest.h"
18 #include "unicode/rbbi.h"
19 #include "unicode/schriter.h"
20 #include "rbbiapts.h"
21 #include "rbbidata.h"
22 #include "cstring.h"
23 #include "ubrkimpl.h"
24 #include "unicode/locid.h"
25 #include "unicode/ustring.h"
26 #include "unicode/utext.h"
27 #include "cmemory.h"
28 #if !UCONFIG_NO_BREAK_ITERATION
29 #include "unicode/filteredbrk.h"
30 #include <stdio.h> // for snprintf
31 #endif
32 /**
33 * API Test the RuleBasedBreakIterator class
34 */
35
36
37 #define TEST_ASSERT_SUCCESS(status) UPRV_BLOCK_MACRO_BEGIN { \
38 if (U_FAILURE(status)) { \
39 dataerrln("Failure at file %s, line %d, error = %s", __FILE__, __LINE__, u_errorName(status)); \
40 } \
41 } UPRV_BLOCK_MACRO_END
42
43 #define TEST_ASSERT(expr) UPRV_BLOCK_MACRO_BEGIN { \
44 if ((expr) == false) { \
45 errln("Test Failure at file %s, line %d: \"%s\" is false.\n", __FILE__, __LINE__, #expr); \
46 } \
47 } UPRV_BLOCK_MACRO_END
48
TestCloneEquals()49 void RBBIAPITest::TestCloneEquals()
50 {
51
52 UErrorCode status=U_ZERO_ERROR;
53 RuleBasedBreakIterator* bi1 = dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status));
54 RuleBasedBreakIterator* biequal = dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status));
55 RuleBasedBreakIterator* bi3 = dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status));
56 RuleBasedBreakIterator* bi2 = dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status));
57 if(U_FAILURE(status)){
58 errcheckln(status, "Fail : in construction - %s", u_errorName(status));
59 return;
60 }
61
62
63 UnicodeString testString="Testing word break iterators's clone() and equals()";
64 bi1->setText(testString);
65 bi2->setText(testString);
66 biequal->setText(testString);
67
68 bi3->setText("hello");
69
70 logln((UnicodeString)"Testing equals()");
71
72 logln((UnicodeString)"Testing == and !=");
73 bool b = (*bi1 != *biequal);
74 b |= *bi1 == *bi2;
75 b |= *bi1 == *bi3;
76 if (b) {
77 errln("%s:%d ERROR:1 RBBI's == and != operator failed.", __FILE__, __LINE__);
78 }
79
80 if(*bi2 == *biequal || *bi2 == *bi1 || *biequal == *bi3)
81 errln("%s:%d ERROR:2 RBBI's == and != operator failed.", __FILE__, __LINE__);
82
83
84 // Quick test of RulesBasedBreakIterator assignment -
85 // Check that
86 // two different iterators are !=
87 // they are == after assignment
88 // source and dest iterator produce the same next() after assignment.
89 // deleting one doesn't disable the other.
90 logln("Testing assignment");
91 RuleBasedBreakIterator *bix = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createLineInstance(Locale::getDefault(), status));
92 if(U_FAILURE(status)){
93 errcheckln(status, "Fail : in construction - %s", u_errorName(status));
94 return;
95 }
96
97 RuleBasedBreakIterator biDefault, biDefault2;
98 if(U_FAILURE(status)){
99 errln("%s:%d FAIL : in construction of default iterator", __FILE__, __LINE__);
100 return;
101 }
102 if (biDefault == *bix) {
103 errln("%s:%d ERROR: iterators should not compare ==", __FILE__, __LINE__);
104 return;
105 }
106 if (biDefault != biDefault2) {
107 errln("%s:%d ERROR: iterators should compare ==", __FILE__, __LINE__);
108 return;
109 }
110
111
112 UnicodeString HelloString("Hello Kitty");
113 bix->setText(HelloString);
114 if (*bix == *bi2) {
115 errln("%s:%d ERROR: strings should not be equal before assignment.", __FILE__, __LINE__);
116 }
117 *bix = *bi2;
118 if (*bix != *bi2) {
119 errln("%s:%d ERROR: strings should be equal before assignment.", __FILE__, __LINE__);
120 }
121
122 int bixnext = bix->next();
123 int bi2next = bi2->next();
124 if (! (bixnext == bi2next && bixnext == 7)) {
125 errln("%s:%d ERROR: iterators behaved differently after assignment.", __FILE__, __LINE__);
126 }
127 delete bix;
128 if (bi2->next() != 8) {
129 errln("%s:%d ERROR: iterator.next() failed after deleting copy.", __FILE__, __LINE__);
130 }
131
132
133
134 logln((UnicodeString)"Testing clone()");
135 RuleBasedBreakIterator* bi1clone = bi1->clone();
136 RuleBasedBreakIterator* bi2clone = bi2->clone();
137
138 if(*bi1clone != *bi1 || *bi1clone != *biequal ||
139 *bi1clone == *bi3 || *bi1clone == *bi2)
140 errln("%s:%d ERROR:1 RBBI's clone() method failed", __FILE__, __LINE__);
141
142 if(*bi2clone == *bi1 || *bi2clone == *biequal ||
143 *bi2clone == *bi3 || *bi2clone != *bi2)
144 errln("%s:%d ERROR:2 RBBI's clone() method failed", __FILE__, __LINE__);
145
146 if(bi1->getText() != bi1clone->getText() ||
147 bi2clone->getText() != bi2->getText() ||
148 *bi2clone == *bi1clone )
149 errln("%s:%d ERROR: RBBI's clone() method failed", __FILE__, __LINE__);
150
151 delete bi1clone;
152 delete bi2clone;
153 delete bi1;
154 delete bi3;
155 delete bi2;
156 delete biequal;
157 }
158
TestBoilerPlate()159 void RBBIAPITest::TestBoilerPlate()
160 {
161 UErrorCode status = U_ZERO_ERROR;
162 BreakIterator* a = BreakIterator::createWordInstance(Locale("hi"), status);
163 BreakIterator* b = BreakIterator::createWordInstance(Locale("hi_IN"),status);
164 if (U_FAILURE(status)) {
165 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
166 return;
167 }
168 if(*a!=*b){
169 errln("Failed: boilerplate method operator!= does not return correct results");
170 }
171 // Japanese word break iterators are identical to root with
172 // a dictionary-based break iterator
173 BreakIterator* c = BreakIterator::createCharacterInstance(Locale("ja"),status);
174 BreakIterator* d = BreakIterator::createCharacterInstance(Locale("root"),status);
175 if(c && d){
176 if(*c!=*d){
177 errln("Failed: boilerplate method operator== does not return correct results");
178 }
179 }else{
180 errln("creation of break iterator failed");
181 }
182 delete a;
183 delete b;
184 delete c;
185 delete d;
186 }
187
TestgetRules()188 void RBBIAPITest::TestgetRules()
189 {
190 UErrorCode status=U_ZERO_ERROR;
191
192 LocalPointer<RuleBasedBreakIterator> bi1(
193 dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status)), status);
194 LocalPointer<RuleBasedBreakIterator> bi2(
195 dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status)), status);
196 if(U_FAILURE(status)){
197 errcheckln(status, "%s:%d, FAIL: in construction - %s", __FILE__, __LINE__, u_errorName(status));
198 return;
199 }
200
201 logln((UnicodeString)"Testing getRules()");
202
203 UnicodeString text(u"Hello there");
204 bi1->setText(text);
205
206 LocalPointer <RuleBasedBreakIterator> bi3(bi1->clone());
207
208 UnicodeString temp=bi1->getRules();
209 UnicodeString temp2=bi2->getRules();
210 UnicodeString temp3=bi3->getRules();
211 if( temp2.compare(temp3) ==0 || temp.compare(temp2) == 0 || temp.compare(temp3) != 0)
212 errln("%s:%d ERROR: error in getRules() method", __FILE__, __LINE__);
213
214 RuleBasedBreakIterator bi4; // Default RuleBasedBreakIterator constructor gives empty shell with empty rules.
215 if (!bi4.getRules().isEmpty()) {
216 errln("%s:%d Empty string expected.", __FILE__, __LINE__);
217 }
218 }
219
TestHashCode()220 void RBBIAPITest::TestHashCode()
221 {
222 UErrorCode status=U_ZERO_ERROR;
223 RuleBasedBreakIterator* bi1 = dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status));
224 RuleBasedBreakIterator* bi3 = dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status));
225 RuleBasedBreakIterator* bi2 = dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status));
226 if(U_FAILURE(status)){
227 errcheckln(status, "Fail : in construction - %s", u_errorName(status));
228 delete bi1;
229 delete bi2;
230 delete bi3;
231 return;
232 }
233
234
235 logln((UnicodeString)"Testing hashCode()");
236
237 bi1->setText((UnicodeString)"Hash code");
238 bi2->setText((UnicodeString)"Hash code");
239 bi3->setText((UnicodeString)"Hash code");
240
241 RuleBasedBreakIterator* bi1clone= bi1->clone();
242 RuleBasedBreakIterator* bi2clone= bi2->clone();
243
244 if(bi1->hashCode() != bi1clone->hashCode() || bi1->hashCode() != bi3->hashCode() ||
245 bi1clone->hashCode() != bi3->hashCode() || bi2->hashCode() != bi2clone->hashCode())
246 errln((UnicodeString)"ERROR: identical objects have different hashcodes");
247
248 if(bi1->hashCode() == bi2->hashCode() || bi2->hashCode() == bi3->hashCode() ||
249 bi1clone->hashCode() == bi2clone->hashCode() || bi1clone->hashCode() == bi2->hashCode())
250 errln((UnicodeString)"ERROR: different objects have same hashcodes");
251
252 delete bi1clone;
253 delete bi2clone;
254 delete bi1;
255 delete bi2;
256 delete bi3;
257
258 }
TestGetSetAdoptText()259 void RBBIAPITest::TestGetSetAdoptText()
260 {
261 logln((UnicodeString)"Testing getText setText ");
262 IcuTestErrorCode status(*this, "TestGetSetAdoptText");
263 UnicodeString str1="first string.";
264 UnicodeString str2="Second string.";
265 LocalPointer<RuleBasedBreakIterator> charIter1(dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status)));
266 LocalPointer<RuleBasedBreakIterator> wordIter1(dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status)));
267 if(status.isFailure()){
268 errcheckln(status, "Fail : in construction - %s", status.errorName());
269 return;
270 }
271
272
273 CharacterIterator* text1= new StringCharacterIterator(str1);
274 CharacterIterator* text1Clone = text1->clone();
275 CharacterIterator* text2= new StringCharacterIterator(str2);
276 CharacterIterator* text3= new StringCharacterIterator(str2, 3, 10, 3); // "ond str"
277
278 wordIter1->setText(str1);
279 CharacterIterator *tci = &wordIter1->getText();
280 UnicodeString tstr;
281 tci->getText(tstr);
282 TEST_ASSERT(tstr == str1);
283 if(wordIter1->current() != 0)
284 errln((UnicodeString)"ERROR:1 setText did not set the iteration position to the beginning of the text, it is" + wordIter1->current() + (UnicodeString)"\n");
285
286 wordIter1->next(2);
287
288 wordIter1->setText(str2);
289 if(wordIter1->current() != 0)
290 errln((UnicodeString)"ERROR:2 setText did not reset the iteration position to the beginning of the text, it is" + wordIter1->current() + (UnicodeString)"\n");
291
292
293 charIter1->adoptText(text1Clone);
294 TEST_ASSERT(wordIter1->getText() != charIter1->getText());
295 tci = &wordIter1->getText();
296 tci->getText(tstr);
297 TEST_ASSERT(tstr == str2);
298 tci = &charIter1->getText();
299 tci->getText(tstr);
300 TEST_ASSERT(tstr == str1);
301
302
303 LocalPointer<RuleBasedBreakIterator> rb(wordIter1->clone());
304 rb->adoptText(text1);
305 if(rb->getText() != *text1)
306 errln((UnicodeString)"ERROR:1 error in adoptText ");
307 rb->adoptText(text2);
308 if(rb->getText() != *text2)
309 errln((UnicodeString)"ERROR:2 error in adoptText ");
310
311 // Adopt where iterator range is less than the entire original source string.
312 // (With the change of the break engine to working with UText internally,
313 // CharacterIterators starting at positions other than zero are not supported)
314 rb->adoptText(text3);
315 TEST_ASSERT(rb->preceding(2) == 0);
316 TEST_ASSERT(rb->following(11) == BreakIterator::DONE);
317 //if(rb->preceding(2) != 3) {
318 // errln((UnicodeString)"ERROR:3 error in adoptText ");
319 //}
320 //if(rb->following(11) != BreakIterator::DONE) {
321 // errln((UnicodeString)"ERROR:4 error in adoptText ");
322 //}
323
324 // UText API
325 //
326 // Quick test to see if UText is working at all.
327 //
328 const char *s1 = "\x68\x65\x6C\x6C\x6F\x20\x77\x6F\x72\x6C\x64"; /* "hello world" in UTF-8 */
329 const char *s2 = "\x73\x65\x65\x20\x79\x61"; /* "see ya" in UTF-8 */
330 // 012345678901
331
332 status.reset();
333 LocalUTextPointer ut(utext_openUTF8(nullptr, s1, -1, status));
334 wordIter1->setText(ut.getAlias(), status);
335 TEST_ASSERT_SUCCESS(status);
336
337 int32_t pos;
338 pos = wordIter1->first();
339 TEST_ASSERT(pos==0);
340 pos = wordIter1->next();
341 TEST_ASSERT(pos==5);
342 pos = wordIter1->next();
343 TEST_ASSERT(pos==6);
344 pos = wordIter1->next();
345 TEST_ASSERT(pos==11);
346 pos = wordIter1->next();
347 TEST_ASSERT(pos==UBRK_DONE);
348
349 status.reset();
350 LocalUTextPointer ut2(utext_openUTF8(nullptr, s2, -1, status));
351 TEST_ASSERT_SUCCESS(status);
352 wordIter1->setText(ut2.getAlias(), status);
353 TEST_ASSERT_SUCCESS(status);
354
355 pos = wordIter1->first();
356 TEST_ASSERT(pos==0);
357 pos = wordIter1->next();
358 TEST_ASSERT(pos==3);
359 pos = wordIter1->next();
360 TEST_ASSERT(pos==4);
361
362 pos = wordIter1->last();
363 TEST_ASSERT(pos==6);
364 pos = wordIter1->previous();
365 TEST_ASSERT(pos==4);
366 pos = wordIter1->previous();
367 TEST_ASSERT(pos==3);
368 pos = wordIter1->previous();
369 TEST_ASSERT(pos==0);
370 pos = wordIter1->previous();
371 TEST_ASSERT(pos==UBRK_DONE);
372
373 status.reset();
374 UnicodeString sEmpty;
375 LocalUTextPointer gut2(utext_openUnicodeString(nullptr, &sEmpty, status));
376 wordIter1->getUText(gut2.getAlias(), status);
377 TEST_ASSERT_SUCCESS(status);
378 status.reset();
379 }
380
381
TestIteration()382 void RBBIAPITest::TestIteration()
383 {
384 // This test just verifies that the API is present.
385 // Testing for correct operation of the break rules happens elsewhere.
386
387 UErrorCode status=U_ZERO_ERROR;
388 RuleBasedBreakIterator* bi = dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status));
389 if (U_FAILURE(status) || bi == nullptr) {
390 errcheckln(status, "Failure creating character break iterator. Status = %s", u_errorName(status));
391 }
392 delete bi;
393
394 status=U_ZERO_ERROR;
395 bi = dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status));
396 if (U_FAILURE(status) || bi == nullptr) {
397 errcheckln(status, "Failure creating Word break iterator. Status = %s", u_errorName(status));
398 }
399 delete bi;
400
401 status=U_ZERO_ERROR;
402 bi = dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createLineInstance(Locale::getDefault(), status));
403 if (U_FAILURE(status) || bi == nullptr) {
404 errcheckln(status, "Failure creating Line break iterator. Status = %s", u_errorName(status));
405 }
406 delete bi;
407
408 status=U_ZERO_ERROR;
409 bi = dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createSentenceInstance(Locale::getDefault(), status));
410 if (U_FAILURE(status) || bi == nullptr) {
411 errcheckln(status, "Failure creating Sentence break iterator. Status = %s", u_errorName(status));
412 }
413 delete bi;
414
415 status=U_ZERO_ERROR;
416 bi = dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createTitleInstance(Locale::getDefault(), status));
417 if (U_FAILURE(status) || bi == nullptr) {
418 errcheckln(status, "Failure creating Title break iterator. Status = %s", u_errorName(status));
419 }
420 delete bi;
421
422 status=U_ZERO_ERROR;
423 bi = dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status));
424 if (U_FAILURE(status) || bi == nullptr) {
425 errcheckln(status, "Failure creating character break iterator. Status = %s", u_errorName(status));
426 return; // Skip the rest of these tests.
427 }
428
429
430 UnicodeString testString="0123456789";
431 bi->setText(testString);
432
433 int32_t i;
434 i = bi->first();
435 if (i != 0) {
436 errln("%s:%d Incorrect value from bi->first(). Expected 0, got %d.", __FILE__, __LINE__, i);
437 }
438
439 i = bi->last();
440 if (i != 10) {
441 errln("%s:%d Incorrect value from bi->last(). Expected 10, got %d", __FILE__, __LINE__, i);
442 }
443
444 //
445 // Previous
446 //
447 bi->last();
448 i = bi->previous();
449 if (i != 9) {
450 errln("%s:%d Incorrect value from bi->last(). Expected 9, got %d", __FILE__, __LINE__, i);
451 }
452
453
454 bi->first();
455 i = bi->previous();
456 if (i != BreakIterator::DONE) {
457 errln("%s:%d Incorrect value from bi->previous(). Expected DONE, got %d", __FILE__, __LINE__, i);
458 }
459
460 //
461 // next()
462 //
463 bi->first();
464 i = bi->next();
465 if (i != 1) {
466 errln("%s:%d Incorrect value from bi->next(). Expected 1, got %d", __FILE__, __LINE__, i);
467 }
468
469 bi->last();
470 i = bi->next();
471 if (i != BreakIterator::DONE) {
472 errln("%s:%d Incorrect value from bi->next(). Expected DONE, got %d", __FILE__, __LINE__, i);
473 }
474
475
476 //
477 // current()
478 //
479 bi->first();
480 i = bi->current();
481 if (i != 0) {
482 errln("%s:%d Incorrect value from bi->current(). Expected 0, got %d", __FILE__, __LINE__, i);
483 }
484
485 bi->next();
486 i = bi->current();
487 if (i != 1) {
488 errln("%s:%d Incorrect value from bi->current(). Expected 1, got %d", __FILE__, __LINE__, i);
489 }
490
491 bi->last();
492 bi->next();
493 i = bi->current();
494 if (i != 10) {
495 errln("%s:%d Incorrect value from bi->current(). Expected 10, got %d", __FILE__, __LINE__, i);
496 }
497
498 bi->first();
499 bi->previous();
500 i = bi->current();
501 if (i != 0) {
502 errln("%s:%d Incorrect value from bi->current(). Expected 0, got %d", __FILE__, __LINE__, i);
503 }
504
505
506 //
507 // Following()
508 //
509 i = bi->following(4);
510 if (i != 5) {
511 errln("%s:%d Incorrect value from bi->following(). Expected 5, got %d", __FILE__, __LINE__, i);
512 }
513
514 i = bi->following(9);
515 if (i != 10) {
516 errln("%s:%d Incorrect value from bi->following(). Expected 10, got %d", __FILE__, __LINE__, i);
517 }
518
519 i = bi->following(10);
520 if (i != BreakIterator::DONE) {
521 errln("%s:%d Incorrect value from bi->following(). Expected DONE, got %d", __FILE__, __LINE__, i);
522 }
523
524
525 //
526 // Preceding
527 //
528 i = bi->preceding(4);
529 if (i != 3) {
530 errln("%s:%d Incorrect value from bi->preceding(). Expected 3, got %d", __FILE__, __LINE__, i);
531 }
532
533 i = bi->preceding(10);
534 if (i != 9) {
535 errln("%s:%d Incorrect value from bi->preceding(). Expected 9, got %d", __FILE__, __LINE__, i);
536 }
537
538 i = bi->preceding(1);
539 if (i != 0) {
540 errln("%s:%d Incorrect value from bi->preceding(). Expected 0, got %d", __FILE__, __LINE__, i);
541 }
542
543 i = bi->preceding(0);
544 if (i != BreakIterator::DONE) {
545 errln("%s:%d Incorrect value from bi->preceding(). Expected DONE, got %d", __FILE__, __LINE__, i);
546 }
547
548
549 //
550 // isBoundary()
551 //
552 bi->first();
553 if (bi->isBoundary(3) != true) {
554 errln("%s:%d Incorrect value from bi->isBoundary(). Expected true, got false", __FILE__, __LINE__, i);
555 }
556 i = bi->current();
557 if (i != 3) {
558 errln("%s:%d Incorrect value from bi->current(). Expected 3, got %d", __FILE__, __LINE__, i);
559 }
560
561
562 if (bi->isBoundary(11) != false) {
563 errln("%s:%d Incorrect value from bi->isBoundary(). Expected false, got true", __FILE__, __LINE__, i);
564 }
565 i = bi->current();
566 if (i != 10) {
567 errln("%s:%d Incorrect value from bi->current(). Expected 10, got %d", __FILE__, __LINE__, i);
568 }
569
570 //
571 // next(n)
572 //
573 bi->first();
574 i = bi->next(4);
575 if (i != 4) {
576 errln("%s:%d Incorrect value from bi->next(). Expected 4, got %d", __FILE__, __LINE__, i);
577 }
578
579 i = bi->next(6);
580 if (i != 10) {
581 errln("%s:%d Incorrect value from bi->next(). Expected 10, got %d", __FILE__, __LINE__, i);
582 }
583
584 bi->first();
585 i = bi->next(11);
586 if (i != BreakIterator::DONE) {
587 errln("%s:%d Incorrect value from bi->next(). Expected BreakIterator::DONE, got %d", __FILE__, __LINE__, i);
588 }
589
590 delete bi;
591
592 }
593
594
595
596
597
598
TestBuilder()599 void RBBIAPITest::TestBuilder() {
600 UnicodeString rulesString1 = "$Letters = [:L:];\n"
601 "$Numbers = [:N:];\n"
602 "$Letters+;\n"
603 "$Numbers+;\n"
604 "[^$Letters $Numbers];\n"
605 "!.*;\n";
606 UnicodeString testString1 = "abc123..abc";
607 // 01234567890
608 int32_t bounds1[] = {0, 3, 6, 7, 8, 11};
609 UErrorCode status=U_ZERO_ERROR;
610 UParseError parseError;
611
612 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
613 if(U_FAILURE(status)) {
614 dataerrln("Fail : in construction - %s", u_errorName(status));
615 } else {
616 bi->setText(testString1);
617 doBoundaryTest(*bi, testString1, bounds1);
618 }
619 delete bi;
620 }
621
622
623 //
624 // TestQuoteGrouping
625 // Single quotes within rules imply a grouping, so that a modifier
626 // following the quoted text (* or +) applies to all of the quoted chars.
627 //
TestQuoteGrouping()628 void RBBIAPITest::TestQuoteGrouping() {
629 UnicodeString rulesString1 = "#Here comes the rule...\n"
630 "'$@!'*;\n" // (\$\@\!)*
631 ".;\n";
632
633 UnicodeString testString1 = "$@!$@!X$@!!X";
634 // 0123456789012
635 int32_t bounds1[] = {0, 6, 7, 10, 11, 12};
636 UErrorCode status=U_ZERO_ERROR;
637 UParseError parseError;
638
639 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
640 if(U_FAILURE(status)) {
641 dataerrln("Fail : in construction - %s", u_errorName(status));
642 } else {
643 bi->setText(testString1);
644 doBoundaryTest(*bi, testString1, bounds1);
645 }
646 delete bi;
647 }
648
649 //
650 // TestRuleStatus
651 // Test word break rule status constants.
652 //
TestRuleStatus()653 void RBBIAPITest::TestRuleStatus() {
654 char16_t str[30];
655 //no longer test Han or hiragana breaking here: ruleStatusVec would return nothing
656 // changed UBRK_WORD_KANA to UBRK_WORD_IDEO
657 u_unescape("plain word 123.45 \\u30a1\\u30a2 ",
658 // 012345678901234567 8 9 0
659 // Katakana
660 str, 30);
661 UnicodeString testString1(str);
662 int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 20, 21};
663 int32_t tag_lo[] = {UBRK_WORD_NONE, UBRK_WORD_LETTER, UBRK_WORD_NONE, UBRK_WORD_LETTER,
664 UBRK_WORD_NONE, UBRK_WORD_NUMBER, UBRK_WORD_NONE,
665 UBRK_WORD_IDEO, UBRK_WORD_NONE};
666
667 int32_t tag_hi[] = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT,
668 UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WORD_NONE_LIMIT,
669 UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT};
670
671 UErrorCode status=U_ZERO_ERROR;
672
673 BreakIterator *bi = BreakIterator::createWordInstance(Locale::getEnglish(), status);
674 if(U_FAILURE(status)) {
675 errcheckln(status, "%s:%d Fail in construction - %s", __FILE__, __LINE__, u_errorName(status));
676 } else {
677 bi->setText(testString1);
678 // First test that the breaks are in the right spots.
679 doBoundaryTest(*bi, testString1, bounds1);
680
681 // Then go back and check tag values
682 int32_t i = 0;
683 int32_t pos, tag;
684 for (pos = bi->first(); pos != BreakIterator::DONE; pos = bi->next(), i++) {
685 if (pos != bounds1[i]) {
686 errln("%s:%d FAIL: unexpected word break at position %d", __FILE__, __LINE__, pos);
687 break;
688 }
689 tag = bi->getRuleStatus();
690 if (tag < tag_lo[i] || tag >= tag_hi[i]) {
691 errln("%s:%d FAIL: incorrect tag value %d at position %d", __FILE__, __LINE__, tag, pos);
692 break;
693 }
694
695 // Check that we get the same tag values from getRuleStatusVec()
696 int32_t vec[10];
697 int t = bi->getRuleStatusVec(vec, 10, status);
698 TEST_ASSERT_SUCCESS(status);
699 TEST_ASSERT(t==1);
700 TEST_ASSERT(vec[0] == tag);
701 }
702 }
703 delete bi;
704
705 // Now test line break status. This test mostly is to confirm that the status constants
706 // are correctly declared in the header.
707 testString1 = "test line. \n";
708 // break type s s h
709
710 bi = BreakIterator::createLineInstance(Locale::getEnglish(), status);
711 if(U_FAILURE(status)) {
712 errcheckln(status, "%s:%d failed to create line break iterator. - %s", __FILE__, __LINE__, u_errorName(status));
713 } else {
714 int32_t i = 0;
715 int32_t pos, tag;
716 UBool success;
717
718 bi->setText(testString1);
719 pos = bi->current();
720 tag = bi->getRuleStatus();
721 for (i=0; i<3; i++) {
722 switch (i) {
723 case 0:
724 success = pos==0 && tag==UBRK_LINE_SOFT; break;
725 case 1:
726 success = pos==5 && tag==UBRK_LINE_SOFT; break;
727 case 2:
728 success = pos==12 && tag==UBRK_LINE_HARD; break;
729 default:
730 success = false; break;
731 }
732 if (success == false) {
733 errln("%s:%d: incorrect line break status or position. i=%d, pos=%d, tag=%d",
734 __FILE__, __LINE__, i, pos, tag);
735 break;
736 }
737 pos = bi->next();
738 tag = bi->getRuleStatus();
739 }
740 if (UBRK_LINE_SOFT >= UBRK_LINE_SOFT_LIMIT ||
741 UBRK_LINE_HARD >= UBRK_LINE_HARD_LIMIT ||
742 (UBRK_LINE_HARD > UBRK_LINE_SOFT && UBRK_LINE_HARD < UBRK_LINE_SOFT_LIMIT)) {
743 errln("%s:%d UBRK_LINE_* constants from header are inconsistent.", __FILE__, __LINE__);
744 }
745 }
746 delete bi;
747
748 }
749
750
751 //
752 // TestRuleStatusVec
753 // Test the vector form of break rule status.
754 //
TestRuleStatusVec()755 void RBBIAPITest::TestRuleStatusVec() {
756 UnicodeString rulesString( "[A-N]{100}; \n"
757 "[a-w]{200}; \n"
758 "[\\p{L}]{300}; \n"
759 "[\\p{N}]{400}; \n"
760 "[0-5]{500}; \n"
761 "!.*;\n", -1, US_INV);
762 UnicodeString testString1 = "Aapz5?";
763 int32_t statusVals[10];
764 int32_t numStatuses;
765 int32_t pos;
766
767 UErrorCode status=U_ZERO_ERROR;
768 UParseError parseError;
769
770 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString, parseError, status);
771 if (U_FAILURE(status)) {
772 dataerrln("Failure at file %s, line %d, error = %s", __FILE__, __LINE__, u_errorName(status));
773 } else {
774 bi->setText(testString1);
775
776 // A
777 pos = bi->next();
778 TEST_ASSERT(pos==1);
779 numStatuses = bi->getRuleStatusVec(statusVals, 10, status);
780 TEST_ASSERT_SUCCESS(status);
781 TEST_ASSERT(numStatuses == 2);
782 TEST_ASSERT(statusVals[0] == 100);
783 TEST_ASSERT(statusVals[1] == 300);
784
785 // a
786 pos = bi->next();
787 TEST_ASSERT(pos==2);
788 numStatuses = bi->getRuleStatusVec(statusVals, 10, status);
789 TEST_ASSERT_SUCCESS(status);
790 TEST_ASSERT(numStatuses == 2);
791 TEST_ASSERT(statusVals[0] == 200);
792 TEST_ASSERT(statusVals[1] == 300);
793
794 // p
795 pos = bi->next();
796 TEST_ASSERT(pos==3);
797 numStatuses = bi->getRuleStatusVec(statusVals, 10, status);
798 TEST_ASSERT_SUCCESS(status);
799 TEST_ASSERT(numStatuses == 2);
800 TEST_ASSERT(statusVals[0] == 200);
801 TEST_ASSERT(statusVals[1] == 300);
802
803 // z
804 pos = bi->next();
805 TEST_ASSERT(pos==4);
806 numStatuses = bi->getRuleStatusVec(statusVals, 10, status);
807 TEST_ASSERT_SUCCESS(status);
808 TEST_ASSERT(numStatuses == 1);
809 TEST_ASSERT(statusVals[0] == 300);
810
811 // 5
812 pos = bi->next();
813 TEST_ASSERT(pos==5);
814 numStatuses = bi->getRuleStatusVec(statusVals, 10, status);
815 TEST_ASSERT_SUCCESS(status);
816 TEST_ASSERT(numStatuses == 2);
817 TEST_ASSERT(statusVals[0] == 400);
818 TEST_ASSERT(statusVals[1] == 500);
819
820 // ?
821 pos = bi->next();
822 TEST_ASSERT(pos==6);
823 numStatuses = bi->getRuleStatusVec(statusVals, 10, status);
824 TEST_ASSERT_SUCCESS(status);
825 TEST_ASSERT(numStatuses == 1);
826 TEST_ASSERT(statusVals[0] == 0);
827
828 //
829 // Check buffer overflow error handling. Char == A
830 //
831 bi->first();
832 pos = bi->next();
833 TEST_ASSERT(pos==1);
834 memset(statusVals, -1, sizeof(statusVals));
835 numStatuses = bi->getRuleStatusVec(statusVals, 0, status);
836 TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR);
837 TEST_ASSERT(numStatuses == 2);
838 TEST_ASSERT(statusVals[0] == -1);
839
840 status = U_ZERO_ERROR;
841 memset(statusVals, -1, sizeof(statusVals));
842 numStatuses = bi->getRuleStatusVec(statusVals, 1, status);
843 TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR);
844 TEST_ASSERT(numStatuses == 2);
845 TEST_ASSERT(statusVals[0] == 100);
846 TEST_ASSERT(statusVals[1] == -1);
847
848 status = U_ZERO_ERROR;
849 memset(statusVals, -1, sizeof(statusVals));
850 numStatuses = bi->getRuleStatusVec(statusVals, 2, status);
851 TEST_ASSERT_SUCCESS(status);
852 TEST_ASSERT(numStatuses == 2);
853 TEST_ASSERT(statusVals[0] == 100);
854 TEST_ASSERT(statusVals[1] == 300);
855 TEST_ASSERT(statusVals[2] == -1);
856 }
857 delete bi;
858
859 }
860
861 //
862 // Bug 2190 Regression test. Builder crash on rule consisting of only a
863 // $variable reference
TestBug2190()864 void RBBIAPITest::TestBug2190() {
865 UnicodeString rulesString1 = "$aaa = abcd;\n"
866 "$bbb = $aaa;\n"
867 "$bbb;\n";
868 UnicodeString testString1 = "abcdabcd";
869 // 01234567890
870 int32_t bounds1[] = {0, 4, 8};
871 UErrorCode status=U_ZERO_ERROR;
872 UParseError parseError;
873
874 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
875 if(U_FAILURE(status)) {
876 dataerrln("Fail : in construction - %s", u_errorName(status));
877 } else {
878 bi->setText(testString1);
879 doBoundaryTest(*bi, testString1, bounds1);
880 }
881 delete bi;
882 }
883
TestBug22580()884 void RBBIAPITest::TestBug22580() {
885 UParseError parseError;
886 // Test single ' will not cause infinity loop
887 {
888 UnicodeString rulesString = u"'";
889 UErrorCode status=U_ZERO_ERROR;
890 RuleBasedBreakIterator(rulesString, parseError, status);
891 }
892 if (quick) {
893 return;
894 }
895 // Test any 1 or 2 ASCII chars as rule will not cause infinity loop.
896 // only in exhaust mode
897 for (char16_t u1 = u' '; u1 <= u'~'; u1++) {
898 {
899 UnicodeString rule = u1;
900 UErrorCode status=U_ZERO_ERROR;
901 RuleBasedBreakIterator bi (rule, parseError, status);
902 }
903 for (char16_t u2 = u' '; u2 <= u'~'; u2++) {
904 {
905 UnicodeString rule;
906 rule.append(u1).append(u2);
907 UErrorCode status=U_ZERO_ERROR;
908 RuleBasedBreakIterator bi (rule, parseError, status);
909 }
910 }
911 }
912 }
913
TestRegistration()914 void RBBIAPITest::TestRegistration() {
915 #if !UCONFIG_NO_SERVICE
916 UErrorCode status = U_ZERO_ERROR;
917 BreakIterator* ja_word = BreakIterator::createWordInstance("ja_JP", status);
918 // ok to not delete these if we exit because of error?
919 BreakIterator* ja_char = BreakIterator::createCharacterInstance("ja_JP", status);
920 BreakIterator* root_word = BreakIterator::createWordInstance("", status);
921 BreakIterator* root_char = BreakIterator::createCharacterInstance("", status);
922
923 if (status == U_MISSING_RESOURCE_ERROR || status == U_FILE_ACCESS_ERROR) {
924 dataerrln("Error creating instances of break interactors - %s", u_errorName(status));
925
926 delete ja_word;
927 delete ja_char;
928 delete root_word;
929 delete root_char;
930
931 return;
932 }
933
934 URegistryKey key = BreakIterator::registerInstance(ja_word, "xx", UBRK_WORD, status);
935 {
936 #if 0 // With a dictionary based word breaking, ja_word is identical to root.
937 if (ja_word && *ja_word == *root_word) {
938 errln("japan not different from root");
939 }
940 #endif
941 }
942
943 {
944 BreakIterator* result = BreakIterator::createWordInstance("xx_XX", status);
945 UBool fail = true;
946 if(result){
947 fail = *result != *ja_word;
948 }
949 delete result;
950 if (fail) {
951 errln("bad result for xx_XX/word");
952 }
953 }
954
955 {
956 BreakIterator* result = BreakIterator::createCharacterInstance("ja_JP", status);
957 UBool fail = true;
958 if(result){
959 fail = *result != *ja_char;
960 }
961 delete result;
962 if (fail) {
963 errln("bad result for ja_JP/char");
964 }
965 }
966
967 {
968 BreakIterator* result = BreakIterator::createCharacterInstance("xx_XX", status);
969 UBool fail = true;
970 if(result){
971 fail = *result != *root_char;
972 }
973 delete result;
974 if (fail) {
975 errln("bad result for xx_XX/char");
976 }
977 }
978
979 {
980 StringEnumeration* avail = BreakIterator::getAvailableLocales();
981 UBool found = false;
982 const UnicodeString* p;
983 while ((p = avail->snext(status))) {
984 if (p->compare("xx") == 0) {
985 found = true;
986 break;
987 }
988 }
989 delete avail;
990 if (!found) {
991 errln("did not find test locale");
992 }
993 }
994
995 {
996 UBool unreg = BreakIterator::unregister(key, status);
997 if (!unreg) {
998 errln("unable to unregister");
999 }
1000 }
1001
1002 {
1003 BreakIterator* result = BreakIterator::createWordInstance("en_US", status);
1004 BreakIterator* root = BreakIterator::createWordInstance("", status);
1005 UBool fail = true;
1006 if(root){
1007 fail = *root != *result;
1008 }
1009 delete root;
1010 delete result;
1011 if (fail) {
1012 errln("did not get root break");
1013 }
1014 }
1015
1016 {
1017 StringEnumeration* avail = BreakIterator::getAvailableLocales();
1018 UBool found = false;
1019 const UnicodeString* p;
1020 while ((p = avail->snext(status))) {
1021 if (p->compare("xx") == 0) {
1022 found = true;
1023 break;
1024 }
1025 }
1026 delete avail;
1027 if (found) {
1028 errln("found test locale");
1029 }
1030 }
1031
1032 {
1033 int32_t count;
1034 UBool foundLocale = false;
1035 const Locale *avail = BreakIterator::getAvailableLocales(count);
1036 for (int i=0; i<count; i++) {
1037 if (avail[i] == Locale::getEnglish()) {
1038 foundLocale = true;
1039 break;
1040 }
1041 }
1042 if (foundLocale == false) {
1043 errln("BreakIterator::getAvailableLocales(&count), failed to find EN.");
1044 }
1045 }
1046
1047
1048 // ja_word was adopted by factory
1049 delete ja_char;
1050 delete root_word;
1051 delete root_char;
1052 #endif
1053 }
1054
RoundtripRule(const char * dataFile)1055 void RBBIAPITest::RoundtripRule(const char *dataFile) {
1056 UErrorCode status = U_ZERO_ERROR;
1057 UParseError parseError;
1058 parseError.line = 0;
1059 parseError.offset = 0;
1060 LocalUDataMemoryPointer data(udata_open(U_ICUDATA_BRKITR, "brk", dataFile, &status));
1061 uint32_t length;
1062 UnicodeString builtSource;
1063 const uint8_t *rbbiRules;
1064 const uint8_t *builtRules;
1065
1066 if (U_FAILURE(status)) {
1067 errcheckln(status, "%s:%d Can't open \"%s\" - %s", __FILE__, __LINE__, dataFile, u_errorName(status));
1068 return;
1069 }
1070
1071 builtRules = (const uint8_t *)udata_getMemory(data.getAlias());
1072 builtSource = UnicodeString::fromUTF8(
1073 (const char *)(builtRules + ((RBBIDataHeader *)builtRules)->fRuleSource));
1074 LocalPointer<RuleBasedBreakIterator> brkItr (new RuleBasedBreakIterator(builtSource, parseError, status));
1075 if (U_FAILURE(status)) {
1076 errln("%s:%d createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n",
1077 __FILE__, __LINE__, u_errorName(status), parseError.line, parseError.offset);
1078 errln(builtSource);
1079 return;
1080 }
1081 rbbiRules = brkItr->getBinaryRules(length);
1082 logln("Comparing \"%s\" len=%d", dataFile, length);
1083 if (memcmp(builtRules, rbbiRules, (int32_t)length) != 0) {
1084 errln("%s:%d Built rules and rebuilt rules are different %s", __FILE__, __LINE__, dataFile);
1085 return;
1086 }
1087 }
1088
TestRoundtripRules()1089 void RBBIAPITest::TestRoundtripRules() {
1090 RoundtripRule("word");
1091 RoundtripRule("title");
1092 RoundtripRule("sent");
1093 RoundtripRule("line");
1094 RoundtripRule("char");
1095 if (!quick) {
1096 RoundtripRule("word_POSIX");
1097 }
1098 }
1099
1100
1101 // Check getBinaryRules() and construction of a break iterator from those rules.
1102
TestGetBinaryRules()1103 void RBBIAPITest::TestGetBinaryRules() {
1104 UErrorCode status=U_ZERO_ERROR;
1105 LocalPointer<BreakIterator> bi(BreakIterator::createLineInstance(Locale::getEnglish(), status));
1106 if (U_FAILURE(status)) {
1107 dataerrln("FAIL: BreakIterator::createLineInstance for Locale::getEnglish(): %s", u_errorName(status));
1108 return;
1109 }
1110 RuleBasedBreakIterator *rbbi = dynamic_cast<RuleBasedBreakIterator *>(bi.getAlias());
1111 if (rbbi == nullptr) {
1112 dataerrln("FAIL: RuleBasedBreakIterator is nullptr");
1113 return;
1114 }
1115
1116 // Check that the new line break iterator is nominally functional.
1117 UnicodeString helloWorld("Hello, World!");
1118 rbbi->setText(helloWorld);
1119 int n = 0;
1120 while (bi->next() != UBRK_DONE) {
1121 ++n;
1122 }
1123 TEST_ASSERT(n == 2);
1124
1125 // Extract the binary rules as a uint8_t blob.
1126 uint32_t ruleLength;
1127 const uint8_t *binRules = rbbi->getBinaryRules(ruleLength);
1128 TEST_ASSERT(ruleLength > 0);
1129 TEST_ASSERT(binRules != nullptr);
1130
1131 // Clone the binary rules, and create a break iterator from that.
1132 // The break iterator does not adopt the rules; we must delete when we are finished with the iterator.
1133 uint8_t *clonedRules = new uint8_t[ruleLength];
1134 memcpy(clonedRules, binRules, ruleLength);
1135 RuleBasedBreakIterator clonedBI(clonedRules, ruleLength, status);
1136 TEST_ASSERT_SUCCESS(status);
1137
1138 // Check that the cloned line break iterator is nominally alive.
1139 clonedBI.setText(helloWorld);
1140 n = 0;
1141 while (clonedBI.next() != UBRK_DONE) {
1142 ++n;
1143 }
1144 TEST_ASSERT(n == 2);
1145
1146 delete[] clonedRules;
1147 }
1148
1149
TestRefreshInputText()1150 void RBBIAPITest::TestRefreshInputText() {
1151 /*
1152 * RefreshInput changes out the input of a Break Iterator without
1153 * changing anything else in the iterator's state. Used with Java JNI,
1154 * when Java moves the underlying string storage. This test
1155 * runs BreakIterator::next() repeatedly, moving the text in the middle of the sequence.
1156 * The right set of boundaries should still be found.
1157 */
1158 char16_t testStr[] = {0x20, 0x41, 0x20, 0x42, 0x20, 0x43, 0x20, 0x44, 0x0}; /* = " A B C D" */
1159 char16_t movedStr[] = {0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0};
1160 UErrorCode status = U_ZERO_ERROR;
1161 UText ut1 = UTEXT_INITIALIZER;
1162 UText ut2 = UTEXT_INITIALIZER;
1163 RuleBasedBreakIterator *bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createLineInstance(Locale::getEnglish(), status));
1164 TEST_ASSERT_SUCCESS(status);
1165
1166 utext_openUChars(&ut1, testStr, -1, &status);
1167 TEST_ASSERT_SUCCESS(status);
1168
1169 if (U_SUCCESS(status)) {
1170 bi->setText(&ut1, status);
1171 TEST_ASSERT_SUCCESS(status);
1172
1173 /* Line boundaries will occur before each letter in the original string */
1174 TEST_ASSERT(1 == bi->next());
1175 TEST_ASSERT(3 == bi->next());
1176
1177 /* Move the string, kill the original string. */
1178 u_strcpy(movedStr, testStr);
1179 u_memset(testStr, 0x20, u_strlen(testStr));
1180 utext_openUChars(&ut2, movedStr, -1, &status);
1181 TEST_ASSERT_SUCCESS(status);
1182 RuleBasedBreakIterator *returnedBI = &bi->refreshInputText(&ut2, status);
1183 TEST_ASSERT_SUCCESS(status);
1184 TEST_ASSERT(bi == returnedBI);
1185
1186 /* Find the following matches, now working in the moved string. */
1187 TEST_ASSERT(5 == bi->next());
1188 TEST_ASSERT(7 == bi->next());
1189 TEST_ASSERT(8 == bi->next());
1190 TEST_ASSERT(UBRK_DONE == bi->next());
1191
1192 utext_close(&ut1);
1193 utext_close(&ut2);
1194 }
1195 delete bi;
1196
1197 }
1198
1199 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION
prtbrks(BreakIterator * brk,const UnicodeString & ustr,IntlTest & it)1200 static void prtbrks(BreakIterator* brk, const UnicodeString &ustr, IntlTest &it) {
1201 static const char16_t PILCROW=0x00B6, CHSTR=0x3010, CHEND=0x3011; // lenticular brackets
1202 it.logln(UnicodeString("String:'")+ustr+UnicodeString("'"));
1203
1204 int32_t *pos = new int32_t[ustr.length()];
1205 int32_t posCount = 0;
1206
1207 // calculate breaks up front, so we can print out
1208 // sans any debugging
1209 for(int32_t n = 0; (n=brk->next())!=UBRK_DONE; ) {
1210 pos[posCount++] = n;
1211 if(posCount>=ustr.length()) {
1212 it.errln("brk count exceeds string length!");
1213 return;
1214 }
1215 }
1216 UnicodeString out;
1217 out.append((char16_t)CHSTR);
1218 int32_t prev = 0;
1219 for(int32_t i=0;i<posCount;i++) {
1220 int32_t n=pos[i];
1221 out.append(ustr.tempSubString(prev,n-prev));
1222 out.append((char16_t)PILCROW);
1223 prev=n;
1224 }
1225 out.append(ustr.tempSubString(prev,ustr.length()-prev));
1226 out.append((char16_t)CHEND);
1227 it.logln(out);
1228
1229 out.remove();
1230 for(int32_t i=0;i<posCount;i++) {
1231 char tmp[100];
1232 snprintf(tmp, sizeof(tmp), "%d ",pos[i]);
1233 out.append(UnicodeString(tmp));
1234 }
1235 it.logln(out);
1236 delete [] pos;
1237 }
1238 #endif
1239
TestFilteredBreakIteratorBuilder()1240 void RBBIAPITest::TestFilteredBreakIteratorBuilder() {
1241 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION
1242 UErrorCode status = U_ZERO_ERROR;
1243 LocalPointer<FilteredBreakIteratorBuilder> builder;
1244 LocalPointer<BreakIterator> baseBI;
1245 LocalPointer<BreakIterator> filteredBI;
1246 LocalPointer<BreakIterator> frenchBI;
1247
1248 const UnicodeString text("In the meantime Mr. Weston arrived with his small ship, which he had now recovered. Capt. Gorges, who informed the Sgt. here that one purpose of his going east was to meet with Mr. Weston, took this opportunity to call him to account for some abuses he had to lay to his charge."); // (William Bradford, public domain. http://catalog.hathitrust.org/Record/008651224 ) - edited.
1249 const UnicodeString ABBR_MR("Mr.");
1250 const UnicodeString ABBR_CAPT("Capt.");
1251
1252 {
1253 logln("Constructing empty builder\n");
1254 builder.adoptInstead(FilteredBreakIteratorBuilder::createInstance(status));
1255 TEST_ASSERT_SUCCESS(status);
1256
1257 logln("Constructing base BI\n");
1258 baseBI.adoptInstead(BreakIterator::createSentenceInstance(Locale::getEnglish(), status));
1259 TEST_ASSERT_SUCCESS(status);
1260
1261 logln("Building new BI\n");
1262 filteredBI.adoptInstead(builder->build(baseBI.orphan(), status));
1263 TEST_ASSERT_SUCCESS(status);
1264
1265 if (U_SUCCESS(status)) {
1266 logln("Testing:");
1267 filteredBI->setText(text);
1268 TEST_ASSERT(20 == filteredBI->next()); // Mr.
1269 TEST_ASSERT(84 == filteredBI->next()); // recovered.
1270 TEST_ASSERT(90 == filteredBI->next()); // Capt.
1271 TEST_ASSERT(181 == filteredBI->next()); // Mr.
1272 TEST_ASSERT(278 == filteredBI->next()); // charge.
1273 filteredBI->first();
1274 prtbrks(filteredBI.getAlias(), text, *this);
1275 }
1276 }
1277
1278 {
1279 logln("Constructing empty builder\n");
1280 builder.adoptInstead(FilteredBreakIteratorBuilder::createInstance(status));
1281 TEST_ASSERT_SUCCESS(status);
1282
1283 if (U_SUCCESS(status)) {
1284 logln("Adding Mr. as an exception\n");
1285 TEST_ASSERT(true == builder->suppressBreakAfter(ABBR_MR, status));
1286 TEST_ASSERT(false == builder->suppressBreakAfter(ABBR_MR, status)); // already have it
1287 TEST_ASSERT(true == builder->unsuppressBreakAfter(ABBR_MR, status));
1288 TEST_ASSERT(false == builder->unsuppressBreakAfter(ABBR_MR, status)); // already removed it
1289 TEST_ASSERT(true == builder->suppressBreakAfter(ABBR_MR, status));
1290 TEST_ASSERT_SUCCESS(status);
1291
1292 logln("Constructing base BI\n");
1293 baseBI.adoptInstead(BreakIterator::createSentenceInstance(Locale::getEnglish(), status));
1294 TEST_ASSERT_SUCCESS(status);
1295
1296 logln("Building new BI\n");
1297 filteredBI.adoptInstead(builder->build(baseBI.orphan(), status));
1298 TEST_ASSERT_SUCCESS(status);
1299
1300 logln("Testing:");
1301 filteredBI->setText(text);
1302 TEST_ASSERT(84 == filteredBI->next());
1303 TEST_ASSERT(90 == filteredBI->next());// Capt.
1304 TEST_ASSERT(278 == filteredBI->next());
1305 filteredBI->first();
1306 prtbrks(filteredBI.getAlias(), text, *this);
1307 }
1308 }
1309
1310
1311 {
1312 logln("Constructing empty builder\n");
1313 builder.adoptInstead(FilteredBreakIteratorBuilder::createInstance(status));
1314 TEST_ASSERT_SUCCESS(status);
1315
1316 if (U_SUCCESS(status)) {
1317 logln("Adding Mr. and Capt as an exception\n");
1318 TEST_ASSERT(true == builder->suppressBreakAfter(ABBR_MR, status));
1319 TEST_ASSERT(true == builder->suppressBreakAfter(ABBR_CAPT, status));
1320 TEST_ASSERT_SUCCESS(status);
1321
1322 logln("Constructing base BI\n");
1323 baseBI.adoptInstead(BreakIterator::createSentenceInstance(Locale::getEnglish(), status));
1324 TEST_ASSERT_SUCCESS(status);
1325
1326 logln("Building new BI\n");
1327 filteredBI.adoptInstead(builder->build(baseBI.orphan(), status));
1328 TEST_ASSERT_SUCCESS(status);
1329
1330 logln("Testing:");
1331 filteredBI->setText(text);
1332 TEST_ASSERT(84 == filteredBI->next());
1333 TEST_ASSERT(278 == filteredBI->next());
1334 filteredBI->first();
1335 prtbrks(filteredBI.getAlias(), text, *this);
1336 }
1337 }
1338
1339
1340 {
1341 logln("Constructing English builder\n");
1342 builder.adoptInstead(FilteredBreakIteratorBuilder::createInstance(Locale::getEnglish(), status));
1343 TEST_ASSERT_SUCCESS(status);
1344
1345 logln("Constructing base BI\n");
1346 baseBI.adoptInstead(BreakIterator::createSentenceInstance(Locale::getEnglish(), status));
1347 TEST_ASSERT_SUCCESS(status);
1348
1349 if (U_SUCCESS(status)) {
1350 logln("unsuppressing 'Capt'");
1351 TEST_ASSERT(true == builder->unsuppressBreakAfter(ABBR_CAPT, status));
1352
1353 logln("Building new BI\n");
1354 filteredBI.adoptInstead(builder->build(baseBI.orphan(), status));
1355 TEST_ASSERT_SUCCESS(status);
1356
1357 if(filteredBI.isValid()) {
1358 logln("Testing:");
1359 filteredBI->setText(text);
1360 TEST_ASSERT(84 == filteredBI->next());
1361 TEST_ASSERT(90 == filteredBI->next());
1362 TEST_ASSERT(278 == filteredBI->next());
1363 filteredBI->first();
1364 prtbrks(filteredBI.getAlias(), text, *this);
1365 }
1366 }
1367 }
1368
1369
1370 {
1371 logln("Constructing English builder\n");
1372 builder.adoptInstead(FilteredBreakIteratorBuilder::createInstance(Locale::getEnglish(), status));
1373 TEST_ASSERT_SUCCESS(status);
1374
1375 logln("Constructing base BI\n");
1376 baseBI.adoptInstead(BreakIterator::createSentenceInstance(Locale::getEnglish(), status));
1377 TEST_ASSERT_SUCCESS(status);
1378
1379 if (U_SUCCESS(status)) {
1380 logln("Building new BI\n");
1381 filteredBI.adoptInstead(builder->build(baseBI.orphan(), status));
1382 TEST_ASSERT_SUCCESS(status);
1383
1384 if(filteredBI.isValid()) {
1385 logln("Testing:");
1386 filteredBI->setText(text);
1387 TEST_ASSERT(84 == filteredBI->next());
1388 TEST_ASSERT(278 == filteredBI->next());
1389 filteredBI->first();
1390 prtbrks(filteredBI.getAlias(), text, *this);
1391 }
1392 }
1393 }
1394
1395 // reenable once french is in
1396 {
1397 logln("Constructing French builder");
1398 builder.adoptInstead(FilteredBreakIteratorBuilder::createInstance(Locale::getFrench(), status));
1399 TEST_ASSERT_SUCCESS(status);
1400
1401 logln("Constructing base BI\n");
1402 baseBI.adoptInstead(BreakIterator::createSentenceInstance(Locale::getFrench(), status));
1403 TEST_ASSERT_SUCCESS(status);
1404
1405 if (U_SUCCESS(status)) {
1406 logln("Building new BI\n");
1407 frenchBI.adoptInstead(builder->build(baseBI.orphan(), status));
1408 TEST_ASSERT_SUCCESS(status);
1409 }
1410
1411 if(frenchBI.isValid()) {
1412 logln("Testing:");
1413 UnicodeString frText("C'est MM. Duval.");
1414 frenchBI->setText(frText);
1415 TEST_ASSERT(16 == frenchBI->next());
1416 TEST_ASSERT(BreakIterator::DONE == frenchBI->next());
1417 frenchBI->first();
1418 prtbrks(frenchBI.getAlias(), frText, *this);
1419 logln("Testing against English:");
1420 filteredBI->setText(frText);
1421 TEST_ASSERT(10 == filteredBI->next()); // wrong for french, but filterBI is english.
1422 TEST_ASSERT(16 == filteredBI->next());
1423 TEST_ASSERT(BreakIterator::DONE == filteredBI->next());
1424 filteredBI->first();
1425 prtbrks(filteredBI.getAlias(), frText, *this);
1426
1427 // Verify ==
1428 assertTrue(WHERE, *frenchBI == *frenchBI);
1429 assertTrue(WHERE, *filteredBI != *frenchBI);
1430 assertTrue(WHERE, *frenchBI != *filteredBI);
1431 } else {
1432 dataerrln("French BI: not valid.");
1433 }
1434 }
1435
1436 #else
1437 logln("Skipped- not: !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION");
1438 #endif
1439 }
1440
1441 //---------------------------------------------
1442 // runIndexedTest
1443 //---------------------------------------------
1444
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)1445 void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
1446 {
1447 if (exec) logln((UnicodeString)"TestSuite RuleBasedBreakIterator API ");
1448 TESTCASE_AUTO_BEGIN;
1449 #if !UCONFIG_NO_FILE_IO
1450 TESTCASE_AUTO(TestCloneEquals);
1451 TESTCASE_AUTO(TestgetRules);
1452 TESTCASE_AUTO(TestHashCode);
1453 TESTCASE_AUTO(TestGetSetAdoptText);
1454 TESTCASE_AUTO(TestIteration);
1455 #endif
1456 TESTCASE_AUTO(TestBuilder);
1457 TESTCASE_AUTO(TestQuoteGrouping);
1458 TESTCASE_AUTO(TestRuleStatusVec);
1459 TESTCASE_AUTO(TestBug2190);
1460 TESTCASE_AUTO(TestBug22580);
1461 #if !UCONFIG_NO_FILE_IO
1462 TESTCASE_AUTO(TestRegistration);
1463 TESTCASE_AUTO(TestBoilerPlate);
1464 TESTCASE_AUTO(TestRuleStatus);
1465 TESTCASE_AUTO(TestRoundtripRules);
1466 TESTCASE_AUTO(TestGetBinaryRules);
1467 #endif
1468 TESTCASE_AUTO(TestRefreshInputText);
1469 #if !UCONFIG_NO_BREAK_ITERATION
1470 TESTCASE_AUTO(TestFilteredBreakIteratorBuilder);
1471 #endif
1472 TESTCASE_AUTO_END;
1473 }
1474
1475
1476 //---------------------------------------------
1477 //Internal subroutines
1478 //---------------------------------------------
1479
doBoundaryTest(BreakIterator & bi,UnicodeString & text,int32_t * boundaries)1480 void RBBIAPITest::doBoundaryTest(BreakIterator& bi, UnicodeString& text, int32_t *boundaries){
1481 logln((UnicodeString)"testIsBoundary():");
1482 int32_t p = 0;
1483 UBool isB;
1484 for (int32_t i = 0; i < text.length(); i++) {
1485 isB = bi.isBoundary(i);
1486 logln((UnicodeString)"bi.isBoundary(" + i + ") -> " + isB);
1487
1488 if (i == boundaries[p]) {
1489 if (!isB)
1490 errln((UnicodeString)"Wrong result from isBoundary() for " + i + (UnicodeString)": expected true, got false");
1491 p++;
1492 }
1493 else {
1494 if (isB)
1495 errln((UnicodeString)"Wrong result from isBoundary() for " + i + (UnicodeString)": expected false, got true");
1496 }
1497 }
1498 }
doTest(UnicodeString & testString,int32_t start,int32_t gotoffset,int32_t expectedOffset,const char * expectedString)1499 void RBBIAPITest::doTest(UnicodeString& testString, int32_t start, int32_t gotoffset, int32_t expectedOffset, const char* expectedString){
1500 UnicodeString selected;
1501 UnicodeString expected=CharsToUnicodeString(expectedString);
1502
1503 if(gotoffset != expectedOffset)
1504 errln((UnicodeString)"ERROR:****returned #" + gotoffset + (UnicodeString)" instead of #" + expectedOffset);
1505 if(start <= gotoffset){
1506 testString.extractBetween(start, gotoffset, selected);
1507 }
1508 else{
1509 testString.extractBetween(gotoffset, start, selected);
1510 }
1511 if(selected.compare(expected) != 0)
1512 errln(prettify((UnicodeString)"ERROR:****selected \"" + selected + "\" instead of \"" + expected + "\""));
1513 else
1514 logln(prettify("****selected \"" + selected + "\""));
1515 }
1516
1517 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1518