xref: /aosp_15_r20/external/icu/icu4c/source/test/intltest/rbbiapts.cpp (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4  * Copyright (c) 1999-2016, International Business Machines
5  * Corporation and others. All Rights Reserved.
6  ********************************************************************
7  *   Date        Name        Description
8  *   12/14/99    Madhu        Creation.
9  *   01/12/2000  Madhu        updated for changed API
10  ********************************************************************/
11 
12 #include "unicode/utypes.h"
13 
14 #if !UCONFIG_NO_BREAK_ITERATION
15 
16 #include "unicode/uchar.h"
17 #include "intltest.h"
18 #include "unicode/rbbi.h"
19 #include "unicode/schriter.h"
20 #include "rbbiapts.h"
21 #include "rbbidata.h"
22 #include "cstring.h"
23 #include "ubrkimpl.h"
24 #include "unicode/locid.h"
25 #include "unicode/ustring.h"
26 #include "unicode/utext.h"
27 #include "cmemory.h"
28 #if !UCONFIG_NO_BREAK_ITERATION
29 #include "unicode/filteredbrk.h"
30 #include <stdio.h> // for snprintf
31 #endif
32 /**
33  * API Test the RuleBasedBreakIterator class
34  */
35 
36 
37 #define TEST_ASSERT_SUCCESS(status) UPRV_BLOCK_MACRO_BEGIN { \
38     if (U_FAILURE(status)) { \
39         dataerrln("Failure at file %s, line %d, error = %s", __FILE__, __LINE__, u_errorName(status)); \
40     } \
41 } UPRV_BLOCK_MACRO_END
42 
43 #define TEST_ASSERT(expr) UPRV_BLOCK_MACRO_BEGIN { \
44     if ((expr) == false) { \
45         errln("Test Failure at file %s, line %d: \"%s\" is false.\n", __FILE__, __LINE__, #expr); \
46     } \
47 } UPRV_BLOCK_MACRO_END
48 
TestCloneEquals()49 void RBBIAPITest::TestCloneEquals()
50 {
51 
52     UErrorCode status=U_ZERO_ERROR;
53     RuleBasedBreakIterator* bi1     = dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status));
54     RuleBasedBreakIterator* biequal = dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status));
55     RuleBasedBreakIterator* bi3     = dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status));
56     RuleBasedBreakIterator* bi2     = dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status));
57     if(U_FAILURE(status)){
58         errcheckln(status, "Fail : in construction - %s", u_errorName(status));
59         return;
60     }
61 
62 
63     UnicodeString testString="Testing word break iterators's clone() and equals()";
64     bi1->setText(testString);
65     bi2->setText(testString);
66     biequal->setText(testString);
67 
68     bi3->setText("hello");
69 
70     logln((UnicodeString)"Testing equals()");
71 
72     logln((UnicodeString)"Testing == and !=");
73     bool b = (*bi1 != *biequal);
74     b |= *bi1 == *bi2;
75     b |= *bi1 == *bi3;
76     if (b) {
77         errln("%s:%d ERROR:1 RBBI's == and != operator failed.", __FILE__, __LINE__);
78     }
79 
80     if(*bi2 == *biequal || *bi2 == *bi1  || *biequal == *bi3)
81         errln("%s:%d ERROR:2 RBBI's == and != operator  failed.", __FILE__, __LINE__);
82 
83 
84     // Quick test of RulesBasedBreakIterator assignment -
85     // Check that
86     //    two different iterators are !=
87     //    they are == after assignment
88     //    source and dest iterator produce the same next() after assignment.
89     //    deleting one doesn't disable the other.
90     logln("Testing assignment");
91     RuleBasedBreakIterator *bix = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createLineInstance(Locale::getDefault(), status));
92     if(U_FAILURE(status)){
93         errcheckln(status, "Fail : in construction - %s", u_errorName(status));
94         return;
95     }
96 
97     RuleBasedBreakIterator biDefault, biDefault2;
98     if(U_FAILURE(status)){
99         errln("%s:%d FAIL : in construction of default iterator", __FILE__, __LINE__);
100         return;
101     }
102     if (biDefault == *bix) {
103         errln("%s:%d ERROR: iterators should not compare ==", __FILE__, __LINE__);
104         return;
105     }
106     if (biDefault != biDefault2) {
107         errln("%s:%d ERROR: iterators should compare ==", __FILE__, __LINE__);
108         return;
109     }
110 
111 
112     UnicodeString   HelloString("Hello Kitty");
113     bix->setText(HelloString);
114     if (*bix == *bi2) {
115         errln("%s:%d ERROR: strings should not be equal before assignment.", __FILE__, __LINE__);
116     }
117     *bix = *bi2;
118     if (*bix != *bi2) {
119         errln("%s:%d ERROR: strings should be equal before assignment.", __FILE__, __LINE__);
120     }
121 
122     int bixnext = bix->next();
123     int bi2next = bi2->next();
124     if (! (bixnext == bi2next && bixnext == 7)) {
125         errln("%s:%d ERROR: iterators behaved differently after assignment.", __FILE__, __LINE__);
126     }
127     delete bix;
128     if (bi2->next() != 8) {
129         errln("%s:%d ERROR: iterator.next() failed after deleting copy.", __FILE__, __LINE__);
130     }
131 
132 
133 
134     logln((UnicodeString)"Testing clone()");
135     RuleBasedBreakIterator* bi1clone = bi1->clone();
136     RuleBasedBreakIterator* bi2clone = bi2->clone();
137 
138     if(*bi1clone != *bi1 || *bi1clone  != *biequal  ||
139       *bi1clone == *bi3 || *bi1clone == *bi2)
140         errln("%s:%d ERROR:1 RBBI's clone() method failed", __FILE__, __LINE__);
141 
142     if(*bi2clone == *bi1 || *bi2clone == *biequal ||
143        *bi2clone == *bi3 || *bi2clone != *bi2)
144         errln("%s:%d ERROR:2 RBBI's clone() method failed", __FILE__, __LINE__);
145 
146     if(bi1->getText() != bi1clone->getText()   ||
147        bi2clone->getText() != bi2->getText()   ||
148        *bi2clone == *bi1clone )
149         errln("%s:%d ERROR: RBBI's clone() method failed", __FILE__, __LINE__);
150 
151     delete bi1clone;
152     delete bi2clone;
153     delete bi1;
154     delete bi3;
155     delete bi2;
156     delete biequal;
157 }
158 
TestBoilerPlate()159 void RBBIAPITest::TestBoilerPlate()
160 {
161     UErrorCode status = U_ZERO_ERROR;
162     BreakIterator* a = BreakIterator::createWordInstance(Locale("hi"), status);
163     BreakIterator* b = BreakIterator::createWordInstance(Locale("hi_IN"),status);
164     if (U_FAILURE(status)) {
165         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
166         return;
167     }
168     if(*a!=*b){
169         errln("Failed: boilerplate method operator!= does not return correct results");
170     }
171     // Japanese word break iterators are identical to root with
172     // a dictionary-based break iterator
173     BreakIterator* c = BreakIterator::createCharacterInstance(Locale("ja"),status);
174     BreakIterator* d = BreakIterator::createCharacterInstance(Locale("root"),status);
175     if(c && d){
176         if(*c!=*d){
177             errln("Failed: boilerplate method operator== does not return correct results");
178         }
179     }else{
180         errln("creation of break iterator failed");
181     }
182     delete a;
183     delete b;
184     delete c;
185     delete d;
186 }
187 
TestgetRules()188 void RBBIAPITest::TestgetRules()
189 {
190     UErrorCode status=U_ZERO_ERROR;
191 
192     LocalPointer<RuleBasedBreakIterator> bi1(
193             dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status)), status);
194     LocalPointer<RuleBasedBreakIterator> bi2(
195             dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status)), status);
196     if(U_FAILURE(status)){
197         errcheckln(status, "%s:%d, FAIL: in construction - %s", __FILE__, __LINE__, u_errorName(status));
198         return;
199     }
200 
201     logln((UnicodeString)"Testing getRules()");
202 
203     UnicodeString text(u"Hello there");
204     bi1->setText(text);
205 
206     LocalPointer <RuleBasedBreakIterator> bi3(bi1->clone());
207 
208     UnicodeString temp=bi1->getRules();
209     UnicodeString temp2=bi2->getRules();
210     UnicodeString temp3=bi3->getRules();
211     if( temp2.compare(temp3) ==0 || temp.compare(temp2) == 0 || temp.compare(temp3) != 0)
212         errln("%s:%d ERROR: error in getRules() method", __FILE__, __LINE__);
213 
214     RuleBasedBreakIterator bi4;   // Default RuleBasedBreakIterator constructor gives empty shell with empty rules.
215     if (!bi4.getRules().isEmpty()) {
216         errln("%s:%d Empty string expected.", __FILE__, __LINE__);
217     }
218 }
219 
TestHashCode()220 void RBBIAPITest::TestHashCode()
221 {
222     UErrorCode status=U_ZERO_ERROR;
223     RuleBasedBreakIterator* bi1     = dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status));
224     RuleBasedBreakIterator* bi3     = dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status));
225     RuleBasedBreakIterator* bi2     = dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status));
226     if(U_FAILURE(status)){
227         errcheckln(status, "Fail : in construction - %s", u_errorName(status));
228         delete bi1;
229         delete bi2;
230         delete bi3;
231         return;
232     }
233 
234 
235     logln((UnicodeString)"Testing hashCode()");
236 
237     bi1->setText((UnicodeString)"Hash code");
238     bi2->setText((UnicodeString)"Hash code");
239     bi3->setText((UnicodeString)"Hash code");
240 
241     RuleBasedBreakIterator* bi1clone= bi1->clone();
242     RuleBasedBreakIterator* bi2clone= bi2->clone();
243 
244     if(bi1->hashCode() != bi1clone->hashCode() ||  bi1->hashCode() != bi3->hashCode() ||
245         bi1clone->hashCode() != bi3->hashCode() || bi2->hashCode() != bi2clone->hashCode())
246         errln((UnicodeString)"ERROR: identical objects have different hashcodes");
247 
248     if(bi1->hashCode() == bi2->hashCode() ||  bi2->hashCode() == bi3->hashCode() ||
249         bi1clone->hashCode() == bi2clone->hashCode() || bi1clone->hashCode() == bi2->hashCode())
250         errln((UnicodeString)"ERROR: different objects have same hashcodes");
251 
252     delete bi1clone;
253     delete bi2clone;
254     delete bi1;
255     delete bi2;
256     delete bi3;
257 
258 }
TestGetSetAdoptText()259 void RBBIAPITest::TestGetSetAdoptText()
260 {
261     logln((UnicodeString)"Testing getText setText ");
262     IcuTestErrorCode status(*this, "TestGetSetAdoptText");
263     UnicodeString str1="first string.";
264     UnicodeString str2="Second string.";
265     LocalPointer<RuleBasedBreakIterator> charIter1(dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status)));
266     LocalPointer<RuleBasedBreakIterator> wordIter1(dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status)));
267     if(status.isFailure()){
268         errcheckln(status, "Fail : in construction - %s", status.errorName());
269             return;
270     }
271 
272 
273     CharacterIterator* text1= new StringCharacterIterator(str1);
274     CharacterIterator* text1Clone = text1->clone();
275     CharacterIterator* text2= new StringCharacterIterator(str2);
276     CharacterIterator* text3= new StringCharacterIterator(str2, 3, 10, 3); //  "ond str"
277 
278     wordIter1->setText(str1);
279     CharacterIterator *tci = &wordIter1->getText();
280     UnicodeString      tstr;
281     tci->getText(tstr);
282     TEST_ASSERT(tstr == str1);
283     if(wordIter1->current() != 0)
284         errln((UnicodeString)"ERROR:1 setText did not set the iteration position to the beginning of the text, it is" + wordIter1->current() + (UnicodeString)"\n");
285 
286     wordIter1->next(2);
287 
288     wordIter1->setText(str2);
289     if(wordIter1->current() != 0)
290         errln((UnicodeString)"ERROR:2 setText did not reset the iteration position to the beginning of the text, it is" + wordIter1->current() + (UnicodeString)"\n");
291 
292 
293     charIter1->adoptText(text1Clone);
294     TEST_ASSERT(wordIter1->getText() != charIter1->getText());
295     tci = &wordIter1->getText();
296     tci->getText(tstr);
297     TEST_ASSERT(tstr == str2);
298     tci = &charIter1->getText();
299     tci->getText(tstr);
300     TEST_ASSERT(tstr == str1);
301 
302 
303     LocalPointer<RuleBasedBreakIterator> rb(wordIter1->clone());
304     rb->adoptText(text1);
305     if(rb->getText() != *text1)
306         errln((UnicodeString)"ERROR:1 error in adoptText ");
307     rb->adoptText(text2);
308     if(rb->getText() != *text2)
309         errln((UnicodeString)"ERROR:2 error in adoptText ");
310 
311     // Adopt where iterator range is less than the entire original source string.
312     //   (With the change of the break engine to working with UText internally,
313     //    CharacterIterators starting at positions other than zero are not supported)
314     rb->adoptText(text3);
315     TEST_ASSERT(rb->preceding(2) == 0);
316     TEST_ASSERT(rb->following(11) == BreakIterator::DONE);
317     //if(rb->preceding(2) != 3) {
318     //    errln((UnicodeString)"ERROR:3 error in adoptText ");
319     //}
320     //if(rb->following(11) != BreakIterator::DONE) {
321     //    errln((UnicodeString)"ERROR:4 error in adoptText ");
322     //}
323 
324     // UText API
325     //
326     //   Quick test to see if UText is working at all.
327     //
328     const char *s1 = "\x68\x65\x6C\x6C\x6F\x20\x77\x6F\x72\x6C\x64"; /* "hello world" in UTF-8 */
329     const char *s2 = "\x73\x65\x65\x20\x79\x61"; /* "see ya" in UTF-8 */
330     //                012345678901
331 
332     status.reset();
333     LocalUTextPointer ut(utext_openUTF8(nullptr, s1, -1, status));
334     wordIter1->setText(ut.getAlias(), status);
335     TEST_ASSERT_SUCCESS(status);
336 
337     int32_t pos;
338     pos = wordIter1->first();
339     TEST_ASSERT(pos==0);
340     pos = wordIter1->next();
341     TEST_ASSERT(pos==5);
342     pos = wordIter1->next();
343     TEST_ASSERT(pos==6);
344     pos = wordIter1->next();
345     TEST_ASSERT(pos==11);
346     pos = wordIter1->next();
347     TEST_ASSERT(pos==UBRK_DONE);
348 
349     status.reset();
350     LocalUTextPointer ut2(utext_openUTF8(nullptr, s2, -1, status));
351     TEST_ASSERT_SUCCESS(status);
352     wordIter1->setText(ut2.getAlias(), status);
353     TEST_ASSERT_SUCCESS(status);
354 
355     pos = wordIter1->first();
356     TEST_ASSERT(pos==0);
357     pos = wordIter1->next();
358     TEST_ASSERT(pos==3);
359     pos = wordIter1->next();
360     TEST_ASSERT(pos==4);
361 
362     pos = wordIter1->last();
363     TEST_ASSERT(pos==6);
364     pos = wordIter1->previous();
365     TEST_ASSERT(pos==4);
366     pos = wordIter1->previous();
367     TEST_ASSERT(pos==3);
368     pos = wordIter1->previous();
369     TEST_ASSERT(pos==0);
370     pos = wordIter1->previous();
371     TEST_ASSERT(pos==UBRK_DONE);
372 
373     status.reset();
374     UnicodeString sEmpty;
375     LocalUTextPointer gut2(utext_openUnicodeString(nullptr, &sEmpty, status));
376     wordIter1->getUText(gut2.getAlias(), status);
377     TEST_ASSERT_SUCCESS(status);
378     status.reset();
379 }
380 
381 
TestIteration()382 void RBBIAPITest::TestIteration()
383 {
384     // This test just verifies that the API is present.
385     // Testing for correct operation of the break rules happens elsewhere.
386 
387     UErrorCode status=U_ZERO_ERROR;
388     RuleBasedBreakIterator* bi  = dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status));
389     if (U_FAILURE(status) || bi == nullptr)  {
390         errcheckln(status, "Failure creating character break iterator.  Status = %s", u_errorName(status));
391     }
392     delete bi;
393 
394     status=U_ZERO_ERROR;
395     bi  = dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status));
396     if (U_FAILURE(status) || bi == nullptr)  {
397         errcheckln(status, "Failure creating Word break iterator.  Status = %s", u_errorName(status));
398     }
399     delete bi;
400 
401     status=U_ZERO_ERROR;
402     bi  = dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createLineInstance(Locale::getDefault(), status));
403     if (U_FAILURE(status) || bi == nullptr)  {
404         errcheckln(status, "Failure creating Line break iterator.  Status = %s", u_errorName(status));
405     }
406     delete bi;
407 
408     status=U_ZERO_ERROR;
409     bi  = dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createSentenceInstance(Locale::getDefault(), status));
410     if (U_FAILURE(status) || bi == nullptr)  {
411         errcheckln(status, "Failure creating Sentence break iterator.  Status = %s", u_errorName(status));
412     }
413     delete bi;
414 
415     status=U_ZERO_ERROR;
416     bi  = dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createTitleInstance(Locale::getDefault(), status));
417     if (U_FAILURE(status) || bi == nullptr)  {
418         errcheckln(status, "Failure creating Title break iterator.  Status = %s", u_errorName(status));
419     }
420     delete bi;
421 
422     status=U_ZERO_ERROR;
423     bi  = dynamic_cast<RuleBasedBreakIterator*>(RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status));
424     if (U_FAILURE(status) || bi == nullptr)  {
425         errcheckln(status, "Failure creating character break iterator.  Status = %s", u_errorName(status));
426         return;   // Skip the rest of these tests.
427     }
428 
429 
430     UnicodeString testString="0123456789";
431     bi->setText(testString);
432 
433     int32_t i;
434     i = bi->first();
435     if (i != 0) {
436         errln("%s:%d Incorrect value from bi->first().  Expected 0, got %d.", __FILE__, __LINE__, i);
437     }
438 
439     i = bi->last();
440     if (i != 10) {
441         errln("%s:%d Incorrect value from bi->last().  Expected 10, got %d", __FILE__, __LINE__, i);
442     }
443 
444     //
445     // Previous
446     //
447     bi->last();
448     i = bi->previous();
449     if (i != 9) {
450         errln("%s:%d Incorrect value from bi->last().  Expected 9, got %d", __FILE__, __LINE__, i);
451     }
452 
453 
454     bi->first();
455     i = bi->previous();
456     if (i != BreakIterator::DONE) {
457         errln("%s:%d Incorrect value from bi->previous().  Expected DONE, got %d", __FILE__, __LINE__, i);
458     }
459 
460     //
461     // next()
462     //
463     bi->first();
464     i = bi->next();
465     if (i != 1) {
466         errln("%s:%d Incorrect value from bi->next().  Expected 1, got %d", __FILE__, __LINE__, i);
467     }
468 
469     bi->last();
470     i = bi->next();
471     if (i != BreakIterator::DONE) {
472         errln("%s:%d Incorrect value from bi->next().  Expected DONE, got %d", __FILE__, __LINE__, i);
473     }
474 
475 
476     //
477     //  current()
478     //
479     bi->first();
480     i = bi->current();
481     if (i != 0) {
482         errln("%s:%d Incorrect value from bi->current().  Expected 0, got %d", __FILE__, __LINE__, i);
483     }
484 
485     bi->next();
486     i = bi->current();
487     if (i != 1) {
488         errln("%s:%d Incorrect value from bi->current().  Expected 1, got %d", __FILE__, __LINE__, i);
489     }
490 
491     bi->last();
492     bi->next();
493     i = bi->current();
494     if (i != 10) {
495         errln("%s:%d Incorrect value from bi->current().  Expected 10, got %d", __FILE__, __LINE__, i);
496     }
497 
498     bi->first();
499     bi->previous();
500     i = bi->current();
501     if (i != 0) {
502         errln("%s:%d Incorrect value from bi->current().  Expected 0, got %d", __FILE__, __LINE__, i);
503     }
504 
505 
506     //
507     // Following()
508     //
509     i = bi->following(4);
510     if (i != 5) {
511         errln("%s:%d Incorrect value from bi->following().  Expected 5, got %d", __FILE__, __LINE__, i);
512     }
513 
514     i = bi->following(9);
515     if (i != 10) {
516         errln("%s:%d Incorrect value from bi->following().  Expected 10, got %d", __FILE__, __LINE__, i);
517     }
518 
519     i = bi->following(10);
520     if (i != BreakIterator::DONE) {
521         errln("%s:%d Incorrect value from bi->following().  Expected DONE, got %d", __FILE__, __LINE__, i);
522     }
523 
524 
525     //
526     // Preceding
527     //
528     i = bi->preceding(4);
529     if (i != 3) {
530         errln("%s:%d Incorrect value from bi->preceding().  Expected 3, got %d", __FILE__, __LINE__, i);
531     }
532 
533     i = bi->preceding(10);
534     if (i != 9) {
535         errln("%s:%d Incorrect value from bi->preceding().  Expected 9, got %d", __FILE__, __LINE__, i);
536     }
537 
538     i = bi->preceding(1);
539     if (i != 0) {
540         errln("%s:%d Incorrect value from bi->preceding().  Expected 0, got %d", __FILE__, __LINE__, i);
541     }
542 
543     i = bi->preceding(0);
544     if (i != BreakIterator::DONE) {
545         errln("%s:%d Incorrect value from bi->preceding().  Expected DONE, got %d", __FILE__, __LINE__, i);
546     }
547 
548 
549     //
550     // isBoundary()
551     //
552     bi->first();
553     if (bi->isBoundary(3) != true) {
554         errln("%s:%d Incorrect value from bi->isBoundary().  Expected true, got false", __FILE__, __LINE__, i);
555     }
556     i = bi->current();
557     if (i != 3) {
558         errln("%s:%d Incorrect value from bi->current().  Expected 3, got %d", __FILE__, __LINE__, i);
559     }
560 
561 
562     if (bi->isBoundary(11) != false) {
563         errln("%s:%d Incorrect value from bi->isBoundary().  Expected false, got true", __FILE__, __LINE__, i);
564     }
565     i = bi->current();
566     if (i != 10) {
567         errln("%s:%d Incorrect value from bi->current().  Expected 10, got %d", __FILE__, __LINE__, i);
568     }
569 
570     //
571     // next(n)
572     //
573     bi->first();
574     i = bi->next(4);
575     if (i != 4) {
576         errln("%s:%d Incorrect value from bi->next().  Expected 4, got %d", __FILE__, __LINE__, i);
577     }
578 
579     i = bi->next(6);
580     if (i != 10) {
581         errln("%s:%d Incorrect value from bi->next().  Expected 10, got %d", __FILE__, __LINE__, i);
582     }
583 
584     bi->first();
585     i = bi->next(11);
586     if (i != BreakIterator::DONE) {
587         errln("%s:%d Incorrect value from bi->next().  Expected BreakIterator::DONE, got %d", __FILE__, __LINE__, i);
588     }
589 
590     delete bi;
591 
592 }
593 
594 
595 
596 
597 
598 
TestBuilder()599 void RBBIAPITest::TestBuilder() {
600      UnicodeString rulesString1 = "$Letters = [:L:];\n"
601                                   "$Numbers = [:N:];\n"
602                                   "$Letters+;\n"
603                                   "$Numbers+;\n"
604                                   "[^$Letters $Numbers];\n"
605                                   "!.*;\n";
606      UnicodeString testString1  = "abc123..abc";
607                                 // 01234567890
608      int32_t bounds1[] = {0, 3, 6, 7, 8, 11};
609      UErrorCode status=U_ZERO_ERROR;
610      UParseError    parseError;
611 
612      RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
613      if(U_FAILURE(status)) {
614          dataerrln("Fail : in construction - %s", u_errorName(status));
615      } else {
616          bi->setText(testString1);
617          doBoundaryTest(*bi, testString1, bounds1);
618      }
619      delete bi;
620 }
621 
622 
623 //
624 //  TestQuoteGrouping
625 //       Single quotes within rules imply a grouping, so that a modifier
626 //       following the quoted text (* or +) applies to all of the quoted chars.
627 //
TestQuoteGrouping()628 void RBBIAPITest::TestQuoteGrouping() {
629      UnicodeString rulesString1 = "#Here comes the rule...\n"
630                                   "'$@!'*;\n"   //  (\$\@\!)*
631                                   ".;\n";
632 
633      UnicodeString testString1  = "$@!$@!X$@!!X";
634                                 // 0123456789012
635      int32_t bounds1[] = {0, 6, 7, 10, 11, 12};
636      UErrorCode status=U_ZERO_ERROR;
637      UParseError    parseError;
638 
639      RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
640      if(U_FAILURE(status)) {
641          dataerrln("Fail : in construction - %s", u_errorName(status));
642      } else {
643          bi->setText(testString1);
644          doBoundaryTest(*bi, testString1, bounds1);
645      }
646      delete bi;
647 }
648 
649 //
650 //  TestRuleStatus
651 //      Test word break rule status constants.
652 //
TestRuleStatus()653 void RBBIAPITest::TestRuleStatus() {
654      char16_t str[30];
655      //no longer test Han or hiragana breaking here: ruleStatusVec would return nothing
656      // changed UBRK_WORD_KANA to UBRK_WORD_IDEO
657      u_unescape("plain word 123.45 \\u30a1\\u30a2 ",
658               // 012345678901234567  8      9    0
659               //                     Katakana
660                 str, 30);
661      UnicodeString testString1(str);
662      int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 20, 21};
663      int32_t tag_lo[]  = {UBRK_WORD_NONE,     UBRK_WORD_LETTER, UBRK_WORD_NONE,    UBRK_WORD_LETTER,
664                           UBRK_WORD_NONE,     UBRK_WORD_NUMBER, UBRK_WORD_NONE,
665                           UBRK_WORD_IDEO,     UBRK_WORD_NONE};
666 
667      int32_t tag_hi[]  = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT,
668                           UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WORD_NONE_LIMIT,
669                           UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT};
670 
671      UErrorCode status=U_ZERO_ERROR;
672 
673      BreakIterator *bi = BreakIterator::createWordInstance(Locale::getEnglish(), status);
674      if(U_FAILURE(status)) {
675          errcheckln(status, "%s:%d Fail in construction - %s", __FILE__, __LINE__, u_errorName(status));
676      } else {
677          bi->setText(testString1);
678          // First test that the breaks are in the right spots.
679          doBoundaryTest(*bi, testString1, bounds1);
680 
681          // Then go back and check tag values
682          int32_t i = 0;
683          int32_t pos, tag;
684          for (pos = bi->first(); pos != BreakIterator::DONE; pos = bi->next(), i++) {
685              if (pos != bounds1[i]) {
686                  errln("%s:%d FAIL: unexpected word break at position %d", __FILE__, __LINE__, pos);
687                  break;
688              }
689              tag = bi->getRuleStatus();
690              if (tag < tag_lo[i] || tag >= tag_hi[i]) {
691                  errln("%s:%d FAIL: incorrect tag value %d at position %d", __FILE__, __LINE__, tag, pos);
692                  break;
693              }
694 
695              // Check that we get the same tag values from getRuleStatusVec()
696              int32_t vec[10];
697              int t = bi->getRuleStatusVec(vec, 10, status);
698              TEST_ASSERT_SUCCESS(status);
699              TEST_ASSERT(t==1);
700              TEST_ASSERT(vec[0] == tag);
701          }
702      }
703      delete bi;
704 
705      // Now test line break status.  This test mostly is to confirm that the status constants
706      //                              are correctly declared in the header.
707      testString1 =   "test line. \n";
708      // break type    s    s     h
709 
710      bi = BreakIterator::createLineInstance(Locale::getEnglish(), status);
711      if(U_FAILURE(status)) {
712          errcheckln(status, "%s:%d failed to create line break iterator. - %s", __FILE__, __LINE__, u_errorName(status));
713      } else {
714          int32_t i = 0;
715          int32_t pos, tag;
716          UBool   success;
717 
718          bi->setText(testString1);
719          pos = bi->current();
720          tag = bi->getRuleStatus();
721          for (i=0; i<3; i++) {
722              switch (i) {
723              case 0:
724                  success = pos==0  && tag==UBRK_LINE_SOFT; break;
725              case 1:
726                  success = pos==5  && tag==UBRK_LINE_SOFT; break;
727              case 2:
728                  success = pos==12 && tag==UBRK_LINE_HARD; break;
729              default:
730                  success = false; break;
731              }
732              if (success == false) {
733                  errln("%s:%d: incorrect line break status or position.  i=%d, pos=%d, tag=%d",
734                      __FILE__, __LINE__, i, pos, tag);
735                  break;
736              }
737              pos = bi->next();
738              tag = bi->getRuleStatus();
739          }
740          if (UBRK_LINE_SOFT >= UBRK_LINE_SOFT_LIMIT ||
741              UBRK_LINE_HARD >= UBRK_LINE_HARD_LIMIT ||
742              (UBRK_LINE_HARD > UBRK_LINE_SOFT && UBRK_LINE_HARD < UBRK_LINE_SOFT_LIMIT)) {
743              errln("%s:%d UBRK_LINE_* constants from header are inconsistent.", __FILE__, __LINE__);
744          }
745      }
746      delete bi;
747 
748 }
749 
750 
751 //
752 //  TestRuleStatusVec
753 //      Test the vector form of  break rule status.
754 //
TestRuleStatusVec()755 void RBBIAPITest::TestRuleStatusVec() {
756     UnicodeString rulesString(   "[A-N]{100}; \n"
757                                  "[a-w]{200}; \n"
758                                  "[\\p{L}]{300}; \n"
759                                  "[\\p{N}]{400}; \n"
760                                  "[0-5]{500}; \n"
761                                   "!.*;\n", -1, US_INV);
762      UnicodeString testString1  = "Aapz5?";
763      int32_t  statusVals[10];
764      int32_t  numStatuses;
765      int32_t  pos;
766 
767      UErrorCode status=U_ZERO_ERROR;
768      UParseError    parseError;
769 
770      RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString, parseError, status);
771      if (U_FAILURE(status)) {
772          dataerrln("Failure at file %s, line %d, error = %s", __FILE__, __LINE__, u_errorName(status));
773      } else {
774          bi->setText(testString1);
775 
776          // A
777          pos = bi->next();
778          TEST_ASSERT(pos==1);
779          numStatuses = bi->getRuleStatusVec(statusVals, 10, status);
780          TEST_ASSERT_SUCCESS(status);
781          TEST_ASSERT(numStatuses == 2);
782          TEST_ASSERT(statusVals[0] == 100);
783          TEST_ASSERT(statusVals[1] == 300);
784 
785          // a
786          pos = bi->next();
787          TEST_ASSERT(pos==2);
788          numStatuses = bi->getRuleStatusVec(statusVals, 10, status);
789          TEST_ASSERT_SUCCESS(status);
790          TEST_ASSERT(numStatuses == 2);
791          TEST_ASSERT(statusVals[0] == 200);
792          TEST_ASSERT(statusVals[1] == 300);
793 
794          // p
795          pos = bi->next();
796          TEST_ASSERT(pos==3);
797          numStatuses = bi->getRuleStatusVec(statusVals, 10, status);
798          TEST_ASSERT_SUCCESS(status);
799          TEST_ASSERT(numStatuses == 2);
800          TEST_ASSERT(statusVals[0] == 200);
801          TEST_ASSERT(statusVals[1] == 300);
802 
803          // z
804          pos = bi->next();
805          TEST_ASSERT(pos==4);
806          numStatuses = bi->getRuleStatusVec(statusVals, 10, status);
807          TEST_ASSERT_SUCCESS(status);
808          TEST_ASSERT(numStatuses == 1);
809          TEST_ASSERT(statusVals[0] == 300);
810 
811          // 5
812          pos = bi->next();
813          TEST_ASSERT(pos==5);
814          numStatuses = bi->getRuleStatusVec(statusVals, 10, status);
815          TEST_ASSERT_SUCCESS(status);
816          TEST_ASSERT(numStatuses == 2);
817          TEST_ASSERT(statusVals[0] == 400);
818          TEST_ASSERT(statusVals[1] == 500);
819 
820          // ?
821          pos = bi->next();
822          TEST_ASSERT(pos==6);
823          numStatuses = bi->getRuleStatusVec(statusVals, 10, status);
824          TEST_ASSERT_SUCCESS(status);
825          TEST_ASSERT(numStatuses == 1);
826          TEST_ASSERT(statusVals[0] == 0);
827 
828          //
829          //  Check buffer overflow error handling.   Char == A
830          //
831          bi->first();
832          pos = bi->next();
833          TEST_ASSERT(pos==1);
834          memset(statusVals, -1, sizeof(statusVals));
835          numStatuses = bi->getRuleStatusVec(statusVals, 0, status);
836          TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR);
837          TEST_ASSERT(numStatuses == 2);
838          TEST_ASSERT(statusVals[0] == -1);
839 
840          status = U_ZERO_ERROR;
841          memset(statusVals, -1, sizeof(statusVals));
842          numStatuses = bi->getRuleStatusVec(statusVals, 1, status);
843          TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR);
844          TEST_ASSERT(numStatuses == 2);
845          TEST_ASSERT(statusVals[0] == 100);
846          TEST_ASSERT(statusVals[1] == -1);
847 
848          status = U_ZERO_ERROR;
849          memset(statusVals, -1, sizeof(statusVals));
850          numStatuses = bi->getRuleStatusVec(statusVals, 2, status);
851          TEST_ASSERT_SUCCESS(status);
852          TEST_ASSERT(numStatuses == 2);
853          TEST_ASSERT(statusVals[0] == 100);
854          TEST_ASSERT(statusVals[1] == 300);
855          TEST_ASSERT(statusVals[2] == -1);
856      }
857      delete bi;
858 
859 }
860 
861 //
862 //   Bug 2190 Regression test.   Builder crash on rule consisting of only a
863 //                               $variable reference
TestBug2190()864 void RBBIAPITest::TestBug2190() {
865      UnicodeString rulesString1 = "$aaa = abcd;\n"
866                                   "$bbb = $aaa;\n"
867                                   "$bbb;\n";
868      UnicodeString testString1  = "abcdabcd";
869                                 // 01234567890
870      int32_t bounds1[] = {0, 4, 8};
871      UErrorCode status=U_ZERO_ERROR;
872      UParseError    parseError;
873 
874      RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
875      if(U_FAILURE(status)) {
876          dataerrln("Fail : in construction - %s", u_errorName(status));
877      } else {
878          bi->setText(testString1);
879          doBoundaryTest(*bi, testString1, bounds1);
880      }
881      delete bi;
882 }
883 
TestBug22580()884 void RBBIAPITest::TestBug22580() {
885      UParseError    parseError;
886      // Test single ' will not cause infinity loop
887      {
888          UnicodeString rulesString = u"'";
889          UErrorCode status=U_ZERO_ERROR;
890          RuleBasedBreakIterator(rulesString, parseError, status);
891      }
892      if (quick) {
893          return;
894      }
895      // Test any 1 or 2 ASCII chars as rule will not cause infinity loop.
896      // only in exhaust mode
897      for (char16_t u1 = u' '; u1 <= u'~'; u1++) {
898          {
899              UnicodeString rule = u1;
900              UErrorCode status=U_ZERO_ERROR;
901              RuleBasedBreakIterator bi (rule, parseError, status);
902          }
903          for (char16_t u2 = u' '; u2 <= u'~'; u2++) {
904              {
905                  UnicodeString rule;
906                  rule.append(u1).append(u2);
907                  UErrorCode status=U_ZERO_ERROR;
908                  RuleBasedBreakIterator bi (rule, parseError, status);
909              }
910          }
911      }
912 }
913 
TestRegistration()914 void RBBIAPITest::TestRegistration() {
915 #if !UCONFIG_NO_SERVICE
916     UErrorCode status = U_ZERO_ERROR;
917     BreakIterator* ja_word = BreakIterator::createWordInstance("ja_JP", status);
918     // ok to not delete these if we exit because of error?
919     BreakIterator* ja_char = BreakIterator::createCharacterInstance("ja_JP", status);
920     BreakIterator* root_word = BreakIterator::createWordInstance("", status);
921     BreakIterator* root_char = BreakIterator::createCharacterInstance("", status);
922 
923     if (status == U_MISSING_RESOURCE_ERROR || status == U_FILE_ACCESS_ERROR) {
924         dataerrln("Error creating instances of break interactors - %s", u_errorName(status));
925 
926         delete ja_word;
927         delete ja_char;
928         delete root_word;
929         delete root_char;
930 
931         return;
932     }
933 
934     URegistryKey key = BreakIterator::registerInstance(ja_word, "xx", UBRK_WORD, status);
935     {
936 #if 0 // With a dictionary based word breaking, ja_word is identical to root.
937         if (ja_word && *ja_word == *root_word) {
938             errln("japan not different from root");
939         }
940 #endif
941     }
942 
943     {
944         BreakIterator* result = BreakIterator::createWordInstance("xx_XX", status);
945         UBool fail = true;
946         if(result){
947             fail = *result != *ja_word;
948         }
949         delete result;
950         if (fail) {
951             errln("bad result for xx_XX/word");
952         }
953     }
954 
955     {
956         BreakIterator* result = BreakIterator::createCharacterInstance("ja_JP", status);
957         UBool fail = true;
958         if(result){
959             fail = *result != *ja_char;
960         }
961         delete result;
962         if (fail) {
963             errln("bad result for ja_JP/char");
964         }
965     }
966 
967     {
968         BreakIterator* result = BreakIterator::createCharacterInstance("xx_XX", status);
969         UBool fail = true;
970         if(result){
971             fail = *result != *root_char;
972         }
973         delete result;
974         if (fail) {
975             errln("bad result for xx_XX/char");
976         }
977     }
978 
979     {
980         StringEnumeration* avail = BreakIterator::getAvailableLocales();
981         UBool found = false;
982         const UnicodeString* p;
983         while ((p = avail->snext(status))) {
984             if (p->compare("xx") == 0) {
985                 found = true;
986                 break;
987             }
988         }
989         delete avail;
990         if (!found) {
991             errln("did not find test locale");
992         }
993     }
994 
995     {
996         UBool unreg = BreakIterator::unregister(key, status);
997         if (!unreg) {
998             errln("unable to unregister");
999         }
1000     }
1001 
1002     {
1003         BreakIterator* result = BreakIterator::createWordInstance("en_US", status);
1004         BreakIterator* root = BreakIterator::createWordInstance("", status);
1005         UBool fail = true;
1006         if(root){
1007           fail = *root != *result;
1008         }
1009         delete root;
1010         delete result;
1011         if (fail) {
1012             errln("did not get root break");
1013         }
1014     }
1015 
1016     {
1017         StringEnumeration* avail = BreakIterator::getAvailableLocales();
1018         UBool found = false;
1019         const UnicodeString* p;
1020         while ((p = avail->snext(status))) {
1021             if (p->compare("xx") == 0) {
1022                 found = true;
1023                 break;
1024             }
1025         }
1026         delete avail;
1027         if (found) {
1028             errln("found test locale");
1029         }
1030     }
1031 
1032     {
1033         int32_t count;
1034         UBool   foundLocale = false;
1035         const Locale *avail = BreakIterator::getAvailableLocales(count);
1036         for (int i=0; i<count; i++) {
1037             if (avail[i] == Locale::getEnglish()) {
1038                 foundLocale = true;
1039                 break;
1040             }
1041         }
1042         if (foundLocale == false) {
1043             errln("BreakIterator::getAvailableLocales(&count), failed to find EN.");
1044         }
1045     }
1046 
1047 
1048     // ja_word was adopted by factory
1049     delete ja_char;
1050     delete root_word;
1051     delete root_char;
1052 #endif
1053 }
1054 
RoundtripRule(const char * dataFile)1055 void RBBIAPITest::RoundtripRule(const char *dataFile) {
1056     UErrorCode status = U_ZERO_ERROR;
1057     UParseError parseError;
1058     parseError.line = 0;
1059     parseError.offset = 0;
1060     LocalUDataMemoryPointer data(udata_open(U_ICUDATA_BRKITR, "brk", dataFile, &status));
1061     uint32_t length;
1062     UnicodeString builtSource;
1063     const uint8_t *rbbiRules;
1064     const uint8_t *builtRules;
1065 
1066     if (U_FAILURE(status)) {
1067         errcheckln(status, "%s:%d Can't open \"%s\" - %s", __FILE__, __LINE__, dataFile, u_errorName(status));
1068         return;
1069     }
1070 
1071     builtRules = (const uint8_t *)udata_getMemory(data.getAlias());
1072     builtSource = UnicodeString::fromUTF8(
1073         (const char *)(builtRules + ((RBBIDataHeader *)builtRules)->fRuleSource));
1074     LocalPointer<RuleBasedBreakIterator> brkItr (new RuleBasedBreakIterator(builtSource, parseError, status));
1075     if (U_FAILURE(status)) {
1076         errln("%s:%d createRuleBasedBreakIterator: ICU Error \"%s\"  at line %d, column %d\n",
1077                 __FILE__, __LINE__, u_errorName(status), parseError.line, parseError.offset);
1078         errln(builtSource);
1079         return;
1080     }
1081     rbbiRules = brkItr->getBinaryRules(length);
1082     logln("Comparing \"%s\" len=%d", dataFile, length);
1083     if (memcmp(builtRules, rbbiRules, (int32_t)length) != 0) {
1084         errln("%s:%d Built rules and rebuilt rules are different %s", __FILE__, __LINE__, dataFile);
1085         return;
1086     }
1087 }
1088 
TestRoundtripRules()1089 void RBBIAPITest::TestRoundtripRules() {
1090     RoundtripRule("word");
1091     RoundtripRule("title");
1092     RoundtripRule("sent");
1093     RoundtripRule("line");
1094     RoundtripRule("char");
1095     if (!quick) {
1096         RoundtripRule("word_POSIX");
1097     }
1098 }
1099 
1100 
1101 // Check getBinaryRules() and construction of a break iterator from those rules.
1102 
TestGetBinaryRules()1103 void RBBIAPITest::TestGetBinaryRules() {
1104     UErrorCode status=U_ZERO_ERROR;
1105     LocalPointer<BreakIterator> bi(BreakIterator::createLineInstance(Locale::getEnglish(), status));
1106     if (U_FAILURE(status)) {
1107         dataerrln("FAIL: BreakIterator::createLineInstance for Locale::getEnglish(): %s", u_errorName(status));
1108         return;
1109     }
1110     RuleBasedBreakIterator *rbbi = dynamic_cast<RuleBasedBreakIterator *>(bi.getAlias());
1111     if (rbbi == nullptr) {
1112         dataerrln("FAIL: RuleBasedBreakIterator is nullptr");
1113         return;
1114     }
1115 
1116     // Check that the new line break iterator is nominally functional.
1117     UnicodeString helloWorld("Hello, World!");
1118     rbbi->setText(helloWorld);
1119     int n = 0;
1120     while (bi->next() != UBRK_DONE) {
1121         ++n;
1122     }
1123     TEST_ASSERT(n == 2);
1124 
1125     // Extract the binary rules as a uint8_t blob.
1126     uint32_t ruleLength;
1127     const uint8_t *binRules = rbbi->getBinaryRules(ruleLength);
1128     TEST_ASSERT(ruleLength > 0);
1129     TEST_ASSERT(binRules != nullptr);
1130 
1131     // Clone the binary rules, and create a break iterator from that.
1132     // The break iterator does not adopt the rules; we must delete when we are finished with the iterator.
1133     uint8_t *clonedRules = new uint8_t[ruleLength];
1134     memcpy(clonedRules, binRules, ruleLength);
1135     RuleBasedBreakIterator clonedBI(clonedRules, ruleLength, status);
1136     TEST_ASSERT_SUCCESS(status);
1137 
1138     // Check that the cloned line break iterator is nominally alive.
1139     clonedBI.setText(helloWorld);
1140     n = 0;
1141     while (clonedBI.next() != UBRK_DONE) {
1142         ++n;
1143     }
1144     TEST_ASSERT(n == 2);
1145 
1146     delete[] clonedRules;
1147 }
1148 
1149 
TestRefreshInputText()1150 void RBBIAPITest::TestRefreshInputText() {
1151     /*
1152      *  RefreshInput changes out the input of a Break Iterator without
1153      *    changing anything else in the iterator's state.  Used with Java JNI,
1154      *    when Java moves the underlying string storage.   This test
1155      *    runs BreakIterator::next() repeatedly, moving the text in the middle of the sequence.
1156      *    The right set of boundaries should still be found.
1157      */
1158     char16_t testStr[]  = {0x20, 0x41, 0x20, 0x42, 0x20, 0x43, 0x20, 0x44, 0x0};  /* = " A B C D"  */
1159     char16_t movedStr[] = {0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,  0};
1160     UErrorCode status = U_ZERO_ERROR;
1161     UText ut1 = UTEXT_INITIALIZER;
1162     UText ut2 = UTEXT_INITIALIZER;
1163     RuleBasedBreakIterator *bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createLineInstance(Locale::getEnglish(), status));
1164     TEST_ASSERT_SUCCESS(status);
1165 
1166     utext_openUChars(&ut1, testStr, -1, &status);
1167     TEST_ASSERT_SUCCESS(status);
1168 
1169     if (U_SUCCESS(status)) {
1170         bi->setText(&ut1, status);
1171         TEST_ASSERT_SUCCESS(status);
1172 
1173         /* Line boundaries will occur before each letter in the original string */
1174         TEST_ASSERT(1 == bi->next());
1175         TEST_ASSERT(3 == bi->next());
1176 
1177         /* Move the string, kill the original string.  */
1178         u_strcpy(movedStr, testStr);
1179         u_memset(testStr, 0x20, u_strlen(testStr));
1180         utext_openUChars(&ut2, movedStr, -1, &status);
1181         TEST_ASSERT_SUCCESS(status);
1182         RuleBasedBreakIterator *returnedBI = &bi->refreshInputText(&ut2, status);
1183         TEST_ASSERT_SUCCESS(status);
1184         TEST_ASSERT(bi == returnedBI);
1185 
1186         /* Find the following matches, now working in the moved string. */
1187         TEST_ASSERT(5 == bi->next());
1188         TEST_ASSERT(7 == bi->next());
1189         TEST_ASSERT(8 == bi->next());
1190         TEST_ASSERT(UBRK_DONE == bi->next());
1191 
1192         utext_close(&ut1);
1193         utext_close(&ut2);
1194     }
1195     delete bi;
1196 
1197 }
1198 
1199 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION
prtbrks(BreakIterator * brk,const UnicodeString & ustr,IntlTest & it)1200 static void prtbrks(BreakIterator* brk, const UnicodeString &ustr, IntlTest &it) {
1201   static const char16_t PILCROW=0x00B6, CHSTR=0x3010, CHEND=0x3011; // lenticular brackets
1202   it.logln(UnicodeString("String:'")+ustr+UnicodeString("'"));
1203 
1204   int32_t *pos = new int32_t[ustr.length()];
1205   int32_t posCount = 0;
1206 
1207   // calculate breaks up front, so we can print out
1208   // sans any debugging
1209   for(int32_t n = 0; (n=brk->next())!=UBRK_DONE; ) {
1210     pos[posCount++] = n;
1211     if(posCount>=ustr.length()) {
1212       it.errln("brk count exceeds string length!");
1213       return;
1214     }
1215   }
1216   UnicodeString out;
1217   out.append((char16_t)CHSTR);
1218   int32_t prev = 0;
1219   for(int32_t i=0;i<posCount;i++) {
1220     int32_t n=pos[i];
1221     out.append(ustr.tempSubString(prev,n-prev));
1222     out.append((char16_t)PILCROW);
1223     prev=n;
1224   }
1225   out.append(ustr.tempSubString(prev,ustr.length()-prev));
1226   out.append((char16_t)CHEND);
1227   it.logln(out);
1228 
1229   out.remove();
1230   for(int32_t i=0;i<posCount;i++) {
1231     char tmp[100];
1232     snprintf(tmp, sizeof(tmp), "%d ",pos[i]);
1233     out.append(UnicodeString(tmp));
1234   }
1235   it.logln(out);
1236   delete [] pos;
1237 }
1238 #endif
1239 
TestFilteredBreakIteratorBuilder()1240 void RBBIAPITest::TestFilteredBreakIteratorBuilder() {
1241 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION
1242   UErrorCode status = U_ZERO_ERROR;
1243   LocalPointer<FilteredBreakIteratorBuilder> builder;
1244   LocalPointer<BreakIterator> baseBI;
1245   LocalPointer<BreakIterator> filteredBI;
1246   LocalPointer<BreakIterator> frenchBI;
1247 
1248   const UnicodeString text("In the meantime Mr. Weston arrived with his small ship, which he had now recovered. Capt. Gorges, who informed the Sgt. here that one purpose of his going east was to meet with Mr. Weston, took this opportunity to call him to account for some abuses he had to lay to his charge."); // (William Bradford, public domain. http://catalog.hathitrust.org/Record/008651224 ) - edited.
1249   const UnicodeString ABBR_MR("Mr.");
1250   const UnicodeString ABBR_CAPT("Capt.");
1251 
1252   {
1253     logln("Constructing empty builder\n");
1254     builder.adoptInstead(FilteredBreakIteratorBuilder::createInstance(status));
1255     TEST_ASSERT_SUCCESS(status);
1256 
1257     logln("Constructing base BI\n");
1258     baseBI.adoptInstead(BreakIterator::createSentenceInstance(Locale::getEnglish(), status));
1259     TEST_ASSERT_SUCCESS(status);
1260 
1261 	logln("Building new BI\n");
1262     filteredBI.adoptInstead(builder->build(baseBI.orphan(), status));
1263     TEST_ASSERT_SUCCESS(status);
1264 
1265 	if (U_SUCCESS(status)) {
1266         logln("Testing:");
1267         filteredBI->setText(text);
1268         TEST_ASSERT(20 == filteredBI->next()); // Mr.
1269         TEST_ASSERT(84 == filteredBI->next()); // recovered.
1270         TEST_ASSERT(90 == filteredBI->next()); // Capt.
1271         TEST_ASSERT(181 == filteredBI->next()); // Mr.
1272         TEST_ASSERT(278 == filteredBI->next()); // charge.
1273         filteredBI->first();
1274         prtbrks(filteredBI.getAlias(), text, *this);
1275     }
1276   }
1277 
1278   {
1279     logln("Constructing empty builder\n");
1280     builder.adoptInstead(FilteredBreakIteratorBuilder::createInstance(status));
1281     TEST_ASSERT_SUCCESS(status);
1282 
1283     if (U_SUCCESS(status)) {
1284         logln("Adding Mr. as an exception\n");
1285         TEST_ASSERT(true == builder->suppressBreakAfter(ABBR_MR, status));
1286         TEST_ASSERT(false == builder->suppressBreakAfter(ABBR_MR, status)); // already have it
1287         TEST_ASSERT(true == builder->unsuppressBreakAfter(ABBR_MR, status));
1288         TEST_ASSERT(false == builder->unsuppressBreakAfter(ABBR_MR, status)); // already removed it
1289         TEST_ASSERT(true == builder->suppressBreakAfter(ABBR_MR, status));
1290         TEST_ASSERT_SUCCESS(status);
1291 
1292         logln("Constructing base BI\n");
1293         baseBI.adoptInstead(BreakIterator::createSentenceInstance(Locale::getEnglish(), status));
1294         TEST_ASSERT_SUCCESS(status);
1295 
1296         logln("Building new BI\n");
1297         filteredBI.adoptInstead(builder->build(baseBI.orphan(), status));
1298         TEST_ASSERT_SUCCESS(status);
1299 
1300         logln("Testing:");
1301         filteredBI->setText(text);
1302         TEST_ASSERT(84 == filteredBI->next());
1303         TEST_ASSERT(90 == filteredBI->next());// Capt.
1304         TEST_ASSERT(278 == filteredBI->next());
1305         filteredBI->first();
1306         prtbrks(filteredBI.getAlias(), text, *this);
1307     }
1308   }
1309 
1310 
1311   {
1312     logln("Constructing empty builder\n");
1313     builder.adoptInstead(FilteredBreakIteratorBuilder::createInstance(status));
1314     TEST_ASSERT_SUCCESS(status);
1315 
1316     if (U_SUCCESS(status)) {
1317         logln("Adding Mr. and Capt as an exception\n");
1318         TEST_ASSERT(true == builder->suppressBreakAfter(ABBR_MR, status));
1319         TEST_ASSERT(true == builder->suppressBreakAfter(ABBR_CAPT, status));
1320         TEST_ASSERT_SUCCESS(status);
1321 
1322         logln("Constructing base BI\n");
1323         baseBI.adoptInstead(BreakIterator::createSentenceInstance(Locale::getEnglish(), status));
1324         TEST_ASSERT_SUCCESS(status);
1325 
1326         logln("Building new BI\n");
1327         filteredBI.adoptInstead(builder->build(baseBI.orphan(), status));
1328         TEST_ASSERT_SUCCESS(status);
1329 
1330         logln("Testing:");
1331         filteredBI->setText(text);
1332         TEST_ASSERT(84 == filteredBI->next());
1333         TEST_ASSERT(278 == filteredBI->next());
1334         filteredBI->first();
1335         prtbrks(filteredBI.getAlias(), text, *this);
1336     }
1337   }
1338 
1339 
1340   {
1341     logln("Constructing English builder\n");
1342     builder.adoptInstead(FilteredBreakIteratorBuilder::createInstance(Locale::getEnglish(), status));
1343     TEST_ASSERT_SUCCESS(status);
1344 
1345     logln("Constructing base BI\n");
1346     baseBI.adoptInstead(BreakIterator::createSentenceInstance(Locale::getEnglish(), status));
1347     TEST_ASSERT_SUCCESS(status);
1348 
1349     if (U_SUCCESS(status)) {
1350         logln("unsuppressing 'Capt'");
1351         TEST_ASSERT(true == builder->unsuppressBreakAfter(ABBR_CAPT, status));
1352 
1353         logln("Building new BI\n");
1354         filteredBI.adoptInstead(builder->build(baseBI.orphan(), status));
1355         TEST_ASSERT_SUCCESS(status);
1356 
1357         if(filteredBI.isValid()) {
1358           logln("Testing:");
1359           filteredBI->setText(text);
1360           TEST_ASSERT(84 == filteredBI->next());
1361           TEST_ASSERT(90 == filteredBI->next());
1362           TEST_ASSERT(278 == filteredBI->next());
1363           filteredBI->first();
1364           prtbrks(filteredBI.getAlias(), text, *this);
1365         }
1366     }
1367   }
1368 
1369 
1370   {
1371     logln("Constructing English builder\n");
1372     builder.adoptInstead(FilteredBreakIteratorBuilder::createInstance(Locale::getEnglish(), status));
1373     TEST_ASSERT_SUCCESS(status);
1374 
1375     logln("Constructing base BI\n");
1376     baseBI.adoptInstead(BreakIterator::createSentenceInstance(Locale::getEnglish(), status));
1377     TEST_ASSERT_SUCCESS(status);
1378 
1379     if (U_SUCCESS(status)) {
1380         logln("Building new BI\n");
1381         filteredBI.adoptInstead(builder->build(baseBI.orphan(), status));
1382         TEST_ASSERT_SUCCESS(status);
1383 
1384         if(filteredBI.isValid()) {
1385           logln("Testing:");
1386           filteredBI->setText(text);
1387           TEST_ASSERT(84 == filteredBI->next());
1388           TEST_ASSERT(278 == filteredBI->next());
1389           filteredBI->first();
1390           prtbrks(filteredBI.getAlias(), text, *this);
1391         }
1392     }
1393   }
1394 
1395   // reenable once french is in
1396   {
1397     logln("Constructing French builder");
1398     builder.adoptInstead(FilteredBreakIteratorBuilder::createInstance(Locale::getFrench(), status));
1399     TEST_ASSERT_SUCCESS(status);
1400 
1401     logln("Constructing base BI\n");
1402     baseBI.adoptInstead(BreakIterator::createSentenceInstance(Locale::getFrench(), status));
1403     TEST_ASSERT_SUCCESS(status);
1404 
1405     if (U_SUCCESS(status)) {
1406         logln("Building new BI\n");
1407         frenchBI.adoptInstead(builder->build(baseBI.orphan(), status));
1408         TEST_ASSERT_SUCCESS(status);
1409     }
1410 
1411     if(frenchBI.isValid()) {
1412       logln("Testing:");
1413       UnicodeString frText("C'est MM. Duval.");
1414       frenchBI->setText(frText);
1415       TEST_ASSERT(16 == frenchBI->next());
1416       TEST_ASSERT(BreakIterator::DONE == frenchBI->next());
1417       frenchBI->first();
1418       prtbrks(frenchBI.getAlias(), frText, *this);
1419       logln("Testing against English:");
1420       filteredBI->setText(frText);
1421       TEST_ASSERT(10 == filteredBI->next()); // wrong for french, but filterBI is english.
1422       TEST_ASSERT(16 == filteredBI->next());
1423       TEST_ASSERT(BreakIterator::DONE == filteredBI->next());
1424       filteredBI->first();
1425       prtbrks(filteredBI.getAlias(), frText, *this);
1426 
1427       // Verify ==
1428       assertTrue(WHERE, *frenchBI   == *frenchBI);
1429       assertTrue(WHERE, *filteredBI != *frenchBI);
1430       assertTrue(WHERE, *frenchBI   != *filteredBI);
1431     } else {
1432       dataerrln("French BI: not valid.");
1433 	}
1434   }
1435 
1436 #else
1437   logln("Skipped- not: !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION");
1438 #endif
1439 }
1440 
1441 //---------------------------------------------
1442 // runIndexedTest
1443 //---------------------------------------------
1444 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)1445 void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
1446 {
1447     if (exec) logln((UnicodeString)"TestSuite RuleBasedBreakIterator API ");
1448     TESTCASE_AUTO_BEGIN;
1449 #if !UCONFIG_NO_FILE_IO
1450     TESTCASE_AUTO(TestCloneEquals);
1451     TESTCASE_AUTO(TestgetRules);
1452     TESTCASE_AUTO(TestHashCode);
1453     TESTCASE_AUTO(TestGetSetAdoptText);
1454     TESTCASE_AUTO(TestIteration);
1455 #endif
1456     TESTCASE_AUTO(TestBuilder);
1457     TESTCASE_AUTO(TestQuoteGrouping);
1458     TESTCASE_AUTO(TestRuleStatusVec);
1459     TESTCASE_AUTO(TestBug2190);
1460     TESTCASE_AUTO(TestBug22580);
1461 #if !UCONFIG_NO_FILE_IO
1462     TESTCASE_AUTO(TestRegistration);
1463     TESTCASE_AUTO(TestBoilerPlate);
1464     TESTCASE_AUTO(TestRuleStatus);
1465     TESTCASE_AUTO(TestRoundtripRules);
1466     TESTCASE_AUTO(TestGetBinaryRules);
1467 #endif
1468     TESTCASE_AUTO(TestRefreshInputText);
1469 #if !UCONFIG_NO_BREAK_ITERATION
1470     TESTCASE_AUTO(TestFilteredBreakIteratorBuilder);
1471 #endif
1472     TESTCASE_AUTO_END;
1473 }
1474 
1475 
1476 //---------------------------------------------
1477 //Internal subroutines
1478 //---------------------------------------------
1479 
doBoundaryTest(BreakIterator & bi,UnicodeString & text,int32_t * boundaries)1480 void RBBIAPITest::doBoundaryTest(BreakIterator& bi, UnicodeString& text, int32_t *boundaries){
1481      logln((UnicodeString)"testIsBoundary():");
1482         int32_t p = 0;
1483         UBool isB;
1484         for (int32_t i = 0; i < text.length(); i++) {
1485             isB = bi.isBoundary(i);
1486             logln((UnicodeString)"bi.isBoundary(" + i + ") -> " + isB);
1487 
1488             if (i == boundaries[p]) {
1489                 if (!isB)
1490                     errln((UnicodeString)"Wrong result from isBoundary() for " + i + (UnicodeString)": expected true, got false");
1491                 p++;
1492             }
1493             else {
1494                 if (isB)
1495                     errln((UnicodeString)"Wrong result from isBoundary() for " + i + (UnicodeString)": expected false, got true");
1496             }
1497         }
1498 }
doTest(UnicodeString & testString,int32_t start,int32_t gotoffset,int32_t expectedOffset,const char * expectedString)1499 void RBBIAPITest::doTest(UnicodeString& testString, int32_t start, int32_t gotoffset, int32_t expectedOffset, const char* expectedString){
1500     UnicodeString selected;
1501     UnicodeString expected=CharsToUnicodeString(expectedString);
1502 
1503     if(gotoffset != expectedOffset)
1504          errln((UnicodeString)"ERROR:****returned #" + gotoffset + (UnicodeString)" instead of #" + expectedOffset);
1505     if(start <= gotoffset){
1506         testString.extractBetween(start, gotoffset, selected);
1507     }
1508     else{
1509         testString.extractBetween(gotoffset, start, selected);
1510     }
1511     if(selected.compare(expected) != 0)
1512          errln(prettify((UnicodeString)"ERROR:****selected \"" + selected + "\" instead of \"" + expected + "\""));
1513     else
1514         logln(prettify("****selected \"" + selected + "\""));
1515 }
1516 
1517 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1518