xref: /aosp_15_r20/external/libtextclassifier/native/annotator/number/number_test-include.cc (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "annotator/number/number_test-include.h"
18 
19 #include <set>
20 #include <string>
21 #include <vector>
22 
23 #include "annotator/collections.h"
24 #include "annotator/model_generated.h"
25 #include "annotator/types-test-util.h"
26 #include "annotator/types.h"
27 #include "utils/tokenizer-utils.h"
28 #include "utils/utf8/unicodetext.h"
29 #include "gmock/gmock.h"
30 #include "gtest/gtest.h"
31 
32 namespace libtextclassifier3 {
33 namespace test_internal {
34 
35 using ::testing::AllOf;
36 using ::testing::ElementsAre;
37 using ::testing::Field;
38 using ::testing::IsEmpty;
39 using ::testing::Matcher;
40 using ::testing::UnorderedElementsAre;
41 
42 namespace {
CreateOptionsData(ModeFlag enabled_modes)43 const flatbuffers::DetachedBuffer* CreateOptionsData(ModeFlag enabled_modes) {
44   NumberAnnotatorOptionsT options;
45   options.enabled = true;
46   options.priority_score = -10.0;
47   options.float_number_priority_score = 1.0;
48   options.enabled_annotation_usecases =
49       1 << AnnotationUsecase_ANNOTATION_USECASE_RAW;
50   options.max_number_of_digits = 20;
51   options.enabled_modes = enabled_modes;
52 
53   options.percentage_priority_score = 1.0;
54   options.percentage_annotation_usecases =
55       (1 << AnnotationUsecase_ANNOTATION_USECASE_RAW) +
56       (1 << AnnotationUsecase_ANNOTATION_USECASE_SMART);
57   std::set<std::string> percent_suffixes(
58       {"パーセント", "percent", "pércént", "pc", "pct", "%", "٪", "﹪", "%"});
59   for (const std::string& string_value : percent_suffixes) {
60     options.percentage_pieces_string.append(string_value);
61     options.percentage_pieces_string.push_back('\0');
62   }
63 
64   flatbuffers::FlatBufferBuilder builder;
65   builder.Finish(NumberAnnotatorOptions::Pack(builder, &options));
66   return new flatbuffers::DetachedBuffer(builder.Release());
67 }
68 }  // namespace
69 
70 const NumberAnnotatorOptions*
TestingNumberAnnotatorOptions(ModeFlag enabled_modes)71 NumberAnnotatorTest::TestingNumberAnnotatorOptions(ModeFlag enabled_modes) {
72   static const flatbuffers::DetachedBuffer* options_data_selection =
73       CreateOptionsData(ModeFlag_SELECTION);
74   static const flatbuffers::DetachedBuffer* options_data_no_selection =
75       CreateOptionsData(ModeFlag_ANNOTATION_AND_CLASSIFICATION);
76   static const flatbuffers::DetachedBuffer* options_data_all =
77       CreateOptionsData(ModeFlag_ALL);
78 
79   if (enabled_modes == ModeFlag_SELECTION) {
80     return flatbuffers::GetRoot<NumberAnnotatorOptions>(
81         options_data_selection->data());
82   } else if (enabled_modes == ModeFlag_ANNOTATION_AND_CLASSIFICATION) {
83     return flatbuffers::GetRoot<NumberAnnotatorOptions>(
84         options_data_no_selection->data());
85   } else {
86     return flatbuffers::GetRoot<NumberAnnotatorOptions>(
87         options_data_all->data());
88   }
89 }
90 
91 MATCHER_P(IsCorrectCollection, collection, "collection is " + collection) {
92   return arg.collection == collection;
93 }
94 
95 MATCHER_P(IsCorrectNumericValue, numeric_value,
96           "numeric value is " + std::to_string(numeric_value)) {
97   return arg.numeric_value == numeric_value;
98 }
99 
100 MATCHER_P(IsCorrectNumericDoubleValue, numeric_double_value,
101           "numeric double value is " + std::to_string(numeric_double_value)) {
102   return arg.numeric_double_value == numeric_double_value;
103 }
104 
105 MATCHER_P(IsCorrectScore, score, "score is " + std::to_string(score)) {
106   return arg.score == score;
107 }
108 
109 MATCHER_P(IsCorrectPriortyScore, priority_score,
110           "priority score is " + std::to_string(priority_score)) {
111   return arg.priority_score == priority_score;
112 }
113 
114 MATCHER_P(IsCorrectSpan, span,
115           "span is (" + std::to_string(span.first) + "," +
116               std::to_string(span.second) + ")") {
117   return arg.span == span;
118 }
119 
120 MATCHER_P(Classification, inner, "") {
121   return testing::ExplainMatchResult(inner, arg.classification,
122                                      result_listener);
123 }
124 
IsAnnotatedSpan(const CodepointSpan & codepoint_span,const std::string & collection,const int int_value,const double double_value,const float priority_score=-10,const float score=1)125 static Matcher<AnnotatedSpan> IsAnnotatedSpan(
126     const CodepointSpan& codepoint_span, const std::string& collection,
127     const int int_value, const double double_value,
128     const float priority_score = -10, const float score = 1) {
129   return AllOf(
130       IsCorrectSpan(codepoint_span),
131       Classification(ElementsAre(AllOf(
132           IsCorrectCollection(collection), IsCorrectNumericValue(int_value),
133           IsCorrectNumericDoubleValue(double_value), IsCorrectScore(score),
134           IsCorrectPriortyScore(priority_score)))));
135 }
136 
TEST_F(NumberAnnotatorTest,ClassifiesAndParsesNumberCorrectly)137 TEST_F(NumberAnnotatorTest, ClassifiesAndParsesNumberCorrectly) {
138   ClassificationResult classification_result;
139   EXPECT_TRUE(number_annotator_.ClassifyText(
140       UTF8ToUnicodeText("... 12345 ..."), {4, 9},
141       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
142 
143   EXPECT_EQ(classification_result.collection, "number");
144   EXPECT_EQ(classification_result.numeric_value, 12345);
145   EXPECT_FLOAT_EQ(classification_result.numeric_double_value, 12345);
146 }
147 
TEST_F(NumberAnnotatorForSelectionTest,ClassifyTextDisabledClassificationReturnsFalse)148 TEST_F(NumberAnnotatorForSelectionTest,
149        ClassifyTextDisabledClassificationReturnsFalse) {
150   ClassificationResult classification_result;
151   EXPECT_FALSE(number_annotator_.ClassifyText(
152       UTF8ToUnicodeText("... 12345 ..."), {4, 9},
153       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
154 }
155 
TEST_F(NumberAnnotatorTest,ClassifiesAndParsesNumberAsFloatCorrectly)156 TEST_F(NumberAnnotatorTest, ClassifiesAndParsesNumberAsFloatCorrectly) {
157   ClassificationResult classification_result;
158   EXPECT_TRUE(number_annotator_.ClassifyText(
159       UTF8ToUnicodeText("... 12345.12345 ..."), {4, 15},
160       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
161 
162   EXPECT_EQ(classification_result.collection, "number");
163   EXPECT_EQ(classification_result.numeric_value, 12345);
164   EXPECT_FLOAT_EQ(classification_result.numeric_double_value, 12345.12345);
165 }
166 
TEST_F(NumberAnnotatorTest,ClassifiesAndParsesNumberAsFloatCorrectlyWithoutDecimals)167 TEST_F(NumberAnnotatorTest,
168        ClassifiesAndParsesNumberAsFloatCorrectlyWithoutDecimals) {
169   ClassificationResult classification_result;
170   // The dot after a number is considered punctuation, not part of a floating
171   // number.
172   EXPECT_TRUE(number_annotator_.ClassifyText(
173       UTF8ToUnicodeText("... 12345. ..."), {4, 9},
174       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
175   EXPECT_FALSE(number_annotator_.ClassifyText(
176       UTF8ToUnicodeText("... 12345. ..."), {4, 10},
177       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
178 
179   EXPECT_EQ(classification_result.collection, "number");
180   EXPECT_EQ(classification_result.numeric_value, 12345);
181   EXPECT_FLOAT_EQ(classification_result.numeric_double_value, 12345);
182 
183   EXPECT_TRUE(number_annotator_.ClassifyText(
184       UTF8ToUnicodeText("... 12345. ..."), {4, 9},
185       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
186   EXPECT_EQ(classification_result.collection, "number");
187   EXPECT_EQ(classification_result.numeric_value, 12345);
188   EXPECT_FLOAT_EQ(classification_result.numeric_double_value, 12345);
189 }
190 
TEST_F(NumberAnnotatorTest,FindsAllIntegerAndFloatNumbersInText)191 TEST_F(NumberAnnotatorTest, FindsAllIntegerAndFloatNumbersInText) {
192   std::vector<AnnotatedSpan> result;
193   // In the context "68.9#" -> 68.9 is a number because # is punctuation.
194   // In the context "68.9#?" -> 68.9 is not a number because is followed by two
195   // punctuation signs.
196   EXPECT_TRUE(number_annotator_.FindAll(
197       UTF8ToUnicodeText("how much is 2 plus 5 divided by 7% minus 3.14 "
198                         "what about 68.9# or 68.9#?"),
199       AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
200 
201   EXPECT_THAT(result,
202               UnorderedElementsAre(
203                   IsAnnotatedSpan(CodepointSpan(12, 13), "number",
204                                   /*int_value=*/2, /*double_value=*/2.0),
205                   IsAnnotatedSpan(CodepointSpan(19, 20), "number",
206                                   /*int_value=*/5, /*double_value=*/5.0),
207                   IsAnnotatedSpan(CodepointSpan(32, 33), "number",
208                                   /*int_value=*/7, /*double_value=*/7.0),
209                   IsAnnotatedSpan(CodepointSpan(32, 34), "percentage",
210                                   /*int_value=*/7, /*double_value=*/7.0,
211                                   /*priority_score=*/1),
212                   IsAnnotatedSpan(CodepointSpan(41, 45), "number",
213                                   /*int_value=*/3, /*double_value=*/3.14,
214                                   /*priority_score=*/1),
215                   IsAnnotatedSpan(CodepointSpan(57, 61), "number",
216                                   /*int_value=*/68, /*double_value=*/68.9,
217                                   /*priority_score=*/1)));
218 }
219 
TEST_F(NumberAnnotatorTest,ClassifiesNonNumberCorrectly)220 TEST_F(NumberAnnotatorTest, ClassifiesNonNumberCorrectly) {
221   ClassificationResult classification_result;
222   EXPECT_FALSE(number_annotator_.ClassifyText(
223       UTF8ToUnicodeText("... 123a45 ..."), {4, 10},
224       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
225   EXPECT_FALSE(number_annotator_.ClassifyText(
226       UTF8ToUnicodeText("... 12345..12345 ..."), {4, 16},
227       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
228   EXPECT_FALSE(number_annotator_.ClassifyText(
229       UTF8ToUnicodeText("... 12345a ..."), {4, 11},
230       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
231 }
232 
TEST_F(NumberAnnotatorTest,ClassifiesNumberSelectionCorrectly)233 TEST_F(NumberAnnotatorTest, ClassifiesNumberSelectionCorrectly) {
234   ClassificationResult classification_result;
235   // Punctuation after a number is not part of the number.
236   EXPECT_TRUE(number_annotator_.ClassifyText(
237       UTF8ToUnicodeText("... 14, ..."), {4, 6},
238       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
239   EXPECT_EQ(classification_result.collection, "number");
240   EXPECT_EQ(classification_result.numeric_value, 14);
241   EXPECT_EQ(classification_result.numeric_double_value, 14);
242 
243   EXPECT_FALSE(number_annotator_.ClassifyText(
244       UTF8ToUnicodeText("... 14, ..."), {4, 7},
245       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
246 }
247 
TEST_F(NumberAnnotatorTest,ClassifiesPercentageSignCorrectly)248 TEST_F(NumberAnnotatorTest, ClassifiesPercentageSignCorrectly) {
249   ClassificationResult classification_result;
250   EXPECT_TRUE(number_annotator_.ClassifyText(
251       UTF8ToUnicodeText("... 99% ..."), {4, 7},
252       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
253 
254   EXPECT_EQ(classification_result.collection, "percentage");
255   EXPECT_EQ(classification_result.numeric_value, 99);
256   EXPECT_EQ(classification_result.numeric_double_value, 99);
257 }
258 
TEST_F(NumberAnnotatorTest,ClassifiesPercentageWordCorrectly)259 TEST_F(NumberAnnotatorTest, ClassifiesPercentageWordCorrectly) {
260   ClassificationResult classification_result;
261   EXPECT_TRUE(number_annotator_.ClassifyText(
262       UTF8ToUnicodeText("... 15 percent ..."), {4, 14},
263       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
264 
265   EXPECT_EQ(classification_result.collection, "percentage");
266   EXPECT_EQ(classification_result.numeric_value, 15);
267   EXPECT_EQ(classification_result.numeric_double_value, 15);
268 }
269 
TEST_F(NumberAnnotatorTest,ClassifiesNonAsciiPercentageIncorrectSuffix)270 TEST_F(NumberAnnotatorTest, ClassifiesNonAsciiPercentageIncorrectSuffix) {
271   ClassificationResult classification_result;
272   EXPECT_FALSE(number_annotator_.ClassifyText(
273       UTF8ToUnicodeText("15 café"), {0, 7},
274       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
275 }
276 
TEST_F(NumberAnnotatorTest,ClassifiesNonAsciiFrPercentageCorrectSuffix)277 TEST_F(NumberAnnotatorTest, ClassifiesNonAsciiFrPercentageCorrectSuffix) {
278   ClassificationResult classification_result;
279   EXPECT_TRUE(number_annotator_.ClassifyText(
280       UTF8ToUnicodeText("25 pércént"), {0, 10},
281       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
282 
283   EXPECT_EQ(classification_result.collection, "percentage");
284   EXPECT_EQ(classification_result.numeric_value, 25);
285   EXPECT_EQ(classification_result.numeric_double_value, 25);
286 }
287 
TEST_F(NumberAnnotatorTest,ClassifiesNonAsciiJaPercentageCorrectSuffix)288 TEST_F(NumberAnnotatorTest, ClassifiesNonAsciiJaPercentageCorrectSuffix) {
289   ClassificationResult classification_result;
290   EXPECT_TRUE(number_annotator_.ClassifyText(
291       UTF8ToUnicodeText("10パーセント"), {0, 7},
292       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
293   EXPECT_EQ(classification_result.collection, "percentage");
294   EXPECT_EQ(classification_result.numeric_value, 10);
295   EXPECT_EQ(classification_result.numeric_double_value, 10);
296 
297   std::vector<AnnotatedSpan> result;
298   EXPECT_TRUE(number_annotator_.FindAll(
299       UTF8ToUnicodeText("明日の降水確率は10パーセント  音量を12にセット"),
300       AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_CLASSIFICATION,
301       &result));
302   EXPECT_THAT(result,
303               UnorderedElementsAre(
304                   IsAnnotatedSpan(CodepointSpan(8, 10), "number",
305                                   /*int_value=*/10, /*double_value=*/10.0),
306                   IsAnnotatedSpan(CodepointSpan(8, 15), "percentage",
307                                   /*int_value=*/10, /*double_value=*/10.0,
308                                   /*priority_score=*/1),
309                   IsAnnotatedSpan(CodepointSpan(20, 22), "number",
310                                   /*int_value=*/12, /*double_value=*/12.0)));
311 }
312 
TEST_F(NumberAnnotatorTest,FindsAllNumbersInText)313 TEST_F(NumberAnnotatorTest, FindsAllNumbersInText) {
314   std::vector<AnnotatedSpan> result;
315   EXPECT_TRUE(number_annotator_.FindAll(
316       UTF8ToUnicodeText("... 12345 ... 9 is my number and 27% or 68# #38 #39 "
317                         "but not $99."),
318       AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
319 
320   EXPECT_THAT(
321       result,
322       UnorderedElementsAre(
323           IsAnnotatedSpan(CodepointSpan(4, 9), "number",
324                           /*int_value=*/12345, /*double_value=*/12345.0),
325           IsAnnotatedSpan(CodepointSpan(14, 15), "number",
326                           /*int_value=*/9, /*double_value=*/9.0),
327           IsAnnotatedSpan(CodepointSpan(33, 35), "number",
328                           /*int_value=*/27, /*double_value=*/27.0),
329           IsAnnotatedSpan(CodepointSpan(33, 36), "percentage",
330                           /*int_value=*/27, /*double_value=*/27.0,
331                           /*priority_score=*/1),
332           IsAnnotatedSpan(CodepointSpan(40, 42), "number",
333                           /*int_value=*/68, /*double_value=*/68.0),
334           IsAnnotatedSpan(CodepointSpan(45, 47), "number",
335                           /*int_value=*/38, /*double_value=*/38.0),
336           IsAnnotatedSpan(CodepointSpan(49, 51), "number",
337                           /*int_value=*/39, /*double_value=*/39.0)));
338 }
339 
TEST_F(NumberAnnotatorForAnnotationAndClassificationTest,FindsAllDisabledModeReturnsNoResults)340 TEST_F(NumberAnnotatorForAnnotationAndClassificationTest,
341        FindsAllDisabledModeReturnsNoResults) {
342   std::vector<AnnotatedSpan> result;
343   EXPECT_TRUE(number_annotator_.FindAll(
344       UTF8ToUnicodeText("... 12345 ... 9 is my number and 27% or 68# #38 #39 "
345                         "but not $99."),
346       AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_SELECTION, &result));
347 
348   EXPECT_THAT(result, IsEmpty());
349 }
350 
TEST_F(NumberAnnotatorTest,FindsNoNumberInText)351 TEST_F(NumberAnnotatorTest, FindsNoNumberInText) {
352   std::vector<AnnotatedSpan> result;
353   EXPECT_TRUE(number_annotator_.FindAll(
354       UTF8ToUnicodeText("... 12345a ... 12345..12345 and 123a45 are not valid. "
355                         "And -#5% is also bad."),
356       AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_SELECTION, &result));
357   ASSERT_EQ(result.size(), 0);
358 }
359 
TEST_F(NumberAnnotatorTest,FindsNumberWithPunctuation)360 TEST_F(NumberAnnotatorTest, FindsNumberWithPunctuation) {
361   std::vector<AnnotatedSpan> result;
362   // A number should be followed by only one punctuation signs => 15 is not a
363   // number.
364   EXPECT_TRUE(number_annotator_.FindAll(
365       UTF8ToUnicodeText(
366           "It's 12, 13, 14! Or 15??? For sure 16: 17; 18. and -19"),
367       AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_CLASSIFICATION,
368       &result));
369 
370   EXPECT_THAT(result,
371               UnorderedElementsAre(
372                   IsAnnotatedSpan(CodepointSpan(5, 7), "number",
373                                   /*int_value=*/12, /*double_value=*/12.0),
374                   IsAnnotatedSpan(CodepointSpan(9, 11), "number",
375                                   /*int_value=*/13, /*double_value=*/13.0),
376                   IsAnnotatedSpan(CodepointSpan(13, 15), "number",
377                                   /*int_value=*/14, /*double_value=*/14.0),
378                   IsAnnotatedSpan(CodepointSpan(35, 37), "number",
379                                   /*int_value=*/16, /*double_value=*/16.0),
380                   IsAnnotatedSpan(CodepointSpan(39, 41), "number",
381                                   /*int_value=*/17, /*double_value=*/17.0),
382                   IsAnnotatedSpan(CodepointSpan(43, 45), "number",
383                                   /*int_value=*/18, /*double_value=*/18.0),
384                   IsAnnotatedSpan(CodepointSpan(51, 54), "number",
385                                   /*int_value=*/-19, /*double_value=*/-19.0)));
386 }
387 
TEST_F(NumberAnnotatorTest,FindsFloatNumberWithPunctuation)388 TEST_F(NumberAnnotatorTest, FindsFloatNumberWithPunctuation) {
389   std::vector<AnnotatedSpan> result;
390   EXPECT_TRUE(number_annotator_.FindAll(
391       UTF8ToUnicodeText("It's 12.123, 13.45, 14.54321! Or 15.1? Maybe 16.33: "
392                         "17.21; but for sure 18.90."),
393       AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
394 
395   EXPECT_THAT(result,
396               UnorderedElementsAre(
397                   IsAnnotatedSpan(CodepointSpan(5, 11), "number",
398                                   /*int_value=*/12, /*double_value=*/12.123,
399                                   /*priority_score=*/1),
400                   IsAnnotatedSpan(CodepointSpan(13, 18), "number",
401                                   /*int_value=*/13, /*double_value=*/13.45,
402                                   /*priority_score=*/1),
403                   IsAnnotatedSpan(CodepointSpan(20, 28), "number",
404                                   /*int_value=*/14, /*double_value=*/14.54321,
405                                   /*priority_score=*/1),
406                   IsAnnotatedSpan(CodepointSpan(33, 37), "number",
407                                   /*int_value=*/15, /*double_value=*/15.1,
408                                   /*priority_score=*/1),
409                   IsAnnotatedSpan(CodepointSpan(45, 50), "number",
410                                   /*int_value=*/16, /*double_value=*/16.33,
411                                   /*priority_score=*/1),
412                   IsAnnotatedSpan(CodepointSpan(52, 57), "number",
413                                   /*int_value=*/17, /*double_value=*/17.21,
414                                   /*priority_score=*/1),
415                   IsAnnotatedSpan(CodepointSpan(72, 77), "number",
416                                   /*int_value=*/18, /*double_value=*/18.9,
417                                   /*priority_score=*/1)));
418 }
419 
TEST_F(NumberAnnotatorTest,HandlesNumbersAtBeginning)420 TEST_F(NumberAnnotatorTest, HandlesNumbersAtBeginning) {
421   std::vector<AnnotatedSpan> result;
422   EXPECT_TRUE(number_annotator_.FindAll(
423       UTF8ToUnicodeText("-5"), AnnotationUsecase_ANNOTATION_USECASE_RAW,
424       ModeFlag_SELECTION, &result));
425 
426   EXPECT_THAT(result, UnorderedElementsAre(IsAnnotatedSpan(
427                           CodepointSpan(0, 2), "number",
428                           /*int_value=*/-5, /*double_value=*/-5)));
429 }
430 
TEST_F(NumberAnnotatorTest,HandlesNegativeNumbers)431 TEST_F(NumberAnnotatorTest, HandlesNegativeNumbers) {
432   std::vector<AnnotatedSpan> result;
433   EXPECT_TRUE(number_annotator_.FindAll(
434       UTF8ToUnicodeText("Number -5 and -5% and not number --5%"),
435       AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
436 
437   EXPECT_THAT(result,
438               UnorderedElementsAre(
439                   IsAnnotatedSpan(CodepointSpan(7, 9), "number",
440                                   /*int_value=*/-5, /*double_value=*/-5),
441                   IsAnnotatedSpan(CodepointSpan(14, 16), "number",
442                                   /*int_value=*/-5, /*double_value=*/-5),
443                   IsAnnotatedSpan(CodepointSpan(14, 17), "percentage",
444                                   /*int_value=*/-5, /*double_value=*/-5,
445                                   /*priority_score=*/1)));
446 }
447 
TEST_F(NumberAnnotatorTest,FindGoodPercentageContexts)448 TEST_F(NumberAnnotatorTest, FindGoodPercentageContexts) {
449   std::vector<AnnotatedSpan> result;
450   EXPECT_TRUE(number_annotator_.FindAll(
451       UTF8ToUnicodeText(
452           "5 percent, 10 pct, 25 pc and 17%, -5 percent, 10% are percentages"),
453       AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_SELECTION, &result));
454 
455   EXPECT_THAT(result,
456               UnorderedElementsAre(
457                   IsAnnotatedSpan(CodepointSpan(0, 1), "number",
458                                   /*int_value=*/5, /*double_value=*/5),
459                   IsAnnotatedSpan(CodepointSpan(0, 9), "percentage",
460                                   /*int_value=*/5, /*double_value=*/5,
461                                   /*priority_score=*/1),
462                   IsAnnotatedSpan(CodepointSpan(11, 13), "number",
463                                   /*int_value=*/10, /*double_value=*/10),
464                   IsAnnotatedSpan(CodepointSpan(11, 17), "percentage",
465                                   /*int_value=*/10, /*double_value=*/10,
466                                   /*priority_score=*/1),
467                   IsAnnotatedSpan(CodepointSpan(19, 21), "number",
468                                   /*int_value=*/25, /*double_value=*/25),
469                   IsAnnotatedSpan(CodepointSpan(19, 24), "percentage",
470                                   /*int_value=*/25, /*double_value=*/25,
471                                   /*priority_score=*/1),
472                   IsAnnotatedSpan(CodepointSpan(29, 31), "number",
473                                   /*int_value=*/17, /*double_value=*/17),
474                   IsAnnotatedSpan(CodepointSpan(29, 32), "percentage",
475                                   /*int_value=*/17, /*double_value=*/17,
476                                   /*priority_score=*/1),
477                   IsAnnotatedSpan(CodepointSpan(34, 36), "number",
478                                   /*int_value=*/-5, /*double_value=*/-5),
479                   IsAnnotatedSpan(CodepointSpan(34, 44), "percentage",
480                                   /*int_value=*/-5, /*double_value=*/-5,
481                                   /*priority_score=*/1),
482                   IsAnnotatedSpan(CodepointSpan(46, 48), "number",
483                                   /*int_value=*/10, /*double_value=*/10),
484                   IsAnnotatedSpan(CodepointSpan(46, 49), "percentage",
485                                   /*int_value=*/10, /*double_value=*/10,
486                                   /*priority_score=*/1)));
487 }
488 
TEST_F(NumberAnnotatorTest,FindSinglePercentageInContext)489 TEST_F(NumberAnnotatorTest, FindSinglePercentageInContext) {
490   std::vector<AnnotatedSpan> result;
491   EXPECT_TRUE(number_annotator_.FindAll(
492       UTF8ToUnicodeText("5%"), AnnotationUsecase_ANNOTATION_USECASE_RAW,
493       ModeFlag_ANNOTATION, &result));
494 
495   EXPECT_THAT(result, UnorderedElementsAre(
496                           IsAnnotatedSpan(CodepointSpan(0, 1), "number",
497                                           /*int_value=*/5, /*double_value=*/5),
498                           IsAnnotatedSpan(CodepointSpan(0, 2), "percentage",
499                                           /*int_value=*/5, /*double_value=*/5,
500                                           /*priority_score=*/1)));
501 }
502 
TEST_F(NumberAnnotatorTest,IgnoreBadPercentageContexts)503 TEST_F(NumberAnnotatorTest, IgnoreBadPercentageContexts) {
504   std::vector<AnnotatedSpan> result;
505   // A valid number is followed by only one punctuation element.
506   EXPECT_TRUE(number_annotator_.FindAll(
507       UTF8ToUnicodeText("10, pct, 25 prc, 5#: percentage are not percentages"),
508       AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
509 
510   EXPECT_THAT(result,
511               UnorderedElementsAre(
512                   IsAnnotatedSpan(CodepointSpan(0, 2), "number",
513                                   /*int_value=*/10, /*double_value=*/10),
514                   IsAnnotatedSpan(CodepointSpan(9, 11), "number",
515                                   /*int_value=*/25, /*double_value=*/25)));
516 }
517 
TEST_F(NumberAnnotatorTest,IgnoreBadPercentagePunctuationContexts)518 TEST_F(NumberAnnotatorTest, IgnoreBadPercentagePunctuationContexts) {
519   std::vector<AnnotatedSpan> result;
520   EXPECT_TRUE(number_annotator_.FindAll(
521       UTF8ToUnicodeText(
522           "#!24% or :?33 percent are not valid percentages, nor numbers."),
523       AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
524 
525   EXPECT_TRUE(result.empty());
526 }
527 
TEST_F(NumberAnnotatorTest,FindPercentageInNonAsciiContext)528 TEST_F(NumberAnnotatorTest, FindPercentageInNonAsciiContext) {
529   std::vector<AnnotatedSpan> result;
530   EXPECT_TRUE(number_annotator_.FindAll(
531       UTF8ToUnicodeText(
532           "At the café 10% or 25 percent of people are nice. Only 10%!"),
533       AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
534 
535   EXPECT_THAT(result,
536               UnorderedElementsAre(
537                   IsAnnotatedSpan(CodepointSpan(12, 14), "number",
538                                   /*int_value=*/10, /*double_value=*/10),
539                   IsAnnotatedSpan(CodepointSpan(12, 15), "percentage",
540                                   /*int_value=*/10, /*double_value=*/10,
541                                   /*priority_score=*/1),
542                   IsAnnotatedSpan(CodepointSpan(19, 21), "number",
543                                   /*int_value=*/25, /*double_value=*/25),
544                   IsAnnotatedSpan(CodepointSpan(19, 29), "percentage",
545                                   /*int_value=*/25, /*double_value=*/25,
546                                   /*priority_score=*/1),
547                   IsAnnotatedSpan(CodepointSpan(55, 57), "number",
548                                   /*int_value=*/10, /*double_value=*/10),
549                   IsAnnotatedSpan(CodepointSpan(55, 58), "percentage",
550                                   /*int_value=*/10, /*double_value=*/10,
551                                   /*priority_score=*/1)));
552 }
553 
TEST_F(NumberAnnotatorTest,WhenPercentSuffixWithAdditionalIgnoredCharactersDoesNotParseIt)554 TEST_F(NumberAnnotatorTest,
555        WhenPercentSuffixWithAdditionalIgnoredCharactersDoesNotParseIt) {
556   ClassificationResult classification_result;
557   EXPECT_FALSE(number_annotator_.ClassifyText(
558       UTF8ToUnicodeText("23#!? percent"), {0, 13},
559       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
560 }
561 
TEST_F(NumberAnnotatorTest,WhenPercentSuffixWithAdditionalRandomTokensDoesNotParseIt)562 TEST_F(NumberAnnotatorTest,
563        WhenPercentSuffixWithAdditionalRandomTokensDoesNotParseIt) {
564   ClassificationResult classification_result;
565   EXPECT_FALSE(number_annotator_.ClassifyText(
566       UTF8ToUnicodeText("23 asdf 3.14 pct asdf"), {0, 21},
567       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
568 }
569 
TEST_F(NumberAnnotatorTest,WhenPercentSuffixWithAdditionalRandomPrefixSuffixDoesNotParseIt)570 TEST_F(NumberAnnotatorTest,
571        WhenPercentSuffixWithAdditionalRandomPrefixSuffixDoesNotParseIt) {
572   ClassificationResult classification_result;
573   EXPECT_FALSE(number_annotator_.ClassifyText(
574       UTF8ToUnicodeText("abdf23 percentabdf"), {0, 18},
575       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
576 }
577 
TEST_F(NumberAnnotatorTest,WhenPercentSuffixWithAdditionalRandomStringsDoesNotParsesIt)578 TEST_F(NumberAnnotatorTest,
579        WhenPercentSuffixWithAdditionalRandomStringsDoesNotParsesIt) {
580   ClassificationResult classification_result;
581   EXPECT_FALSE(number_annotator_.ClassifyText(
582       UTF8ToUnicodeText("#?!23 percent#!?"), {0, 16},
583       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
584 }
585 
TEST_F(NumberAnnotatorTest,WhenBothPercentSymbolAndSuffixDoesNotParseIt)586 TEST_F(NumberAnnotatorTest, WhenBothPercentSymbolAndSuffixDoesNotParseIt) {
587   ClassificationResult classification_result;
588   EXPECT_FALSE(number_annotator_.ClassifyText(
589       UTF8ToUnicodeText("23% percent"), {0, 11},
590       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
591 }
592 
TEST_F(NumberAnnotatorTest,WhenPercentSymbolWithAdditionalPrefixCharactersDoesNotParsesIt)593 TEST_F(NumberAnnotatorTest,
594        WhenPercentSymbolWithAdditionalPrefixCharactersDoesNotParsesIt) {
595   ClassificationResult classification_result;
596   EXPECT_FALSE(number_annotator_.ClassifyText(
597       UTF8ToUnicodeText("#?23%"), {0, 5},
598       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
599 }
600 
TEST_F(NumberAnnotatorTest,WhenNumberWithAdditionalCharactersDoesNotParsesIt)601 TEST_F(NumberAnnotatorTest, WhenNumberWithAdditionalCharactersDoesNotParsesIt) {
602   ClassificationResult classification_result;
603   EXPECT_FALSE(number_annotator_.ClassifyText(
604       UTF8ToUnicodeText("23#!?"), {0, 5},
605       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
606 }
607 
TEST_F(NumberAnnotatorTest,WhenPercentSymbolWithAdditionalCharactersDoesNotParsesIt)608 TEST_F(NumberAnnotatorTest,
609        WhenPercentSymbolWithAdditionalCharactersDoesNotParsesIt) {
610   ClassificationResult classification_result;
611   // ! does not belong to the percentage annotation
612   EXPECT_TRUE(number_annotator_.ClassifyText(
613       UTF8ToUnicodeText("23%!"), {0, 3},
614       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
615   EXPECT_EQ(classification_result.collection, "percentage");
616   EXPECT_EQ(classification_result.numeric_value, 23);
617   EXPECT_EQ(classification_result.numeric_double_value, 23);
618 
619   EXPECT_FALSE(number_annotator_.ClassifyText(
620       UTF8ToUnicodeText("23%!"), {0, 4},
621       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
622 }
623 
TEST_F(NumberAnnotatorTest,WhenAdditionalCharactersWithMisplacedPercentSymbolDoesNotParsesIt)624 TEST_F(NumberAnnotatorTest,
625        WhenAdditionalCharactersWithMisplacedPercentSymbolDoesNotParsesIt) {
626   ClassificationResult classification_result;
627   EXPECT_FALSE(number_annotator_.ClassifyText(
628       UTF8ToUnicodeText("23.:;%"), {0, 6},
629       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
630 }
631 
TEST_F(NumberAnnotatorTest,WhenMultipleMinusSignsDoesNotParsesIt)632 TEST_F(NumberAnnotatorTest, WhenMultipleMinusSignsDoesNotParsesIt) {
633   ClassificationResult classification_result;
634   EXPECT_TRUE(number_annotator_.ClassifyText(
635       UTF8ToUnicodeText("--11"), {1, 4},
636       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
637   EXPECT_THAT(classification_result,
638               AllOf(Field(&ClassificationResult::collection, "number"),
639                     Field(&ClassificationResult::numeric_value, -11),
640                     Field(&ClassificationResult::numeric_double_value, -11)));
641 
642   EXPECT_FALSE(number_annotator_.ClassifyText(
643       UTF8ToUnicodeText("--11"), {0, 4},
644       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
645 }
646 
TEST_F(NumberAnnotatorTest,WhenMultipleMinusSignsPercentSignDoesNotParsesIt)647 TEST_F(NumberAnnotatorTest, WhenMultipleMinusSignsPercentSignDoesNotParsesIt) {
648   ClassificationResult classification_result;
649   EXPECT_TRUE(number_annotator_.ClassifyText(
650       UTF8ToUnicodeText("--11%"), {1, 5},
651       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
652   EXPECT_THAT(classification_result,
653               AllOf(Field(&ClassificationResult::collection, "percentage"),
654                     Field(&ClassificationResult::numeric_value, -11),
655                     Field(&ClassificationResult::numeric_double_value, -11)));
656 
657   EXPECT_FALSE(number_annotator_.ClassifyText(
658       UTF8ToUnicodeText("--11%"), {0, 5},
659       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
660 }
661 
TEST_F(NumberAnnotatorTest,WhenPlusMinusSignsDoesNotParsesIt)662 TEST_F(NumberAnnotatorTest, WhenPlusMinusSignsDoesNotParsesIt) {
663   ClassificationResult classification_result;
664   EXPECT_TRUE(number_annotator_.ClassifyText(
665       UTF8ToUnicodeText("+-11"), {1, 4},
666       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
667   EXPECT_THAT(classification_result,
668               AllOf(Field(&ClassificationResult::collection, "number"),
669                     Field(&ClassificationResult::numeric_value, -11),
670                     Field(&ClassificationResult::numeric_double_value, -11)));
671 
672   EXPECT_FALSE(number_annotator_.ClassifyText(
673       UTF8ToUnicodeText("+-11"), {0, 4},
674       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
675 }
676 
TEST_F(NumberAnnotatorTest,WhenMinusPlusSignsDoesNotParsesIt)677 TEST_F(NumberAnnotatorTest, WhenMinusPlusSignsDoesNotParsesIt) {
678   ClassificationResult classification_result;
679   // + right before a number is not included in the number annotation
680   EXPECT_FALSE(number_annotator_.ClassifyText(
681       UTF8ToUnicodeText("-+11"), {1, 4},
682       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
683   EXPECT_FALSE(number_annotator_.ClassifyText(
684       UTF8ToUnicodeText("-+11"), {0, 4},
685       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
686 }
687 
TEST_F(NumberAnnotatorTest,WhenMinusSignSuffixDoesNotParsesIt)688 TEST_F(NumberAnnotatorTest, WhenMinusSignSuffixDoesNotParsesIt) {
689   ClassificationResult classification_result;
690   EXPECT_FALSE(number_annotator_.ClassifyText(
691       UTF8ToUnicodeText("10-"), {0, 3},
692       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
693 }
694 
TEST_F(NumberAnnotatorTest,WhenMultipleCharSuffixDoesNotParsesIt)695 TEST_F(NumberAnnotatorTest, WhenMultipleCharSuffixDoesNotParsesIt) {
696   ClassificationResult classification_result;
697   EXPECT_TRUE(number_annotator_.ClassifyText(
698       UTF8ToUnicodeText("10**"), {0, 2},
699       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
700   EXPECT_THAT(classification_result,
701               AllOf(Field(&ClassificationResult::collection, "number"),
702                     Field(&ClassificationResult::numeric_value, 10),
703                     Field(&ClassificationResult::numeric_double_value, 10)));
704 
705   EXPECT_FALSE(number_annotator_.ClassifyText(
706       UTF8ToUnicodeText("10**"), {0, 3},
707       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
708   EXPECT_FALSE(number_annotator_.ClassifyText(
709       UTF8ToUnicodeText("10**"), {0, 4},
710       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
711 }
712 
TEST_F(NumberAnnotatorTest,WhenMultipleCharPrefixDoesNotParsesIt)713 TEST_F(NumberAnnotatorTest, WhenMultipleCharPrefixDoesNotParsesIt) {
714   ClassificationResult classification_result;
715   EXPECT_FALSE(number_annotator_.ClassifyText(
716       UTF8ToUnicodeText("**10"), {1, 4},
717       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
718   EXPECT_FALSE(number_annotator_.ClassifyText(
719       UTF8ToUnicodeText("**10"), {0, 4},
720       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
721 }
722 
TEST_F(NumberAnnotatorTest,WhenLowestSupportedNumberParsesIt)723 TEST_F(NumberAnnotatorTest, WhenLowestSupportedNumberParsesIt) {
724   ClassificationResult classification_result;
725   EXPECT_TRUE(number_annotator_.ClassifyText(
726       UTF8ToUnicodeText("-1000000000"), {0, 11},
727       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
728 
729   EXPECT_THAT(
730       classification_result,
731       AllOf(Field(&ClassificationResult::collection, "number"),
732             Field(&ClassificationResult::numeric_value, -1000000000),
733             Field(&ClassificationResult::numeric_double_value, -1000000000)));
734 }
735 
TEST_F(NumberAnnotatorTest,WhenLargestSupportedNumberParsesIt)736 TEST_F(NumberAnnotatorTest, WhenLargestSupportedNumberParsesIt) {
737   ClassificationResult classification_result;
738   EXPECT_TRUE(number_annotator_.ClassifyText(
739       UTF8ToUnicodeText("1000000000"), {0, 10},
740       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
741 
742   EXPECT_THAT(
743       classification_result,
744       AllOf(Field(&ClassificationResult::collection, "number"),
745             Field(&ClassificationResult::numeric_value, 1000000000),
746             Field(&ClassificationResult::numeric_double_value, 1000000000)));
747 }
748 
TEST_F(NumberAnnotatorTest,WhenLowestSupportedFloatNumberParsesIt)749 TEST_F(NumberAnnotatorTest, WhenLowestSupportedFloatNumberParsesIt) {
750   ClassificationResult classification_result;
751   EXPECT_TRUE(number_annotator_.ClassifyText(
752       UTF8ToUnicodeText("-999999999.999999999"), {0, 20},
753       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
754 
755   EXPECT_THAT(classification_result,
756               AllOf(Field(&ClassificationResult::collection, "number"),
757                     Field(&ClassificationResult::numeric_value, -1000000000),
758                     Field(&ClassificationResult::numeric_double_value,
759                           -999999999.999999999)));
760 }
761 
TEST_F(NumberAnnotatorTest,WhenLargestFloatSupportedNumberParsesIt)762 TEST_F(NumberAnnotatorTest, WhenLargestFloatSupportedNumberParsesIt) {
763   ClassificationResult classification_result;
764   EXPECT_TRUE(number_annotator_.ClassifyText(
765       UTF8ToUnicodeText("999999999.999999999"), {0, 19},
766       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
767 
768   EXPECT_THAT(classification_result,
769               AllOf(Field(&ClassificationResult::collection, "number"),
770                     Field(&ClassificationResult::numeric_value, 1000000000),
771                     Field(&ClassificationResult::numeric_double_value,
772                           999999999.999999999)));
773 }
774 
TEST_F(NumberAnnotatorTest,WhenLargeNumberDoesNotParseIt)775 TEST_F(NumberAnnotatorTest, WhenLargeNumberDoesNotParseIt) {
776   ClassificationResult classification_result;
777   EXPECT_FALSE(number_annotator_.ClassifyText(
778       UTF8ToUnicodeText("1234567890123456789012345678901234567890"), {0, 40},
779       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
780 }
781 
TEST_F(NumberAnnotatorTest,WhenMinusInTheMiddleDoesNotParseIt)782 TEST_F(NumberAnnotatorTest, WhenMinusInTheMiddleDoesNotParseIt) {
783   ClassificationResult classification_result;
784   EXPECT_FALSE(number_annotator_.ClassifyText(
785       UTF8ToUnicodeText("2016-2017"), {0, 9},
786       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
787 }
788 
TEST_F(NumberAnnotatorTest,WhenSuffixWithoutNumberDoesNotParseIt)789 TEST_F(NumberAnnotatorTest, WhenSuffixWithoutNumberDoesNotParseIt) {
790   std::vector<AnnotatedSpan> result;
791   EXPECT_TRUE(number_annotator_.FindAll(
792       UTF8ToUnicodeText("... % ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW,
793       ModeFlag_ANNOTATION, &result));
794 
795   ASSERT_EQ(result.size(), 0);
796 }
797 
TEST_F(NumberAnnotatorTest,WhenPrefixWithoutNumberDoesNotParseIt)798 TEST_F(NumberAnnotatorTest, WhenPrefixWithoutNumberDoesNotParseIt) {
799   std::vector<AnnotatedSpan> result;
800   EXPECT_TRUE(number_annotator_.FindAll(
801       UTF8ToUnicodeText("... $ ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW,
802       ModeFlag_ANNOTATION, &result));
803 
804   ASSERT_EQ(result.size(), 0);
805 }
806 
TEST_F(NumberAnnotatorTest,WhenPrefixAndSuffixWithoutNumberDoesNotParseIt)807 TEST_F(NumberAnnotatorTest, WhenPrefixAndSuffixWithoutNumberDoesNotParseIt) {
808   std::vector<AnnotatedSpan> result;
809   EXPECT_TRUE(number_annotator_.FindAll(
810       UTF8ToUnicodeText("... $% ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW,
811       ModeFlag_ANNOTATION, &result));
812 
813   ASSERT_EQ(result.size(), 0);
814 }
815 
TEST_F(NumberAnnotatorTest,ForNumberAnnotationsSetsScoreAndPriorityScore)816 TEST_F(NumberAnnotatorTest, ForNumberAnnotationsSetsScoreAndPriorityScore) {
817   ClassificationResult classification_result;
818   EXPECT_TRUE(number_annotator_.ClassifyText(
819       UTF8ToUnicodeText("... 12345 ..."), {4, 9},
820       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
821 
822   EXPECT_EQ(classification_result.collection, "number");
823   EXPECT_EQ(classification_result.numeric_value, 12345);
824   EXPECT_EQ(classification_result.numeric_double_value, 12345);
825   EXPECT_EQ(classification_result.score, 1);
826   EXPECT_EQ(classification_result.priority_score, -10);
827 
828   std::vector<AnnotatedSpan> result;
829   EXPECT_TRUE(number_annotator_.FindAll(
830       UTF8ToUnicodeText("Come at 9 or 10 ok?"),
831       AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
832 
833   EXPECT_THAT(result,
834               UnorderedElementsAre(
835                   IsAnnotatedSpan(CodepointSpan(8, 9), "number",
836                                   /*int_value=*/9, /*double_value=*/9),
837                   IsAnnotatedSpan(CodepointSpan(13, 15), "number",
838                                   /*int_value=*/10, /*double_value=*/10)));
839 }
840 
TEST_F(NumberAnnotatorTest,ForFloatNumberAnnotationsSetsScoreAndPriorityScore)841 TEST_F(NumberAnnotatorTest,
842        ForFloatNumberAnnotationsSetsScoreAndPriorityScore) {
843   ClassificationResult classification_result;
844   EXPECT_TRUE(number_annotator_.ClassifyText(
845       UTF8ToUnicodeText("... 12345.12345 ..."), {4, 15},
846       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
847   EXPECT_EQ(classification_result.collection, "number");
848   EXPECT_EQ(classification_result.numeric_value, 12345);
849   EXPECT_EQ(classification_result.numeric_double_value, 12345.12345);
850   EXPECT_EQ(classification_result.score, 1);
851   EXPECT_EQ(classification_result.priority_score, 1);
852 
853   std::vector<AnnotatedSpan> result;
854   EXPECT_TRUE(number_annotator_.FindAll(
855       UTF8ToUnicodeText("Results are between 12.5 and 13.5, right?"),
856       AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
857   EXPECT_THAT(result,
858               UnorderedElementsAre(
859                   IsAnnotatedSpan(CodepointSpan(20, 24), "number",
860                                   /*int_value=*/12, /*double_value=*/12.5,
861                                   /*priority_score=*/1),
862                   IsAnnotatedSpan(CodepointSpan(29, 33), "number",
863                                   /*int_value=*/13, /*double_value=*/13.5,
864                                   /*priority_score=*/1)));
865 }
866 
TEST_F(NumberAnnotatorTest,ForPercentageAnnotationsSetsScoreAndPriorityScore)867 TEST_F(NumberAnnotatorTest, ForPercentageAnnotationsSetsScoreAndPriorityScore) {
868   ClassificationResult classification_result;
869   EXPECT_TRUE(number_annotator_.ClassifyText(
870       UTF8ToUnicodeText("... 12345% ..."), {4, 10},
871       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
872   EXPECT_EQ(classification_result.collection, "percentage");
873   EXPECT_EQ(classification_result.numeric_value, 12345);
874   EXPECT_EQ(classification_result.numeric_double_value, 12345);
875   EXPECT_EQ(classification_result.score, 1);
876   EXPECT_EQ(classification_result.priority_score, 1);
877 
878   EXPECT_TRUE(number_annotator_.ClassifyText(
879       UTF8ToUnicodeText("... 12345 percent ..."), {4, 17},
880       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
881   EXPECT_EQ(classification_result.collection, "percentage");
882   EXPECT_EQ(classification_result.numeric_value, 12345);
883   EXPECT_EQ(classification_result.numeric_double_value, 12345);
884   EXPECT_EQ(classification_result.score, 1);
885   EXPECT_EQ(classification_result.priority_score, 1);
886 
887   std::vector<AnnotatedSpan> result;
888   EXPECT_TRUE(number_annotator_.FindAll(
889       UTF8ToUnicodeText("Results are between 9% and 10 percent."),
890       AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
891   EXPECT_THAT(result,
892               UnorderedElementsAre(
893                   IsAnnotatedSpan(CodepointSpan(20, 21), "number",
894                                   /*int_value=*/9, /*double_value=*/9),
895                   IsAnnotatedSpan(CodepointSpan(20, 22), "percentage",
896                                   /*int_value=*/9, /*double_value=*/9,
897                                   /*priority_score=*/1),
898                   IsAnnotatedSpan(CodepointSpan(27, 29), "number",
899                                   /*int_value=*/10, /*double_value=*/10),
900                   IsAnnotatedSpan(CodepointSpan(27, 37), "percentage",
901                                   /*int_value=*/10, /*double_value=*/10,
902                                   /*priority_score=*/1)));
903 }
904 
TEST_F(NumberAnnotatorTest,NumberDisabledPercentageEnabledForSmartUsecase)905 TEST_F(NumberAnnotatorTest, NumberDisabledPercentageEnabledForSmartUsecase) {
906   ClassificationResult classification_result;
907   EXPECT_FALSE(number_annotator_.ClassifyText(
908       UTF8ToUnicodeText("... 12345 ..."), {4, 9},
909       AnnotationUsecase_ANNOTATION_USECASE_SMART, &classification_result));
910 
911   EXPECT_TRUE(number_annotator_.ClassifyText(
912       UTF8ToUnicodeText("... 12345% ..."), {4, 10},
913       AnnotationUsecase_ANNOTATION_USECASE_SMART, &classification_result));
914   EXPECT_EQ(classification_result.collection, "percentage");
915   EXPECT_EQ(classification_result.numeric_value, 12345);
916   EXPECT_EQ(classification_result.numeric_double_value, 12345.0);
917   EXPECT_EQ(classification_result.score, 1);
918   EXPECT_EQ(classification_result.priority_score, 1);
919 
920   EXPECT_TRUE(number_annotator_.ClassifyText(
921       UTF8ToUnicodeText("... 12345percent ..."), {4, 16},
922       AnnotationUsecase_ANNOTATION_USECASE_SMART, &classification_result));
923   EXPECT_EQ(classification_result.collection, "percentage");
924   EXPECT_EQ(classification_result.numeric_value, 12345);
925   EXPECT_EQ(classification_result.numeric_double_value, 12345);
926   EXPECT_EQ(classification_result.score, 1);
927   EXPECT_EQ(classification_result.priority_score, 1);
928 
929   std::vector<AnnotatedSpan> result;
930   EXPECT_TRUE(number_annotator_.FindAll(
931       UTF8ToUnicodeText("Accuracy for experiment 3 is 9%."),
932       AnnotationUsecase_ANNOTATION_USECASE_SMART, ModeFlag_ANNOTATION,
933       &result));
934   EXPECT_THAT(result, UnorderedElementsAre(
935                           IsAnnotatedSpan(CodepointSpan(29, 31), "percentage",
936                                           /*int_value=*/9, /*double_value=*/9.0,
937                                           /*priority_score=*/1)));
938 }
939 
TEST_F(NumberAnnotatorTest,MathOperatorsNotAnnotatedAsNumbersFindAll)940 TEST_F(NumberAnnotatorTest, MathOperatorsNotAnnotatedAsNumbersFindAll) {
941   std::vector<AnnotatedSpan> result;
942   EXPECT_TRUE(number_annotator_.FindAll(
943       UTF8ToUnicodeText("how much is 2 + 2 or 5 - 96 * 89"),
944       AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
945 
946   EXPECT_THAT(result,
947               UnorderedElementsAre(
948                   IsAnnotatedSpan(CodepointSpan(12, 13), "number",
949                                   /*int_value=*/2, /*double_value=*/2),
950                   IsAnnotatedSpan(CodepointSpan(16, 17), "number",
951                                   /*int_value=*/2, /*double_value=*/2),
952                   IsAnnotatedSpan(CodepointSpan(21, 22), "number",
953                                   /*int_value=*/5, /*double_value=*/5),
954                   IsAnnotatedSpan(CodepointSpan(25, 27), "number",
955                                   /*int_value=*/96, /*double_value=*/96),
956                   IsAnnotatedSpan(CodepointSpan(30, 32), "number",
957                                   /*int_value=*/89, /*double_value=*/89)));
958 }
959 
TEST_F(NumberAnnotatorTest,MathOperatorsNotAnnotatedAsNumbersClassifyText)960 TEST_F(NumberAnnotatorTest, MathOperatorsNotAnnotatedAsNumbersClassifyText) {
961   ClassificationResult classification_result;
962   EXPECT_FALSE(number_annotator_.ClassifyText(
963       UTF8ToUnicodeText("2 + 2"), {2, 3},
964       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
965   EXPECT_FALSE(number_annotator_.ClassifyText(
966       UTF8ToUnicodeText("2 - 96 * 89"), {2, 3},
967       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
968 }
969 
TEST_F(NumberAnnotatorTest,SlashSeparatesTwoNumbersFindAll)970 TEST_F(NumberAnnotatorTest, SlashSeparatesTwoNumbersFindAll) {
971   std::vector<AnnotatedSpan> result;
972   EXPECT_TRUE(number_annotator_.FindAll(
973       UTF8ToUnicodeText("what's 1 + 2/3 * 4/5 * 6 / 7"),
974       AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
975 
976   EXPECT_THAT(result,
977               UnorderedElementsAre(
978                   IsAnnotatedSpan(CodepointSpan(7, 8), "number",
979                                   /*int_value=*/1, /*double_value=*/1),
980                   IsAnnotatedSpan(CodepointSpan(11, 12), "number",
981                                   /*int_value=*/2, /*double_value=*/2),
982                   IsAnnotatedSpan(CodepointSpan(13, 14), "number",
983                                   /*int_value=*/3, /*double_value=*/3),
984                   IsAnnotatedSpan(CodepointSpan(17, 18), "number",
985                                   /*int_value=*/4, /*double_value=*/4),
986                   IsAnnotatedSpan(CodepointSpan(19, 20), "number",
987                                   /*int_value=*/5, /*double_value=*/5),
988                   IsAnnotatedSpan(CodepointSpan(23, 24), "number",
989                                   /*int_value=*/6, /*double_value=*/6),
990                   IsAnnotatedSpan(CodepointSpan(27, 28), "number",
991                                   /*int_value=*/7, /*double_value=*/7)));
992 }
993 
TEST_F(NumberAnnotatorTest,SlashSeparatesTwoNumbersClassifyText)994 TEST_F(NumberAnnotatorTest, SlashSeparatesTwoNumbersClassifyText) {
995   ClassificationResult classification_result;
996   EXPECT_TRUE(number_annotator_.ClassifyText(
997       UTF8ToUnicodeText("what's 1 + 2/3 * 4"), {11, 12},
998       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
999   EXPECT_EQ(classification_result.collection, "number");
1000   EXPECT_EQ(classification_result.numeric_value, 2);
1001   EXPECT_EQ(classification_result.numeric_double_value, 2);
1002   EXPECT_EQ(classification_result.score, 1);
1003 
1004   EXPECT_TRUE(number_annotator_.ClassifyText(
1005       UTF8ToUnicodeText("what's 1 + 2/3 * 4"), {13, 14},
1006       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
1007   EXPECT_EQ(classification_result.collection, "number");
1008   EXPECT_EQ(classification_result.numeric_value, 3);
1009   EXPECT_EQ(classification_result.numeric_double_value, 3);
1010   EXPECT_EQ(classification_result.score, 1);
1011 }
1012 
TEST_F(NumberAnnotatorTest,SlashDoesNotSeparatesTwoNumbersFindAll)1013 TEST_F(NumberAnnotatorTest, SlashDoesNotSeparatesTwoNumbersFindAll) {
1014   std::vector<AnnotatedSpan> result;
1015   // 2 in the "2/" context is a number because / is punctuation
1016   EXPECT_TRUE(number_annotator_.FindAll(
1017       UTF8ToUnicodeText("what's 2a2/3 or 2/s4 or 2/ or /3 or //3 or 2//"),
1018       AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
1019 
1020   EXPECT_THAT(result, UnorderedElementsAre(IsAnnotatedSpan(
1021                           CodepointSpan(24, 25), "number",
1022                           /*int_value=*/2, /*double_value=*/2)));
1023 }
1024 
TEST_F(NumberAnnotatorTest,BracketsContextAnnotatedFindAll)1025 TEST_F(NumberAnnotatorTest, BracketsContextAnnotatedFindAll) {
1026   std::vector<AnnotatedSpan> result;
1027   EXPECT_TRUE(number_annotator_.FindAll(
1028       UTF8ToUnicodeText("The interval is: (12, 13) or [-12, -4.5)"),
1029       AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
1030 
1031   EXPECT_THAT(result,
1032               UnorderedElementsAre(
1033                   IsAnnotatedSpan(CodepointSpan(18, 20), "number",
1034                                   /*int_value=*/12, /*double_value=*/12),
1035                   IsAnnotatedSpan(CodepointSpan(22, 24), "number",
1036                                   /*int_value=*/13, /*double_value=*/13),
1037                   IsAnnotatedSpan(CodepointSpan(30, 33), "number",
1038                                   /*int_value=*/-12, /*double_value=*/-12),
1039                   IsAnnotatedSpan(CodepointSpan(35, 39), "number",
1040                                   /*int_value=*/-4, /*double_value=*/-4.5,
1041                                   /*priority_score=*/1)));
1042 }
1043 
TEST_F(NumberAnnotatorTest,BracketsContextNotAnnotatedFindAll)1044 TEST_F(NumberAnnotatorTest, BracketsContextNotAnnotatedFindAll) {
1045   std::vector<AnnotatedSpan> result;
1046   EXPECT_TRUE(number_annotator_.FindAll(
1047       UTF8ToUnicodeText("The interval is: -(12, 138*)"),
1048       AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
1049 
1050   EXPECT_TRUE(result.empty());
1051 }
1052 
TEST_F(NumberAnnotatorTest,FractionalNumberDotsFindAll)1053 TEST_F(NumberAnnotatorTest, FractionalNumberDotsFindAll) {
1054   std::vector<AnnotatedSpan> result;
1055   // Dots source: https://unicode-search.net/unicode-namesearch.pl?term=period
1056   EXPECT_TRUE(number_annotator_.FindAll(
1057       UTF8ToUnicodeText("3.1 3﹒2 3.3"),
1058       AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
1059 
1060   EXPECT_THAT(result, UnorderedElementsAre(
1061                           IsAnnotatedSpan(CodepointSpan(0, 3), "number",
1062                                           /*int_value=*/3, /*double_value=*/3.1,
1063                                           /*priority_score=*/1),
1064                           IsAnnotatedSpan(CodepointSpan(4, 7), "number",
1065                                           /*int_value=*/3, /*double_value=*/3.2,
1066                                           /*priority_score=*/1),
1067                           IsAnnotatedSpan(CodepointSpan(8, 11), "number",
1068                                           /*int_value=*/3, /*double_value=*/3.3,
1069                                           /*priority_score=*/1)));
1070 }
1071 
TEST_F(NumberAnnotatorTest,NonAsciiDigitsFindAll)1072 TEST_F(NumberAnnotatorTest, NonAsciiDigitsFindAll) {
1073   std::vector<AnnotatedSpan> result;
1074   // Dots source: https://unicode-search.net/unicode-namesearch.pl?term=period
1075   // Digits source: https://unicode-search.net/unicode-namesearch.pl?term=digit
1076   EXPECT_TRUE(number_annotator_.FindAll(
1077       UTF8ToUnicodeText("3 3﹒2 3.3%"),
1078       AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
1079 
1080   EXPECT_THAT(result, UnorderedElementsAre(
1081                           IsAnnotatedSpan(CodepointSpan(0, 1), "number",
1082                                           /*int_value=*/3, /*double_value=*/3),
1083                           IsAnnotatedSpan(CodepointSpan(2, 5), "number",
1084                                           /*int_value=*/3, /*double_value=*/3.2,
1085                                           /*priority_score=*/1),
1086                           IsAnnotatedSpan(CodepointSpan(6, 9), "number",
1087                                           /*int_value=*/3, /*double_value=*/3.3,
1088                                           /*priority_score=*/1),
1089                           IsAnnotatedSpan(CodepointSpan(6, 10), "percentage",
1090                                           /*int_value=*/3, /*double_value=*/3.3,
1091                                           /*priority_score=*/1)));
1092 }
1093 
TEST_F(NumberAnnotatorTest,AnnotatedZeroPrecededNumbersFindAll)1094 TEST_F(NumberAnnotatorTest, AnnotatedZeroPrecededNumbersFindAll) {
1095   std::vector<AnnotatedSpan> result;
1096   EXPECT_TRUE(number_annotator_.FindAll(
1097       UTF8ToUnicodeText("Numbers: 0.9 or 09 or 09.9 or 032310"),
1098       AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
1099 
1100   EXPECT_THAT(result, UnorderedElementsAre(
1101                           IsAnnotatedSpan(CodepointSpan(9, 12), "number",
1102                                           /*int_value=*/0, /*double_value=*/0.9,
1103                                           /*priority_score=*/1),
1104                           IsAnnotatedSpan(CodepointSpan(16, 18), "number",
1105                                           /*int_value=*/9, /*double_value=*/9),
1106                           IsAnnotatedSpan(CodepointSpan(22, 26), "number",
1107                                           /*int_value=*/9, /*double_value=*/9.9,
1108                                           /*priority_score=*/1),
1109                           IsAnnotatedSpan(CodepointSpan(30, 36), "number",
1110                                           /*int_value=*/32310,
1111                                           /*double_value=*/32310)));
1112 }
1113 
TEST_F(NumberAnnotatorTest,ZeroAfterDotFindAll)1114 TEST_F(NumberAnnotatorTest, ZeroAfterDotFindAll) {
1115   std::vector<AnnotatedSpan> result;
1116   EXPECT_TRUE(number_annotator_.FindAll(
1117       UTF8ToUnicodeText("15.0 16.00"), AnnotationUsecase_ANNOTATION_USECASE_RAW,
1118       ModeFlag_ANNOTATION, &result));
1119 
1120   EXPECT_THAT(result,
1121               UnorderedElementsAre(
1122                   IsAnnotatedSpan(CodepointSpan(0, 4), "number",
1123                                   /*int_value=*/15, /*double_value=*/15),
1124                   IsAnnotatedSpan(CodepointSpan(5, 10), "number",
1125                                   /*int_value=*/16, /*double_value=*/16)));
1126 }
1127 
TEST_F(NumberAnnotatorTest,NineDotNineFindAll)1128 TEST_F(NumberAnnotatorTest, NineDotNineFindAll) {
1129   std::vector<AnnotatedSpan> result;
1130   EXPECT_TRUE(number_annotator_.FindAll(
1131       UTF8ToUnicodeText("9.9 9.99 99.99 99.999 99.9999"),
1132       AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
1133 
1134   EXPECT_THAT(result,
1135               UnorderedElementsAre(
1136                   IsAnnotatedSpan(CodepointSpan(0, 3), "number",
1137                                   /*int_value=*/9, /*double_value=*/9.9,
1138                                   /*priority_score=*/1),
1139                   IsAnnotatedSpan(CodepointSpan(4, 8), "number",
1140                                   /*int_value=*/9, /*double_value=*/9.99,
1141                                   /*priority_score=*/1),
1142                   IsAnnotatedSpan(CodepointSpan(9, 14), "number",
1143                                   /*int_value=*/99, /*double_value=*/99.99,
1144                                   /*priority_score=*/1),
1145                   IsAnnotatedSpan(CodepointSpan(15, 21), "number",
1146                                   /*int_value=*/99, /*double_value=*/99.999,
1147                                   /*priority_score=*/1),
1148                   IsAnnotatedSpan(CodepointSpan(22, 29), "number",
1149                                   /*int_value=*/99, /*double_value=*/99.9999,
1150                                   /*priority_score=*/1)));
1151 }
1152 
1153 }  // namespace test_internal
1154 }  // namespace libtextclassifier3
1155