1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "annotator/number/number_test-include.h"
18
19 #include <set>
20 #include <string>
21 #include <vector>
22
23 #include "annotator/collections.h"
24 #include "annotator/model_generated.h"
25 #include "annotator/types-test-util.h"
26 #include "annotator/types.h"
27 #include "utils/tokenizer-utils.h"
28 #include "utils/utf8/unicodetext.h"
29 #include "gmock/gmock.h"
30 #include "gtest/gtest.h"
31
32 namespace libtextclassifier3 {
33 namespace test_internal {
34
35 using ::testing::AllOf;
36 using ::testing::ElementsAre;
37 using ::testing::Field;
38 using ::testing::IsEmpty;
39 using ::testing::Matcher;
40 using ::testing::UnorderedElementsAre;
41
42 namespace {
CreateOptionsData(ModeFlag enabled_modes)43 const flatbuffers::DetachedBuffer* CreateOptionsData(ModeFlag enabled_modes) {
44 NumberAnnotatorOptionsT options;
45 options.enabled = true;
46 options.priority_score = -10.0;
47 options.float_number_priority_score = 1.0;
48 options.enabled_annotation_usecases =
49 1 << AnnotationUsecase_ANNOTATION_USECASE_RAW;
50 options.max_number_of_digits = 20;
51 options.enabled_modes = enabled_modes;
52
53 options.percentage_priority_score = 1.0;
54 options.percentage_annotation_usecases =
55 (1 << AnnotationUsecase_ANNOTATION_USECASE_RAW) +
56 (1 << AnnotationUsecase_ANNOTATION_USECASE_SMART);
57 std::set<std::string> percent_suffixes(
58 {"パーセント", "percent", "pércént", "pc", "pct", "%", "٪", "﹪", "%"});
59 for (const std::string& string_value : percent_suffixes) {
60 options.percentage_pieces_string.append(string_value);
61 options.percentage_pieces_string.push_back('\0');
62 }
63
64 flatbuffers::FlatBufferBuilder builder;
65 builder.Finish(NumberAnnotatorOptions::Pack(builder, &options));
66 return new flatbuffers::DetachedBuffer(builder.Release());
67 }
68 } // namespace
69
70 const NumberAnnotatorOptions*
TestingNumberAnnotatorOptions(ModeFlag enabled_modes)71 NumberAnnotatorTest::TestingNumberAnnotatorOptions(ModeFlag enabled_modes) {
72 static const flatbuffers::DetachedBuffer* options_data_selection =
73 CreateOptionsData(ModeFlag_SELECTION);
74 static const flatbuffers::DetachedBuffer* options_data_no_selection =
75 CreateOptionsData(ModeFlag_ANNOTATION_AND_CLASSIFICATION);
76 static const flatbuffers::DetachedBuffer* options_data_all =
77 CreateOptionsData(ModeFlag_ALL);
78
79 if (enabled_modes == ModeFlag_SELECTION) {
80 return flatbuffers::GetRoot<NumberAnnotatorOptions>(
81 options_data_selection->data());
82 } else if (enabled_modes == ModeFlag_ANNOTATION_AND_CLASSIFICATION) {
83 return flatbuffers::GetRoot<NumberAnnotatorOptions>(
84 options_data_no_selection->data());
85 } else {
86 return flatbuffers::GetRoot<NumberAnnotatorOptions>(
87 options_data_all->data());
88 }
89 }
90
91 MATCHER_P(IsCorrectCollection, collection, "collection is " + collection) {
92 return arg.collection == collection;
93 }
94
95 MATCHER_P(IsCorrectNumericValue, numeric_value,
96 "numeric value is " + std::to_string(numeric_value)) {
97 return arg.numeric_value == numeric_value;
98 }
99
100 MATCHER_P(IsCorrectNumericDoubleValue, numeric_double_value,
101 "numeric double value is " + std::to_string(numeric_double_value)) {
102 return arg.numeric_double_value == numeric_double_value;
103 }
104
105 MATCHER_P(IsCorrectScore, score, "score is " + std::to_string(score)) {
106 return arg.score == score;
107 }
108
109 MATCHER_P(IsCorrectPriortyScore, priority_score,
110 "priority score is " + std::to_string(priority_score)) {
111 return arg.priority_score == priority_score;
112 }
113
114 MATCHER_P(IsCorrectSpan, span,
115 "span is (" + std::to_string(span.first) + "," +
116 std::to_string(span.second) + ")") {
117 return arg.span == span;
118 }
119
120 MATCHER_P(Classification, inner, "") {
121 return testing::ExplainMatchResult(inner, arg.classification,
122 result_listener);
123 }
124
IsAnnotatedSpan(const CodepointSpan & codepoint_span,const std::string & collection,const int int_value,const double double_value,const float priority_score=-10,const float score=1)125 static Matcher<AnnotatedSpan> IsAnnotatedSpan(
126 const CodepointSpan& codepoint_span, const std::string& collection,
127 const int int_value, const double double_value,
128 const float priority_score = -10, const float score = 1) {
129 return AllOf(
130 IsCorrectSpan(codepoint_span),
131 Classification(ElementsAre(AllOf(
132 IsCorrectCollection(collection), IsCorrectNumericValue(int_value),
133 IsCorrectNumericDoubleValue(double_value), IsCorrectScore(score),
134 IsCorrectPriortyScore(priority_score)))));
135 }
136
TEST_F(NumberAnnotatorTest,ClassifiesAndParsesNumberCorrectly)137 TEST_F(NumberAnnotatorTest, ClassifiesAndParsesNumberCorrectly) {
138 ClassificationResult classification_result;
139 EXPECT_TRUE(number_annotator_.ClassifyText(
140 UTF8ToUnicodeText("... 12345 ..."), {4, 9},
141 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
142
143 EXPECT_EQ(classification_result.collection, "number");
144 EXPECT_EQ(classification_result.numeric_value, 12345);
145 EXPECT_FLOAT_EQ(classification_result.numeric_double_value, 12345);
146 }
147
TEST_F(NumberAnnotatorForSelectionTest,ClassifyTextDisabledClassificationReturnsFalse)148 TEST_F(NumberAnnotatorForSelectionTest,
149 ClassifyTextDisabledClassificationReturnsFalse) {
150 ClassificationResult classification_result;
151 EXPECT_FALSE(number_annotator_.ClassifyText(
152 UTF8ToUnicodeText("... 12345 ..."), {4, 9},
153 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
154 }
155
TEST_F(NumberAnnotatorTest,ClassifiesAndParsesNumberAsFloatCorrectly)156 TEST_F(NumberAnnotatorTest, ClassifiesAndParsesNumberAsFloatCorrectly) {
157 ClassificationResult classification_result;
158 EXPECT_TRUE(number_annotator_.ClassifyText(
159 UTF8ToUnicodeText("... 12345.12345 ..."), {4, 15},
160 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
161
162 EXPECT_EQ(classification_result.collection, "number");
163 EXPECT_EQ(classification_result.numeric_value, 12345);
164 EXPECT_FLOAT_EQ(classification_result.numeric_double_value, 12345.12345);
165 }
166
TEST_F(NumberAnnotatorTest,ClassifiesAndParsesNumberAsFloatCorrectlyWithoutDecimals)167 TEST_F(NumberAnnotatorTest,
168 ClassifiesAndParsesNumberAsFloatCorrectlyWithoutDecimals) {
169 ClassificationResult classification_result;
170 // The dot after a number is considered punctuation, not part of a floating
171 // number.
172 EXPECT_TRUE(number_annotator_.ClassifyText(
173 UTF8ToUnicodeText("... 12345. ..."), {4, 9},
174 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
175 EXPECT_FALSE(number_annotator_.ClassifyText(
176 UTF8ToUnicodeText("... 12345. ..."), {4, 10},
177 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
178
179 EXPECT_EQ(classification_result.collection, "number");
180 EXPECT_EQ(classification_result.numeric_value, 12345);
181 EXPECT_FLOAT_EQ(classification_result.numeric_double_value, 12345);
182
183 EXPECT_TRUE(number_annotator_.ClassifyText(
184 UTF8ToUnicodeText("... 12345. ..."), {4, 9},
185 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
186 EXPECT_EQ(classification_result.collection, "number");
187 EXPECT_EQ(classification_result.numeric_value, 12345);
188 EXPECT_FLOAT_EQ(classification_result.numeric_double_value, 12345);
189 }
190
TEST_F(NumberAnnotatorTest,FindsAllIntegerAndFloatNumbersInText)191 TEST_F(NumberAnnotatorTest, FindsAllIntegerAndFloatNumbersInText) {
192 std::vector<AnnotatedSpan> result;
193 // In the context "68.9#" -> 68.9 is a number because # is punctuation.
194 // In the context "68.9#?" -> 68.9 is not a number because is followed by two
195 // punctuation signs.
196 EXPECT_TRUE(number_annotator_.FindAll(
197 UTF8ToUnicodeText("how much is 2 plus 5 divided by 7% minus 3.14 "
198 "what about 68.9# or 68.9#?"),
199 AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
200
201 EXPECT_THAT(result,
202 UnorderedElementsAre(
203 IsAnnotatedSpan(CodepointSpan(12, 13), "number",
204 /*int_value=*/2, /*double_value=*/2.0),
205 IsAnnotatedSpan(CodepointSpan(19, 20), "number",
206 /*int_value=*/5, /*double_value=*/5.0),
207 IsAnnotatedSpan(CodepointSpan(32, 33), "number",
208 /*int_value=*/7, /*double_value=*/7.0),
209 IsAnnotatedSpan(CodepointSpan(32, 34), "percentage",
210 /*int_value=*/7, /*double_value=*/7.0,
211 /*priority_score=*/1),
212 IsAnnotatedSpan(CodepointSpan(41, 45), "number",
213 /*int_value=*/3, /*double_value=*/3.14,
214 /*priority_score=*/1),
215 IsAnnotatedSpan(CodepointSpan(57, 61), "number",
216 /*int_value=*/68, /*double_value=*/68.9,
217 /*priority_score=*/1)));
218 }
219
TEST_F(NumberAnnotatorTest,ClassifiesNonNumberCorrectly)220 TEST_F(NumberAnnotatorTest, ClassifiesNonNumberCorrectly) {
221 ClassificationResult classification_result;
222 EXPECT_FALSE(number_annotator_.ClassifyText(
223 UTF8ToUnicodeText("... 123a45 ..."), {4, 10},
224 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
225 EXPECT_FALSE(number_annotator_.ClassifyText(
226 UTF8ToUnicodeText("... 12345..12345 ..."), {4, 16},
227 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
228 EXPECT_FALSE(number_annotator_.ClassifyText(
229 UTF8ToUnicodeText("... 12345a ..."), {4, 11},
230 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
231 }
232
TEST_F(NumberAnnotatorTest,ClassifiesNumberSelectionCorrectly)233 TEST_F(NumberAnnotatorTest, ClassifiesNumberSelectionCorrectly) {
234 ClassificationResult classification_result;
235 // Punctuation after a number is not part of the number.
236 EXPECT_TRUE(number_annotator_.ClassifyText(
237 UTF8ToUnicodeText("... 14, ..."), {4, 6},
238 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
239 EXPECT_EQ(classification_result.collection, "number");
240 EXPECT_EQ(classification_result.numeric_value, 14);
241 EXPECT_EQ(classification_result.numeric_double_value, 14);
242
243 EXPECT_FALSE(number_annotator_.ClassifyText(
244 UTF8ToUnicodeText("... 14, ..."), {4, 7},
245 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
246 }
247
TEST_F(NumberAnnotatorTest,ClassifiesPercentageSignCorrectly)248 TEST_F(NumberAnnotatorTest, ClassifiesPercentageSignCorrectly) {
249 ClassificationResult classification_result;
250 EXPECT_TRUE(number_annotator_.ClassifyText(
251 UTF8ToUnicodeText("... 99% ..."), {4, 7},
252 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
253
254 EXPECT_EQ(classification_result.collection, "percentage");
255 EXPECT_EQ(classification_result.numeric_value, 99);
256 EXPECT_EQ(classification_result.numeric_double_value, 99);
257 }
258
TEST_F(NumberAnnotatorTest,ClassifiesPercentageWordCorrectly)259 TEST_F(NumberAnnotatorTest, ClassifiesPercentageWordCorrectly) {
260 ClassificationResult classification_result;
261 EXPECT_TRUE(number_annotator_.ClassifyText(
262 UTF8ToUnicodeText("... 15 percent ..."), {4, 14},
263 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
264
265 EXPECT_EQ(classification_result.collection, "percentage");
266 EXPECT_EQ(classification_result.numeric_value, 15);
267 EXPECT_EQ(classification_result.numeric_double_value, 15);
268 }
269
TEST_F(NumberAnnotatorTest,ClassifiesNonAsciiPercentageIncorrectSuffix)270 TEST_F(NumberAnnotatorTest, ClassifiesNonAsciiPercentageIncorrectSuffix) {
271 ClassificationResult classification_result;
272 EXPECT_FALSE(number_annotator_.ClassifyText(
273 UTF8ToUnicodeText("15 café"), {0, 7},
274 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
275 }
276
TEST_F(NumberAnnotatorTest,ClassifiesNonAsciiFrPercentageCorrectSuffix)277 TEST_F(NumberAnnotatorTest, ClassifiesNonAsciiFrPercentageCorrectSuffix) {
278 ClassificationResult classification_result;
279 EXPECT_TRUE(number_annotator_.ClassifyText(
280 UTF8ToUnicodeText("25 pércént"), {0, 10},
281 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
282
283 EXPECT_EQ(classification_result.collection, "percentage");
284 EXPECT_EQ(classification_result.numeric_value, 25);
285 EXPECT_EQ(classification_result.numeric_double_value, 25);
286 }
287
TEST_F(NumberAnnotatorTest,ClassifiesNonAsciiJaPercentageCorrectSuffix)288 TEST_F(NumberAnnotatorTest, ClassifiesNonAsciiJaPercentageCorrectSuffix) {
289 ClassificationResult classification_result;
290 EXPECT_TRUE(number_annotator_.ClassifyText(
291 UTF8ToUnicodeText("10パーセント"), {0, 7},
292 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
293 EXPECT_EQ(classification_result.collection, "percentage");
294 EXPECT_EQ(classification_result.numeric_value, 10);
295 EXPECT_EQ(classification_result.numeric_double_value, 10);
296
297 std::vector<AnnotatedSpan> result;
298 EXPECT_TRUE(number_annotator_.FindAll(
299 UTF8ToUnicodeText("明日の降水確率は10パーセント 音量を12にセット"),
300 AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_CLASSIFICATION,
301 &result));
302 EXPECT_THAT(result,
303 UnorderedElementsAre(
304 IsAnnotatedSpan(CodepointSpan(8, 10), "number",
305 /*int_value=*/10, /*double_value=*/10.0),
306 IsAnnotatedSpan(CodepointSpan(8, 15), "percentage",
307 /*int_value=*/10, /*double_value=*/10.0,
308 /*priority_score=*/1),
309 IsAnnotatedSpan(CodepointSpan(20, 22), "number",
310 /*int_value=*/12, /*double_value=*/12.0)));
311 }
312
TEST_F(NumberAnnotatorTest,FindsAllNumbersInText)313 TEST_F(NumberAnnotatorTest, FindsAllNumbersInText) {
314 std::vector<AnnotatedSpan> result;
315 EXPECT_TRUE(number_annotator_.FindAll(
316 UTF8ToUnicodeText("... 12345 ... 9 is my number and 27% or 68# #38 #39 "
317 "but not $99."),
318 AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
319
320 EXPECT_THAT(
321 result,
322 UnorderedElementsAre(
323 IsAnnotatedSpan(CodepointSpan(4, 9), "number",
324 /*int_value=*/12345, /*double_value=*/12345.0),
325 IsAnnotatedSpan(CodepointSpan(14, 15), "number",
326 /*int_value=*/9, /*double_value=*/9.0),
327 IsAnnotatedSpan(CodepointSpan(33, 35), "number",
328 /*int_value=*/27, /*double_value=*/27.0),
329 IsAnnotatedSpan(CodepointSpan(33, 36), "percentage",
330 /*int_value=*/27, /*double_value=*/27.0,
331 /*priority_score=*/1),
332 IsAnnotatedSpan(CodepointSpan(40, 42), "number",
333 /*int_value=*/68, /*double_value=*/68.0),
334 IsAnnotatedSpan(CodepointSpan(45, 47), "number",
335 /*int_value=*/38, /*double_value=*/38.0),
336 IsAnnotatedSpan(CodepointSpan(49, 51), "number",
337 /*int_value=*/39, /*double_value=*/39.0)));
338 }
339
TEST_F(NumberAnnotatorForAnnotationAndClassificationTest,FindsAllDisabledModeReturnsNoResults)340 TEST_F(NumberAnnotatorForAnnotationAndClassificationTest,
341 FindsAllDisabledModeReturnsNoResults) {
342 std::vector<AnnotatedSpan> result;
343 EXPECT_TRUE(number_annotator_.FindAll(
344 UTF8ToUnicodeText("... 12345 ... 9 is my number and 27% or 68# #38 #39 "
345 "but not $99."),
346 AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_SELECTION, &result));
347
348 EXPECT_THAT(result, IsEmpty());
349 }
350
TEST_F(NumberAnnotatorTest,FindsNoNumberInText)351 TEST_F(NumberAnnotatorTest, FindsNoNumberInText) {
352 std::vector<AnnotatedSpan> result;
353 EXPECT_TRUE(number_annotator_.FindAll(
354 UTF8ToUnicodeText("... 12345a ... 12345..12345 and 123a45 are not valid. "
355 "And -#5% is also bad."),
356 AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_SELECTION, &result));
357 ASSERT_EQ(result.size(), 0);
358 }
359
TEST_F(NumberAnnotatorTest,FindsNumberWithPunctuation)360 TEST_F(NumberAnnotatorTest, FindsNumberWithPunctuation) {
361 std::vector<AnnotatedSpan> result;
362 // A number should be followed by only one punctuation signs => 15 is not a
363 // number.
364 EXPECT_TRUE(number_annotator_.FindAll(
365 UTF8ToUnicodeText(
366 "It's 12, 13, 14! Or 15??? For sure 16: 17; 18. and -19"),
367 AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_CLASSIFICATION,
368 &result));
369
370 EXPECT_THAT(result,
371 UnorderedElementsAre(
372 IsAnnotatedSpan(CodepointSpan(5, 7), "number",
373 /*int_value=*/12, /*double_value=*/12.0),
374 IsAnnotatedSpan(CodepointSpan(9, 11), "number",
375 /*int_value=*/13, /*double_value=*/13.0),
376 IsAnnotatedSpan(CodepointSpan(13, 15), "number",
377 /*int_value=*/14, /*double_value=*/14.0),
378 IsAnnotatedSpan(CodepointSpan(35, 37), "number",
379 /*int_value=*/16, /*double_value=*/16.0),
380 IsAnnotatedSpan(CodepointSpan(39, 41), "number",
381 /*int_value=*/17, /*double_value=*/17.0),
382 IsAnnotatedSpan(CodepointSpan(43, 45), "number",
383 /*int_value=*/18, /*double_value=*/18.0),
384 IsAnnotatedSpan(CodepointSpan(51, 54), "number",
385 /*int_value=*/-19, /*double_value=*/-19.0)));
386 }
387
TEST_F(NumberAnnotatorTest,FindsFloatNumberWithPunctuation)388 TEST_F(NumberAnnotatorTest, FindsFloatNumberWithPunctuation) {
389 std::vector<AnnotatedSpan> result;
390 EXPECT_TRUE(number_annotator_.FindAll(
391 UTF8ToUnicodeText("It's 12.123, 13.45, 14.54321! Or 15.1? Maybe 16.33: "
392 "17.21; but for sure 18.90."),
393 AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
394
395 EXPECT_THAT(result,
396 UnorderedElementsAre(
397 IsAnnotatedSpan(CodepointSpan(5, 11), "number",
398 /*int_value=*/12, /*double_value=*/12.123,
399 /*priority_score=*/1),
400 IsAnnotatedSpan(CodepointSpan(13, 18), "number",
401 /*int_value=*/13, /*double_value=*/13.45,
402 /*priority_score=*/1),
403 IsAnnotatedSpan(CodepointSpan(20, 28), "number",
404 /*int_value=*/14, /*double_value=*/14.54321,
405 /*priority_score=*/1),
406 IsAnnotatedSpan(CodepointSpan(33, 37), "number",
407 /*int_value=*/15, /*double_value=*/15.1,
408 /*priority_score=*/1),
409 IsAnnotatedSpan(CodepointSpan(45, 50), "number",
410 /*int_value=*/16, /*double_value=*/16.33,
411 /*priority_score=*/1),
412 IsAnnotatedSpan(CodepointSpan(52, 57), "number",
413 /*int_value=*/17, /*double_value=*/17.21,
414 /*priority_score=*/1),
415 IsAnnotatedSpan(CodepointSpan(72, 77), "number",
416 /*int_value=*/18, /*double_value=*/18.9,
417 /*priority_score=*/1)));
418 }
419
TEST_F(NumberAnnotatorTest,HandlesNumbersAtBeginning)420 TEST_F(NumberAnnotatorTest, HandlesNumbersAtBeginning) {
421 std::vector<AnnotatedSpan> result;
422 EXPECT_TRUE(number_annotator_.FindAll(
423 UTF8ToUnicodeText("-5"), AnnotationUsecase_ANNOTATION_USECASE_RAW,
424 ModeFlag_SELECTION, &result));
425
426 EXPECT_THAT(result, UnorderedElementsAre(IsAnnotatedSpan(
427 CodepointSpan(0, 2), "number",
428 /*int_value=*/-5, /*double_value=*/-5)));
429 }
430
TEST_F(NumberAnnotatorTest,HandlesNegativeNumbers)431 TEST_F(NumberAnnotatorTest, HandlesNegativeNumbers) {
432 std::vector<AnnotatedSpan> result;
433 EXPECT_TRUE(number_annotator_.FindAll(
434 UTF8ToUnicodeText("Number -5 and -5% and not number --5%"),
435 AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
436
437 EXPECT_THAT(result,
438 UnorderedElementsAre(
439 IsAnnotatedSpan(CodepointSpan(7, 9), "number",
440 /*int_value=*/-5, /*double_value=*/-5),
441 IsAnnotatedSpan(CodepointSpan(14, 16), "number",
442 /*int_value=*/-5, /*double_value=*/-5),
443 IsAnnotatedSpan(CodepointSpan(14, 17), "percentage",
444 /*int_value=*/-5, /*double_value=*/-5,
445 /*priority_score=*/1)));
446 }
447
TEST_F(NumberAnnotatorTest,FindGoodPercentageContexts)448 TEST_F(NumberAnnotatorTest, FindGoodPercentageContexts) {
449 std::vector<AnnotatedSpan> result;
450 EXPECT_TRUE(number_annotator_.FindAll(
451 UTF8ToUnicodeText(
452 "5 percent, 10 pct, 25 pc and 17%, -5 percent, 10% are percentages"),
453 AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_SELECTION, &result));
454
455 EXPECT_THAT(result,
456 UnorderedElementsAre(
457 IsAnnotatedSpan(CodepointSpan(0, 1), "number",
458 /*int_value=*/5, /*double_value=*/5),
459 IsAnnotatedSpan(CodepointSpan(0, 9), "percentage",
460 /*int_value=*/5, /*double_value=*/5,
461 /*priority_score=*/1),
462 IsAnnotatedSpan(CodepointSpan(11, 13), "number",
463 /*int_value=*/10, /*double_value=*/10),
464 IsAnnotatedSpan(CodepointSpan(11, 17), "percentage",
465 /*int_value=*/10, /*double_value=*/10,
466 /*priority_score=*/1),
467 IsAnnotatedSpan(CodepointSpan(19, 21), "number",
468 /*int_value=*/25, /*double_value=*/25),
469 IsAnnotatedSpan(CodepointSpan(19, 24), "percentage",
470 /*int_value=*/25, /*double_value=*/25,
471 /*priority_score=*/1),
472 IsAnnotatedSpan(CodepointSpan(29, 31), "number",
473 /*int_value=*/17, /*double_value=*/17),
474 IsAnnotatedSpan(CodepointSpan(29, 32), "percentage",
475 /*int_value=*/17, /*double_value=*/17,
476 /*priority_score=*/1),
477 IsAnnotatedSpan(CodepointSpan(34, 36), "number",
478 /*int_value=*/-5, /*double_value=*/-5),
479 IsAnnotatedSpan(CodepointSpan(34, 44), "percentage",
480 /*int_value=*/-5, /*double_value=*/-5,
481 /*priority_score=*/1),
482 IsAnnotatedSpan(CodepointSpan(46, 48), "number",
483 /*int_value=*/10, /*double_value=*/10),
484 IsAnnotatedSpan(CodepointSpan(46, 49), "percentage",
485 /*int_value=*/10, /*double_value=*/10,
486 /*priority_score=*/1)));
487 }
488
TEST_F(NumberAnnotatorTest,FindSinglePercentageInContext)489 TEST_F(NumberAnnotatorTest, FindSinglePercentageInContext) {
490 std::vector<AnnotatedSpan> result;
491 EXPECT_TRUE(number_annotator_.FindAll(
492 UTF8ToUnicodeText("5%"), AnnotationUsecase_ANNOTATION_USECASE_RAW,
493 ModeFlag_ANNOTATION, &result));
494
495 EXPECT_THAT(result, UnorderedElementsAre(
496 IsAnnotatedSpan(CodepointSpan(0, 1), "number",
497 /*int_value=*/5, /*double_value=*/5),
498 IsAnnotatedSpan(CodepointSpan(0, 2), "percentage",
499 /*int_value=*/5, /*double_value=*/5,
500 /*priority_score=*/1)));
501 }
502
TEST_F(NumberAnnotatorTest,IgnoreBadPercentageContexts)503 TEST_F(NumberAnnotatorTest, IgnoreBadPercentageContexts) {
504 std::vector<AnnotatedSpan> result;
505 // A valid number is followed by only one punctuation element.
506 EXPECT_TRUE(number_annotator_.FindAll(
507 UTF8ToUnicodeText("10, pct, 25 prc, 5#: percentage are not percentages"),
508 AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
509
510 EXPECT_THAT(result,
511 UnorderedElementsAre(
512 IsAnnotatedSpan(CodepointSpan(0, 2), "number",
513 /*int_value=*/10, /*double_value=*/10),
514 IsAnnotatedSpan(CodepointSpan(9, 11), "number",
515 /*int_value=*/25, /*double_value=*/25)));
516 }
517
TEST_F(NumberAnnotatorTest,IgnoreBadPercentagePunctuationContexts)518 TEST_F(NumberAnnotatorTest, IgnoreBadPercentagePunctuationContexts) {
519 std::vector<AnnotatedSpan> result;
520 EXPECT_TRUE(number_annotator_.FindAll(
521 UTF8ToUnicodeText(
522 "#!24% or :?33 percent are not valid percentages, nor numbers."),
523 AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
524
525 EXPECT_TRUE(result.empty());
526 }
527
TEST_F(NumberAnnotatorTest,FindPercentageInNonAsciiContext)528 TEST_F(NumberAnnotatorTest, FindPercentageInNonAsciiContext) {
529 std::vector<AnnotatedSpan> result;
530 EXPECT_TRUE(number_annotator_.FindAll(
531 UTF8ToUnicodeText(
532 "At the café 10% or 25 percent of people are nice. Only 10%!"),
533 AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
534
535 EXPECT_THAT(result,
536 UnorderedElementsAre(
537 IsAnnotatedSpan(CodepointSpan(12, 14), "number",
538 /*int_value=*/10, /*double_value=*/10),
539 IsAnnotatedSpan(CodepointSpan(12, 15), "percentage",
540 /*int_value=*/10, /*double_value=*/10,
541 /*priority_score=*/1),
542 IsAnnotatedSpan(CodepointSpan(19, 21), "number",
543 /*int_value=*/25, /*double_value=*/25),
544 IsAnnotatedSpan(CodepointSpan(19, 29), "percentage",
545 /*int_value=*/25, /*double_value=*/25,
546 /*priority_score=*/1),
547 IsAnnotatedSpan(CodepointSpan(55, 57), "number",
548 /*int_value=*/10, /*double_value=*/10),
549 IsAnnotatedSpan(CodepointSpan(55, 58), "percentage",
550 /*int_value=*/10, /*double_value=*/10,
551 /*priority_score=*/1)));
552 }
553
TEST_F(NumberAnnotatorTest,WhenPercentSuffixWithAdditionalIgnoredCharactersDoesNotParseIt)554 TEST_F(NumberAnnotatorTest,
555 WhenPercentSuffixWithAdditionalIgnoredCharactersDoesNotParseIt) {
556 ClassificationResult classification_result;
557 EXPECT_FALSE(number_annotator_.ClassifyText(
558 UTF8ToUnicodeText("23#!? percent"), {0, 13},
559 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
560 }
561
TEST_F(NumberAnnotatorTest,WhenPercentSuffixWithAdditionalRandomTokensDoesNotParseIt)562 TEST_F(NumberAnnotatorTest,
563 WhenPercentSuffixWithAdditionalRandomTokensDoesNotParseIt) {
564 ClassificationResult classification_result;
565 EXPECT_FALSE(number_annotator_.ClassifyText(
566 UTF8ToUnicodeText("23 asdf 3.14 pct asdf"), {0, 21},
567 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
568 }
569
TEST_F(NumberAnnotatorTest,WhenPercentSuffixWithAdditionalRandomPrefixSuffixDoesNotParseIt)570 TEST_F(NumberAnnotatorTest,
571 WhenPercentSuffixWithAdditionalRandomPrefixSuffixDoesNotParseIt) {
572 ClassificationResult classification_result;
573 EXPECT_FALSE(number_annotator_.ClassifyText(
574 UTF8ToUnicodeText("abdf23 percentabdf"), {0, 18},
575 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
576 }
577
TEST_F(NumberAnnotatorTest,WhenPercentSuffixWithAdditionalRandomStringsDoesNotParsesIt)578 TEST_F(NumberAnnotatorTest,
579 WhenPercentSuffixWithAdditionalRandomStringsDoesNotParsesIt) {
580 ClassificationResult classification_result;
581 EXPECT_FALSE(number_annotator_.ClassifyText(
582 UTF8ToUnicodeText("#?!23 percent#!?"), {0, 16},
583 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
584 }
585
TEST_F(NumberAnnotatorTest,WhenBothPercentSymbolAndSuffixDoesNotParseIt)586 TEST_F(NumberAnnotatorTest, WhenBothPercentSymbolAndSuffixDoesNotParseIt) {
587 ClassificationResult classification_result;
588 EXPECT_FALSE(number_annotator_.ClassifyText(
589 UTF8ToUnicodeText("23% percent"), {0, 11},
590 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
591 }
592
TEST_F(NumberAnnotatorTest,WhenPercentSymbolWithAdditionalPrefixCharactersDoesNotParsesIt)593 TEST_F(NumberAnnotatorTest,
594 WhenPercentSymbolWithAdditionalPrefixCharactersDoesNotParsesIt) {
595 ClassificationResult classification_result;
596 EXPECT_FALSE(number_annotator_.ClassifyText(
597 UTF8ToUnicodeText("#?23%"), {0, 5},
598 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
599 }
600
TEST_F(NumberAnnotatorTest,WhenNumberWithAdditionalCharactersDoesNotParsesIt)601 TEST_F(NumberAnnotatorTest, WhenNumberWithAdditionalCharactersDoesNotParsesIt) {
602 ClassificationResult classification_result;
603 EXPECT_FALSE(number_annotator_.ClassifyText(
604 UTF8ToUnicodeText("23#!?"), {0, 5},
605 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
606 }
607
TEST_F(NumberAnnotatorTest,WhenPercentSymbolWithAdditionalCharactersDoesNotParsesIt)608 TEST_F(NumberAnnotatorTest,
609 WhenPercentSymbolWithAdditionalCharactersDoesNotParsesIt) {
610 ClassificationResult classification_result;
611 // ! does not belong to the percentage annotation
612 EXPECT_TRUE(number_annotator_.ClassifyText(
613 UTF8ToUnicodeText("23%!"), {0, 3},
614 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
615 EXPECT_EQ(classification_result.collection, "percentage");
616 EXPECT_EQ(classification_result.numeric_value, 23);
617 EXPECT_EQ(classification_result.numeric_double_value, 23);
618
619 EXPECT_FALSE(number_annotator_.ClassifyText(
620 UTF8ToUnicodeText("23%!"), {0, 4},
621 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
622 }
623
TEST_F(NumberAnnotatorTest,WhenAdditionalCharactersWithMisplacedPercentSymbolDoesNotParsesIt)624 TEST_F(NumberAnnotatorTest,
625 WhenAdditionalCharactersWithMisplacedPercentSymbolDoesNotParsesIt) {
626 ClassificationResult classification_result;
627 EXPECT_FALSE(number_annotator_.ClassifyText(
628 UTF8ToUnicodeText("23.:;%"), {0, 6},
629 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
630 }
631
TEST_F(NumberAnnotatorTest,WhenMultipleMinusSignsDoesNotParsesIt)632 TEST_F(NumberAnnotatorTest, WhenMultipleMinusSignsDoesNotParsesIt) {
633 ClassificationResult classification_result;
634 EXPECT_TRUE(number_annotator_.ClassifyText(
635 UTF8ToUnicodeText("--11"), {1, 4},
636 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
637 EXPECT_THAT(classification_result,
638 AllOf(Field(&ClassificationResult::collection, "number"),
639 Field(&ClassificationResult::numeric_value, -11),
640 Field(&ClassificationResult::numeric_double_value, -11)));
641
642 EXPECT_FALSE(number_annotator_.ClassifyText(
643 UTF8ToUnicodeText("--11"), {0, 4},
644 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
645 }
646
TEST_F(NumberAnnotatorTest,WhenMultipleMinusSignsPercentSignDoesNotParsesIt)647 TEST_F(NumberAnnotatorTest, WhenMultipleMinusSignsPercentSignDoesNotParsesIt) {
648 ClassificationResult classification_result;
649 EXPECT_TRUE(number_annotator_.ClassifyText(
650 UTF8ToUnicodeText("--11%"), {1, 5},
651 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
652 EXPECT_THAT(classification_result,
653 AllOf(Field(&ClassificationResult::collection, "percentage"),
654 Field(&ClassificationResult::numeric_value, -11),
655 Field(&ClassificationResult::numeric_double_value, -11)));
656
657 EXPECT_FALSE(number_annotator_.ClassifyText(
658 UTF8ToUnicodeText("--11%"), {0, 5},
659 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
660 }
661
TEST_F(NumberAnnotatorTest,WhenPlusMinusSignsDoesNotParsesIt)662 TEST_F(NumberAnnotatorTest, WhenPlusMinusSignsDoesNotParsesIt) {
663 ClassificationResult classification_result;
664 EXPECT_TRUE(number_annotator_.ClassifyText(
665 UTF8ToUnicodeText("+-11"), {1, 4},
666 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
667 EXPECT_THAT(classification_result,
668 AllOf(Field(&ClassificationResult::collection, "number"),
669 Field(&ClassificationResult::numeric_value, -11),
670 Field(&ClassificationResult::numeric_double_value, -11)));
671
672 EXPECT_FALSE(number_annotator_.ClassifyText(
673 UTF8ToUnicodeText("+-11"), {0, 4},
674 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
675 }
676
TEST_F(NumberAnnotatorTest,WhenMinusPlusSignsDoesNotParsesIt)677 TEST_F(NumberAnnotatorTest, WhenMinusPlusSignsDoesNotParsesIt) {
678 ClassificationResult classification_result;
679 // + right before a number is not included in the number annotation
680 EXPECT_FALSE(number_annotator_.ClassifyText(
681 UTF8ToUnicodeText("-+11"), {1, 4},
682 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
683 EXPECT_FALSE(number_annotator_.ClassifyText(
684 UTF8ToUnicodeText("-+11"), {0, 4},
685 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
686 }
687
TEST_F(NumberAnnotatorTest,WhenMinusSignSuffixDoesNotParsesIt)688 TEST_F(NumberAnnotatorTest, WhenMinusSignSuffixDoesNotParsesIt) {
689 ClassificationResult classification_result;
690 EXPECT_FALSE(number_annotator_.ClassifyText(
691 UTF8ToUnicodeText("10-"), {0, 3},
692 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
693 }
694
TEST_F(NumberAnnotatorTest,WhenMultipleCharSuffixDoesNotParsesIt)695 TEST_F(NumberAnnotatorTest, WhenMultipleCharSuffixDoesNotParsesIt) {
696 ClassificationResult classification_result;
697 EXPECT_TRUE(number_annotator_.ClassifyText(
698 UTF8ToUnicodeText("10**"), {0, 2},
699 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
700 EXPECT_THAT(classification_result,
701 AllOf(Field(&ClassificationResult::collection, "number"),
702 Field(&ClassificationResult::numeric_value, 10),
703 Field(&ClassificationResult::numeric_double_value, 10)));
704
705 EXPECT_FALSE(number_annotator_.ClassifyText(
706 UTF8ToUnicodeText("10**"), {0, 3},
707 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
708 EXPECT_FALSE(number_annotator_.ClassifyText(
709 UTF8ToUnicodeText("10**"), {0, 4},
710 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
711 }
712
TEST_F(NumberAnnotatorTest,WhenMultipleCharPrefixDoesNotParsesIt)713 TEST_F(NumberAnnotatorTest, WhenMultipleCharPrefixDoesNotParsesIt) {
714 ClassificationResult classification_result;
715 EXPECT_FALSE(number_annotator_.ClassifyText(
716 UTF8ToUnicodeText("**10"), {1, 4},
717 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
718 EXPECT_FALSE(number_annotator_.ClassifyText(
719 UTF8ToUnicodeText("**10"), {0, 4},
720 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
721 }
722
TEST_F(NumberAnnotatorTest,WhenLowestSupportedNumberParsesIt)723 TEST_F(NumberAnnotatorTest, WhenLowestSupportedNumberParsesIt) {
724 ClassificationResult classification_result;
725 EXPECT_TRUE(number_annotator_.ClassifyText(
726 UTF8ToUnicodeText("-1000000000"), {0, 11},
727 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
728
729 EXPECT_THAT(
730 classification_result,
731 AllOf(Field(&ClassificationResult::collection, "number"),
732 Field(&ClassificationResult::numeric_value, -1000000000),
733 Field(&ClassificationResult::numeric_double_value, -1000000000)));
734 }
735
TEST_F(NumberAnnotatorTest,WhenLargestSupportedNumberParsesIt)736 TEST_F(NumberAnnotatorTest, WhenLargestSupportedNumberParsesIt) {
737 ClassificationResult classification_result;
738 EXPECT_TRUE(number_annotator_.ClassifyText(
739 UTF8ToUnicodeText("1000000000"), {0, 10},
740 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
741
742 EXPECT_THAT(
743 classification_result,
744 AllOf(Field(&ClassificationResult::collection, "number"),
745 Field(&ClassificationResult::numeric_value, 1000000000),
746 Field(&ClassificationResult::numeric_double_value, 1000000000)));
747 }
748
TEST_F(NumberAnnotatorTest,WhenLowestSupportedFloatNumberParsesIt)749 TEST_F(NumberAnnotatorTest, WhenLowestSupportedFloatNumberParsesIt) {
750 ClassificationResult classification_result;
751 EXPECT_TRUE(number_annotator_.ClassifyText(
752 UTF8ToUnicodeText("-999999999.999999999"), {0, 20},
753 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
754
755 EXPECT_THAT(classification_result,
756 AllOf(Field(&ClassificationResult::collection, "number"),
757 Field(&ClassificationResult::numeric_value, -1000000000),
758 Field(&ClassificationResult::numeric_double_value,
759 -999999999.999999999)));
760 }
761
TEST_F(NumberAnnotatorTest,WhenLargestFloatSupportedNumberParsesIt)762 TEST_F(NumberAnnotatorTest, WhenLargestFloatSupportedNumberParsesIt) {
763 ClassificationResult classification_result;
764 EXPECT_TRUE(number_annotator_.ClassifyText(
765 UTF8ToUnicodeText("999999999.999999999"), {0, 19},
766 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
767
768 EXPECT_THAT(classification_result,
769 AllOf(Field(&ClassificationResult::collection, "number"),
770 Field(&ClassificationResult::numeric_value, 1000000000),
771 Field(&ClassificationResult::numeric_double_value,
772 999999999.999999999)));
773 }
774
TEST_F(NumberAnnotatorTest,WhenLargeNumberDoesNotParseIt)775 TEST_F(NumberAnnotatorTest, WhenLargeNumberDoesNotParseIt) {
776 ClassificationResult classification_result;
777 EXPECT_FALSE(number_annotator_.ClassifyText(
778 UTF8ToUnicodeText("1234567890123456789012345678901234567890"), {0, 40},
779 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
780 }
781
TEST_F(NumberAnnotatorTest,WhenMinusInTheMiddleDoesNotParseIt)782 TEST_F(NumberAnnotatorTest, WhenMinusInTheMiddleDoesNotParseIt) {
783 ClassificationResult classification_result;
784 EXPECT_FALSE(number_annotator_.ClassifyText(
785 UTF8ToUnicodeText("2016-2017"), {0, 9},
786 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
787 }
788
TEST_F(NumberAnnotatorTest,WhenSuffixWithoutNumberDoesNotParseIt)789 TEST_F(NumberAnnotatorTest, WhenSuffixWithoutNumberDoesNotParseIt) {
790 std::vector<AnnotatedSpan> result;
791 EXPECT_TRUE(number_annotator_.FindAll(
792 UTF8ToUnicodeText("... % ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW,
793 ModeFlag_ANNOTATION, &result));
794
795 ASSERT_EQ(result.size(), 0);
796 }
797
TEST_F(NumberAnnotatorTest,WhenPrefixWithoutNumberDoesNotParseIt)798 TEST_F(NumberAnnotatorTest, WhenPrefixWithoutNumberDoesNotParseIt) {
799 std::vector<AnnotatedSpan> result;
800 EXPECT_TRUE(number_annotator_.FindAll(
801 UTF8ToUnicodeText("... $ ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW,
802 ModeFlag_ANNOTATION, &result));
803
804 ASSERT_EQ(result.size(), 0);
805 }
806
TEST_F(NumberAnnotatorTest,WhenPrefixAndSuffixWithoutNumberDoesNotParseIt)807 TEST_F(NumberAnnotatorTest, WhenPrefixAndSuffixWithoutNumberDoesNotParseIt) {
808 std::vector<AnnotatedSpan> result;
809 EXPECT_TRUE(number_annotator_.FindAll(
810 UTF8ToUnicodeText("... $% ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW,
811 ModeFlag_ANNOTATION, &result));
812
813 ASSERT_EQ(result.size(), 0);
814 }
815
TEST_F(NumberAnnotatorTest,ForNumberAnnotationsSetsScoreAndPriorityScore)816 TEST_F(NumberAnnotatorTest, ForNumberAnnotationsSetsScoreAndPriorityScore) {
817 ClassificationResult classification_result;
818 EXPECT_TRUE(number_annotator_.ClassifyText(
819 UTF8ToUnicodeText("... 12345 ..."), {4, 9},
820 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
821
822 EXPECT_EQ(classification_result.collection, "number");
823 EXPECT_EQ(classification_result.numeric_value, 12345);
824 EXPECT_EQ(classification_result.numeric_double_value, 12345);
825 EXPECT_EQ(classification_result.score, 1);
826 EXPECT_EQ(classification_result.priority_score, -10);
827
828 std::vector<AnnotatedSpan> result;
829 EXPECT_TRUE(number_annotator_.FindAll(
830 UTF8ToUnicodeText("Come at 9 or 10 ok?"),
831 AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
832
833 EXPECT_THAT(result,
834 UnorderedElementsAre(
835 IsAnnotatedSpan(CodepointSpan(8, 9), "number",
836 /*int_value=*/9, /*double_value=*/9),
837 IsAnnotatedSpan(CodepointSpan(13, 15), "number",
838 /*int_value=*/10, /*double_value=*/10)));
839 }
840
TEST_F(NumberAnnotatorTest,ForFloatNumberAnnotationsSetsScoreAndPriorityScore)841 TEST_F(NumberAnnotatorTest,
842 ForFloatNumberAnnotationsSetsScoreAndPriorityScore) {
843 ClassificationResult classification_result;
844 EXPECT_TRUE(number_annotator_.ClassifyText(
845 UTF8ToUnicodeText("... 12345.12345 ..."), {4, 15},
846 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
847 EXPECT_EQ(classification_result.collection, "number");
848 EXPECT_EQ(classification_result.numeric_value, 12345);
849 EXPECT_EQ(classification_result.numeric_double_value, 12345.12345);
850 EXPECT_EQ(classification_result.score, 1);
851 EXPECT_EQ(classification_result.priority_score, 1);
852
853 std::vector<AnnotatedSpan> result;
854 EXPECT_TRUE(number_annotator_.FindAll(
855 UTF8ToUnicodeText("Results are between 12.5 and 13.5, right?"),
856 AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
857 EXPECT_THAT(result,
858 UnorderedElementsAre(
859 IsAnnotatedSpan(CodepointSpan(20, 24), "number",
860 /*int_value=*/12, /*double_value=*/12.5,
861 /*priority_score=*/1),
862 IsAnnotatedSpan(CodepointSpan(29, 33), "number",
863 /*int_value=*/13, /*double_value=*/13.5,
864 /*priority_score=*/1)));
865 }
866
TEST_F(NumberAnnotatorTest,ForPercentageAnnotationsSetsScoreAndPriorityScore)867 TEST_F(NumberAnnotatorTest, ForPercentageAnnotationsSetsScoreAndPriorityScore) {
868 ClassificationResult classification_result;
869 EXPECT_TRUE(number_annotator_.ClassifyText(
870 UTF8ToUnicodeText("... 12345% ..."), {4, 10},
871 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
872 EXPECT_EQ(classification_result.collection, "percentage");
873 EXPECT_EQ(classification_result.numeric_value, 12345);
874 EXPECT_EQ(classification_result.numeric_double_value, 12345);
875 EXPECT_EQ(classification_result.score, 1);
876 EXPECT_EQ(classification_result.priority_score, 1);
877
878 EXPECT_TRUE(number_annotator_.ClassifyText(
879 UTF8ToUnicodeText("... 12345 percent ..."), {4, 17},
880 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
881 EXPECT_EQ(classification_result.collection, "percentage");
882 EXPECT_EQ(classification_result.numeric_value, 12345);
883 EXPECT_EQ(classification_result.numeric_double_value, 12345);
884 EXPECT_EQ(classification_result.score, 1);
885 EXPECT_EQ(classification_result.priority_score, 1);
886
887 std::vector<AnnotatedSpan> result;
888 EXPECT_TRUE(number_annotator_.FindAll(
889 UTF8ToUnicodeText("Results are between 9% and 10 percent."),
890 AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
891 EXPECT_THAT(result,
892 UnorderedElementsAre(
893 IsAnnotatedSpan(CodepointSpan(20, 21), "number",
894 /*int_value=*/9, /*double_value=*/9),
895 IsAnnotatedSpan(CodepointSpan(20, 22), "percentage",
896 /*int_value=*/9, /*double_value=*/9,
897 /*priority_score=*/1),
898 IsAnnotatedSpan(CodepointSpan(27, 29), "number",
899 /*int_value=*/10, /*double_value=*/10),
900 IsAnnotatedSpan(CodepointSpan(27, 37), "percentage",
901 /*int_value=*/10, /*double_value=*/10,
902 /*priority_score=*/1)));
903 }
904
TEST_F(NumberAnnotatorTest,NumberDisabledPercentageEnabledForSmartUsecase)905 TEST_F(NumberAnnotatorTest, NumberDisabledPercentageEnabledForSmartUsecase) {
906 ClassificationResult classification_result;
907 EXPECT_FALSE(number_annotator_.ClassifyText(
908 UTF8ToUnicodeText("... 12345 ..."), {4, 9},
909 AnnotationUsecase_ANNOTATION_USECASE_SMART, &classification_result));
910
911 EXPECT_TRUE(number_annotator_.ClassifyText(
912 UTF8ToUnicodeText("... 12345% ..."), {4, 10},
913 AnnotationUsecase_ANNOTATION_USECASE_SMART, &classification_result));
914 EXPECT_EQ(classification_result.collection, "percentage");
915 EXPECT_EQ(classification_result.numeric_value, 12345);
916 EXPECT_EQ(classification_result.numeric_double_value, 12345.0);
917 EXPECT_EQ(classification_result.score, 1);
918 EXPECT_EQ(classification_result.priority_score, 1);
919
920 EXPECT_TRUE(number_annotator_.ClassifyText(
921 UTF8ToUnicodeText("... 12345percent ..."), {4, 16},
922 AnnotationUsecase_ANNOTATION_USECASE_SMART, &classification_result));
923 EXPECT_EQ(classification_result.collection, "percentage");
924 EXPECT_EQ(classification_result.numeric_value, 12345);
925 EXPECT_EQ(classification_result.numeric_double_value, 12345);
926 EXPECT_EQ(classification_result.score, 1);
927 EXPECT_EQ(classification_result.priority_score, 1);
928
929 std::vector<AnnotatedSpan> result;
930 EXPECT_TRUE(number_annotator_.FindAll(
931 UTF8ToUnicodeText("Accuracy for experiment 3 is 9%."),
932 AnnotationUsecase_ANNOTATION_USECASE_SMART, ModeFlag_ANNOTATION,
933 &result));
934 EXPECT_THAT(result, UnorderedElementsAre(
935 IsAnnotatedSpan(CodepointSpan(29, 31), "percentage",
936 /*int_value=*/9, /*double_value=*/9.0,
937 /*priority_score=*/1)));
938 }
939
TEST_F(NumberAnnotatorTest,MathOperatorsNotAnnotatedAsNumbersFindAll)940 TEST_F(NumberAnnotatorTest, MathOperatorsNotAnnotatedAsNumbersFindAll) {
941 std::vector<AnnotatedSpan> result;
942 EXPECT_TRUE(number_annotator_.FindAll(
943 UTF8ToUnicodeText("how much is 2 + 2 or 5 - 96 * 89"),
944 AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
945
946 EXPECT_THAT(result,
947 UnorderedElementsAre(
948 IsAnnotatedSpan(CodepointSpan(12, 13), "number",
949 /*int_value=*/2, /*double_value=*/2),
950 IsAnnotatedSpan(CodepointSpan(16, 17), "number",
951 /*int_value=*/2, /*double_value=*/2),
952 IsAnnotatedSpan(CodepointSpan(21, 22), "number",
953 /*int_value=*/5, /*double_value=*/5),
954 IsAnnotatedSpan(CodepointSpan(25, 27), "number",
955 /*int_value=*/96, /*double_value=*/96),
956 IsAnnotatedSpan(CodepointSpan(30, 32), "number",
957 /*int_value=*/89, /*double_value=*/89)));
958 }
959
TEST_F(NumberAnnotatorTest,MathOperatorsNotAnnotatedAsNumbersClassifyText)960 TEST_F(NumberAnnotatorTest, MathOperatorsNotAnnotatedAsNumbersClassifyText) {
961 ClassificationResult classification_result;
962 EXPECT_FALSE(number_annotator_.ClassifyText(
963 UTF8ToUnicodeText("2 + 2"), {2, 3},
964 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
965 EXPECT_FALSE(number_annotator_.ClassifyText(
966 UTF8ToUnicodeText("2 - 96 * 89"), {2, 3},
967 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
968 }
969
TEST_F(NumberAnnotatorTest,SlashSeparatesTwoNumbersFindAll)970 TEST_F(NumberAnnotatorTest, SlashSeparatesTwoNumbersFindAll) {
971 std::vector<AnnotatedSpan> result;
972 EXPECT_TRUE(number_annotator_.FindAll(
973 UTF8ToUnicodeText("what's 1 + 2/3 * 4/5 * 6 / 7"),
974 AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
975
976 EXPECT_THAT(result,
977 UnorderedElementsAre(
978 IsAnnotatedSpan(CodepointSpan(7, 8), "number",
979 /*int_value=*/1, /*double_value=*/1),
980 IsAnnotatedSpan(CodepointSpan(11, 12), "number",
981 /*int_value=*/2, /*double_value=*/2),
982 IsAnnotatedSpan(CodepointSpan(13, 14), "number",
983 /*int_value=*/3, /*double_value=*/3),
984 IsAnnotatedSpan(CodepointSpan(17, 18), "number",
985 /*int_value=*/4, /*double_value=*/4),
986 IsAnnotatedSpan(CodepointSpan(19, 20), "number",
987 /*int_value=*/5, /*double_value=*/5),
988 IsAnnotatedSpan(CodepointSpan(23, 24), "number",
989 /*int_value=*/6, /*double_value=*/6),
990 IsAnnotatedSpan(CodepointSpan(27, 28), "number",
991 /*int_value=*/7, /*double_value=*/7)));
992 }
993
TEST_F(NumberAnnotatorTest,SlashSeparatesTwoNumbersClassifyText)994 TEST_F(NumberAnnotatorTest, SlashSeparatesTwoNumbersClassifyText) {
995 ClassificationResult classification_result;
996 EXPECT_TRUE(number_annotator_.ClassifyText(
997 UTF8ToUnicodeText("what's 1 + 2/3 * 4"), {11, 12},
998 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
999 EXPECT_EQ(classification_result.collection, "number");
1000 EXPECT_EQ(classification_result.numeric_value, 2);
1001 EXPECT_EQ(classification_result.numeric_double_value, 2);
1002 EXPECT_EQ(classification_result.score, 1);
1003
1004 EXPECT_TRUE(number_annotator_.ClassifyText(
1005 UTF8ToUnicodeText("what's 1 + 2/3 * 4"), {13, 14},
1006 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
1007 EXPECT_EQ(classification_result.collection, "number");
1008 EXPECT_EQ(classification_result.numeric_value, 3);
1009 EXPECT_EQ(classification_result.numeric_double_value, 3);
1010 EXPECT_EQ(classification_result.score, 1);
1011 }
1012
TEST_F(NumberAnnotatorTest,SlashDoesNotSeparatesTwoNumbersFindAll)1013 TEST_F(NumberAnnotatorTest, SlashDoesNotSeparatesTwoNumbersFindAll) {
1014 std::vector<AnnotatedSpan> result;
1015 // 2 in the "2/" context is a number because / is punctuation
1016 EXPECT_TRUE(number_annotator_.FindAll(
1017 UTF8ToUnicodeText("what's 2a2/3 or 2/s4 or 2/ or /3 or //3 or 2//"),
1018 AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
1019
1020 EXPECT_THAT(result, UnorderedElementsAre(IsAnnotatedSpan(
1021 CodepointSpan(24, 25), "number",
1022 /*int_value=*/2, /*double_value=*/2)));
1023 }
1024
TEST_F(NumberAnnotatorTest,BracketsContextAnnotatedFindAll)1025 TEST_F(NumberAnnotatorTest, BracketsContextAnnotatedFindAll) {
1026 std::vector<AnnotatedSpan> result;
1027 EXPECT_TRUE(number_annotator_.FindAll(
1028 UTF8ToUnicodeText("The interval is: (12, 13) or [-12, -4.5)"),
1029 AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
1030
1031 EXPECT_THAT(result,
1032 UnorderedElementsAre(
1033 IsAnnotatedSpan(CodepointSpan(18, 20), "number",
1034 /*int_value=*/12, /*double_value=*/12),
1035 IsAnnotatedSpan(CodepointSpan(22, 24), "number",
1036 /*int_value=*/13, /*double_value=*/13),
1037 IsAnnotatedSpan(CodepointSpan(30, 33), "number",
1038 /*int_value=*/-12, /*double_value=*/-12),
1039 IsAnnotatedSpan(CodepointSpan(35, 39), "number",
1040 /*int_value=*/-4, /*double_value=*/-4.5,
1041 /*priority_score=*/1)));
1042 }
1043
TEST_F(NumberAnnotatorTest,BracketsContextNotAnnotatedFindAll)1044 TEST_F(NumberAnnotatorTest, BracketsContextNotAnnotatedFindAll) {
1045 std::vector<AnnotatedSpan> result;
1046 EXPECT_TRUE(number_annotator_.FindAll(
1047 UTF8ToUnicodeText("The interval is: -(12, 138*)"),
1048 AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
1049
1050 EXPECT_TRUE(result.empty());
1051 }
1052
TEST_F(NumberAnnotatorTest,FractionalNumberDotsFindAll)1053 TEST_F(NumberAnnotatorTest, FractionalNumberDotsFindAll) {
1054 std::vector<AnnotatedSpan> result;
1055 // Dots source: https://unicode-search.net/unicode-namesearch.pl?term=period
1056 EXPECT_TRUE(number_annotator_.FindAll(
1057 UTF8ToUnicodeText("3.1 3﹒2 3.3"),
1058 AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
1059
1060 EXPECT_THAT(result, UnorderedElementsAre(
1061 IsAnnotatedSpan(CodepointSpan(0, 3), "number",
1062 /*int_value=*/3, /*double_value=*/3.1,
1063 /*priority_score=*/1),
1064 IsAnnotatedSpan(CodepointSpan(4, 7), "number",
1065 /*int_value=*/3, /*double_value=*/3.2,
1066 /*priority_score=*/1),
1067 IsAnnotatedSpan(CodepointSpan(8, 11), "number",
1068 /*int_value=*/3, /*double_value=*/3.3,
1069 /*priority_score=*/1)));
1070 }
1071
TEST_F(NumberAnnotatorTest,NonAsciiDigitsFindAll)1072 TEST_F(NumberAnnotatorTest, NonAsciiDigitsFindAll) {
1073 std::vector<AnnotatedSpan> result;
1074 // Dots source: https://unicode-search.net/unicode-namesearch.pl?term=period
1075 // Digits source: https://unicode-search.net/unicode-namesearch.pl?term=digit
1076 EXPECT_TRUE(number_annotator_.FindAll(
1077 UTF8ToUnicodeText("3 3﹒2 3.3%"),
1078 AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
1079
1080 EXPECT_THAT(result, UnorderedElementsAre(
1081 IsAnnotatedSpan(CodepointSpan(0, 1), "number",
1082 /*int_value=*/3, /*double_value=*/3),
1083 IsAnnotatedSpan(CodepointSpan(2, 5), "number",
1084 /*int_value=*/3, /*double_value=*/3.2,
1085 /*priority_score=*/1),
1086 IsAnnotatedSpan(CodepointSpan(6, 9), "number",
1087 /*int_value=*/3, /*double_value=*/3.3,
1088 /*priority_score=*/1),
1089 IsAnnotatedSpan(CodepointSpan(6, 10), "percentage",
1090 /*int_value=*/3, /*double_value=*/3.3,
1091 /*priority_score=*/1)));
1092 }
1093
TEST_F(NumberAnnotatorTest,AnnotatedZeroPrecededNumbersFindAll)1094 TEST_F(NumberAnnotatorTest, AnnotatedZeroPrecededNumbersFindAll) {
1095 std::vector<AnnotatedSpan> result;
1096 EXPECT_TRUE(number_annotator_.FindAll(
1097 UTF8ToUnicodeText("Numbers: 0.9 or 09 or 09.9 or 032310"),
1098 AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
1099
1100 EXPECT_THAT(result, UnorderedElementsAre(
1101 IsAnnotatedSpan(CodepointSpan(9, 12), "number",
1102 /*int_value=*/0, /*double_value=*/0.9,
1103 /*priority_score=*/1),
1104 IsAnnotatedSpan(CodepointSpan(16, 18), "number",
1105 /*int_value=*/9, /*double_value=*/9),
1106 IsAnnotatedSpan(CodepointSpan(22, 26), "number",
1107 /*int_value=*/9, /*double_value=*/9.9,
1108 /*priority_score=*/1),
1109 IsAnnotatedSpan(CodepointSpan(30, 36), "number",
1110 /*int_value=*/32310,
1111 /*double_value=*/32310)));
1112 }
1113
TEST_F(NumberAnnotatorTest,ZeroAfterDotFindAll)1114 TEST_F(NumberAnnotatorTest, ZeroAfterDotFindAll) {
1115 std::vector<AnnotatedSpan> result;
1116 EXPECT_TRUE(number_annotator_.FindAll(
1117 UTF8ToUnicodeText("15.0 16.00"), AnnotationUsecase_ANNOTATION_USECASE_RAW,
1118 ModeFlag_ANNOTATION, &result));
1119
1120 EXPECT_THAT(result,
1121 UnorderedElementsAre(
1122 IsAnnotatedSpan(CodepointSpan(0, 4), "number",
1123 /*int_value=*/15, /*double_value=*/15),
1124 IsAnnotatedSpan(CodepointSpan(5, 10), "number",
1125 /*int_value=*/16, /*double_value=*/16)));
1126 }
1127
TEST_F(NumberAnnotatorTest,NineDotNineFindAll)1128 TEST_F(NumberAnnotatorTest, NineDotNineFindAll) {
1129 std::vector<AnnotatedSpan> result;
1130 EXPECT_TRUE(number_annotator_.FindAll(
1131 UTF8ToUnicodeText("9.9 9.99 99.99 99.999 99.9999"),
1132 AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
1133
1134 EXPECT_THAT(result,
1135 UnorderedElementsAre(
1136 IsAnnotatedSpan(CodepointSpan(0, 3), "number",
1137 /*int_value=*/9, /*double_value=*/9.9,
1138 /*priority_score=*/1),
1139 IsAnnotatedSpan(CodepointSpan(4, 8), "number",
1140 /*int_value=*/9, /*double_value=*/9.99,
1141 /*priority_score=*/1),
1142 IsAnnotatedSpan(CodepointSpan(9, 14), "number",
1143 /*int_value=*/99, /*double_value=*/99.99,
1144 /*priority_score=*/1),
1145 IsAnnotatedSpan(CodepointSpan(15, 21), "number",
1146 /*int_value=*/99, /*double_value=*/99.999,
1147 /*priority_score=*/1),
1148 IsAnnotatedSpan(CodepointSpan(22, 29), "number",
1149 /*int_value=*/99, /*double_value=*/99.9999,
1150 /*priority_score=*/1)));
1151 }
1152
1153 } // namespace test_internal
1154 } // namespace libtextclassifier3
1155