1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "annotator/feature-processor.h"
18
19 #include "annotator/model-executor.h"
20 #include "utils/tensor-view.h"
21 #include "utils/utf8/unicodetext.h"
22 #include "gmock/gmock.h"
23 #include "gtest/gtest.h"
24
25 namespace libtextclassifier3 {
26 namespace {
27
28 using testing::ElementsAreArray;
29 using testing::FloatEq;
30 using testing::Matcher;
31
PackFeatureProcessorOptions(const FeatureProcessorOptionsT & options)32 flatbuffers::DetachedBuffer PackFeatureProcessorOptions(
33 const FeatureProcessorOptionsT& options) {
34 flatbuffers::FlatBufferBuilder builder;
35 builder.Finish(CreateFeatureProcessorOptions(builder, &options));
36 return builder.Release();
37 }
38
39 template <typename T>
Subvector(const std::vector<T> & vector,int start,int end)40 std::vector<T> Subvector(const std::vector<T>& vector, int start, int end) {
41 return std::vector<T>(vector.begin() + start, vector.begin() + end);
42 }
43
ElementsAreFloat(const std::vector<float> & values)44 Matcher<std::vector<float>> ElementsAreFloat(const std::vector<float>& values) {
45 std::vector<Matcher<float>> matchers;
46 for (const float value : values) {
47 matchers.push_back(FloatEq(value));
48 }
49 return ElementsAreArray(matchers);
50 }
51
52 class TestingFeatureProcessor : public FeatureProcessor {
53 public:
54 using FeatureProcessor::CountIgnoredSpanBoundaryCodepoints;
55 using FeatureProcessor::FeatureProcessor;
56 using FeatureProcessor::SpanToLabel;
57 using FeatureProcessor::StripTokensFromOtherLines;
58 using FeatureProcessor::supported_codepoint_ranges_;
59 using FeatureProcessor::SupportedCodepointsRatio;
60 };
61
62 // EmbeddingExecutor that always returns features based on
63 class FakeEmbeddingExecutor : public EmbeddingExecutor {
64 public:
AddEmbedding(const TensorView<int> & sparse_features,float * dest,int dest_size) const65 bool AddEmbedding(const TensorView<int>& sparse_features, float* dest,
66 int dest_size) const override {
67 TC3_CHECK_GE(dest_size, 4);
68 EXPECT_EQ(sparse_features.size(), 1);
69 dest[0] = sparse_features.data()[0];
70 dest[1] = sparse_features.data()[0];
71 dest[2] = -sparse_features.data()[0];
72 dest[3] = -sparse_features.data()[0];
73 return true;
74 }
75
76 private:
77 std::vector<float> storage_;
78 };
79
80 class AnnotatorFeatureProcessorTest : public ::testing::Test {
81 protected:
AnnotatorFeatureProcessorTest()82 AnnotatorFeatureProcessorTest() : INIT_UNILIB_FOR_TESTING(unilib_) {}
83 UniLib unilib_;
84 };
85
TEST_F(AnnotatorFeatureProcessorTest,SplitTokensOnSelectionBoundariesMiddle)86 TEST_F(AnnotatorFeatureProcessorTest, SplitTokensOnSelectionBoundariesMiddle) {
87 std::vector<Token> tokens{Token("Hělló", 0, 5),
88 Token("fěěbař@google.com", 6, 23),
89 Token("heře!", 24, 29)};
90
91 internal::SplitTokensOnSelectionBoundaries({9, 12}, &tokens);
92
93 // clang-format off
94 EXPECT_THAT(tokens, ElementsAreArray(
95 {Token("Hělló", 0, 5),
96 Token("fěě", 6, 9),
97 Token("bař", 9, 12),
98 Token("@google.com", 12, 23),
99 Token("heře!", 24, 29)}));
100 // clang-format on
101 }
102
TEST_F(AnnotatorFeatureProcessorTest,SplitTokensOnSelectionBoundariesBegin)103 TEST_F(AnnotatorFeatureProcessorTest, SplitTokensOnSelectionBoundariesBegin) {
104 std::vector<Token> tokens{Token("Hělló", 0, 5),
105 Token("fěěbař@google.com", 6, 23),
106 Token("heře!", 24, 29)};
107
108 internal::SplitTokensOnSelectionBoundaries({6, 12}, &tokens);
109
110 // clang-format off
111 EXPECT_THAT(tokens, ElementsAreArray(
112 {Token("Hělló", 0, 5),
113 Token("fěěbař", 6, 12),
114 Token("@google.com", 12, 23),
115 Token("heře!", 24, 29)}));
116 // clang-format on
117 }
118
TEST_F(AnnotatorFeatureProcessorTest,SplitTokensOnSelectionBoundariesEnd)119 TEST_F(AnnotatorFeatureProcessorTest, SplitTokensOnSelectionBoundariesEnd) {
120 std::vector<Token> tokens{Token("Hělló", 0, 5),
121 Token("fěěbař@google.com", 6, 23),
122 Token("heře!", 24, 29)};
123
124 internal::SplitTokensOnSelectionBoundaries({9, 23}, &tokens);
125
126 // clang-format off
127 EXPECT_THAT(tokens, ElementsAreArray(
128 {Token("Hělló", 0, 5),
129 Token("fěě", 6, 9),
130 Token("bař@google.com", 9, 23),
131 Token("heře!", 24, 29)}));
132 // clang-format on
133 }
134
TEST_F(AnnotatorFeatureProcessorTest,SplitTokensOnSelectionBoundariesWhole)135 TEST_F(AnnotatorFeatureProcessorTest, SplitTokensOnSelectionBoundariesWhole) {
136 std::vector<Token> tokens{Token("Hělló", 0, 5),
137 Token("fěěbař@google.com", 6, 23),
138 Token("heře!", 24, 29)};
139
140 internal::SplitTokensOnSelectionBoundaries({6, 23}, &tokens);
141
142 // clang-format off
143 EXPECT_THAT(tokens, ElementsAreArray(
144 {Token("Hělló", 0, 5),
145 Token("fěěbař@google.com", 6, 23),
146 Token("heře!", 24, 29)}));
147 // clang-format on
148 }
149
TEST_F(AnnotatorFeatureProcessorTest,SplitTokensOnSelectionBoundariesCrossToken)150 TEST_F(AnnotatorFeatureProcessorTest,
151 SplitTokensOnSelectionBoundariesCrossToken) {
152 std::vector<Token> tokens{Token("Hělló", 0, 5),
153 Token("fěěbař@google.com", 6, 23),
154 Token("heře!", 24, 29)};
155
156 internal::SplitTokensOnSelectionBoundaries({2, 9}, &tokens);
157
158 // clang-format off
159 EXPECT_THAT(tokens, ElementsAreArray(
160 {Token("Hě", 0, 2),
161 Token("lló", 2, 5),
162 Token("fěě", 6, 9),
163 Token("bař@google.com", 9, 23),
164 Token("heře!", 24, 29)}));
165 // clang-format on
166 }
167
TEST_F(AnnotatorFeatureProcessorTest,KeepLineWithClickFirst)168 TEST_F(AnnotatorFeatureProcessorTest, KeepLineWithClickFirst) {
169 FeatureProcessorOptionsT options;
170 options.only_use_line_with_click = true;
171 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
172 TestingFeatureProcessor feature_processor(
173 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
174 &unilib_);
175
176 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
177 const CodepointSpan span = {0, 5};
178 // clang-format off
179 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
180 Token("Lině", 6, 10),
181 Token("Sěcond", 11, 17),
182 Token("Lině", 18, 22),
183 Token("Thiřd", 23, 28),
184 Token("Lině", 29, 33)};
185 // clang-format on
186
187 // Keeps the first line.
188 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
189 EXPECT_THAT(tokens,
190 ElementsAreArray({Token("Fiřst", 0, 5), Token("Lině", 6, 10)}));
191 }
192
TEST_F(AnnotatorFeatureProcessorTest,KeepLineWithClickSecond)193 TEST_F(AnnotatorFeatureProcessorTest, KeepLineWithClickSecond) {
194 FeatureProcessorOptionsT options;
195 options.only_use_line_with_click = true;
196 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
197 TestingFeatureProcessor feature_processor(
198 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
199 &unilib_);
200
201 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
202 const CodepointSpan span = {18, 22};
203 // clang-format off
204 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
205 Token("Lině", 6, 10),
206 Token("Sěcond", 11, 17),
207 Token("Lině", 18, 22),
208 Token("Thiřd", 23, 28),
209 Token("Lině", 29, 33)};
210 // clang-format on
211
212 // Keeps the first line.
213 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
214 EXPECT_THAT(tokens, ElementsAreArray(
215 {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
216 }
217
TEST_F(AnnotatorFeatureProcessorTest,KeepLineWithClickThird)218 TEST_F(AnnotatorFeatureProcessorTest, KeepLineWithClickThird) {
219 FeatureProcessorOptionsT options;
220 options.only_use_line_with_click = true;
221 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
222 TestingFeatureProcessor feature_processor(
223 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
224 &unilib_);
225
226 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
227 const CodepointSpan span = {24, 33};
228 // clang-format off
229 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
230 Token("Lině", 6, 10),
231 Token("Sěcond", 11, 17),
232 Token("Lině", 18, 22),
233 Token("Thiřd", 23, 28),
234 Token("Lině", 29, 33)};
235 // clang-format on
236
237 // Keeps the first line.
238 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
239 EXPECT_THAT(tokens, ElementsAreArray(
240 {Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
241 }
242
TEST_F(AnnotatorFeatureProcessorTest,KeepLineWithClickSecondWithPipe)243 TEST_F(AnnotatorFeatureProcessorTest, KeepLineWithClickSecondWithPipe) {
244 FeatureProcessorOptionsT options;
245 options.only_use_line_with_click = true;
246 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
247 TestingFeatureProcessor feature_processor(
248 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
249 &unilib_);
250
251 const std::string context = "Fiřst Lině|Sěcond Lině\nThiřd Lině";
252 const CodepointSpan span = {18, 22};
253 // clang-format off
254 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
255 Token("Lině", 6, 10),
256 Token("Sěcond", 11, 17),
257 Token("Lině", 18, 22),
258 Token("Thiřd", 23, 28),
259 Token("Lině", 29, 33)};
260 // clang-format on
261
262 // Keeps the first line.
263 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
264 EXPECT_THAT(tokens, ElementsAreArray(
265 {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
266 }
267
TEST_F(AnnotatorFeatureProcessorTest,KeepLineWithClickAndDoNotUsePipeAsNewLineCharacter)268 TEST_F(AnnotatorFeatureProcessorTest,
269 KeepLineWithClickAndDoNotUsePipeAsNewLineCharacter) {
270 FeatureProcessorOptionsT options;
271 options.only_use_line_with_click = true;
272 options.use_pipe_character_for_newline = false;
273 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
274 TestingFeatureProcessor feature_processor(
275 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
276 &unilib_);
277
278 const std::string context = "Fiřst Lině|Sěcond Lině\nThiřd Lině";
279 const CodepointSpan span = {18, 22};
280 // clang-format off
281 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
282 Token("Lině|Sěcond", 6, 17),
283 Token("Lině", 18, 22),
284 Token("Thiřd", 23, 28),
285 Token("Lině", 29, 33)};
286 // clang-format on
287
288 // Keeps the first line.
289 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
290 EXPECT_THAT(tokens, ElementsAreArray({Token("Fiřst", 0, 5),
291 Token("Lině|Sěcond", 6, 17),
292 Token("Lině", 18, 22)}));
293 }
294
TEST_F(AnnotatorFeatureProcessorTest,ShouldSplitLinesOnPipe)295 TEST_F(AnnotatorFeatureProcessorTest, ShouldSplitLinesOnPipe) {
296 FeatureProcessorOptionsT options;
297 options.use_pipe_character_for_newline = true;
298 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
299 TestingFeatureProcessor feature_processor(
300 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
301 &unilib_);
302
303 const std::string context = "Fiřst Lině|Sěcond Lině\nThiřd Lině";
304 const UnicodeText context_unicode = UTF8ToUnicodeText(context,
305 /*do_copy=*/false);
306
307 const std::vector<UnicodeTextRange>& lines = feature_processor.SplitContext(
308 context_unicode, options.use_pipe_character_for_newline);
309 EXPECT_EQ(lines.size(), 3);
310 EXPECT_EQ(UnicodeText::UTF8Substring(lines[0].first, lines[0].second),
311 "Fiřst Lině");
312 EXPECT_EQ(UnicodeText::UTF8Substring(lines[1].first, lines[1].second),
313 "Sěcond Lině");
314 EXPECT_EQ(UnicodeText::UTF8Substring(lines[2].first, lines[2].second),
315 "Thiřd Lině");
316 }
317
TEST_F(AnnotatorFeatureProcessorTest,ShouldNotSplitLinesOnPipe)318 TEST_F(AnnotatorFeatureProcessorTest, ShouldNotSplitLinesOnPipe) {
319 FeatureProcessorOptionsT options;
320 options.use_pipe_character_for_newline = false;
321 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
322 TestingFeatureProcessor feature_processor(
323 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
324 &unilib_);
325
326 const std::string context = "Fiřst Lině|Sěcond Lině\nThiřd Lině";
327 const UnicodeText context_unicode = UTF8ToUnicodeText(context,
328 /*do_copy=*/false);
329
330 const std::vector<UnicodeTextRange>& lines = feature_processor.SplitContext(
331 context_unicode, options.use_pipe_character_for_newline);
332 EXPECT_EQ(lines.size(), 2);
333 EXPECT_EQ(UnicodeText::UTF8Substring(lines[0].first, lines[0].second),
334 "Fiřst Lině|Sěcond Lině");
335 EXPECT_EQ(UnicodeText::UTF8Substring(lines[1].first, lines[1].second),
336 "Thiřd Lině");
337 }
338
TEST_F(AnnotatorFeatureProcessorTest,KeepLineWithCrosslineClick)339 TEST_F(AnnotatorFeatureProcessorTest, KeepLineWithCrosslineClick) {
340 FeatureProcessorOptionsT options;
341 options.only_use_line_with_click = true;
342 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
343 TestingFeatureProcessor feature_processor(
344 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
345 &unilib_);
346
347 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
348 const CodepointSpan span = {5, 23};
349 // clang-format off
350 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
351 Token("Lině", 6, 10),
352 Token("Sěcond", 18, 23),
353 Token("Lině", 19, 23),
354 Token("Thiřd", 23, 28),
355 Token("Lině", 29, 33)};
356 // clang-format on
357
358 // Keeps the first line.
359 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
360 EXPECT_THAT(tokens, ElementsAreArray(
361 {Token("Fiřst", 0, 5), Token("Lině", 6, 10),
362 Token("Sěcond", 18, 23), Token("Lině", 19, 23),
363 Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
364 }
365
TEST_F(AnnotatorFeatureProcessorTest,SpanToLabel)366 TEST_F(AnnotatorFeatureProcessorTest, SpanToLabel) {
367 FeatureProcessorOptionsT options;
368 options.context_size = 1;
369 options.max_selection_span = 1;
370 options.snap_label_span_boundaries_to_containing_tokens = false;
371
372 options.tokenization_codepoint_config.emplace_back(
373 new TokenizationCodepointRangeT());
374 auto& config = options.tokenization_codepoint_config.back();
375 config->start = 32;
376 config->end = 33;
377 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
378
379 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
380 TestingFeatureProcessor feature_processor(
381 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
382 &unilib_);
383 std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");
384 ASSERT_EQ(3, tokens.size());
385 int label;
386 ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));
387 EXPECT_EQ(kInvalidLabel, label);
388 ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));
389 EXPECT_NE(kInvalidLabel, label);
390 TokenSpan token_span;
391 feature_processor.LabelToTokenSpan(label, &token_span);
392 EXPECT_EQ(0, token_span.first);
393 EXPECT_EQ(0, token_span.second);
394
395 // Reconfigure with snapping enabled.
396 options.snap_label_span_boundaries_to_containing_tokens = true;
397 flatbuffers::DetachedBuffer options2_fb =
398 PackFeatureProcessorOptions(options);
399 TestingFeatureProcessor feature_processor2(
400 flatbuffers::GetRoot<FeatureProcessorOptions>(options2_fb.data()),
401 &unilib_);
402 int label2;
403 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));
404 EXPECT_EQ(label, label2);
405 ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));
406 EXPECT_EQ(label, label2);
407 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));
408 EXPECT_EQ(label, label2);
409
410 // Cross a token boundary.
411 ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));
412 EXPECT_EQ(kInvalidLabel, label2);
413 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));
414 EXPECT_EQ(kInvalidLabel, label2);
415
416 // Multiple tokens.
417 options.context_size = 2;
418 options.max_selection_span = 2;
419 flatbuffers::DetachedBuffer options3_fb =
420 PackFeatureProcessorOptions(options);
421 TestingFeatureProcessor feature_processor3(
422 flatbuffers::GetRoot<FeatureProcessorOptions>(options3_fb.data()),
423 &unilib_);
424 tokens = feature_processor3.Tokenize("zero, one, two, three, four");
425 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));
426 EXPECT_NE(kInvalidLabel, label2);
427 feature_processor3.LabelToTokenSpan(label2, &token_span);
428 EXPECT_EQ(1, token_span.first);
429 EXPECT_EQ(0, token_span.second);
430
431 int label3;
432 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));
433 EXPECT_EQ(label2, label3);
434 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));
435 EXPECT_EQ(label2, label3);
436 ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));
437 EXPECT_EQ(label2, label3);
438 }
439
TEST_F(AnnotatorFeatureProcessorTest,SpanToLabelIgnoresPunctuation)440 TEST_F(AnnotatorFeatureProcessorTest, SpanToLabelIgnoresPunctuation) {
441 FeatureProcessorOptionsT options;
442 options.context_size = 1;
443 options.max_selection_span = 1;
444 options.snap_label_span_boundaries_to_containing_tokens = false;
445
446 options.tokenization_codepoint_config.emplace_back(
447 new TokenizationCodepointRangeT());
448 auto& config = options.tokenization_codepoint_config.back();
449 config->start = 32;
450 config->end = 33;
451 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
452
453 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
454 TestingFeatureProcessor feature_processor(
455 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
456 &unilib_);
457 std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");
458 ASSERT_EQ(3, tokens.size());
459 int label;
460 ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));
461 EXPECT_EQ(kInvalidLabel, label);
462 ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));
463 EXPECT_NE(kInvalidLabel, label);
464 TokenSpan token_span;
465 feature_processor.LabelToTokenSpan(label, &token_span);
466 EXPECT_EQ(0, token_span.first);
467 EXPECT_EQ(0, token_span.second);
468
469 // Reconfigure with snapping enabled.
470 options.snap_label_span_boundaries_to_containing_tokens = true;
471 flatbuffers::DetachedBuffer options2_fb =
472 PackFeatureProcessorOptions(options);
473 TestingFeatureProcessor feature_processor2(
474 flatbuffers::GetRoot<FeatureProcessorOptions>(options2_fb.data()),
475 &unilib_);
476 int label2;
477 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));
478 EXPECT_EQ(label, label2);
479 ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));
480 EXPECT_EQ(label, label2);
481 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));
482 EXPECT_EQ(label, label2);
483
484 // Cross a token boundary.
485 ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));
486 EXPECT_EQ(kInvalidLabel, label2);
487 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));
488 EXPECT_EQ(kInvalidLabel, label2);
489
490 // Multiple tokens.
491 options.context_size = 2;
492 options.max_selection_span = 2;
493 flatbuffers::DetachedBuffer options3_fb =
494 PackFeatureProcessorOptions(options);
495 TestingFeatureProcessor feature_processor3(
496 flatbuffers::GetRoot<FeatureProcessorOptions>(options3_fb.data()),
497 &unilib_);
498 tokens = feature_processor3.Tokenize("zero, one, two, three, four");
499 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));
500 EXPECT_NE(kInvalidLabel, label2);
501 feature_processor3.LabelToTokenSpan(label2, &token_span);
502 EXPECT_EQ(1, token_span.first);
503 EXPECT_EQ(0, token_span.second);
504
505 int label3;
506 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));
507 EXPECT_EQ(label2, label3);
508 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));
509 EXPECT_EQ(label2, label3);
510 ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));
511 EXPECT_EQ(label2, label3);
512 }
513
TEST_F(AnnotatorFeatureProcessorTest,CenterTokenFromClick)514 TEST_F(AnnotatorFeatureProcessorTest, CenterTokenFromClick) {
515 int token_index;
516
517 // Exactly aligned indices.
518 token_index = internal::CenterTokenFromClick(
519 {6, 11},
520 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
521 EXPECT_EQ(token_index, 1);
522
523 // Click is contained in a token.
524 token_index = internal::CenterTokenFromClick(
525 {13, 17},
526 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
527 EXPECT_EQ(token_index, 2);
528
529 // Click spans two tokens.
530 token_index = internal::CenterTokenFromClick(
531 {6, 17},
532 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
533 EXPECT_EQ(token_index, kInvalidIndex);
534 }
535
TEST_F(AnnotatorFeatureProcessorTest,CenterTokenFromMiddleOfSelection)536 TEST_F(AnnotatorFeatureProcessorTest, CenterTokenFromMiddleOfSelection) {
537 int token_index;
538
539 // Selection of length 3. Exactly aligned indices.
540 token_index = internal::CenterTokenFromMiddleOfSelection(
541 {7, 27},
542 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
543 Token("Token4", 21, 27), Token("Token5", 28, 34)});
544 EXPECT_EQ(token_index, 2);
545
546 // Selection of length 1 token. Exactly aligned indices.
547 token_index = internal::CenterTokenFromMiddleOfSelection(
548 {21, 27},
549 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
550 Token("Token4", 21, 27), Token("Token5", 28, 34)});
551 EXPECT_EQ(token_index, 3);
552
553 // Selection marks sub-token range, with no tokens in it.
554 token_index = internal::CenterTokenFromMiddleOfSelection(
555 {29, 33},
556 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
557 Token("Token4", 21, 27), Token("Token5", 28, 34)});
558 EXPECT_EQ(token_index, kInvalidIndex);
559
560 // Selection of length 2. Sub-token indices.
561 token_index = internal::CenterTokenFromMiddleOfSelection(
562 {3, 25},
563 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
564 Token("Token4", 21, 27), Token("Token5", 28, 34)});
565 EXPECT_EQ(token_index, 1);
566
567 // Selection of length 1. Sub-token indices.
568 token_index = internal::CenterTokenFromMiddleOfSelection(
569 {22, 34},
570 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
571 Token("Token4", 21, 27), Token("Token5", 28, 34)});
572 EXPECT_EQ(token_index, 4);
573
574 // Some invalid ones.
575 token_index = internal::CenterTokenFromMiddleOfSelection({7, 27}, {});
576 EXPECT_EQ(token_index, -1);
577 }
578
TEST_F(AnnotatorFeatureProcessorTest,SupportedCodepointsRatio)579 TEST_F(AnnotatorFeatureProcessorTest, SupportedCodepointsRatio) {
580 FeatureProcessorOptionsT options;
581 options.context_size = 2;
582 options.max_selection_span = 2;
583 options.snap_label_span_boundaries_to_containing_tokens = false;
584 options.feature_version = 2;
585 options.embedding_size = 4;
586 options.bounds_sensitive_features.reset(
587 new FeatureProcessorOptions_::BoundsSensitiveFeaturesT());
588 options.bounds_sensitive_features->enabled = true;
589 options.bounds_sensitive_features->num_tokens_before = 5;
590 options.bounds_sensitive_features->num_tokens_inside_left = 3;
591 options.bounds_sensitive_features->num_tokens_inside_right = 3;
592 options.bounds_sensitive_features->num_tokens_after = 5;
593 options.bounds_sensitive_features->include_inside_bag = true;
594 options.bounds_sensitive_features->include_inside_length = true;
595
596 options.tokenization_codepoint_config.emplace_back(
597 new TokenizationCodepointRangeT());
598 auto& config = options.tokenization_codepoint_config.back();
599 config->start = 32;
600 config->end = 33;
601 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
602
603 {
604 options.supported_codepoint_ranges.emplace_back(new CodepointRangeT());
605 auto& range = options.supported_codepoint_ranges.back();
606 range->start = 0;
607 range->end = 128;
608 }
609
610 {
611 options.supported_codepoint_ranges.emplace_back(new CodepointRangeT());
612 auto& range = options.supported_codepoint_ranges.back();
613 range->start = 10000;
614 range->end = 10001;
615 }
616
617 {
618 options.supported_codepoint_ranges.emplace_back(new CodepointRangeT());
619 auto& range = options.supported_codepoint_ranges.back();
620 range->start = 20000;
621 range->end = 30000;
622 }
623
624 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
625 TestingFeatureProcessor feature_processor(
626 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
627 &unilib_);
628 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
629 {0, 3}, feature_processor.Tokenize("aaa bbb ccc")),
630 FloatEq(1.0));
631 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
632 {0, 3}, feature_processor.Tokenize("aaa bbb ěěě")),
633 FloatEq(2.0 / 3));
634 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
635 {0, 3}, feature_processor.Tokenize("ěěě řřř ěěě")),
636 FloatEq(0.0));
637 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
638 {0, 0}, feature_processor.Tokenize("")),
639 FloatEq(0.0));
640 EXPECT_FALSE(
641 IsCodepointInRanges(-1, feature_processor.supported_codepoint_ranges_));
642 EXPECT_TRUE(
643 IsCodepointInRanges(0, feature_processor.supported_codepoint_ranges_));
644 EXPECT_TRUE(
645 IsCodepointInRanges(10, feature_processor.supported_codepoint_ranges_));
646 EXPECT_TRUE(
647 IsCodepointInRanges(127, feature_processor.supported_codepoint_ranges_));
648 EXPECT_FALSE(
649 IsCodepointInRanges(128, feature_processor.supported_codepoint_ranges_));
650 EXPECT_FALSE(
651 IsCodepointInRanges(9999, feature_processor.supported_codepoint_ranges_));
652 EXPECT_TRUE(IsCodepointInRanges(
653 10000, feature_processor.supported_codepoint_ranges_));
654 EXPECT_FALSE(IsCodepointInRanges(
655 10001, feature_processor.supported_codepoint_ranges_));
656 EXPECT_TRUE(IsCodepointInRanges(
657 25000, feature_processor.supported_codepoint_ranges_));
658
659 const std::vector<Token> tokens = {Token("ěěě", 0, 3), Token("řřř", 4, 7),
660 Token("eee", 8, 11)};
661
662 options.min_supported_codepoint_ratio = 0.0;
663 flatbuffers::DetachedBuffer options2_fb =
664 PackFeatureProcessorOptions(options);
665 TestingFeatureProcessor feature_processor2(
666 flatbuffers::GetRoot<FeatureProcessorOptions>(options2_fb.data()),
667 &unilib_);
668 EXPECT_TRUE(feature_processor2.HasEnoughSupportedCodepoints(
669 tokens, /*token_span=*/{0, 3}));
670
671 options.min_supported_codepoint_ratio = 0.2;
672 flatbuffers::DetachedBuffer options3_fb =
673 PackFeatureProcessorOptions(options);
674 TestingFeatureProcessor feature_processor3(
675 flatbuffers::GetRoot<FeatureProcessorOptions>(options3_fb.data()),
676 &unilib_);
677 EXPECT_TRUE(feature_processor3.HasEnoughSupportedCodepoints(
678 tokens, /*token_span=*/{0, 3}));
679
680 options.min_supported_codepoint_ratio = 0.5;
681 flatbuffers::DetachedBuffer options4_fb =
682 PackFeatureProcessorOptions(options);
683 TestingFeatureProcessor feature_processor4(
684 flatbuffers::GetRoot<FeatureProcessorOptions>(options4_fb.data()),
685 &unilib_);
686 EXPECT_FALSE(feature_processor4.HasEnoughSupportedCodepoints(
687 tokens, /*token_span=*/{0, 3}));
688 }
689
TEST_F(AnnotatorFeatureProcessorTest,InSpanFeature)690 TEST_F(AnnotatorFeatureProcessorTest, InSpanFeature) {
691 FeatureProcessorOptionsT options;
692 options.context_size = 2;
693 options.max_selection_span = 2;
694 options.snap_label_span_boundaries_to_containing_tokens = false;
695 options.feature_version = 2;
696 options.embedding_size = 4;
697 options.extract_selection_mask_feature = true;
698
699 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
700 TestingFeatureProcessor feature_processor(
701 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
702 &unilib_);
703
704 std::unique_ptr<CachedFeatures> cached_features;
705
706 FakeEmbeddingExecutor embedding_executor;
707
708 const std::vector<Token> tokens = {Token("aaa", 0, 3), Token("bbb", 4, 7),
709 Token("ccc", 8, 11), Token("ddd", 12, 15)};
710
711 EXPECT_TRUE(feature_processor.ExtractFeatures(
712 tokens, /*token_span=*/{0, 4},
713 /*selection_span_for_feature=*/{4, 11}, &embedding_executor,
714 /*embedding_cache=*/nullptr, /*feature_vector_size=*/5,
715 &cached_features));
716 std::vector<float> features;
717 cached_features->AppendClickContextFeaturesForClick(1, &features);
718 ASSERT_EQ(features.size(), 25);
719 EXPECT_THAT(features[4], FloatEq(0.0));
720 EXPECT_THAT(features[9], FloatEq(0.0));
721 EXPECT_THAT(features[14], FloatEq(1.0));
722 EXPECT_THAT(features[19], FloatEq(1.0));
723 EXPECT_THAT(features[24], FloatEq(0.0));
724 }
725
TEST_F(AnnotatorFeatureProcessorTest,EmbeddingCache)726 TEST_F(AnnotatorFeatureProcessorTest, EmbeddingCache) {
727 FeatureProcessorOptionsT options;
728 options.context_size = 2;
729 options.max_selection_span = 2;
730 options.snap_label_span_boundaries_to_containing_tokens = false;
731 options.feature_version = 2;
732 options.embedding_size = 4;
733 options.bounds_sensitive_features.reset(
734 new FeatureProcessorOptions_::BoundsSensitiveFeaturesT());
735 options.bounds_sensitive_features->enabled = true;
736 options.bounds_sensitive_features->num_tokens_before = 3;
737 options.bounds_sensitive_features->num_tokens_inside_left = 2;
738 options.bounds_sensitive_features->num_tokens_inside_right = 2;
739 options.bounds_sensitive_features->num_tokens_after = 3;
740
741 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
742 TestingFeatureProcessor feature_processor(
743 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
744 &unilib_);
745
746 std::unique_ptr<CachedFeatures> cached_features;
747
748 FakeEmbeddingExecutor embedding_executor;
749
750 const std::vector<Token> tokens = {
751 Token("aaa", 0, 3), Token("bbb", 4, 7), Token("ccc", 8, 11),
752 Token("ddd", 12, 15), Token("eee", 16, 19), Token("fff", 20, 23)};
753
754 // We pre-populate the cache with dummy embeddings, to make sure they are
755 // used when populating the features vector.
756 const std::vector<float> cached_padding_features = {10.0, -10.0, 10.0, -10.0};
757 const std::vector<float> cached_features1 = {1.0, 2.0, 3.0, 4.0};
758 const std::vector<float> cached_features2 = {5.0, 6.0, 7.0, 8.0};
759 FeatureProcessor::EmbeddingCache embedding_cache = {
760 {{kInvalidIndex, kInvalidIndex}, cached_padding_features},
761 {{4, 7}, cached_features1},
762 {{12, 15}, cached_features2},
763 };
764
765 EXPECT_TRUE(feature_processor.ExtractFeatures(
766 tokens, /*token_span=*/{0, 6},
767 /*selection_span_for_feature=*/{kInvalidIndex, kInvalidIndex},
768 &embedding_executor, &embedding_cache, /*feature_vector_size=*/4,
769 &cached_features));
770 std::vector<float> features;
771 cached_features->AppendBoundsSensitiveFeaturesForSpan({2, 4}, &features);
772 ASSERT_EQ(features.size(), 40);
773 // Check that the dummy embeddings were used.
774 EXPECT_THAT(Subvector(features, 0, 4),
775 ElementsAreFloat(cached_padding_features));
776 EXPECT_THAT(Subvector(features, 8, 12), ElementsAreFloat(cached_features1));
777 EXPECT_THAT(Subvector(features, 16, 20), ElementsAreFloat(cached_features2));
778 EXPECT_THAT(Subvector(features, 24, 28), ElementsAreFloat(cached_features2));
779 EXPECT_THAT(Subvector(features, 36, 40),
780 ElementsAreFloat(cached_padding_features));
781 // Check that the real embeddings were cached.
782 EXPECT_EQ(embedding_cache.size(), 7);
783 EXPECT_THAT(Subvector(features, 4, 8),
784 ElementsAreFloat(embedding_cache.at({0, 3})));
785 EXPECT_THAT(Subvector(features, 12, 16),
786 ElementsAreFloat(embedding_cache.at({8, 11})));
787 EXPECT_THAT(Subvector(features, 20, 24),
788 ElementsAreFloat(embedding_cache.at({8, 11})));
789 EXPECT_THAT(Subvector(features, 28, 32),
790 ElementsAreFloat(embedding_cache.at({16, 19})));
791 EXPECT_THAT(Subvector(features, 32, 36),
792 ElementsAreFloat(embedding_cache.at({20, 23})));
793 }
794
TEST_F(AnnotatorFeatureProcessorTest,StripUnusedTokensWithNoRelativeClick)795 TEST_F(AnnotatorFeatureProcessorTest, StripUnusedTokensWithNoRelativeClick) {
796 std::vector<Token> tokens_orig{
797 Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0), Token("3", 0, 0),
798 Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0), Token("7", 0, 0),
799 Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
800 Token("12", 0, 0)};
801
802 std::vector<Token> tokens;
803 int click_index;
804
805 // Try to click first token and see if it gets padded from left.
806 tokens = tokens_orig;
807 click_index = 0;
808 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
809 // clang-format off
810 EXPECT_EQ(tokens, std::vector<Token>({Token(),
811 Token(),
812 Token("0", 0, 0),
813 Token("1", 0, 0),
814 Token("2", 0, 0)}));
815 // clang-format on
816 EXPECT_EQ(click_index, 2);
817
818 // When we click the second token nothing should get padded.
819 tokens = tokens_orig;
820 click_index = 2;
821 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
822 // clang-format off
823 EXPECT_EQ(tokens, std::vector<Token>({Token("0", 0, 0),
824 Token("1", 0, 0),
825 Token("2", 0, 0),
826 Token("3", 0, 0),
827 Token("4", 0, 0)}));
828 // clang-format on
829 EXPECT_EQ(click_index, 2);
830
831 // When we click the last token tokens should get padded from the right.
832 tokens = tokens_orig;
833 click_index = 12;
834 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
835 // clang-format off
836 EXPECT_EQ(tokens, std::vector<Token>({Token("10", 0, 0),
837 Token("11", 0, 0),
838 Token("12", 0, 0),
839 Token(),
840 Token()}));
841 // clang-format on
842 EXPECT_EQ(click_index, 2);
843 }
844
TEST_F(AnnotatorFeatureProcessorTest,StripUnusedTokensWithRelativeClick)845 TEST_F(AnnotatorFeatureProcessorTest, StripUnusedTokensWithRelativeClick) {
846 std::vector<Token> tokens_orig{
847 Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0), Token("3", 0, 0),
848 Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0), Token("7", 0, 0),
849 Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
850 Token("12", 0, 0)};
851
852 std::vector<Token> tokens;
853 int click_index;
854
855 // Try to click first token and see if it gets padded from left to maximum
856 // context_size.
857 tokens = tokens_orig;
858 click_index = 0;
859 internal::StripOrPadTokens({2, 3}, 2, &tokens, &click_index);
860 // clang-format off
861 EXPECT_EQ(tokens, std::vector<Token>({Token(),
862 Token(),
863 Token("0", 0, 0),
864 Token("1", 0, 0),
865 Token("2", 0, 0),
866 Token("3", 0, 0),
867 Token("4", 0, 0),
868 Token("5", 0, 0)}));
869 // clang-format on
870 EXPECT_EQ(click_index, 2);
871
872 // Clicking to the middle with enough context should not produce any padding.
873 tokens = tokens_orig;
874 click_index = 6;
875 internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
876 // clang-format off
877 EXPECT_EQ(tokens, std::vector<Token>({Token("1", 0, 0),
878 Token("2", 0, 0),
879 Token("3", 0, 0),
880 Token("4", 0, 0),
881 Token("5", 0, 0),
882 Token("6", 0, 0),
883 Token("7", 0, 0),
884 Token("8", 0, 0),
885 Token("9", 0, 0)}));
886 // clang-format on
887 EXPECT_EQ(click_index, 5);
888
889 // Clicking at the end should pad right to maximum context_size.
890 tokens = tokens_orig;
891 click_index = 11;
892 internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
893 // clang-format off
894 EXPECT_EQ(tokens, std::vector<Token>({Token("6", 0, 0),
895 Token("7", 0, 0),
896 Token("8", 0, 0),
897 Token("9", 0, 0),
898 Token("10", 0, 0),
899 Token("11", 0, 0),
900 Token("12", 0, 0),
901 Token(),
902 Token()}));
903 // clang-format on
904 EXPECT_EQ(click_index, 5);
905 }
906
TEST_F(AnnotatorFeatureProcessorTest,IgnoredSpanBoundaryCodepoints)907 TEST_F(AnnotatorFeatureProcessorTest, IgnoredSpanBoundaryCodepoints) {
908 FeatureProcessorOptionsT options;
909 options.ignored_span_boundary_codepoints.push_back('.');
910 options.ignored_span_boundary_codepoints.push_back(',');
911 options.ignored_span_boundary_codepoints.push_back('[');
912 options.ignored_span_boundary_codepoints.push_back(']');
913
914 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
915 TestingFeatureProcessor feature_processor(
916 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
917 &unilib_);
918
919 const std::string text1_utf8 = "ěščř";
920 const UnicodeText text1 = UTF8ToUnicodeText(text1_utf8, /*do_copy=*/false);
921 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
922 text1.begin(), text1.end(),
923 /*count_from_beginning=*/true),
924 0);
925 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
926 text1.begin(), text1.end(),
927 /*count_from_beginning=*/false),
928 0);
929
930 const std::string text2_utf8 = ".,abčd";
931 const UnicodeText text2 = UTF8ToUnicodeText(text2_utf8, /*do_copy=*/false);
932 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
933 text2.begin(), text2.end(),
934 /*count_from_beginning=*/true),
935 2);
936 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
937 text2.begin(), text2.end(),
938 /*count_from_beginning=*/false),
939 0);
940
941 const std::string text3_utf8 = ".,abčd[]";
942 const UnicodeText text3 = UTF8ToUnicodeText(text3_utf8, /*do_copy=*/false);
943 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
944 text3.begin(), text3.end(),
945 /*count_from_beginning=*/true),
946 2);
947 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
948 text3.begin(), text3.end(),
949 /*count_from_beginning=*/false),
950 2);
951
952 const std::string text4_utf8 = "[abčd]";
953 const UnicodeText text4 = UTF8ToUnicodeText(text4_utf8, /*do_copy=*/false);
954 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
955 text4.begin(), text4.end(),
956 /*count_from_beginning=*/true),
957 1);
958 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
959 text4.begin(), text4.end(),
960 /*count_from_beginning=*/false),
961 1);
962
963 const std::string text5_utf8 = "";
964 const UnicodeText text5 = UTF8ToUnicodeText(text5_utf8, /*do_copy=*/false);
965 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
966 text5.begin(), text5.end(),
967 /*count_from_beginning=*/true),
968 0);
969 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
970 text5.begin(), text5.end(),
971 /*count_from_beginning=*/false),
972 0);
973
974 const std::string text6_utf8 = "012345ěščř";
975 const UnicodeText text6 = UTF8ToUnicodeText(text6_utf8, /*do_copy=*/false);
976 UnicodeText::const_iterator text6_begin = text6.begin();
977 std::advance(text6_begin, 6);
978 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
979 text6_begin, text6.end(),
980 /*count_from_beginning=*/true),
981 0);
982 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
983 text6_begin, text6.end(),
984 /*count_from_beginning=*/false),
985 0);
986
987 const std::string text7_utf8 = "012345.,ěščř";
988 const UnicodeText text7 = UTF8ToUnicodeText(text7_utf8, /*do_copy=*/false);
989 UnicodeText::const_iterator text7_begin = text7.begin();
990 std::advance(text7_begin, 6);
991 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
992 text7_begin, text7.end(),
993 /*count_from_beginning=*/true),
994 2);
995 UnicodeText::const_iterator text7_end = text7.begin();
996 std::advance(text7_end, 8);
997 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
998 text7.begin(), text7_end,
999 /*count_from_beginning=*/false),
1000 2);
1001
1002 // Test not stripping.
1003 EXPECT_EQ(feature_processor.StripBoundaryCodepoints(
1004 "Hello [[[Wořld]] or not?", {0, 24}),
1005 CodepointSpan(0, 24));
1006 // Test basic stripping.
1007 EXPECT_EQ(feature_processor.StripBoundaryCodepoints(
1008 "Hello [[[Wořld]] or not?", {6, 16}),
1009 CodepointSpan(9, 14));
1010 // Test stripping when everything is stripped.
1011 EXPECT_EQ(
1012 feature_processor.StripBoundaryCodepoints("Hello [[[]] or not?", {6, 11}),
1013 CodepointSpan(6, 6));
1014 // Test stripping empty string.
1015 EXPECT_EQ(feature_processor.StripBoundaryCodepoints("", {0, 0}),
1016 CodepointSpan(0, 0));
1017 }
1018
TEST_F(AnnotatorFeatureProcessorTest,CodepointSpanToTokenSpan)1019 TEST_F(AnnotatorFeatureProcessorTest, CodepointSpanToTokenSpan) {
1020 const std::vector<Token> tokens{Token("Hělló", 0, 5),
1021 Token("fěěbař@google.com", 6, 23),
1022 Token("heře!", 24, 29)};
1023
1024 // Spans matching the tokens exactly.
1025 EXPECT_EQ(TokenSpan(0, 1), CodepointSpanToTokenSpan(tokens, {0, 5}));
1026 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {6, 23}));
1027 EXPECT_EQ(TokenSpan(2, 3), CodepointSpanToTokenSpan(tokens, {24, 29}));
1028 EXPECT_EQ(TokenSpan(0, 2), CodepointSpanToTokenSpan(tokens, {0, 23}));
1029 EXPECT_EQ(TokenSpan(1, 3), CodepointSpanToTokenSpan(tokens, {6, 29}));
1030 EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {0, 29}));
1031
1032 // Snapping to containing tokens has no effect.
1033 EXPECT_EQ(TokenSpan(0, 1), CodepointSpanToTokenSpan(tokens, {0, 5}, true));
1034 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {6, 23}, true));
1035 EXPECT_EQ(TokenSpan(2, 3), CodepointSpanToTokenSpan(tokens, {24, 29}, true));
1036 EXPECT_EQ(TokenSpan(0, 2), CodepointSpanToTokenSpan(tokens, {0, 23}, true));
1037 EXPECT_EQ(TokenSpan(1, 3), CodepointSpanToTokenSpan(tokens, {6, 29}, true));
1038 EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {0, 29}, true));
1039
1040 // Span boundaries inside tokens.
1041 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {1, 28}));
1042 EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {1, 28}, true));
1043
1044 // Tokens adjacent to the span, but not overlapping.
1045 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {5, 24}));
1046 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {5, 24}, true));
1047 }
1048
1049 } // namespace
1050 } // namespace libtextclassifier3
1051