xref: /aosp_15_r20/external/libtextclassifier/native/annotator/feature-processor_test.cc (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "annotator/feature-processor.h"
18 
19 #include "annotator/model-executor.h"
20 #include "utils/tensor-view.h"
21 #include "utils/utf8/unicodetext.h"
22 #include "gmock/gmock.h"
23 #include "gtest/gtest.h"
24 
25 namespace libtextclassifier3 {
26 namespace {
27 
28 using testing::ElementsAreArray;
29 using testing::FloatEq;
30 using testing::Matcher;
31 
PackFeatureProcessorOptions(const FeatureProcessorOptionsT & options)32 flatbuffers::DetachedBuffer PackFeatureProcessorOptions(
33     const FeatureProcessorOptionsT& options) {
34   flatbuffers::FlatBufferBuilder builder;
35   builder.Finish(CreateFeatureProcessorOptions(builder, &options));
36   return builder.Release();
37 }
38 
39 template <typename T>
Subvector(const std::vector<T> & vector,int start,int end)40 std::vector<T> Subvector(const std::vector<T>& vector, int start, int end) {
41   return std::vector<T>(vector.begin() + start, vector.begin() + end);
42 }
43 
ElementsAreFloat(const std::vector<float> & values)44 Matcher<std::vector<float>> ElementsAreFloat(const std::vector<float>& values) {
45   std::vector<Matcher<float>> matchers;
46   for (const float value : values) {
47     matchers.push_back(FloatEq(value));
48   }
49   return ElementsAreArray(matchers);
50 }
51 
52 class TestingFeatureProcessor : public FeatureProcessor {
53  public:
54   using FeatureProcessor::CountIgnoredSpanBoundaryCodepoints;
55   using FeatureProcessor::FeatureProcessor;
56   using FeatureProcessor::SpanToLabel;
57   using FeatureProcessor::StripTokensFromOtherLines;
58   using FeatureProcessor::supported_codepoint_ranges_;
59   using FeatureProcessor::SupportedCodepointsRatio;
60 };
61 
62 // EmbeddingExecutor that always returns features based on
63 class FakeEmbeddingExecutor : public EmbeddingExecutor {
64  public:
AddEmbedding(const TensorView<int> & sparse_features,float * dest,int dest_size) const65   bool AddEmbedding(const TensorView<int>& sparse_features, float* dest,
66                     int dest_size) const override {
67     TC3_CHECK_GE(dest_size, 4);
68     EXPECT_EQ(sparse_features.size(), 1);
69     dest[0] = sparse_features.data()[0];
70     dest[1] = sparse_features.data()[0];
71     dest[2] = -sparse_features.data()[0];
72     dest[3] = -sparse_features.data()[0];
73     return true;
74   }
75 
76  private:
77   std::vector<float> storage_;
78 };
79 
80 class AnnotatorFeatureProcessorTest : public ::testing::Test {
81  protected:
AnnotatorFeatureProcessorTest()82   AnnotatorFeatureProcessorTest() : INIT_UNILIB_FOR_TESTING(unilib_) {}
83   UniLib unilib_;
84 };
85 
TEST_F(AnnotatorFeatureProcessorTest,SplitTokensOnSelectionBoundariesMiddle)86 TEST_F(AnnotatorFeatureProcessorTest, SplitTokensOnSelectionBoundariesMiddle) {
87   std::vector<Token> tokens{Token("Hělló", 0, 5),
88                             Token("fěěbař@google.com", 6, 23),
89                             Token("heře!", 24, 29)};
90 
91   internal::SplitTokensOnSelectionBoundaries({9, 12}, &tokens);
92 
93   // clang-format off
94   EXPECT_THAT(tokens, ElementsAreArray(
95                           {Token("Hělló", 0, 5),
96                            Token("fěě", 6, 9),
97                            Token("bař", 9, 12),
98                            Token("@google.com", 12, 23),
99                            Token("heře!", 24, 29)}));
100   // clang-format on
101 }
102 
TEST_F(AnnotatorFeatureProcessorTest,SplitTokensOnSelectionBoundariesBegin)103 TEST_F(AnnotatorFeatureProcessorTest, SplitTokensOnSelectionBoundariesBegin) {
104   std::vector<Token> tokens{Token("Hělló", 0, 5),
105                             Token("fěěbař@google.com", 6, 23),
106                             Token("heře!", 24, 29)};
107 
108   internal::SplitTokensOnSelectionBoundaries({6, 12}, &tokens);
109 
110   // clang-format off
111   EXPECT_THAT(tokens, ElementsAreArray(
112                           {Token("Hělló", 0, 5),
113                            Token("fěěbař", 6, 12),
114                            Token("@google.com", 12, 23),
115                            Token("heře!", 24, 29)}));
116   // clang-format on
117 }
118 
TEST_F(AnnotatorFeatureProcessorTest,SplitTokensOnSelectionBoundariesEnd)119 TEST_F(AnnotatorFeatureProcessorTest, SplitTokensOnSelectionBoundariesEnd) {
120   std::vector<Token> tokens{Token("Hělló", 0, 5),
121                             Token("fěěbař@google.com", 6, 23),
122                             Token("heře!", 24, 29)};
123 
124   internal::SplitTokensOnSelectionBoundaries({9, 23}, &tokens);
125 
126   // clang-format off
127   EXPECT_THAT(tokens, ElementsAreArray(
128                           {Token("Hělló", 0, 5),
129                            Token("fěě", 6, 9),
130                            Token("bař@google.com", 9, 23),
131                            Token("heře!", 24, 29)}));
132   // clang-format on
133 }
134 
TEST_F(AnnotatorFeatureProcessorTest,SplitTokensOnSelectionBoundariesWhole)135 TEST_F(AnnotatorFeatureProcessorTest, SplitTokensOnSelectionBoundariesWhole) {
136   std::vector<Token> tokens{Token("Hělló", 0, 5),
137                             Token("fěěbař@google.com", 6, 23),
138                             Token("heře!", 24, 29)};
139 
140   internal::SplitTokensOnSelectionBoundaries({6, 23}, &tokens);
141 
142   // clang-format off
143   EXPECT_THAT(tokens, ElementsAreArray(
144                           {Token("Hělló", 0, 5),
145                            Token("fěěbař@google.com", 6, 23),
146                            Token("heře!", 24, 29)}));
147   // clang-format on
148 }
149 
TEST_F(AnnotatorFeatureProcessorTest,SplitTokensOnSelectionBoundariesCrossToken)150 TEST_F(AnnotatorFeatureProcessorTest,
151        SplitTokensOnSelectionBoundariesCrossToken) {
152   std::vector<Token> tokens{Token("Hělló", 0, 5),
153                             Token("fěěbař@google.com", 6, 23),
154                             Token("heře!", 24, 29)};
155 
156   internal::SplitTokensOnSelectionBoundaries({2, 9}, &tokens);
157 
158   // clang-format off
159   EXPECT_THAT(tokens, ElementsAreArray(
160                           {Token("Hě", 0, 2),
161                            Token("lló", 2, 5),
162                            Token("fěě", 6, 9),
163                            Token("bař@google.com", 9, 23),
164                            Token("heře!", 24, 29)}));
165   // clang-format on
166 }
167 
TEST_F(AnnotatorFeatureProcessorTest,KeepLineWithClickFirst)168 TEST_F(AnnotatorFeatureProcessorTest, KeepLineWithClickFirst) {
169   FeatureProcessorOptionsT options;
170   options.only_use_line_with_click = true;
171   flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
172   TestingFeatureProcessor feature_processor(
173       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
174       &unilib_);
175 
176   const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
177   const CodepointSpan span = {0, 5};
178   // clang-format off
179   std::vector<Token> tokens = {Token("Fiřst", 0, 5),
180                                Token("Lině", 6, 10),
181                                Token("Sěcond", 11, 17),
182                                Token("Lině", 18, 22),
183                                Token("Thiřd", 23, 28),
184                                Token("Lině", 29, 33)};
185   // clang-format on
186 
187   // Keeps the first line.
188   feature_processor.StripTokensFromOtherLines(context, span, &tokens);
189   EXPECT_THAT(tokens,
190               ElementsAreArray({Token("Fiřst", 0, 5), Token("Lině", 6, 10)}));
191 }
192 
TEST_F(AnnotatorFeatureProcessorTest,KeepLineWithClickSecond)193 TEST_F(AnnotatorFeatureProcessorTest, KeepLineWithClickSecond) {
194   FeatureProcessorOptionsT options;
195   options.only_use_line_with_click = true;
196   flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
197   TestingFeatureProcessor feature_processor(
198       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
199       &unilib_);
200 
201   const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
202   const CodepointSpan span = {18, 22};
203   // clang-format off
204   std::vector<Token> tokens = {Token("Fiřst", 0, 5),
205                                Token("Lině", 6, 10),
206                                Token("Sěcond", 11, 17),
207                                Token("Lině", 18, 22),
208                                Token("Thiřd", 23, 28),
209                                Token("Lině", 29, 33)};
210   // clang-format on
211 
212   // Keeps the first line.
213   feature_processor.StripTokensFromOtherLines(context, span, &tokens);
214   EXPECT_THAT(tokens, ElementsAreArray(
215                           {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
216 }
217 
TEST_F(AnnotatorFeatureProcessorTest,KeepLineWithClickThird)218 TEST_F(AnnotatorFeatureProcessorTest, KeepLineWithClickThird) {
219   FeatureProcessorOptionsT options;
220   options.only_use_line_with_click = true;
221   flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
222   TestingFeatureProcessor feature_processor(
223       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
224       &unilib_);
225 
226   const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
227   const CodepointSpan span = {24, 33};
228   // clang-format off
229   std::vector<Token> tokens = {Token("Fiřst", 0, 5),
230                                Token("Lině", 6, 10),
231                                Token("Sěcond", 11, 17),
232                                Token("Lině", 18, 22),
233                                Token("Thiřd", 23, 28),
234                                Token("Lině", 29, 33)};
235   // clang-format on
236 
237   // Keeps the first line.
238   feature_processor.StripTokensFromOtherLines(context, span, &tokens);
239   EXPECT_THAT(tokens, ElementsAreArray(
240                           {Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
241 }
242 
TEST_F(AnnotatorFeatureProcessorTest,KeepLineWithClickSecondWithPipe)243 TEST_F(AnnotatorFeatureProcessorTest, KeepLineWithClickSecondWithPipe) {
244   FeatureProcessorOptionsT options;
245   options.only_use_line_with_click = true;
246   flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
247   TestingFeatureProcessor feature_processor(
248       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
249       &unilib_);
250 
251   const std::string context = "Fiřst Lině|Sěcond Lině\nThiřd Lině";
252   const CodepointSpan span = {18, 22};
253   // clang-format off
254   std::vector<Token> tokens = {Token("Fiřst", 0, 5),
255                                Token("Lině", 6, 10),
256                                Token("Sěcond", 11, 17),
257                                Token("Lině", 18, 22),
258                                Token("Thiřd", 23, 28),
259                                Token("Lině", 29, 33)};
260   // clang-format on
261 
262   // Keeps the first line.
263   feature_processor.StripTokensFromOtherLines(context, span, &tokens);
264   EXPECT_THAT(tokens, ElementsAreArray(
265                           {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
266 }
267 
TEST_F(AnnotatorFeatureProcessorTest,KeepLineWithClickAndDoNotUsePipeAsNewLineCharacter)268 TEST_F(AnnotatorFeatureProcessorTest,
269        KeepLineWithClickAndDoNotUsePipeAsNewLineCharacter) {
270   FeatureProcessorOptionsT options;
271   options.only_use_line_with_click = true;
272   options.use_pipe_character_for_newline = false;
273   flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
274   TestingFeatureProcessor feature_processor(
275       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
276       &unilib_);
277 
278   const std::string context = "Fiřst Lině|Sěcond Lině\nThiřd Lině";
279   const CodepointSpan span = {18, 22};
280   // clang-format off
281   std::vector<Token> tokens = {Token("Fiřst", 0, 5),
282                                Token("Lině|Sěcond", 6, 17),
283                                Token("Lině", 18, 22),
284                                Token("Thiřd", 23, 28),
285                                Token("Lině", 29, 33)};
286   // clang-format on
287 
288   // Keeps the first line.
289   feature_processor.StripTokensFromOtherLines(context, span, &tokens);
290   EXPECT_THAT(tokens, ElementsAreArray({Token("Fiřst", 0, 5),
291                                         Token("Lině|Sěcond", 6, 17),
292                                         Token("Lině", 18, 22)}));
293 }
294 
TEST_F(AnnotatorFeatureProcessorTest,ShouldSplitLinesOnPipe)295 TEST_F(AnnotatorFeatureProcessorTest, ShouldSplitLinesOnPipe) {
296   FeatureProcessorOptionsT options;
297   options.use_pipe_character_for_newline = true;
298   flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
299   TestingFeatureProcessor feature_processor(
300       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
301       &unilib_);
302 
303   const std::string context = "Fiřst Lině|Sěcond Lině\nThiřd Lině";
304   const UnicodeText context_unicode = UTF8ToUnicodeText(context,
305                                                         /*do_copy=*/false);
306 
307   const std::vector<UnicodeTextRange>& lines = feature_processor.SplitContext(
308       context_unicode, options.use_pipe_character_for_newline);
309   EXPECT_EQ(lines.size(), 3);
310   EXPECT_EQ(UnicodeText::UTF8Substring(lines[0].first, lines[0].second),
311             "Fiřst Lině");
312   EXPECT_EQ(UnicodeText::UTF8Substring(lines[1].first, lines[1].second),
313             "Sěcond Lině");
314   EXPECT_EQ(UnicodeText::UTF8Substring(lines[2].first, lines[2].second),
315             "Thiřd Lině");
316 }
317 
TEST_F(AnnotatorFeatureProcessorTest,ShouldNotSplitLinesOnPipe)318 TEST_F(AnnotatorFeatureProcessorTest, ShouldNotSplitLinesOnPipe) {
319   FeatureProcessorOptionsT options;
320   options.use_pipe_character_for_newline = false;
321   flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
322   TestingFeatureProcessor feature_processor(
323       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
324       &unilib_);
325 
326   const std::string context = "Fiřst Lině|Sěcond Lině\nThiřd Lině";
327   const UnicodeText context_unicode = UTF8ToUnicodeText(context,
328                                                         /*do_copy=*/false);
329 
330   const std::vector<UnicodeTextRange>& lines = feature_processor.SplitContext(
331       context_unicode, options.use_pipe_character_for_newline);
332   EXPECT_EQ(lines.size(), 2);
333   EXPECT_EQ(UnicodeText::UTF8Substring(lines[0].first, lines[0].second),
334             "Fiřst Lině|Sěcond Lině");
335   EXPECT_EQ(UnicodeText::UTF8Substring(lines[1].first, lines[1].second),
336             "Thiřd Lině");
337 }
338 
TEST_F(AnnotatorFeatureProcessorTest,KeepLineWithCrosslineClick)339 TEST_F(AnnotatorFeatureProcessorTest, KeepLineWithCrosslineClick) {
340   FeatureProcessorOptionsT options;
341   options.only_use_line_with_click = true;
342   flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
343   TestingFeatureProcessor feature_processor(
344       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
345       &unilib_);
346 
347   const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
348   const CodepointSpan span = {5, 23};
349   // clang-format off
350   std::vector<Token> tokens = {Token("Fiřst", 0, 5),
351                                Token("Lině", 6, 10),
352                                Token("Sěcond", 18, 23),
353                                Token("Lině", 19, 23),
354                                Token("Thiřd", 23, 28),
355                                Token("Lině", 29, 33)};
356   // clang-format on
357 
358   // Keeps the first line.
359   feature_processor.StripTokensFromOtherLines(context, span, &tokens);
360   EXPECT_THAT(tokens, ElementsAreArray(
361                           {Token("Fiřst", 0, 5), Token("Lině", 6, 10),
362                            Token("Sěcond", 18, 23), Token("Lině", 19, 23),
363                            Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
364 }
365 
TEST_F(AnnotatorFeatureProcessorTest,SpanToLabel)366 TEST_F(AnnotatorFeatureProcessorTest, SpanToLabel) {
367   FeatureProcessorOptionsT options;
368   options.context_size = 1;
369   options.max_selection_span = 1;
370   options.snap_label_span_boundaries_to_containing_tokens = false;
371 
372   options.tokenization_codepoint_config.emplace_back(
373       new TokenizationCodepointRangeT());
374   auto& config = options.tokenization_codepoint_config.back();
375   config->start = 32;
376   config->end = 33;
377   config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
378 
379   flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
380   TestingFeatureProcessor feature_processor(
381       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
382       &unilib_);
383   std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");
384   ASSERT_EQ(3, tokens.size());
385   int label;
386   ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));
387   EXPECT_EQ(kInvalidLabel, label);
388   ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));
389   EXPECT_NE(kInvalidLabel, label);
390   TokenSpan token_span;
391   feature_processor.LabelToTokenSpan(label, &token_span);
392   EXPECT_EQ(0, token_span.first);
393   EXPECT_EQ(0, token_span.second);
394 
395   // Reconfigure with snapping enabled.
396   options.snap_label_span_boundaries_to_containing_tokens = true;
397   flatbuffers::DetachedBuffer options2_fb =
398       PackFeatureProcessorOptions(options);
399   TestingFeatureProcessor feature_processor2(
400       flatbuffers::GetRoot<FeatureProcessorOptions>(options2_fb.data()),
401       &unilib_);
402   int label2;
403   ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));
404   EXPECT_EQ(label, label2);
405   ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));
406   EXPECT_EQ(label, label2);
407   ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));
408   EXPECT_EQ(label, label2);
409 
410   // Cross a token boundary.
411   ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));
412   EXPECT_EQ(kInvalidLabel, label2);
413   ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));
414   EXPECT_EQ(kInvalidLabel, label2);
415 
416   // Multiple tokens.
417   options.context_size = 2;
418   options.max_selection_span = 2;
419   flatbuffers::DetachedBuffer options3_fb =
420       PackFeatureProcessorOptions(options);
421   TestingFeatureProcessor feature_processor3(
422       flatbuffers::GetRoot<FeatureProcessorOptions>(options3_fb.data()),
423       &unilib_);
424   tokens = feature_processor3.Tokenize("zero, one, two, three, four");
425   ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));
426   EXPECT_NE(kInvalidLabel, label2);
427   feature_processor3.LabelToTokenSpan(label2, &token_span);
428   EXPECT_EQ(1, token_span.first);
429   EXPECT_EQ(0, token_span.second);
430 
431   int label3;
432   ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));
433   EXPECT_EQ(label2, label3);
434   ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));
435   EXPECT_EQ(label2, label3);
436   ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));
437   EXPECT_EQ(label2, label3);
438 }
439 
TEST_F(AnnotatorFeatureProcessorTest,SpanToLabelIgnoresPunctuation)440 TEST_F(AnnotatorFeatureProcessorTest, SpanToLabelIgnoresPunctuation) {
441   FeatureProcessorOptionsT options;
442   options.context_size = 1;
443   options.max_selection_span = 1;
444   options.snap_label_span_boundaries_to_containing_tokens = false;
445 
446   options.tokenization_codepoint_config.emplace_back(
447       new TokenizationCodepointRangeT());
448   auto& config = options.tokenization_codepoint_config.back();
449   config->start = 32;
450   config->end = 33;
451   config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
452 
453   flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
454   TestingFeatureProcessor feature_processor(
455       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
456       &unilib_);
457   std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");
458   ASSERT_EQ(3, tokens.size());
459   int label;
460   ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));
461   EXPECT_EQ(kInvalidLabel, label);
462   ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));
463   EXPECT_NE(kInvalidLabel, label);
464   TokenSpan token_span;
465   feature_processor.LabelToTokenSpan(label, &token_span);
466   EXPECT_EQ(0, token_span.first);
467   EXPECT_EQ(0, token_span.second);
468 
469   // Reconfigure with snapping enabled.
470   options.snap_label_span_boundaries_to_containing_tokens = true;
471   flatbuffers::DetachedBuffer options2_fb =
472       PackFeatureProcessorOptions(options);
473   TestingFeatureProcessor feature_processor2(
474       flatbuffers::GetRoot<FeatureProcessorOptions>(options2_fb.data()),
475       &unilib_);
476   int label2;
477   ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));
478   EXPECT_EQ(label, label2);
479   ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));
480   EXPECT_EQ(label, label2);
481   ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));
482   EXPECT_EQ(label, label2);
483 
484   // Cross a token boundary.
485   ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));
486   EXPECT_EQ(kInvalidLabel, label2);
487   ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));
488   EXPECT_EQ(kInvalidLabel, label2);
489 
490   // Multiple tokens.
491   options.context_size = 2;
492   options.max_selection_span = 2;
493   flatbuffers::DetachedBuffer options3_fb =
494       PackFeatureProcessorOptions(options);
495   TestingFeatureProcessor feature_processor3(
496       flatbuffers::GetRoot<FeatureProcessorOptions>(options3_fb.data()),
497       &unilib_);
498   tokens = feature_processor3.Tokenize("zero, one, two, three, four");
499   ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));
500   EXPECT_NE(kInvalidLabel, label2);
501   feature_processor3.LabelToTokenSpan(label2, &token_span);
502   EXPECT_EQ(1, token_span.first);
503   EXPECT_EQ(0, token_span.second);
504 
505   int label3;
506   ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));
507   EXPECT_EQ(label2, label3);
508   ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));
509   EXPECT_EQ(label2, label3);
510   ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));
511   EXPECT_EQ(label2, label3);
512 }
513 
TEST_F(AnnotatorFeatureProcessorTest,CenterTokenFromClick)514 TEST_F(AnnotatorFeatureProcessorTest, CenterTokenFromClick) {
515   int token_index;
516 
517   // Exactly aligned indices.
518   token_index = internal::CenterTokenFromClick(
519       {6, 11},
520       {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
521   EXPECT_EQ(token_index, 1);
522 
523   // Click is contained in a token.
524   token_index = internal::CenterTokenFromClick(
525       {13, 17},
526       {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
527   EXPECT_EQ(token_index, 2);
528 
529   // Click spans two tokens.
530   token_index = internal::CenterTokenFromClick(
531       {6, 17},
532       {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
533   EXPECT_EQ(token_index, kInvalidIndex);
534 }
535 
TEST_F(AnnotatorFeatureProcessorTest,CenterTokenFromMiddleOfSelection)536 TEST_F(AnnotatorFeatureProcessorTest, CenterTokenFromMiddleOfSelection) {
537   int token_index;
538 
539   // Selection of length 3. Exactly aligned indices.
540   token_index = internal::CenterTokenFromMiddleOfSelection(
541       {7, 27},
542       {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
543        Token("Token4", 21, 27), Token("Token5", 28, 34)});
544   EXPECT_EQ(token_index, 2);
545 
546   // Selection of length 1 token. Exactly aligned indices.
547   token_index = internal::CenterTokenFromMiddleOfSelection(
548       {21, 27},
549       {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
550        Token("Token4", 21, 27), Token("Token5", 28, 34)});
551   EXPECT_EQ(token_index, 3);
552 
553   // Selection marks sub-token range, with no tokens in it.
554   token_index = internal::CenterTokenFromMiddleOfSelection(
555       {29, 33},
556       {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
557        Token("Token4", 21, 27), Token("Token5", 28, 34)});
558   EXPECT_EQ(token_index, kInvalidIndex);
559 
560   // Selection of length 2. Sub-token indices.
561   token_index = internal::CenterTokenFromMiddleOfSelection(
562       {3, 25},
563       {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
564        Token("Token4", 21, 27), Token("Token5", 28, 34)});
565   EXPECT_EQ(token_index, 1);
566 
567   // Selection of length 1. Sub-token indices.
568   token_index = internal::CenterTokenFromMiddleOfSelection(
569       {22, 34},
570       {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
571        Token("Token4", 21, 27), Token("Token5", 28, 34)});
572   EXPECT_EQ(token_index, 4);
573 
574   // Some invalid ones.
575   token_index = internal::CenterTokenFromMiddleOfSelection({7, 27}, {});
576   EXPECT_EQ(token_index, -1);
577 }
578 
TEST_F(AnnotatorFeatureProcessorTest,SupportedCodepointsRatio)579 TEST_F(AnnotatorFeatureProcessorTest, SupportedCodepointsRatio) {
580   FeatureProcessorOptionsT options;
581   options.context_size = 2;
582   options.max_selection_span = 2;
583   options.snap_label_span_boundaries_to_containing_tokens = false;
584   options.feature_version = 2;
585   options.embedding_size = 4;
586   options.bounds_sensitive_features.reset(
587       new FeatureProcessorOptions_::BoundsSensitiveFeaturesT());
588   options.bounds_sensitive_features->enabled = true;
589   options.bounds_sensitive_features->num_tokens_before = 5;
590   options.bounds_sensitive_features->num_tokens_inside_left = 3;
591   options.bounds_sensitive_features->num_tokens_inside_right = 3;
592   options.bounds_sensitive_features->num_tokens_after = 5;
593   options.bounds_sensitive_features->include_inside_bag = true;
594   options.bounds_sensitive_features->include_inside_length = true;
595 
596   options.tokenization_codepoint_config.emplace_back(
597       new TokenizationCodepointRangeT());
598   auto& config = options.tokenization_codepoint_config.back();
599   config->start = 32;
600   config->end = 33;
601   config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
602 
603   {
604     options.supported_codepoint_ranges.emplace_back(new CodepointRangeT());
605     auto& range = options.supported_codepoint_ranges.back();
606     range->start = 0;
607     range->end = 128;
608   }
609 
610   {
611     options.supported_codepoint_ranges.emplace_back(new CodepointRangeT());
612     auto& range = options.supported_codepoint_ranges.back();
613     range->start = 10000;
614     range->end = 10001;
615   }
616 
617   {
618     options.supported_codepoint_ranges.emplace_back(new CodepointRangeT());
619     auto& range = options.supported_codepoint_ranges.back();
620     range->start = 20000;
621     range->end = 30000;
622   }
623 
624   flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
625   TestingFeatureProcessor feature_processor(
626       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
627       &unilib_);
628   EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
629                   {0, 3}, feature_processor.Tokenize("aaa bbb ccc")),
630               FloatEq(1.0));
631   EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
632                   {0, 3}, feature_processor.Tokenize("aaa bbb ěěě")),
633               FloatEq(2.0 / 3));
634   EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
635                   {0, 3}, feature_processor.Tokenize("ěěě řřř ěěě")),
636               FloatEq(0.0));
637   EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
638                   {0, 0}, feature_processor.Tokenize("")),
639               FloatEq(0.0));
640   EXPECT_FALSE(
641       IsCodepointInRanges(-1, feature_processor.supported_codepoint_ranges_));
642   EXPECT_TRUE(
643       IsCodepointInRanges(0, feature_processor.supported_codepoint_ranges_));
644   EXPECT_TRUE(
645       IsCodepointInRanges(10, feature_processor.supported_codepoint_ranges_));
646   EXPECT_TRUE(
647       IsCodepointInRanges(127, feature_processor.supported_codepoint_ranges_));
648   EXPECT_FALSE(
649       IsCodepointInRanges(128, feature_processor.supported_codepoint_ranges_));
650   EXPECT_FALSE(
651       IsCodepointInRanges(9999, feature_processor.supported_codepoint_ranges_));
652   EXPECT_TRUE(IsCodepointInRanges(
653       10000, feature_processor.supported_codepoint_ranges_));
654   EXPECT_FALSE(IsCodepointInRanges(
655       10001, feature_processor.supported_codepoint_ranges_));
656   EXPECT_TRUE(IsCodepointInRanges(
657       25000, feature_processor.supported_codepoint_ranges_));
658 
659   const std::vector<Token> tokens = {Token("ěěě", 0, 3), Token("řřř", 4, 7),
660                                      Token("eee", 8, 11)};
661 
662   options.min_supported_codepoint_ratio = 0.0;
663   flatbuffers::DetachedBuffer options2_fb =
664       PackFeatureProcessorOptions(options);
665   TestingFeatureProcessor feature_processor2(
666       flatbuffers::GetRoot<FeatureProcessorOptions>(options2_fb.data()),
667       &unilib_);
668   EXPECT_TRUE(feature_processor2.HasEnoughSupportedCodepoints(
669       tokens, /*token_span=*/{0, 3}));
670 
671   options.min_supported_codepoint_ratio = 0.2;
672   flatbuffers::DetachedBuffer options3_fb =
673       PackFeatureProcessorOptions(options);
674   TestingFeatureProcessor feature_processor3(
675       flatbuffers::GetRoot<FeatureProcessorOptions>(options3_fb.data()),
676       &unilib_);
677   EXPECT_TRUE(feature_processor3.HasEnoughSupportedCodepoints(
678       tokens, /*token_span=*/{0, 3}));
679 
680   options.min_supported_codepoint_ratio = 0.5;
681   flatbuffers::DetachedBuffer options4_fb =
682       PackFeatureProcessorOptions(options);
683   TestingFeatureProcessor feature_processor4(
684       flatbuffers::GetRoot<FeatureProcessorOptions>(options4_fb.data()),
685       &unilib_);
686   EXPECT_FALSE(feature_processor4.HasEnoughSupportedCodepoints(
687       tokens, /*token_span=*/{0, 3}));
688 }
689 
TEST_F(AnnotatorFeatureProcessorTest,InSpanFeature)690 TEST_F(AnnotatorFeatureProcessorTest, InSpanFeature) {
691   FeatureProcessorOptionsT options;
692   options.context_size = 2;
693   options.max_selection_span = 2;
694   options.snap_label_span_boundaries_to_containing_tokens = false;
695   options.feature_version = 2;
696   options.embedding_size = 4;
697   options.extract_selection_mask_feature = true;
698 
699   flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
700   TestingFeatureProcessor feature_processor(
701       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
702       &unilib_);
703 
704   std::unique_ptr<CachedFeatures> cached_features;
705 
706   FakeEmbeddingExecutor embedding_executor;
707 
708   const std::vector<Token> tokens = {Token("aaa", 0, 3), Token("bbb", 4, 7),
709                                      Token("ccc", 8, 11), Token("ddd", 12, 15)};
710 
711   EXPECT_TRUE(feature_processor.ExtractFeatures(
712       tokens, /*token_span=*/{0, 4},
713       /*selection_span_for_feature=*/{4, 11}, &embedding_executor,
714       /*embedding_cache=*/nullptr, /*feature_vector_size=*/5,
715       &cached_features));
716   std::vector<float> features;
717   cached_features->AppendClickContextFeaturesForClick(1, &features);
718   ASSERT_EQ(features.size(), 25);
719   EXPECT_THAT(features[4], FloatEq(0.0));
720   EXPECT_THAT(features[9], FloatEq(0.0));
721   EXPECT_THAT(features[14], FloatEq(1.0));
722   EXPECT_THAT(features[19], FloatEq(1.0));
723   EXPECT_THAT(features[24], FloatEq(0.0));
724 }
725 
TEST_F(AnnotatorFeatureProcessorTest,EmbeddingCache)726 TEST_F(AnnotatorFeatureProcessorTest, EmbeddingCache) {
727   FeatureProcessorOptionsT options;
728   options.context_size = 2;
729   options.max_selection_span = 2;
730   options.snap_label_span_boundaries_to_containing_tokens = false;
731   options.feature_version = 2;
732   options.embedding_size = 4;
733   options.bounds_sensitive_features.reset(
734       new FeatureProcessorOptions_::BoundsSensitiveFeaturesT());
735   options.bounds_sensitive_features->enabled = true;
736   options.bounds_sensitive_features->num_tokens_before = 3;
737   options.bounds_sensitive_features->num_tokens_inside_left = 2;
738   options.bounds_sensitive_features->num_tokens_inside_right = 2;
739   options.bounds_sensitive_features->num_tokens_after = 3;
740 
741   flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
742   TestingFeatureProcessor feature_processor(
743       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
744       &unilib_);
745 
746   std::unique_ptr<CachedFeatures> cached_features;
747 
748   FakeEmbeddingExecutor embedding_executor;
749 
750   const std::vector<Token> tokens = {
751       Token("aaa", 0, 3),   Token("bbb", 4, 7),   Token("ccc", 8, 11),
752       Token("ddd", 12, 15), Token("eee", 16, 19), Token("fff", 20, 23)};
753 
754   // We pre-populate the cache with dummy embeddings, to make sure they are
755   // used when populating the features vector.
756   const std::vector<float> cached_padding_features = {10.0, -10.0, 10.0, -10.0};
757   const std::vector<float> cached_features1 = {1.0, 2.0, 3.0, 4.0};
758   const std::vector<float> cached_features2 = {5.0, 6.0, 7.0, 8.0};
759   FeatureProcessor::EmbeddingCache embedding_cache = {
760       {{kInvalidIndex, kInvalidIndex}, cached_padding_features},
761       {{4, 7}, cached_features1},
762       {{12, 15}, cached_features2},
763   };
764 
765   EXPECT_TRUE(feature_processor.ExtractFeatures(
766       tokens, /*token_span=*/{0, 6},
767       /*selection_span_for_feature=*/{kInvalidIndex, kInvalidIndex},
768       &embedding_executor, &embedding_cache, /*feature_vector_size=*/4,
769       &cached_features));
770   std::vector<float> features;
771   cached_features->AppendBoundsSensitiveFeaturesForSpan({2, 4}, &features);
772   ASSERT_EQ(features.size(), 40);
773   // Check that the dummy embeddings were used.
774   EXPECT_THAT(Subvector(features, 0, 4),
775               ElementsAreFloat(cached_padding_features));
776   EXPECT_THAT(Subvector(features, 8, 12), ElementsAreFloat(cached_features1));
777   EXPECT_THAT(Subvector(features, 16, 20), ElementsAreFloat(cached_features2));
778   EXPECT_THAT(Subvector(features, 24, 28), ElementsAreFloat(cached_features2));
779   EXPECT_THAT(Subvector(features, 36, 40),
780               ElementsAreFloat(cached_padding_features));
781   // Check that the real embeddings were cached.
782   EXPECT_EQ(embedding_cache.size(), 7);
783   EXPECT_THAT(Subvector(features, 4, 8),
784               ElementsAreFloat(embedding_cache.at({0, 3})));
785   EXPECT_THAT(Subvector(features, 12, 16),
786               ElementsAreFloat(embedding_cache.at({8, 11})));
787   EXPECT_THAT(Subvector(features, 20, 24),
788               ElementsAreFloat(embedding_cache.at({8, 11})));
789   EXPECT_THAT(Subvector(features, 28, 32),
790               ElementsAreFloat(embedding_cache.at({16, 19})));
791   EXPECT_THAT(Subvector(features, 32, 36),
792               ElementsAreFloat(embedding_cache.at({20, 23})));
793 }
794 
TEST_F(AnnotatorFeatureProcessorTest,StripUnusedTokensWithNoRelativeClick)795 TEST_F(AnnotatorFeatureProcessorTest, StripUnusedTokensWithNoRelativeClick) {
796   std::vector<Token> tokens_orig{
797       Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0),  Token("3", 0, 0),
798       Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0),  Token("7", 0, 0),
799       Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
800       Token("12", 0, 0)};
801 
802   std::vector<Token> tokens;
803   int click_index;
804 
805   // Try to click first token and see if it gets padded from left.
806   tokens = tokens_orig;
807   click_index = 0;
808   internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
809   // clang-format off
810   EXPECT_EQ(tokens, std::vector<Token>({Token(),
811                                         Token(),
812                                         Token("0", 0, 0),
813                                         Token("1", 0, 0),
814                                         Token("2", 0, 0)}));
815   // clang-format on
816   EXPECT_EQ(click_index, 2);
817 
818   // When we click the second token nothing should get padded.
819   tokens = tokens_orig;
820   click_index = 2;
821   internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
822   // clang-format off
823   EXPECT_EQ(tokens, std::vector<Token>({Token("0", 0, 0),
824                                         Token("1", 0, 0),
825                                         Token("2", 0, 0),
826                                         Token("3", 0, 0),
827                                         Token("4", 0, 0)}));
828   // clang-format on
829   EXPECT_EQ(click_index, 2);
830 
831   // When we click the last token tokens should get padded from the right.
832   tokens = tokens_orig;
833   click_index = 12;
834   internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
835   // clang-format off
836   EXPECT_EQ(tokens, std::vector<Token>({Token("10", 0, 0),
837                                         Token("11", 0, 0),
838                                         Token("12", 0, 0),
839                                         Token(),
840                                         Token()}));
841   // clang-format on
842   EXPECT_EQ(click_index, 2);
843 }
844 
TEST_F(AnnotatorFeatureProcessorTest,StripUnusedTokensWithRelativeClick)845 TEST_F(AnnotatorFeatureProcessorTest, StripUnusedTokensWithRelativeClick) {
846   std::vector<Token> tokens_orig{
847       Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0),  Token("3", 0, 0),
848       Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0),  Token("7", 0, 0),
849       Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
850       Token("12", 0, 0)};
851 
852   std::vector<Token> tokens;
853   int click_index;
854 
855   // Try to click first token and see if it gets padded from left to maximum
856   // context_size.
857   tokens = tokens_orig;
858   click_index = 0;
859   internal::StripOrPadTokens({2, 3}, 2, &tokens, &click_index);
860   // clang-format off
861   EXPECT_EQ(tokens, std::vector<Token>({Token(),
862                                         Token(),
863                                         Token("0", 0, 0),
864                                         Token("1", 0, 0),
865                                         Token("2", 0, 0),
866                                         Token("3", 0, 0),
867                                         Token("4", 0, 0),
868                                         Token("5", 0, 0)}));
869   // clang-format on
870   EXPECT_EQ(click_index, 2);
871 
872   // Clicking to the middle with enough context should not produce any padding.
873   tokens = tokens_orig;
874   click_index = 6;
875   internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
876   // clang-format off
877   EXPECT_EQ(tokens, std::vector<Token>({Token("1", 0, 0),
878                                         Token("2", 0, 0),
879                                         Token("3", 0, 0),
880                                         Token("4", 0, 0),
881                                         Token("5", 0, 0),
882                                         Token("6", 0, 0),
883                                         Token("7", 0, 0),
884                                         Token("8", 0, 0),
885                                         Token("9", 0, 0)}));
886   // clang-format on
887   EXPECT_EQ(click_index, 5);
888 
889   // Clicking at the end should pad right to maximum context_size.
890   tokens = tokens_orig;
891   click_index = 11;
892   internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
893   // clang-format off
894   EXPECT_EQ(tokens, std::vector<Token>({Token("6", 0, 0),
895                                         Token("7", 0, 0),
896                                         Token("8", 0, 0),
897                                         Token("9", 0, 0),
898                                         Token("10", 0, 0),
899                                         Token("11", 0, 0),
900                                         Token("12", 0, 0),
901                                         Token(),
902                                         Token()}));
903   // clang-format on
904   EXPECT_EQ(click_index, 5);
905 }
906 
TEST_F(AnnotatorFeatureProcessorTest,IgnoredSpanBoundaryCodepoints)907 TEST_F(AnnotatorFeatureProcessorTest, IgnoredSpanBoundaryCodepoints) {
908   FeatureProcessorOptionsT options;
909   options.ignored_span_boundary_codepoints.push_back('.');
910   options.ignored_span_boundary_codepoints.push_back(',');
911   options.ignored_span_boundary_codepoints.push_back('[');
912   options.ignored_span_boundary_codepoints.push_back(']');
913 
914   flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
915   TestingFeatureProcessor feature_processor(
916       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
917       &unilib_);
918 
919   const std::string text1_utf8 = "ěščř";
920   const UnicodeText text1 = UTF8ToUnicodeText(text1_utf8, /*do_copy=*/false);
921   EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
922                 text1.begin(), text1.end(),
923                 /*count_from_beginning=*/true),
924             0);
925   EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
926                 text1.begin(), text1.end(),
927                 /*count_from_beginning=*/false),
928             0);
929 
930   const std::string text2_utf8 = ".,abčd";
931   const UnicodeText text2 = UTF8ToUnicodeText(text2_utf8, /*do_copy=*/false);
932   EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
933                 text2.begin(), text2.end(),
934                 /*count_from_beginning=*/true),
935             2);
936   EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
937                 text2.begin(), text2.end(),
938                 /*count_from_beginning=*/false),
939             0);
940 
941   const std::string text3_utf8 = ".,abčd[]";
942   const UnicodeText text3 = UTF8ToUnicodeText(text3_utf8, /*do_copy=*/false);
943   EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
944                 text3.begin(), text3.end(),
945                 /*count_from_beginning=*/true),
946             2);
947   EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
948                 text3.begin(), text3.end(),
949                 /*count_from_beginning=*/false),
950             2);
951 
952   const std::string text4_utf8 = "[abčd]";
953   const UnicodeText text4 = UTF8ToUnicodeText(text4_utf8, /*do_copy=*/false);
954   EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
955                 text4.begin(), text4.end(),
956                 /*count_from_beginning=*/true),
957             1);
958   EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
959                 text4.begin(), text4.end(),
960                 /*count_from_beginning=*/false),
961             1);
962 
963   const std::string text5_utf8 = "";
964   const UnicodeText text5 = UTF8ToUnicodeText(text5_utf8, /*do_copy=*/false);
965   EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
966                 text5.begin(), text5.end(),
967                 /*count_from_beginning=*/true),
968             0);
969   EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
970                 text5.begin(), text5.end(),
971                 /*count_from_beginning=*/false),
972             0);
973 
974   const std::string text6_utf8 = "012345ěščř";
975   const UnicodeText text6 = UTF8ToUnicodeText(text6_utf8, /*do_copy=*/false);
976   UnicodeText::const_iterator text6_begin = text6.begin();
977   std::advance(text6_begin, 6);
978   EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
979                 text6_begin, text6.end(),
980                 /*count_from_beginning=*/true),
981             0);
982   EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
983                 text6_begin, text6.end(),
984                 /*count_from_beginning=*/false),
985             0);
986 
987   const std::string text7_utf8 = "012345.,ěščř";
988   const UnicodeText text7 = UTF8ToUnicodeText(text7_utf8, /*do_copy=*/false);
989   UnicodeText::const_iterator text7_begin = text7.begin();
990   std::advance(text7_begin, 6);
991   EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
992                 text7_begin, text7.end(),
993                 /*count_from_beginning=*/true),
994             2);
995   UnicodeText::const_iterator text7_end = text7.begin();
996   std::advance(text7_end, 8);
997   EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
998                 text7.begin(), text7_end,
999                 /*count_from_beginning=*/false),
1000             2);
1001 
1002   // Test not stripping.
1003   EXPECT_EQ(feature_processor.StripBoundaryCodepoints(
1004                 "Hello [[[Wořld]] or not?", {0, 24}),
1005             CodepointSpan(0, 24));
1006   // Test basic stripping.
1007   EXPECT_EQ(feature_processor.StripBoundaryCodepoints(
1008                 "Hello [[[Wořld]] or not?", {6, 16}),
1009             CodepointSpan(9, 14));
1010   // Test stripping when everything is stripped.
1011   EXPECT_EQ(
1012       feature_processor.StripBoundaryCodepoints("Hello [[[]] or not?", {6, 11}),
1013       CodepointSpan(6, 6));
1014   // Test stripping empty string.
1015   EXPECT_EQ(feature_processor.StripBoundaryCodepoints("", {0, 0}),
1016             CodepointSpan(0, 0));
1017 }
1018 
TEST_F(AnnotatorFeatureProcessorTest,CodepointSpanToTokenSpan)1019 TEST_F(AnnotatorFeatureProcessorTest, CodepointSpanToTokenSpan) {
1020   const std::vector<Token> tokens{Token("Hělló", 0, 5),
1021                                   Token("fěěbař@google.com", 6, 23),
1022                                   Token("heře!", 24, 29)};
1023 
1024   // Spans matching the tokens exactly.
1025   EXPECT_EQ(TokenSpan(0, 1), CodepointSpanToTokenSpan(tokens, {0, 5}));
1026   EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {6, 23}));
1027   EXPECT_EQ(TokenSpan(2, 3), CodepointSpanToTokenSpan(tokens, {24, 29}));
1028   EXPECT_EQ(TokenSpan(0, 2), CodepointSpanToTokenSpan(tokens, {0, 23}));
1029   EXPECT_EQ(TokenSpan(1, 3), CodepointSpanToTokenSpan(tokens, {6, 29}));
1030   EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {0, 29}));
1031 
1032   // Snapping to containing tokens has no effect.
1033   EXPECT_EQ(TokenSpan(0, 1), CodepointSpanToTokenSpan(tokens, {0, 5}, true));
1034   EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {6, 23}, true));
1035   EXPECT_EQ(TokenSpan(2, 3), CodepointSpanToTokenSpan(tokens, {24, 29}, true));
1036   EXPECT_EQ(TokenSpan(0, 2), CodepointSpanToTokenSpan(tokens, {0, 23}, true));
1037   EXPECT_EQ(TokenSpan(1, 3), CodepointSpanToTokenSpan(tokens, {6, 29}, true));
1038   EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {0, 29}, true));
1039 
1040   // Span boundaries inside tokens.
1041   EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {1, 28}));
1042   EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {1, 28}, true));
1043 
1044   // Tokens adjacent to the span, but not overlapping.
1045   EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {5, 24}));
1046   EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {5, 24}, true));
1047 }
1048 
1049 }  // namespace
1050 }  // namespace libtextclassifier3
1051