xref: /aosp_15_r20/external/libtextclassifier/native/utils/token-feature-extractor_test.cc (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker  * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker  *
4*993b0882SAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker  *
8*993b0882SAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker  *
10*993b0882SAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker  * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker  */
16*993b0882SAndroid Build Coastguard Worker 
17*993b0882SAndroid Build Coastguard Worker #include "utils/token-feature-extractor.h"
18*993b0882SAndroid Build Coastguard Worker 
19*993b0882SAndroid Build Coastguard Worker #include "gmock/gmock.h"
20*993b0882SAndroid Build Coastguard Worker #include "gtest/gtest.h"
21*993b0882SAndroid Build Coastguard Worker 
22*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 {
23*993b0882SAndroid Build Coastguard Worker namespace {
24*993b0882SAndroid Build Coastguard Worker 
25*993b0882SAndroid Build Coastguard Worker class TokenFeatureExtractorTest : public ::testing::Test {
26*993b0882SAndroid Build Coastguard Worker  protected:
TokenFeatureExtractorTest()27*993b0882SAndroid Build Coastguard Worker   explicit TokenFeatureExtractorTest() : INIT_UNILIB_FOR_TESTING(unilib_) {}
28*993b0882SAndroid Build Coastguard Worker   UniLib unilib_;
29*993b0882SAndroid Build Coastguard Worker };
30*993b0882SAndroid Build Coastguard Worker 
31*993b0882SAndroid Build Coastguard Worker class TestingTokenFeatureExtractor : public TokenFeatureExtractor {
32*993b0882SAndroid Build Coastguard Worker  public:
33*993b0882SAndroid Build Coastguard Worker   using TokenFeatureExtractor::HashToken;
34*993b0882SAndroid Build Coastguard Worker   using TokenFeatureExtractor::TokenFeatureExtractor;
35*993b0882SAndroid Build Coastguard Worker };
36*993b0882SAndroid Build Coastguard Worker 
TEST_F(TokenFeatureExtractorTest,ExtractAscii)37*993b0882SAndroid Build Coastguard Worker TEST_F(TokenFeatureExtractorTest, ExtractAscii) {
38*993b0882SAndroid Build Coastguard Worker   TokenFeatureExtractorOptions options;
39*993b0882SAndroid Build Coastguard Worker   options.num_buckets = 1000;
40*993b0882SAndroid Build Coastguard Worker   options.chargram_orders = std::vector<int>{1, 2, 3};
41*993b0882SAndroid Build Coastguard Worker   options.extract_case_feature = true;
42*993b0882SAndroid Build Coastguard Worker   options.unicode_aware_features = false;
43*993b0882SAndroid Build Coastguard Worker   options.extract_selection_mask_feature = true;
44*993b0882SAndroid Build Coastguard Worker   TestingTokenFeatureExtractor extractor(options, &unilib_);
45*993b0882SAndroid Build Coastguard Worker 
46*993b0882SAndroid Build Coastguard Worker   std::vector<int> sparse_features;
47*993b0882SAndroid Build Coastguard Worker   std::vector<float> dense_features;
48*993b0882SAndroid Build Coastguard Worker 
49*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"Hello", 0, 5}, true, &sparse_features,
50*993b0882SAndroid Build Coastguard Worker                     &dense_features);
51*993b0882SAndroid Build Coastguard Worker 
52*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(sparse_features,
53*993b0882SAndroid Build Coastguard Worker               testing::ElementsAreArray({
54*993b0882SAndroid Build Coastguard Worker                   // clang-format off
55*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("H"),
56*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("e"),
57*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("l"),
58*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("l"),
59*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("o"),
60*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("^H"),
61*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("He"),
62*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("el"),
63*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("ll"),
64*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("lo"),
65*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("o$"),
66*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("^He"),
67*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("Hel"),
68*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("ell"),
69*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("llo"),
70*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("lo$")
71*993b0882SAndroid Build Coastguard Worker                   // clang-format on
72*993b0882SAndroid Build Coastguard Worker               }));
73*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(dense_features, testing::ElementsAreArray({1.0, 1.0}));
74*993b0882SAndroid Build Coastguard Worker 
75*993b0882SAndroid Build Coastguard Worker   sparse_features.clear();
76*993b0882SAndroid Build Coastguard Worker   dense_features.clear();
77*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"world!", 23, 29}, false, &sparse_features,
78*993b0882SAndroid Build Coastguard Worker                     &dense_features);
79*993b0882SAndroid Build Coastguard Worker 
80*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(sparse_features,
81*993b0882SAndroid Build Coastguard Worker               testing::ElementsAreArray({
82*993b0882SAndroid Build Coastguard Worker                   // clang-format off
83*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("w"),
84*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("o"),
85*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("r"),
86*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("l"),
87*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("d"),
88*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("!"),
89*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("^w"),
90*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("wo"),
91*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("or"),
92*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("rl"),
93*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("ld"),
94*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("d!"),
95*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("!$"),
96*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("^wo"),
97*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("wor"),
98*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("orl"),
99*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("rld"),
100*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("ld!"),
101*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("d!$"),
102*993b0882SAndroid Build Coastguard Worker                   // clang-format on
103*993b0882SAndroid Build Coastguard Worker               }));
104*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0, 0.0}));
105*993b0882SAndroid Build Coastguard Worker }
106*993b0882SAndroid Build Coastguard Worker 
TEST_F(TokenFeatureExtractorTest,ExtractAsciiNoChargrams)107*993b0882SAndroid Build Coastguard Worker TEST_F(TokenFeatureExtractorTest, ExtractAsciiNoChargrams) {
108*993b0882SAndroid Build Coastguard Worker   TokenFeatureExtractorOptions options;
109*993b0882SAndroid Build Coastguard Worker   options.num_buckets = 1000;
110*993b0882SAndroid Build Coastguard Worker   options.chargram_orders = std::vector<int>{};
111*993b0882SAndroid Build Coastguard Worker   options.extract_case_feature = true;
112*993b0882SAndroid Build Coastguard Worker   options.unicode_aware_features = false;
113*993b0882SAndroid Build Coastguard Worker   options.extract_selection_mask_feature = true;
114*993b0882SAndroid Build Coastguard Worker   TestingTokenFeatureExtractor extractor(options, &unilib_);
115*993b0882SAndroid Build Coastguard Worker 
116*993b0882SAndroid Build Coastguard Worker   std::vector<int> sparse_features;
117*993b0882SAndroid Build Coastguard Worker   std::vector<float> dense_features;
118*993b0882SAndroid Build Coastguard Worker 
119*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"Hello", 0, 5}, true, &sparse_features,
120*993b0882SAndroid Build Coastguard Worker                     &dense_features);
121*993b0882SAndroid Build Coastguard Worker 
122*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(sparse_features,
123*993b0882SAndroid Build Coastguard Worker               testing::ElementsAreArray({extractor.HashToken("^Hello$")}));
124*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(dense_features, testing::ElementsAreArray({1.0, 1.0}));
125*993b0882SAndroid Build Coastguard Worker 
126*993b0882SAndroid Build Coastguard Worker   sparse_features.clear();
127*993b0882SAndroid Build Coastguard Worker   dense_features.clear();
128*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"world!", 23, 29}, false, &sparse_features,
129*993b0882SAndroid Build Coastguard Worker                     &dense_features);
130*993b0882SAndroid Build Coastguard Worker 
131*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(sparse_features,
132*993b0882SAndroid Build Coastguard Worker               testing::ElementsAreArray({extractor.HashToken("^world!$")}));
133*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0, 0.0}));
134*993b0882SAndroid Build Coastguard Worker }
135*993b0882SAndroid Build Coastguard Worker 
TEST_F(TokenFeatureExtractorTest,ExtractUnicode)136*993b0882SAndroid Build Coastguard Worker TEST_F(TokenFeatureExtractorTest, ExtractUnicode) {
137*993b0882SAndroid Build Coastguard Worker   TokenFeatureExtractorOptions options;
138*993b0882SAndroid Build Coastguard Worker   options.num_buckets = 1000;
139*993b0882SAndroid Build Coastguard Worker   options.chargram_orders = std::vector<int>{1, 2, 3};
140*993b0882SAndroid Build Coastguard Worker   options.extract_case_feature = true;
141*993b0882SAndroid Build Coastguard Worker   options.unicode_aware_features = true;
142*993b0882SAndroid Build Coastguard Worker   options.extract_selection_mask_feature = true;
143*993b0882SAndroid Build Coastguard Worker   TestingTokenFeatureExtractor extractor(options, &unilib_);
144*993b0882SAndroid Build Coastguard Worker 
145*993b0882SAndroid Build Coastguard Worker   std::vector<int> sparse_features;
146*993b0882SAndroid Build Coastguard Worker   std::vector<float> dense_features;
147*993b0882SAndroid Build Coastguard Worker 
148*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"Hělló", 0, 5}, true, &sparse_features,
149*993b0882SAndroid Build Coastguard Worker                     &dense_features);
150*993b0882SAndroid Build Coastguard Worker 
151*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(sparse_features,
152*993b0882SAndroid Build Coastguard Worker               testing::ElementsAreArray({
153*993b0882SAndroid Build Coastguard Worker                   // clang-format off
154*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("H"),
155*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("ě"),
156*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("l"),
157*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("l"),
158*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("ó"),
159*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("^H"),
160*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("Hě"),
161*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("ěl"),
162*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("ll"),
163*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("ló"),
164*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("ó$"),
165*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("^Hě"),
166*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("Hěl"),
167*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("ěll"),
168*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("lló"),
169*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("ló$")
170*993b0882SAndroid Build Coastguard Worker                   // clang-format on
171*993b0882SAndroid Build Coastguard Worker               }));
172*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(dense_features, testing::ElementsAreArray({1.0, 1.0}));
173*993b0882SAndroid Build Coastguard Worker 
174*993b0882SAndroid Build Coastguard Worker   sparse_features.clear();
175*993b0882SAndroid Build Coastguard Worker   dense_features.clear();
176*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"world!", 23, 29}, false, &sparse_features,
177*993b0882SAndroid Build Coastguard Worker                     &dense_features);
178*993b0882SAndroid Build Coastguard Worker 
179*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(sparse_features,
180*993b0882SAndroid Build Coastguard Worker               testing::ElementsAreArray({
181*993b0882SAndroid Build Coastguard Worker                   // clang-format off
182*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("w"),
183*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("o"),
184*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("r"),
185*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("l"),
186*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("d"),
187*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("!"),
188*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("^w"),
189*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("wo"),
190*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("or"),
191*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("rl"),
192*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("ld"),
193*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("d!"),
194*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("!$"),
195*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("^wo"),
196*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("wor"),
197*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("orl"),
198*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("rld"),
199*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("ld!"),
200*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("d!$"),
201*993b0882SAndroid Build Coastguard Worker                   // clang-format on
202*993b0882SAndroid Build Coastguard Worker               }));
203*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0, -1.0}));
204*993b0882SAndroid Build Coastguard Worker }
205*993b0882SAndroid Build Coastguard Worker 
TEST_F(TokenFeatureExtractorTest,ExtractUnicodeNoChargrams)206*993b0882SAndroid Build Coastguard Worker TEST_F(TokenFeatureExtractorTest, ExtractUnicodeNoChargrams) {
207*993b0882SAndroid Build Coastguard Worker   TokenFeatureExtractorOptions options;
208*993b0882SAndroid Build Coastguard Worker   options.num_buckets = 1000;
209*993b0882SAndroid Build Coastguard Worker   options.chargram_orders = std::vector<int>{};
210*993b0882SAndroid Build Coastguard Worker   options.extract_case_feature = true;
211*993b0882SAndroid Build Coastguard Worker   options.unicode_aware_features = true;
212*993b0882SAndroid Build Coastguard Worker   options.extract_selection_mask_feature = true;
213*993b0882SAndroid Build Coastguard Worker   TestingTokenFeatureExtractor extractor(options, &unilib_);
214*993b0882SAndroid Build Coastguard Worker 
215*993b0882SAndroid Build Coastguard Worker   std::vector<int> sparse_features;
216*993b0882SAndroid Build Coastguard Worker   std::vector<float> dense_features;
217*993b0882SAndroid Build Coastguard Worker 
218*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"Hělló", 0, 5}, true, &sparse_features,
219*993b0882SAndroid Build Coastguard Worker                     &dense_features);
220*993b0882SAndroid Build Coastguard Worker 
221*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(sparse_features,
222*993b0882SAndroid Build Coastguard Worker               testing::ElementsAreArray({extractor.HashToken("^Hělló$")}));
223*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(dense_features, testing::ElementsAreArray({1.0, 1.0}));
224*993b0882SAndroid Build Coastguard Worker 
225*993b0882SAndroid Build Coastguard Worker   sparse_features.clear();
226*993b0882SAndroid Build Coastguard Worker   dense_features.clear();
227*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"world!", 23, 29}, false, &sparse_features,
228*993b0882SAndroid Build Coastguard Worker                     &dense_features);
229*993b0882SAndroid Build Coastguard Worker 
230*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(sparse_features, testing::ElementsAreArray({
231*993b0882SAndroid Build Coastguard Worker                                    extractor.HashToken("^world!$"),
232*993b0882SAndroid Build Coastguard Worker                                }));
233*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0, -1.0}));
234*993b0882SAndroid Build Coastguard Worker }
235*993b0882SAndroid Build Coastguard Worker 
236*993b0882SAndroid Build Coastguard Worker #ifdef TC3_TEST_ICU
TEST_F(TokenFeatureExtractorTest,ICUCaseFeature)237*993b0882SAndroid Build Coastguard Worker TEST_F(TokenFeatureExtractorTest, ICUCaseFeature) {
238*993b0882SAndroid Build Coastguard Worker   TokenFeatureExtractorOptions options;
239*993b0882SAndroid Build Coastguard Worker   options.num_buckets = 1000;
240*993b0882SAndroid Build Coastguard Worker   options.chargram_orders = std::vector<int>{1, 2};
241*993b0882SAndroid Build Coastguard Worker   options.extract_case_feature = true;
242*993b0882SAndroid Build Coastguard Worker   options.unicode_aware_features = true;
243*993b0882SAndroid Build Coastguard Worker   options.extract_selection_mask_feature = false;
244*993b0882SAndroid Build Coastguard Worker   TestingTokenFeatureExtractor extractor(options, &unilib_);
245*993b0882SAndroid Build Coastguard Worker 
246*993b0882SAndroid Build Coastguard Worker   std::vector<int> sparse_features;
247*993b0882SAndroid Build Coastguard Worker   std::vector<float> dense_features;
248*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"Hělló", 0, 5}, true, &sparse_features,
249*993b0882SAndroid Build Coastguard Worker                     &dense_features);
250*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(dense_features, testing::ElementsAreArray({1.0}));
251*993b0882SAndroid Build Coastguard Worker 
252*993b0882SAndroid Build Coastguard Worker   sparse_features.clear();
253*993b0882SAndroid Build Coastguard Worker   dense_features.clear();
254*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"world!", 23, 29}, false, &sparse_features,
255*993b0882SAndroid Build Coastguard Worker                     &dense_features);
256*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0}));
257*993b0882SAndroid Build Coastguard Worker 
258*993b0882SAndroid Build Coastguard Worker   sparse_features.clear();
259*993b0882SAndroid Build Coastguard Worker   dense_features.clear();
260*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"Ř", 23, 29}, false, &sparse_features,
261*993b0882SAndroid Build Coastguard Worker                     &dense_features);
262*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(dense_features, testing::ElementsAreArray({1.0}));
263*993b0882SAndroid Build Coastguard Worker 
264*993b0882SAndroid Build Coastguard Worker   sparse_features.clear();
265*993b0882SAndroid Build Coastguard Worker   dense_features.clear();
266*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"ř", 23, 29}, false, &sparse_features,
267*993b0882SAndroid Build Coastguard Worker                     &dense_features);
268*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0}));
269*993b0882SAndroid Build Coastguard Worker }
270*993b0882SAndroid Build Coastguard Worker #endif
271*993b0882SAndroid Build Coastguard Worker 
TEST_F(TokenFeatureExtractorTest,DigitRemapping)272*993b0882SAndroid Build Coastguard Worker TEST_F(TokenFeatureExtractorTest, DigitRemapping) {
273*993b0882SAndroid Build Coastguard Worker   TokenFeatureExtractorOptions options;
274*993b0882SAndroid Build Coastguard Worker   options.num_buckets = 1000;
275*993b0882SAndroid Build Coastguard Worker   options.chargram_orders = std::vector<int>{1, 2};
276*993b0882SAndroid Build Coastguard Worker   options.remap_digits = true;
277*993b0882SAndroid Build Coastguard Worker   options.unicode_aware_features = false;
278*993b0882SAndroid Build Coastguard Worker   TestingTokenFeatureExtractor extractor(options, &unilib_);
279*993b0882SAndroid Build Coastguard Worker 
280*993b0882SAndroid Build Coastguard Worker   std::vector<int> sparse_features;
281*993b0882SAndroid Build Coastguard Worker   std::vector<float> dense_features;
282*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"9:30am", 0, 6}, true, &sparse_features,
283*993b0882SAndroid Build Coastguard Worker                     &dense_features);
284*993b0882SAndroid Build Coastguard Worker 
285*993b0882SAndroid Build Coastguard Worker   std::vector<int> sparse_features2;
286*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"5:32am", 0, 6}, true, &sparse_features2,
287*993b0882SAndroid Build Coastguard Worker                     &dense_features);
288*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(sparse_features, testing::ElementsAreArray(sparse_features2));
289*993b0882SAndroid Build Coastguard Worker 
290*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"10:32am", 0, 6}, true, &sparse_features2,
291*993b0882SAndroid Build Coastguard Worker                     &dense_features);
292*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(sparse_features,
293*993b0882SAndroid Build Coastguard Worker               testing::Not(testing::ElementsAreArray(sparse_features2)));
294*993b0882SAndroid Build Coastguard Worker }
295*993b0882SAndroid Build Coastguard Worker 
TEST_F(TokenFeatureExtractorTest,DigitRemappingUnicode)296*993b0882SAndroid Build Coastguard Worker TEST_F(TokenFeatureExtractorTest, DigitRemappingUnicode) {
297*993b0882SAndroid Build Coastguard Worker   TokenFeatureExtractorOptions options;
298*993b0882SAndroid Build Coastguard Worker   options.num_buckets = 1000;
299*993b0882SAndroid Build Coastguard Worker   options.chargram_orders = std::vector<int>{1, 2};
300*993b0882SAndroid Build Coastguard Worker   options.remap_digits = true;
301*993b0882SAndroid Build Coastguard Worker   options.unicode_aware_features = true;
302*993b0882SAndroid Build Coastguard Worker   TestingTokenFeatureExtractor extractor(options, &unilib_);
303*993b0882SAndroid Build Coastguard Worker 
304*993b0882SAndroid Build Coastguard Worker   std::vector<int> sparse_features;
305*993b0882SAndroid Build Coastguard Worker   std::vector<float> dense_features;
306*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"9:30am", 0, 6}, true, &sparse_features,
307*993b0882SAndroid Build Coastguard Worker                     &dense_features);
308*993b0882SAndroid Build Coastguard Worker 
309*993b0882SAndroid Build Coastguard Worker   std::vector<int> sparse_features2;
310*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"5:32am", 0, 6}, true, &sparse_features2,
311*993b0882SAndroid Build Coastguard Worker                     &dense_features);
312*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(sparse_features, testing::ElementsAreArray(sparse_features2));
313*993b0882SAndroid Build Coastguard Worker 
314*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"10:32am", 0, 6}, true, &sparse_features2,
315*993b0882SAndroid Build Coastguard Worker                     &dense_features);
316*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(sparse_features,
317*993b0882SAndroid Build Coastguard Worker               testing::Not(testing::ElementsAreArray(sparse_features2)));
318*993b0882SAndroid Build Coastguard Worker }
319*993b0882SAndroid Build Coastguard Worker 
TEST_F(TokenFeatureExtractorTest,LowercaseAscii)320*993b0882SAndroid Build Coastguard Worker TEST_F(TokenFeatureExtractorTest, LowercaseAscii) {
321*993b0882SAndroid Build Coastguard Worker   TokenFeatureExtractorOptions options;
322*993b0882SAndroid Build Coastguard Worker   options.num_buckets = 1000;
323*993b0882SAndroid Build Coastguard Worker   options.chargram_orders = std::vector<int>{1, 2};
324*993b0882SAndroid Build Coastguard Worker   options.lowercase_tokens = true;
325*993b0882SAndroid Build Coastguard Worker   options.unicode_aware_features = false;
326*993b0882SAndroid Build Coastguard Worker   TestingTokenFeatureExtractor extractor(options, &unilib_);
327*993b0882SAndroid Build Coastguard Worker 
328*993b0882SAndroid Build Coastguard Worker   std::vector<int> sparse_features;
329*993b0882SAndroid Build Coastguard Worker   std::vector<float> dense_features;
330*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"AABB", 0, 6}, true, &sparse_features,
331*993b0882SAndroid Build Coastguard Worker                     &dense_features);
332*993b0882SAndroid Build Coastguard Worker 
333*993b0882SAndroid Build Coastguard Worker   std::vector<int> sparse_features2;
334*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"aaBB", 0, 6}, true, &sparse_features2,
335*993b0882SAndroid Build Coastguard Worker                     &dense_features);
336*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(sparse_features, testing::ElementsAreArray(sparse_features2));
337*993b0882SAndroid Build Coastguard Worker 
338*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"aAbB", 0, 6}, true, &sparse_features2,
339*993b0882SAndroid Build Coastguard Worker                     &dense_features);
340*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(sparse_features, testing::ElementsAreArray(sparse_features2));
341*993b0882SAndroid Build Coastguard Worker }
342*993b0882SAndroid Build Coastguard Worker 
343*993b0882SAndroid Build Coastguard Worker #ifdef TC3_TEST_ICU
TEST_F(TokenFeatureExtractorTest,LowercaseUnicode)344*993b0882SAndroid Build Coastguard Worker TEST_F(TokenFeatureExtractorTest, LowercaseUnicode) {
345*993b0882SAndroid Build Coastguard Worker   TokenFeatureExtractorOptions options;
346*993b0882SAndroid Build Coastguard Worker   options.num_buckets = 1000;
347*993b0882SAndroid Build Coastguard Worker   options.chargram_orders = std::vector<int>{1, 2};
348*993b0882SAndroid Build Coastguard Worker   options.lowercase_tokens = true;
349*993b0882SAndroid Build Coastguard Worker   options.unicode_aware_features = true;
350*993b0882SAndroid Build Coastguard Worker   TestingTokenFeatureExtractor extractor(options, &unilib_);
351*993b0882SAndroid Build Coastguard Worker 
352*993b0882SAndroid Build Coastguard Worker   std::vector<int> sparse_features;
353*993b0882SAndroid Build Coastguard Worker   std::vector<float> dense_features;
354*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"ŘŘ", 0, 6}, true, &sparse_features, &dense_features);
355*993b0882SAndroid Build Coastguard Worker 
356*993b0882SAndroid Build Coastguard Worker   std::vector<int> sparse_features2;
357*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"řř", 0, 6}, true, &sparse_features2,
358*993b0882SAndroid Build Coastguard Worker                     &dense_features);
359*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(sparse_features, testing::ElementsAreArray(sparse_features2));
360*993b0882SAndroid Build Coastguard Worker }
361*993b0882SAndroid Build Coastguard Worker #endif
362*993b0882SAndroid Build Coastguard Worker 
363*993b0882SAndroid Build Coastguard Worker #ifdef TC3_TEST_ICU
TEST_F(TokenFeatureExtractorTest,RegexFeatures)364*993b0882SAndroid Build Coastguard Worker TEST_F(TokenFeatureExtractorTest, RegexFeatures) {
365*993b0882SAndroid Build Coastguard Worker   TokenFeatureExtractorOptions options;
366*993b0882SAndroid Build Coastguard Worker   options.num_buckets = 1000;
367*993b0882SAndroid Build Coastguard Worker   options.chargram_orders = std::vector<int>{1, 2};
368*993b0882SAndroid Build Coastguard Worker   options.remap_digits = false;
369*993b0882SAndroid Build Coastguard Worker   options.unicode_aware_features = false;
370*993b0882SAndroid Build Coastguard Worker   options.regexp_features.push_back("^[a-z]+$");  // all lower case.
371*993b0882SAndroid Build Coastguard Worker   options.regexp_features.push_back("^[0-9]+$");  // all digits.
372*993b0882SAndroid Build Coastguard Worker   TestingTokenFeatureExtractor extractor(options, &unilib_);
373*993b0882SAndroid Build Coastguard Worker 
374*993b0882SAndroid Build Coastguard Worker   std::vector<int> sparse_features;
375*993b0882SAndroid Build Coastguard Worker   std::vector<float> dense_features;
376*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"abCde", 0, 6}, true, &sparse_features,
377*993b0882SAndroid Build Coastguard Worker                     &dense_features);
378*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0, -1.0}));
379*993b0882SAndroid Build Coastguard Worker 
380*993b0882SAndroid Build Coastguard Worker   dense_features.clear();
381*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"abcde", 0, 6}, true, &sparse_features,
382*993b0882SAndroid Build Coastguard Worker                     &dense_features);
383*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(dense_features, testing::ElementsAreArray({1.0, -1.0}));
384*993b0882SAndroid Build Coastguard Worker 
385*993b0882SAndroid Build Coastguard Worker   dense_features.clear();
386*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"12c45", 0, 6}, true, &sparse_features,
387*993b0882SAndroid Build Coastguard Worker                     &dense_features);
388*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0, -1.0}));
389*993b0882SAndroid Build Coastguard Worker 
390*993b0882SAndroid Build Coastguard Worker   dense_features.clear();
391*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"12345", 0, 6}, true, &sparse_features,
392*993b0882SAndroid Build Coastguard Worker                     &dense_features);
393*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0, 1.0}));
394*993b0882SAndroid Build Coastguard Worker }
395*993b0882SAndroid Build Coastguard Worker #endif
396*993b0882SAndroid Build Coastguard Worker 
TEST_F(TokenFeatureExtractorTest,ExtractTooLongWord)397*993b0882SAndroid Build Coastguard Worker TEST_F(TokenFeatureExtractorTest, ExtractTooLongWord) {
398*993b0882SAndroid Build Coastguard Worker   TokenFeatureExtractorOptions options;
399*993b0882SAndroid Build Coastguard Worker   options.num_buckets = 1000;
400*993b0882SAndroid Build Coastguard Worker   options.chargram_orders = std::vector<int>{22};
401*993b0882SAndroid Build Coastguard Worker   options.extract_case_feature = true;
402*993b0882SAndroid Build Coastguard Worker   options.unicode_aware_features = true;
403*993b0882SAndroid Build Coastguard Worker   options.extract_selection_mask_feature = true;
404*993b0882SAndroid Build Coastguard Worker   TestingTokenFeatureExtractor extractor(options, &unilib_);
405*993b0882SAndroid Build Coastguard Worker 
406*993b0882SAndroid Build Coastguard Worker   // Test that this runs. ASAN should catch problems.
407*993b0882SAndroid Build Coastguard Worker   std::vector<int> sparse_features;
408*993b0882SAndroid Build Coastguard Worker   std::vector<float> dense_features;
409*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"abcdefghijklmnopqřstuvwxyz", 0, 0}, true,
410*993b0882SAndroid Build Coastguard Worker                     &sparse_features, &dense_features);
411*993b0882SAndroid Build Coastguard Worker 
412*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(sparse_features,
413*993b0882SAndroid Build Coastguard Worker               testing::ElementsAreArray({
414*993b0882SAndroid Build Coastguard Worker                   // clang-format off
415*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("^abcdefghij\1qřstuvwxyz"),
416*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("abcdefghij\1qřstuvwxyz$"),
417*993b0882SAndroid Build Coastguard Worker                   // clang-format on
418*993b0882SAndroid Build Coastguard Worker               }));
419*993b0882SAndroid Build Coastguard Worker }
420*993b0882SAndroid Build Coastguard Worker 
TEST_F(TokenFeatureExtractorTest,ExtractAsciiUnicodeMatches)421*993b0882SAndroid Build Coastguard Worker TEST_F(TokenFeatureExtractorTest, ExtractAsciiUnicodeMatches) {
422*993b0882SAndroid Build Coastguard Worker   TokenFeatureExtractorOptions options;
423*993b0882SAndroid Build Coastguard Worker   options.num_buckets = 1000;
424*993b0882SAndroid Build Coastguard Worker   options.chargram_orders = std::vector<int>{1, 2, 3, 4, 5};
425*993b0882SAndroid Build Coastguard Worker   options.extract_case_feature = true;
426*993b0882SAndroid Build Coastguard Worker   options.unicode_aware_features = true;
427*993b0882SAndroid Build Coastguard Worker   options.extract_selection_mask_feature = true;
428*993b0882SAndroid Build Coastguard Worker 
429*993b0882SAndroid Build Coastguard Worker   TestingTokenFeatureExtractor extractor_unicode(options, &unilib_);
430*993b0882SAndroid Build Coastguard Worker 
431*993b0882SAndroid Build Coastguard Worker   options.unicode_aware_features = false;
432*993b0882SAndroid Build Coastguard Worker   TestingTokenFeatureExtractor extractor_ascii(options, &unilib_);
433*993b0882SAndroid Build Coastguard Worker 
434*993b0882SAndroid Build Coastguard Worker   for (const std::string& input :
435*993b0882SAndroid Build Coastguard Worker        {"https://www.abcdefgh.com/in/xxxkkkvayio",
436*993b0882SAndroid Build Coastguard Worker         "https://www.fjsidofj.om/xx/abadfy/xxxx/?xfjiis=ffffiijiihil",
437*993b0882SAndroid Build Coastguard Worker         "asdfhasdofjiasdofj#%()*%#*(aisdojfaosdifjiaofjdsiofjdi_fdis3w", "abcd",
438*993b0882SAndroid Build Coastguard Worker         "x", "Hello", "Hey,", "Hi", ""}) {
439*993b0882SAndroid Build Coastguard Worker     std::vector<int> sparse_features_unicode;
440*993b0882SAndroid Build Coastguard Worker     std::vector<float> dense_features_unicode;
441*993b0882SAndroid Build Coastguard Worker     extractor_unicode.Extract(Token{input, 0, 0}, true,
442*993b0882SAndroid Build Coastguard Worker                               &sparse_features_unicode,
443*993b0882SAndroid Build Coastguard Worker                               &dense_features_unicode);
444*993b0882SAndroid Build Coastguard Worker 
445*993b0882SAndroid Build Coastguard Worker     std::vector<int> sparse_features_ascii;
446*993b0882SAndroid Build Coastguard Worker     std::vector<float> dense_features_ascii;
447*993b0882SAndroid Build Coastguard Worker     extractor_ascii.Extract(Token{input, 0, 0}, true, &sparse_features_ascii,
448*993b0882SAndroid Build Coastguard Worker                             &dense_features_ascii);
449*993b0882SAndroid Build Coastguard Worker 
450*993b0882SAndroid Build Coastguard Worker     EXPECT_THAT(sparse_features_unicode, sparse_features_ascii) << input;
451*993b0882SAndroid Build Coastguard Worker     EXPECT_THAT(dense_features_unicode, dense_features_ascii) << input;
452*993b0882SAndroid Build Coastguard Worker   }
453*993b0882SAndroid Build Coastguard Worker }
454*993b0882SAndroid Build Coastguard Worker 
TEST_F(TokenFeatureExtractorTest,ExtractForPadToken)455*993b0882SAndroid Build Coastguard Worker TEST_F(TokenFeatureExtractorTest, ExtractForPadToken) {
456*993b0882SAndroid Build Coastguard Worker   TokenFeatureExtractorOptions options;
457*993b0882SAndroid Build Coastguard Worker   options.num_buckets = 1000;
458*993b0882SAndroid Build Coastguard Worker   options.chargram_orders = std::vector<int>{1, 2};
459*993b0882SAndroid Build Coastguard Worker   options.extract_case_feature = true;
460*993b0882SAndroid Build Coastguard Worker   options.unicode_aware_features = false;
461*993b0882SAndroid Build Coastguard Worker   options.extract_selection_mask_feature = true;
462*993b0882SAndroid Build Coastguard Worker 
463*993b0882SAndroid Build Coastguard Worker   TestingTokenFeatureExtractor extractor(options, &unilib_);
464*993b0882SAndroid Build Coastguard Worker 
465*993b0882SAndroid Build Coastguard Worker   std::vector<int> sparse_features;
466*993b0882SAndroid Build Coastguard Worker   std::vector<float> dense_features;
467*993b0882SAndroid Build Coastguard Worker 
468*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token(), false, &sparse_features, &dense_features);
469*993b0882SAndroid Build Coastguard Worker 
470*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(sparse_features,
471*993b0882SAndroid Build Coastguard Worker               testing::ElementsAreArray({extractor.HashToken("<PAD>")}));
472*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0, 0.0}));
473*993b0882SAndroid Build Coastguard Worker }
474*993b0882SAndroid Build Coastguard Worker 
TEST_F(TokenFeatureExtractorTest,ExtractFiltered)475*993b0882SAndroid Build Coastguard Worker TEST_F(TokenFeatureExtractorTest, ExtractFiltered) {
476*993b0882SAndroid Build Coastguard Worker   TokenFeatureExtractorOptions options;
477*993b0882SAndroid Build Coastguard Worker   options.num_buckets = 1000;
478*993b0882SAndroid Build Coastguard Worker   options.chargram_orders = std::vector<int>{1, 2, 3};
479*993b0882SAndroid Build Coastguard Worker   options.extract_case_feature = true;
480*993b0882SAndroid Build Coastguard Worker   options.unicode_aware_features = false;
481*993b0882SAndroid Build Coastguard Worker   options.extract_selection_mask_feature = true;
482*993b0882SAndroid Build Coastguard Worker   options.allowed_chargrams.insert("^H");
483*993b0882SAndroid Build Coastguard Worker   options.allowed_chargrams.insert("ll");
484*993b0882SAndroid Build Coastguard Worker   options.allowed_chargrams.insert("llo");
485*993b0882SAndroid Build Coastguard Worker   options.allowed_chargrams.insert("w");
486*993b0882SAndroid Build Coastguard Worker   options.allowed_chargrams.insert("!");
487*993b0882SAndroid Build Coastguard Worker   options.allowed_chargrams.insert("\xc4");  // UTF8 control character.
488*993b0882SAndroid Build Coastguard Worker 
489*993b0882SAndroid Build Coastguard Worker   TestingTokenFeatureExtractor extractor(options, &unilib_);
490*993b0882SAndroid Build Coastguard Worker 
491*993b0882SAndroid Build Coastguard Worker   std::vector<int> sparse_features;
492*993b0882SAndroid Build Coastguard Worker   std::vector<float> dense_features;
493*993b0882SAndroid Build Coastguard Worker 
494*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"Hěllo", 0, 5}, true, &sparse_features,
495*993b0882SAndroid Build Coastguard Worker                     &dense_features);
496*993b0882SAndroid Build Coastguard Worker 
497*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(sparse_features,
498*993b0882SAndroid Build Coastguard Worker               testing::ElementsAreArray({
499*993b0882SAndroid Build Coastguard Worker                   // clang-format off
500*993b0882SAndroid Build Coastguard Worker                   0,
501*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("\xc4"),
502*993b0882SAndroid Build Coastguard Worker                   0,
503*993b0882SAndroid Build Coastguard Worker                   0,
504*993b0882SAndroid Build Coastguard Worker                   0,
505*993b0882SAndroid Build Coastguard Worker                   0,
506*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("^H"),
507*993b0882SAndroid Build Coastguard Worker                   0,
508*993b0882SAndroid Build Coastguard Worker                   0,
509*993b0882SAndroid Build Coastguard Worker                   0,
510*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("ll"),
511*993b0882SAndroid Build Coastguard Worker                   0,
512*993b0882SAndroid Build Coastguard Worker                   0,
513*993b0882SAndroid Build Coastguard Worker                   0,
514*993b0882SAndroid Build Coastguard Worker                   0,
515*993b0882SAndroid Build Coastguard Worker                   0,
516*993b0882SAndroid Build Coastguard Worker                   0,
517*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("llo"),
518*993b0882SAndroid Build Coastguard Worker                   0
519*993b0882SAndroid Build Coastguard Worker                   // clang-format on
520*993b0882SAndroid Build Coastguard Worker               }));
521*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(dense_features, testing::ElementsAreArray({1.0, 1.0}));
522*993b0882SAndroid Build Coastguard Worker 
523*993b0882SAndroid Build Coastguard Worker   sparse_features.clear();
524*993b0882SAndroid Build Coastguard Worker   dense_features.clear();
525*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token{"world!", 23, 29}, false, &sparse_features,
526*993b0882SAndroid Build Coastguard Worker                     &dense_features);
527*993b0882SAndroid Build Coastguard Worker 
528*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(sparse_features, testing::ElementsAreArray({
529*993b0882SAndroid Build Coastguard Worker                                    // clang-format off
530*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("w"),
531*993b0882SAndroid Build Coastguard Worker                   0,
532*993b0882SAndroid Build Coastguard Worker                   0,
533*993b0882SAndroid Build Coastguard Worker                   0,
534*993b0882SAndroid Build Coastguard Worker                   0,
535*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("!"),
536*993b0882SAndroid Build Coastguard Worker                   0,
537*993b0882SAndroid Build Coastguard Worker                   0,
538*993b0882SAndroid Build Coastguard Worker                   0,
539*993b0882SAndroid Build Coastguard Worker                   0,
540*993b0882SAndroid Build Coastguard Worker                   0,
541*993b0882SAndroid Build Coastguard Worker                   0,
542*993b0882SAndroid Build Coastguard Worker                   0,
543*993b0882SAndroid Build Coastguard Worker                   0,
544*993b0882SAndroid Build Coastguard Worker                   0,
545*993b0882SAndroid Build Coastguard Worker                   0,
546*993b0882SAndroid Build Coastguard Worker                   0,
547*993b0882SAndroid Build Coastguard Worker                   0,
548*993b0882SAndroid Build Coastguard Worker                   0,
549*993b0882SAndroid Build Coastguard Worker                                    // clang-format on
550*993b0882SAndroid Build Coastguard Worker                                }));
551*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0, 0.0}));
552*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(extractor.HashToken("<PAD>"), 1);
553*993b0882SAndroid Build Coastguard Worker }
554*993b0882SAndroid Build Coastguard Worker 
TEST_F(TokenFeatureExtractorTest,ExtractEmptyToken)555*993b0882SAndroid Build Coastguard Worker TEST_F(TokenFeatureExtractorTest, ExtractEmptyToken) {
556*993b0882SAndroid Build Coastguard Worker   TokenFeatureExtractorOptions options;
557*993b0882SAndroid Build Coastguard Worker   options.num_buckets = 1000;
558*993b0882SAndroid Build Coastguard Worker   options.chargram_orders = std::vector<int>{1, 2, 3};
559*993b0882SAndroid Build Coastguard Worker   options.extract_case_feature = true;
560*993b0882SAndroid Build Coastguard Worker   options.unicode_aware_features = false;
561*993b0882SAndroid Build Coastguard Worker   options.extract_selection_mask_feature = true;
562*993b0882SAndroid Build Coastguard Worker   TestingTokenFeatureExtractor extractor(options, &unilib_);
563*993b0882SAndroid Build Coastguard Worker 
564*993b0882SAndroid Build Coastguard Worker   std::vector<int> sparse_features;
565*993b0882SAndroid Build Coastguard Worker   std::vector<float> dense_features;
566*993b0882SAndroid Build Coastguard Worker 
567*993b0882SAndroid Build Coastguard Worker   // Should not crash.
568*993b0882SAndroid Build Coastguard Worker   extractor.Extract(Token(), true, &sparse_features, &dense_features);
569*993b0882SAndroid Build Coastguard Worker 
570*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(sparse_features, testing::ElementsAreArray({
571*993b0882SAndroid Build Coastguard Worker                                    // clang-format off
572*993b0882SAndroid Build Coastguard Worker                   extractor.HashToken("<PAD>"),
573*993b0882SAndroid Build Coastguard Worker                                    // clang-format on
574*993b0882SAndroid Build Coastguard Worker                                }));
575*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0, 1.0}));
576*993b0882SAndroid Build Coastguard Worker }
577*993b0882SAndroid Build Coastguard Worker 
578*993b0882SAndroid Build Coastguard Worker }  // namespace
579*993b0882SAndroid Build Coastguard Worker }  // namespace libtextclassifier3
580