xref: /aosp_15_r20/external/libtextclassifier/native/utils/sentencepiece/normalizer_test.cc (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "utils/sentencepiece/normalizer.h"
18 
19 #include <fstream>
20 #include <string>
21 
22 #include "utils/container/double-array-trie.h"
23 #include "utils/sentencepiece/test_utils.h"
24 #include "utils/strings/stringpiece.h"
25 #include "utils/test-data-test-utils.h"
26 #include "gmock/gmock.h"
27 #include "gtest/gtest.h"
28 
29 namespace libtextclassifier3 {
30 namespace {
31 
GetTestConfigPath()32 std::string GetTestConfigPath() {
33   return GetTestDataPath("utils/sentencepiece/test_data/nmt_nfkc_charsmap.bin");
34 }
35 
TEST(NormalizerTest,NormalizesAsReferenceNormalizer)36 TEST(NormalizerTest, NormalizesAsReferenceNormalizer) {
37   std::ifstream test_config_stream(GetTestConfigPath());
38   std::string config((std::istreambuf_iterator<char>(test_config_stream)),
39                      (std::istreambuf_iterator<char>()));
40   SentencePieceNormalizer normalizer =
41       NormalizerFromSpec(config, /*add_dummy_prefix=*/true,
42                          /*remove_extra_whitespaces=*/true,
43                          /*escape_whitespaces=*/true);
44   {
45     std::string normalized;
46     EXPECT_TRUE(normalizer.Normalize("hello there", &normalized));
47     EXPECT_EQ(normalized, "▁hello▁there");
48   }
49 
50   // Redundant whitespace.
51   {
52     std::string normalized;
53     EXPECT_TRUE(normalizer.Normalize("when is  the  world cup?", &normalized));
54     EXPECT_EQ(normalized, "▁when▁is▁the▁world▁cup?");
55   }
56 
57   // Different whitespace.
58   {
59     std::string normalized;
60     EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized));
61     EXPECT_EQ(normalized, "▁general▁kenobi");
62   }
63 
64   // NFKC char to multi-char normalization.
65   {
66     std::string normalized;
67     EXPECT_TRUE(normalizer.Normalize("㍿", &normalized));
68     EXPECT_EQ(normalized, "▁株式会社");
69   }
70 
71   // Half width katakana, character composition happens.
72   {
73     std::string normalized;
74     EXPECT_TRUE(normalizer.Normalize(" グーグル ", &normalized));
75     EXPECT_EQ(normalized, "▁グーグル");
76   }
77 
78   // NFKC char to char normalization.
79   {
80     std::string normalized;
81     EXPECT_TRUE(normalizer.Normalize("①②③", &normalized));
82     EXPECT_EQ(normalized, "▁123");
83   }
84 }
85 
TEST(NormalizerTest,NoDummyPrefix)86 TEST(NormalizerTest, NoDummyPrefix) {
87   std::ifstream test_config_stream(GetTestConfigPath());
88   std::string config((std::istreambuf_iterator<char>(test_config_stream)),
89                      (std::istreambuf_iterator<char>()));
90   SentencePieceNormalizer normalizer =
91       NormalizerFromSpec(config, /*add_dummy_prefix=*/false,
92                          /*remove_extra_whitespaces=*/true,
93                          /*escape_whitespaces=*/true);
94 
95   // NFKC char to char normalization.
96   {
97     std::string normalized;
98     EXPECT_TRUE(normalizer.Normalize("hello there", &normalized));
99     EXPECT_EQ(normalized, "hello▁there");
100   }
101 
102   // Redundant whitespace.
103   {
104     std::string normalized;
105     EXPECT_TRUE(normalizer.Normalize("when is  the  world cup?", &normalized));
106     EXPECT_EQ(normalized, "when▁is▁the▁world▁cup?");
107   }
108 
109   // Different whitespace.
110   {
111     std::string normalized;
112     EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized));
113     EXPECT_EQ(normalized, "general▁kenobi");
114   }
115 
116   // NFKC char to multi-char normalization.
117   {
118     std::string normalized;
119     EXPECT_TRUE(normalizer.Normalize("㍿", &normalized));
120     EXPECT_EQ(normalized, "株式会社");
121   }
122 
123   // Half width katakana, character composition happens.
124   {
125     std::string normalized;
126     EXPECT_TRUE(normalizer.Normalize(" グーグル ", &normalized));
127     EXPECT_EQ(normalized, "グーグル");
128   }
129 
130   // NFKC char to char normalization.
131   {
132     std::string normalized;
133     EXPECT_TRUE(normalizer.Normalize("①②③", &normalized));
134     EXPECT_EQ(normalized, "123");
135   }
136 }
137 
TEST(NormalizerTest,NoRemoveExtraWhitespace)138 TEST(NormalizerTest, NoRemoveExtraWhitespace) {
139   std::ifstream test_config_stream(GetTestConfigPath());
140   std::string config((std::istreambuf_iterator<char>(test_config_stream)),
141                      (std::istreambuf_iterator<char>()));
142   SentencePieceNormalizer normalizer =
143       NormalizerFromSpec(config, /*add_dummy_prefix=*/false,
144                          /*remove_extra_whitespaces=*/false,
145                          /*escape_whitespaces=*/true);
146 
147   {
148     std::string normalized;
149     EXPECT_TRUE(normalizer.Normalize("hello there", &normalized));
150     EXPECT_EQ(normalized, "hello▁there");
151   }
152 
153   // Redundant whitespace.
154   {
155     std::string normalized;
156     EXPECT_TRUE(normalizer.Normalize("when is  the  world cup?", &normalized));
157     EXPECT_EQ(normalized, "when▁is▁▁the▁▁world▁cup?");
158   }
159 
160   // Different whitespace.
161   {
162     std::string normalized;
163     EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized));
164     EXPECT_EQ(normalized, "general▁kenobi");
165   }
166 }
167 
TEST(NormalizerTest,NoEscapeWhitespaces)168 TEST(NormalizerTest, NoEscapeWhitespaces) {
169   std::ifstream test_config_stream(GetTestConfigPath());
170   std::string config((std::istreambuf_iterator<char>(test_config_stream)),
171                      (std::istreambuf_iterator<char>()));
172   SentencePieceNormalizer normalizer =
173       NormalizerFromSpec(config, /*add_dummy_prefix=*/false,
174                          /*remove_extra_whitespaces=*/false,
175                          /*escape_whitespaces=*/false);
176 
177   {
178     std::string normalized;
179     EXPECT_TRUE(normalizer.Normalize("hello there", &normalized));
180     EXPECT_EQ(normalized, "hello there");
181   }
182 
183   // Redundant whitespace.
184   {
185     std::string normalized;
186     EXPECT_TRUE(normalizer.Normalize("when is  the  world cup?", &normalized));
187     EXPECT_EQ(normalized, "when is  the  world cup?");
188   }
189 
190   // Different whitespace.
191   {
192     std::string normalized;
193     EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized));
194     EXPECT_EQ(normalized, "general kenobi");
195   }
196 }
197 
198 }  // namespace
199 }  // namespace libtextclassifier3
200