1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "utils/sentencepiece/normalizer.h"
18
19 #include <fstream>
20 #include <string>
21
22 #include "utils/container/double-array-trie.h"
23 #include "utils/sentencepiece/test_utils.h"
24 #include "utils/strings/stringpiece.h"
25 #include "utils/test-data-test-utils.h"
26 #include "gmock/gmock.h"
27 #include "gtest/gtest.h"
28
29 namespace libtextclassifier3 {
30 namespace {
31
GetTestConfigPath()32 std::string GetTestConfigPath() {
33 return GetTestDataPath("utils/sentencepiece/test_data/nmt_nfkc_charsmap.bin");
34 }
35
TEST(NormalizerTest,NormalizesAsReferenceNormalizer)36 TEST(NormalizerTest, NormalizesAsReferenceNormalizer) {
37 std::ifstream test_config_stream(GetTestConfigPath());
38 std::string config((std::istreambuf_iterator<char>(test_config_stream)),
39 (std::istreambuf_iterator<char>()));
40 SentencePieceNormalizer normalizer =
41 NormalizerFromSpec(config, /*add_dummy_prefix=*/true,
42 /*remove_extra_whitespaces=*/true,
43 /*escape_whitespaces=*/true);
44 {
45 std::string normalized;
46 EXPECT_TRUE(normalizer.Normalize("hello there", &normalized));
47 EXPECT_EQ(normalized, "▁hello▁there");
48 }
49
50 // Redundant whitespace.
51 {
52 std::string normalized;
53 EXPECT_TRUE(normalizer.Normalize("when is the world cup?", &normalized));
54 EXPECT_EQ(normalized, "▁when▁is▁the▁world▁cup?");
55 }
56
57 // Different whitespace.
58 {
59 std::string normalized;
60 EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized));
61 EXPECT_EQ(normalized, "▁general▁kenobi");
62 }
63
64 // NFKC char to multi-char normalization.
65 {
66 std::string normalized;
67 EXPECT_TRUE(normalizer.Normalize("㍿", &normalized));
68 EXPECT_EQ(normalized, "▁株式会社");
69 }
70
71 // Half width katakana, character composition happens.
72 {
73 std::string normalized;
74 EXPECT_TRUE(normalizer.Normalize(" グーグル ", &normalized));
75 EXPECT_EQ(normalized, "▁グーグル");
76 }
77
78 // NFKC char to char normalization.
79 {
80 std::string normalized;
81 EXPECT_TRUE(normalizer.Normalize("①②③", &normalized));
82 EXPECT_EQ(normalized, "▁123");
83 }
84 }
85
TEST(NormalizerTest,NoDummyPrefix)86 TEST(NormalizerTest, NoDummyPrefix) {
87 std::ifstream test_config_stream(GetTestConfigPath());
88 std::string config((std::istreambuf_iterator<char>(test_config_stream)),
89 (std::istreambuf_iterator<char>()));
90 SentencePieceNormalizer normalizer =
91 NormalizerFromSpec(config, /*add_dummy_prefix=*/false,
92 /*remove_extra_whitespaces=*/true,
93 /*escape_whitespaces=*/true);
94
95 // NFKC char to char normalization.
96 {
97 std::string normalized;
98 EXPECT_TRUE(normalizer.Normalize("hello there", &normalized));
99 EXPECT_EQ(normalized, "hello▁there");
100 }
101
102 // Redundant whitespace.
103 {
104 std::string normalized;
105 EXPECT_TRUE(normalizer.Normalize("when is the world cup?", &normalized));
106 EXPECT_EQ(normalized, "when▁is▁the▁world▁cup?");
107 }
108
109 // Different whitespace.
110 {
111 std::string normalized;
112 EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized));
113 EXPECT_EQ(normalized, "general▁kenobi");
114 }
115
116 // NFKC char to multi-char normalization.
117 {
118 std::string normalized;
119 EXPECT_TRUE(normalizer.Normalize("㍿", &normalized));
120 EXPECT_EQ(normalized, "株式会社");
121 }
122
123 // Half width katakana, character composition happens.
124 {
125 std::string normalized;
126 EXPECT_TRUE(normalizer.Normalize(" グーグル ", &normalized));
127 EXPECT_EQ(normalized, "グーグル");
128 }
129
130 // NFKC char to char normalization.
131 {
132 std::string normalized;
133 EXPECT_TRUE(normalizer.Normalize("①②③", &normalized));
134 EXPECT_EQ(normalized, "123");
135 }
136 }
137
TEST(NormalizerTest,NoRemoveExtraWhitespace)138 TEST(NormalizerTest, NoRemoveExtraWhitespace) {
139 std::ifstream test_config_stream(GetTestConfigPath());
140 std::string config((std::istreambuf_iterator<char>(test_config_stream)),
141 (std::istreambuf_iterator<char>()));
142 SentencePieceNormalizer normalizer =
143 NormalizerFromSpec(config, /*add_dummy_prefix=*/false,
144 /*remove_extra_whitespaces=*/false,
145 /*escape_whitespaces=*/true);
146
147 {
148 std::string normalized;
149 EXPECT_TRUE(normalizer.Normalize("hello there", &normalized));
150 EXPECT_EQ(normalized, "hello▁there");
151 }
152
153 // Redundant whitespace.
154 {
155 std::string normalized;
156 EXPECT_TRUE(normalizer.Normalize("when is the world cup?", &normalized));
157 EXPECT_EQ(normalized, "when▁is▁▁the▁▁world▁cup?");
158 }
159
160 // Different whitespace.
161 {
162 std::string normalized;
163 EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized));
164 EXPECT_EQ(normalized, "general▁kenobi");
165 }
166 }
167
TEST(NormalizerTest,NoEscapeWhitespaces)168 TEST(NormalizerTest, NoEscapeWhitespaces) {
169 std::ifstream test_config_stream(GetTestConfigPath());
170 std::string config((std::istreambuf_iterator<char>(test_config_stream)),
171 (std::istreambuf_iterator<char>()));
172 SentencePieceNormalizer normalizer =
173 NormalizerFromSpec(config, /*add_dummy_prefix=*/false,
174 /*remove_extra_whitespaces=*/false,
175 /*escape_whitespaces=*/false);
176
177 {
178 std::string normalized;
179 EXPECT_TRUE(normalizer.Normalize("hello there", &normalized));
180 EXPECT_EQ(normalized, "hello there");
181 }
182
183 // Redundant whitespace.
184 {
185 std::string normalized;
186 EXPECT_TRUE(normalizer.Normalize("when is the world cup?", &normalized));
187 EXPECT_EQ(normalized, "when is the world cup?");
188 }
189
190 // Different whitespace.
191 {
192 std::string normalized;
193 EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized));
194 EXPECT_EQ(normalized, "general kenobi");
195 }
196 }
197
198 } // namespace
199 } // namespace libtextclassifier3
200