1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "url/url_canon.h"
6
7 #include <errno.h>
8 #include <stddef.h>
9 #include <string_view>
10
11 #include "base/strings/string_number_conversions.h"
12 #include "base/strings/utf_string_conversions.h"
13 #include "base/test/gtest_util.h"
14 #include "base/test/scoped_feature_list.h"
15 #include "testing/gtest/include/gtest/gtest.h"
16 #include "url/third_party/mozilla/url_parse.h"
17 #include "url/url_canon_internal.h"
18 #include "url/url_canon_stdstring.h"
19 #include "url/url_features.h"
20 #include "url/url_test_utils.h"
21
22 namespace url {
23
24 namespace {
25
26 struct ComponentCase {
27 const char* input;
28 const char* expected;
29 Component expected_component;
30 bool expected_success;
31 };
32
33 // ComponentCase but with dual 8-bit/16-bit input. Generally, the unit tests
34 // treat each input as optional, and will only try processing if non-NULL.
35 // The output is always 8-bit.
36 struct DualComponentCase {
37 const char* input8;
38 const wchar_t* input16;
39 const char* expected;
40 Component expected_component;
41 bool expected_success;
42 };
43
44 // Test cases for CanonicalizeIPAddress(). The inputs are identical to
45 // DualComponentCase, but the output has extra CanonHostInfo fields.
46 struct IPAddressCase {
47 const char* input8;
48 const wchar_t* input16;
49 const char* expected;
50 Component expected_component;
51
52 // CanonHostInfo fields, for verbose output.
53 CanonHostInfo::Family expected_family;
54 int expected_num_ipv4_components;
55 const char* expected_address_hex; // Two hex chars per IP address byte.
56 };
57
58 struct ReplaceCase {
59 const char* base;
60 const char* scheme;
61 const char* username;
62 const char* password;
63 const char* host;
64 const char* port;
65 const char* path;
66 const char* query;
67 const char* ref;
68 const char* expected;
69 };
70
71 // Magic string used in the replacements code that tells SetupReplComp to
72 // call the clear function.
73 const char kDeleteComp[] = "|";
74
75 // Sets up a replacement for a single component. This is given pointers to
76 // the set and clear function for the component being replaced, and will
77 // either set the component (if it exists) or clear it (if the replacement
78 // string matches kDeleteComp).
79 //
80 // This template is currently used only for the 8-bit case, and the strlen
81 // causes it to fail in other cases. It is left a template in case we have
82 // tests for wide replacements.
83 template<typename CHAR>
SetupReplComp(void (Replacements<CHAR>::* set)(const CHAR *,const Component &),void (Replacements<CHAR>::* clear)(),Replacements<CHAR> * rep,const CHAR * str)84 void SetupReplComp(
85 void (Replacements<CHAR>::*set)(const CHAR*, const Component&),
86 void (Replacements<CHAR>::*clear)(),
87 Replacements<CHAR>* rep,
88 const CHAR* str) {
89 if (str && str[0] == kDeleteComp[0]) {
90 (rep->*clear)();
91 } else if (str) {
92 (rep->*set)(str, Component(0, static_cast<int>(strlen(str))));
93 }
94 }
95
CanonicalizeSpecialPath(const char * spec,const Component & path,CanonOutput * output,Component * out_path)96 bool CanonicalizeSpecialPath(const char* spec,
97 const Component& path,
98 CanonOutput* output,
99 Component* out_path) {
100 return CanonicalizePath(spec, path, CanonMode::kSpecialURL, output, out_path);
101 }
102
CanonicalizeSpecialPath(const char16_t * spec,const Component & path,CanonOutput * output,Component * out_path)103 bool CanonicalizeSpecialPath(const char16_t* spec,
104 const Component& path,
105 CanonOutput* output,
106 Component* out_path) {
107 return CanonicalizePath(spec, path, CanonMode::kSpecialURL, output, out_path);
108 }
109
CanonicalizeNonSpecialPath(const char * spec,const Component & path,CanonOutput * output,Component * out_path)110 bool CanonicalizeNonSpecialPath(const char* spec,
111 const Component& path,
112 CanonOutput* output,
113 Component* out_path) {
114 return CanonicalizePath(spec, path, CanonMode::kNonSpecialURL, output,
115 out_path);
116 }
117
CanonicalizeNonSpecialPath(const char16_t * spec,const Component & path,CanonOutput * output,Component * out_path)118 bool CanonicalizeNonSpecialPath(const char16_t* spec,
119 const Component& path,
120 CanonOutput* output,
121 Component* out_path) {
122 return CanonicalizePath(spec, path, CanonMode::kNonSpecialURL, output,
123 out_path);
124 }
125
126 } // namespace
127
TEST(URLCanonTest,DoAppendUTF8)128 TEST(URLCanonTest, DoAppendUTF8) {
129 struct UTF8Case {
130 unsigned input;
131 const char* output;
132 } utf_cases[] = {
133 // Valid code points.
134 {0x24, "\x24"},
135 {0xA2, "\xC2\xA2"},
136 {0x20AC, "\xE2\x82\xAC"},
137 {0x24B62, "\xF0\xA4\xAD\xA2"},
138 {0x10FFFF, "\xF4\x8F\xBF\xBF"},
139 };
140 std::string out_str;
141 for (const auto& utf_case : utf_cases) {
142 out_str.clear();
143 StdStringCanonOutput output(&out_str);
144 AppendUTF8Value(utf_case.input, &output);
145 output.Complete();
146 EXPECT_EQ(utf_case.output, out_str);
147 }
148 }
149
TEST(URLCanonTest,DoAppendUTF8Invalid)150 TEST(URLCanonTest, DoAppendUTF8Invalid) {
151 std::string out_str;
152 StdStringCanonOutput output(&out_str);
153 // Invalid code point (too large).
154 EXPECT_DCHECK_DEATH({
155 AppendUTF8Value(0x110000, &output);
156 output.Complete();
157 });
158 }
159
TEST(URLCanonTest,UTF)160 TEST(URLCanonTest, UTF) {
161 // Low-level test that we handle reading, canonicalization, and writing
162 // UTF-8/UTF-16 strings properly.
163 struct UTFCase {
164 const char* input8;
165 const wchar_t* input16;
166 bool expected_success;
167 const char* output;
168 } utf_cases[] = {
169 // Valid canonical input should get passed through & escaped.
170 {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true, "%E4%BD%A0%E5%A5%BD"},
171 // Test a character that takes > 16 bits (U+10300 = old italic letter A)
172 {"\xF0\x90\x8C\x80", L"\xd800\xdf00", true, "%F0%90%8C%80"},
173 // Non-shortest-form UTF-8 characters are invalid. The bad bytes should
174 // each be replaced with the invalid character (EF BF DB in UTF-8).
175 {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", nullptr, false,
176 "%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%E5%A5%BD"},
177 // Invalid UTF-8 sequences should be marked as invalid (the first
178 // sequence is truncated).
179 {"\xe4\xa0\xe5\xa5\xbd", L"\xd800\x597d", false, "%EF%BF%BD%E5%A5%BD"},
180 // Character going off the end.
181 {"\xe4\xbd\xa0\xe5\xa5", L"\x4f60\xd800", false, "%E4%BD%A0%EF%BF%BD"},
182 // ...same with low surrogates with no high surrogate.
183 {nullptr, L"\xdc00", false, "%EF%BF%BD"},
184 // Test a UTF-8 encoded surrogate value is marked as invalid.
185 // ED A0 80 = U+D800
186 {"\xed\xa0\x80", nullptr, false, "%EF%BF%BD%EF%BF%BD%EF%BF%BD"},
187 // ...even when paired.
188 {"\xed\xa0\x80\xed\xb0\x80", nullptr, false,
189 "%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD"},
190 };
191
192 std::string out_str;
193 for (const auto& utf_case : utf_cases) {
194 if (utf_case.input8) {
195 out_str.clear();
196 StdStringCanonOutput output(&out_str);
197
198 size_t input_len = strlen(utf_case.input8);
199 bool success = true;
200 for (size_t ch = 0; ch < input_len; ch++) {
201 success &=
202 AppendUTF8EscapedChar(utf_case.input8, &ch, input_len, &output);
203 }
204 output.Complete();
205 EXPECT_EQ(utf_case.expected_success, success);
206 EXPECT_EQ(utf_case.output, out_str);
207 }
208 if (utf_case.input16) {
209 out_str.clear();
210 StdStringCanonOutput output(&out_str);
211
212 std::u16string input_str(
213 test_utils::TruncateWStringToUTF16(utf_case.input16));
214 size_t input_len = input_str.length();
215 bool success = true;
216 for (size_t ch = 0; ch < input_len; ch++) {
217 success &= AppendUTF8EscapedChar(input_str.c_str(), &ch, input_len,
218 &output);
219 }
220 output.Complete();
221 EXPECT_EQ(utf_case.expected_success, success);
222 EXPECT_EQ(utf_case.output, out_str);
223 }
224
225 if (utf_case.input8 && utf_case.input16 && utf_case.expected_success) {
226 // Check that the UTF-8 and UTF-16 inputs are equivalent.
227
228 // UTF-16 -> UTF-8
229 std::string input8_str(utf_case.input8);
230 std::u16string input16_str(
231 test_utils::TruncateWStringToUTF16(utf_case.input16));
232 EXPECT_EQ(input8_str, base::UTF16ToUTF8(input16_str));
233
234 // UTF-8 -> UTF-16
235 EXPECT_EQ(input16_str, base::UTF8ToUTF16(input8_str));
236 }
237 }
238 }
239
TEST(URLCanonTest,Scheme)240 TEST(URLCanonTest, Scheme) {
241 // Here, we're mostly testing that unusual characters are handled properly.
242 // The canonicalizer doesn't do any parsing or whitespace detection. It will
243 // also do its best on error, and will escape funny sequences (these won't be
244 // valid schemes and it will return error).
245 //
246 // Note that the canonicalizer will append a colon to the output to separate
247 // out the rest of the URL, which is not present in the input. We check,
248 // however, that the output range includes everything but the colon.
249 ComponentCase scheme_cases[] = {
250 {"http", "http:", Component(0, 4), true},
251 {"HTTP", "http:", Component(0, 4), true},
252 {" HTTP ", "%20http%20:", Component(0, 10), false},
253 {"htt: ", "htt%3A%20:", Component(0, 9), false},
254 {"\xe4\xbd\xa0\xe5\xa5\xbdhttp", "%E4%BD%A0%E5%A5%BDhttp:", Component(0, 22), false},
255 // Don't re-escape something already escaped. Note that it will
256 // "canonicalize" the 'A' to 'a', but that's OK.
257 {"ht%3Atp", "ht%3atp:", Component(0, 7), false},
258 {"", ":", Component(0, 0), false},
259 };
260
261 std::string out_str;
262
263 for (const auto& scheme_case : scheme_cases) {
264 int url_len = static_cast<int>(strlen(scheme_case.input));
265 Component in_comp(0, url_len);
266 Component out_comp;
267
268 out_str.clear();
269 StdStringCanonOutput output1(&out_str);
270 bool success =
271 CanonicalizeScheme(scheme_case.input, in_comp, &output1, &out_comp);
272 output1.Complete();
273
274 EXPECT_EQ(scheme_case.expected_success, success);
275 EXPECT_EQ(scheme_case.expected, out_str);
276 EXPECT_EQ(scheme_case.expected_component.begin, out_comp.begin);
277 EXPECT_EQ(scheme_case.expected_component.len, out_comp.len);
278
279 // Now try the wide version.
280 out_str.clear();
281 StdStringCanonOutput output2(&out_str);
282
283 std::u16string wide_input(base::UTF8ToUTF16(scheme_case.input));
284 in_comp.len = static_cast<int>(wide_input.length());
285 success = CanonicalizeScheme(wide_input.c_str(), in_comp, &output2,
286 &out_comp);
287 output2.Complete();
288
289 EXPECT_EQ(scheme_case.expected_success, success);
290 EXPECT_EQ(scheme_case.expected, out_str);
291 EXPECT_EQ(scheme_case.expected_component.begin, out_comp.begin);
292 EXPECT_EQ(scheme_case.expected_component.len, out_comp.len);
293 }
294
295 // Test the case where the scheme is declared nonexistent, it should be
296 // converted into an empty scheme.
297 Component out_comp;
298 out_str.clear();
299 StdStringCanonOutput output(&out_str);
300
301 EXPECT_FALSE(CanonicalizeScheme("", Component(0, -1), &output, &out_comp));
302 output.Complete();
303
304 EXPECT_EQ(":", out_str);
305 EXPECT_EQ(0, out_comp.begin);
306 EXPECT_EQ(0, out_comp.len);
307 }
308
309 // IDNA mode to use in CanonHost tests.
310 enum class IDNAMode { kTransitional, kNonTransitional };
311
312 class URLCanonHostTest
313 : public ::testing::Test,
314 public ::testing::WithParamInterface<IDNAMode> {
315 public:
URLCanonHostTest()316 URLCanonHostTest() {
317 if (GetParam() == IDNAMode::kNonTransitional) {
318 scoped_feature_list_.InitAndEnableFeature(kUseIDNA2008NonTransitional);
319 } else {
320 scoped_feature_list_.InitAndDisableFeature(kUseIDNA2008NonTransitional);
321 }
322 }
323
324 private:
325 base::test::ScopedFeatureList scoped_feature_list_;
326 };
327
328 INSTANTIATE_TEST_SUITE_P(All,
329 URLCanonHostTest,
330 ::testing::Values(IDNAMode::kTransitional,
331 IDNAMode::kNonTransitional));
332
TEST_P(URLCanonHostTest,Host)333 TEST_P(URLCanonHostTest, Host) {
334 bool use_idna_non_transitional = IsUsingIDNA2008NonTransitional();
335
336 // clang-format off
337 IPAddressCase host_cases[] = {
338 // Basic canonicalization, uppercase should be converted to lowercase.
339 {"GoOgLe.CoM", L"GoOgLe.CoM", "google.com", Component(0, 10),
340 CanonHostInfo::NEUTRAL, -1, ""},
341 // TODO(https://crbug.com/1416013): Update the test after SPACE is
342 // correctly handled.
343 {"Goo%20 goo.com", L"Goo%20 goo.com", "goo%20%20goo.com",
344 Component(0, 16), CanonHostInfo::NEUTRAL, -1, ""},
345 // TODO(https://crbug.com/1416013): Update the test after ASTERISK is
346 // correctly handled.
347 {"Goo%2a*goo.com", L"Goo%2a*goo.com", "goo%2A%2Agoo.com",
348 Component(0, 16), CanonHostInfo::NEUTRAL, -1, ""},
349 // Exciting different types of spaces!
350 {nullptr, L"GOO\x00a0\x3000goo.com", "goo%20%20goo.com", Component(0, 16),
351 CanonHostInfo::NEUTRAL, -1, ""},
352 // Other types of space (no-break, zero-width, zero-width-no-break) are
353 // name-prepped away to nothing.
354 {nullptr, L"GOO\x200b\x2060\xfeffgoo.com", "googoo.com", Component(0, 10),
355 CanonHostInfo::NEUTRAL, -1, ""},
356 // Ideographic full stop (full-width period for Chinese, etc.) should be
357 // treated as a dot.
358 {nullptr,
359 L"www.foo\x3002"
360 L"bar.com",
361 "www.foo.bar.com", Component(0, 15), CanonHostInfo::NEUTRAL, -1, ""},
362 // Invalid unicode characters should fail...
363 {"\xef\xb7\x90zyx.com", L"\xfdd0zyx.com", "%EF%B7%90zyx.com",
364 Component(0, 16), CanonHostInfo::BROKEN, -1, ""},
365 // ...This is the same as previous but with with escaped.
366 {"%ef%b7%90zyx.com", L"%ef%b7%90zyx.com", "%EF%B7%90zyx.com",
367 Component(0, 16), CanonHostInfo::BROKEN, -1, ""},
368 // Test name prepping, fullwidth input should be converted to ASCII and
369 // NOT
370 // IDN-ized. This is "Go" in fullwidth UTF-8/UTF-16.
371 {"\xef\xbc\xa7\xef\xbd\x8f.com", L"\xff27\xff4f.com", "go.com",
372 Component(0, 6), CanonHostInfo::NEUTRAL, -1, ""},
373 // Test that fullwidth escaped values are properly name-prepped,
374 // then converted or rejected.
375 // ...%41 in fullwidth = 'A' (also as escaped UTF-8 input)
376 {"\xef\xbc\x85\xef\xbc\x94\xef\xbc\x91.com", L"\xff05\xff14\xff11.com",
377 "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
378 {"%ef%bc%85%ef%bc%94%ef%bc%91.com", L"%ef%bc%85%ef%bc%94%ef%bc%91.com",
379 "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
380 // ...%00 in fullwidth should fail (also as escaped UTF-8 input)
381 {"\xef\xbc\x85\xef\xbc\x90\xef\xbc\x90.com", L"\xff05\xff10\xff10.com",
382 "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
383 {"%ef%bc%85%ef%bc%90%ef%bc%90.com", L"%ef%bc%85%ef%bc%90%ef%bc%90.com",
384 "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
385 // ICU will convert weird percents into ASCII percents, but not unescape
386 // further. A weird percent is U+FE6A (EF B9 AA in UTF-8) which is a
387 // "small percent". At this point we should be within our rights to mark
388 // anything as invalid since the URL is corrupt or malicious. The code
389 // happens to allow ASCII characters (%41 = "A" -> 'a') to be unescaped
390 // and kept as valid, so we validate that behavior here, but this level
391 // of fixing the input shouldn't be seen as required. "%81" is invalid.
392 {"\xef\xb9\xaa"
393 "41.com",
394 L"\xfe6a"
395 L"41.com",
396 "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
397 {"%ef%b9%aa"
398 "41.com",
399 L"\xfe6a"
400 L"41.com",
401 "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
402 {"\xef\xb9\xaa"
403 "81.com",
404 L"\xfe6a"
405 L"81.com",
406 "%81.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
407 {"%ef%b9%aa"
408 "81.com",
409 L"\xfe6a"
410 L"81.com",
411 "%81.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
412 // Basic IDN support, UTF-8 and UTF-16 input should be converted to IDN
413 {"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd",
414 L"\x4f60\x597d\x4f60\x597d", "xn--6qqa088eba", Component(0, 14),
415 CanonHostInfo::NEUTRAL, -1, ""},
416 // See http://unicode.org/cldr/utility/idna.jsp for other
417 // examples/experiments and http://goo.gl/7yG11o
418 // for the full list of characters handled differently by
419 // IDNA 2003, UTS 46 (http://unicode.org/reports/tr46/ ) and IDNA 2008.
420
421 // 4 Deviation characters are mapped/ignored in UTS 46 transitional
422 // mechansm. UTS 46, table 4 row (g).
423 // Sharp-s is mapped to 'ss' in IDNA 2003, not in IDNA 2008 or UTF 46
424 // after transitional period.
425 // Previously, it'd be "fussball.de".
426 {"fu\xc3\x9f"
427 "ball.de",
428 L"fu\x00df"
429 L"ball.de",
430 use_idna_non_transitional ? "xn--fuball-cta.de" : "fussball.de",
431 use_idna_non_transitional ? Component(0, 17) : Component(0, 11),
432 CanonHostInfo::NEUTRAL, -1, ""},
433
434 // Final-sigma (U+03C3) was mapped to regular sigma (U+03C2).
435 // Previously, it'd be "xn--wxaikc9b".
436 {"\xcf\x83\xcf\x8c\xce\xbb\xce\xbf\xcf\x82", L"\x3c3\x3cc\x3bb\x3bf\x3c2",
437 use_idna_non_transitional ? "xn--wxaijb9b" : "xn--wxaikc6b",
438 Component(0, 12), CanonHostInfo::NEUTRAL, -1, ""},
439
440 // ZWNJ (U+200C) and ZWJ (U+200D) are mapped away in UTS 46 transitional
441 // handling as well as in IDNA 2003, but not thereafter.
442 {"a\xe2\x80\x8c"
443 "b\xe2\x80\x8d"
444 "c",
445 L"a\x200c"
446 L"b\x200d"
447 L"c",
448 use_idna_non_transitional ? "xn--abc-9m0ag" : "abc",
449 use_idna_non_transitional ? Component(0, 13) : Component(0, 3),
450 CanonHostInfo::NEUTRAL, -1, ""},
451
452 // ZWJ between Devanagari characters was still mapped away in UTS 46
453 // transitional handling. IDNA 2008 gives xn--11bo0mv54g.
454 // Previously "xn--11bo0m".
455 {"\xe0\xa4\x95\xe0\xa5\x8d\xe2\x80\x8d\xe0\xa4\x9c",
456 L"\x915\x94d\x200d\x91c",
457 use_idna_non_transitional ? "xn--11bo0mv54g" : "xn--11bo0m",
458 use_idna_non_transitional ? Component(0, 14) : Component(0, 10),
459 CanonHostInfo::NEUTRAL, -1, ""},
460
461 // Fullwidth exclamation mark is disallowed. UTS 46, table 4, row (b)
462 // However, we do allow this at the moment because we don't use
463 // STD3 rules and canonicalize full-width ASCII to ASCII.
464 {"wow\xef\xbc\x81", L"wow\xff01", "wow!", Component(0, 4),
465 CanonHostInfo::NEUTRAL, -1, ""},
466 // U+2132 (turned capital F) is disallowed. UTS 46, table 4, row (c)
467 // Allowed in IDNA 2003, but the mapping changed after Unicode 3.2
468 {"\xe2\x84\xb2oo", L"\x2132oo", "%E2%84%B2oo", Component(0, 11),
469 CanonHostInfo::BROKEN, -1, ""},
470 // U+2F868 (CJK Comp) is disallowed. UTS 46, table 4, row (d)
471 // Allowed in IDNA 2003, but the mapping changed after Unicode 3.2
472 {"\xf0\xaf\xa1\xa8\xe5\xa7\xbb.cn", L"\xd87e\xdc68\x59fb.cn",
473 "%F0%AF%A1%A8%E5%A7%BB.cn", Component(0, 24), CanonHostInfo::BROKEN, -1,
474 ""},
475 // Maps uppercase letters to lower case letters. UTS 46 table 4 row (e)
476 {"M\xc3\x9cNCHEN", L"M\xdcNCHEN", "xn--mnchen-3ya", Component(0, 14),
477 CanonHostInfo::NEUTRAL, -1, ""},
478 // An already-IDNA host is not modified.
479 {"xn--mnchen-3ya", L"xn--mnchen-3ya", "xn--mnchen-3ya", Component(0, 14),
480 CanonHostInfo::NEUTRAL, -1, ""},
481 // Symbol/punctuations are allowed in IDNA 2003/UTS46.
482 // Not allowed in IDNA 2008. UTS 46 table 4 row (f).
483 {"\xe2\x99\xa5ny.us", L"\x2665ny.us", "xn--ny-s0x.us", Component(0, 13),
484 CanonHostInfo::NEUTRAL, -1, ""},
485 // U+11013 is new in Unicode 6.0 and is allowed. UTS 46 table 4, row (h)
486 // We used to allow it because we passed through unassigned code points.
487 {"\xf0\x91\x80\x93.com", L"\xd804\xdc13.com", "xn--n00d.com",
488 Component(0, 12), CanonHostInfo::NEUTRAL, -1, ""},
489 // U+0602 is disallowed in UTS46/IDNA 2008. UTS 46 table 4, row(i)
490 // Used to be allowed in INDA 2003.
491 {"\xd8\x82.eg", L"\x602.eg", "%D8%82.eg", Component(0, 9),
492 CanonHostInfo::BROKEN, -1, ""},
493 // U+20B7 is new in Unicode 5.2 (not a part of IDNA 2003 based
494 // on Unicode 3.2). We did allow it in the past because we let unassigned
495 // code point pass. We continue to allow it even though it's a
496 // "punctuation and symbol" blocked in IDNA 2008.
497 // UTS 46 table 4, row (j)
498 {"\xe2\x82\xb7.com", L"\x20b7.com", "xn--wzg.com", Component(0, 11),
499 CanonHostInfo::NEUTRAL, -1, ""},
500 // Maps uppercase letters to lower case letters.
501 // In IDNA 2003, it's allowed without case-folding
502 // ( xn--bc-7cb.com ) because it's not defined in Unicode 3.2
503 // (added in Unicode 4.1). UTS 46 table 4 row (k)
504 {"bc\xc8\xba.com", L"bc\x23a.com", "xn--bc-is1a.com", Component(0, 15),
505 CanonHostInfo::NEUTRAL, -1, ""},
506 // Maps U+FF43 (Full Width Small Letter C) to 'c'.
507 {"ab\xef\xbd\x83.xyz", L"ab\xff43.xyz", "abc.xyz", Component(0, 7),
508 CanonHostInfo::NEUTRAL, -1, ""},
509 // Maps U+1D68C (Math Monospace Small C) to 'c'.
510 // U+1D68C = \xD835\xDE8C in UTF-16
511 {"ab\xf0\x9d\x9a\x8c.xyz", L"ab\xd835\xde8c.xyz", "abc.xyz",
512 Component(0, 7), CanonHostInfo::NEUTRAL, -1, ""},
513 // BiDi check test
514 // "Divehi" in Divehi (Thaana script) ends with BidiClass=NSM.
515 // Disallowed in IDNA 2003 but now allowed in UTS 46/IDNA 2008.
516 {"\xde\x8b\xde\xa8\xde\x88\xde\xac\xde\x80\xde\xa8",
517 L"\x78b\x7a8\x788\x7ac\x780\x7a8", "xn--hqbpi0jcw", Component(0, 13),
518 CanonHostInfo::NEUTRAL, -1, ""},
519 // Disallowed in both IDNA 2003 and 2008 with BiDi check.
520 // Labels starting with a RTL character cannot end with a LTR character.
521 {"\xd8\xac\xd8\xa7\xd8\xb1xyz", L"\x62c\x627\x631xyz",
522 "%D8%AC%D8%A7%D8%B1xyz", Component(0, 21), CanonHostInfo::BROKEN, -1,
523 ""},
524 // Labels starting with a RTL character can end with BC=EN (European
525 // number). Disallowed in IDNA 2003 but now allowed.
526 {"\xd8\xac\xd8\xa7\xd8\xb1"
527 "2",
528 L"\x62c\x627\x631"
529 L"2",
530 "xn--2-ymcov", Component(0, 11), CanonHostInfo::NEUTRAL, -1, ""},
531 // Labels starting with a RTL character cannot have "L" characters
532 // even if it ends with an BC=EN. Disallowed in both IDNA 2003/2008.
533 {"\xd8\xac\xd8\xa7\xd8\xb1xy2", L"\x62c\x627\x631xy2",
534 "%D8%AC%D8%A7%D8%B1xy2", Component(0, 21), CanonHostInfo::BROKEN, -1,
535 ""},
536 // Labels starting with a RTL character can end with BC=AN (Arabic number)
537 // Disallowed in IDNA 2003, but now allowed.
538 {"\xd8\xac\xd8\xa7\xd8\xb1\xd9\xa2", L"\x62c\x627\x631\x662",
539 "xn--mgbjq0r", Component(0, 11), CanonHostInfo::NEUTRAL, -1, ""},
540 // Labels starting with a RTL character cannot have "L" characters
541 // even if it ends with an BC=AN (Arabic number).
542 // Disallowed in both IDNA 2003/2008.
543 {"\xd8\xac\xd8\xa7\xd8\xb1xy\xd9\xa2", L"\x62c\x627\x631xy\x662",
544 "%D8%AC%D8%A7%D8%B1xy%D9%A2", Component(0, 26), CanonHostInfo::BROKEN,
545 -1, ""},
546 // Labels starting with a RTL character cannot mix BC=EN and BC=AN
547 {"\xd8\xac\xd8\xa7\xd8\xb1xy2\xd9\xa2", L"\x62c\x627\x631xy2\x662",
548 "%D8%AC%D8%A7%D8%B1xy2%D9%A2", Component(0, 27), CanonHostInfo::BROKEN,
549 -1, ""},
550 // As of Unicode 6.2, U+20CF is not assigned. We do not allow it.
551 {"\xe2\x83\x8f.com", L"\x20cf.com", "%E2%83%8F.com", Component(0, 13),
552 CanonHostInfo::BROKEN, -1, ""},
553 // U+0080 is not allowed.
554 {"\xc2\x80.com", L"\x80.com", "%C2%80.com", Component(0, 10),
555 CanonHostInfo::BROKEN, -1, ""},
556 // Mixed UTF-8 and escaped UTF-8 (narrow case) and UTF-16 and escaped
557 // Mixed UTF-8 and escaped UTF-8 (narrow case) and UTF-16 and escaped
558 // UTF-8 (wide case). The output should be equivalent to the true wide
559 // character input above).
560 {"%E4%BD%A0%E5%A5%BD\xe4\xbd\xa0\xe5\xa5\xbd",
561 L"%E4%BD%A0%E5%A5%BD\x4f60\x597d", "xn--6qqa088eba", Component(0, 14),
562 CanonHostInfo::NEUTRAL, -1, ""},
563 // Invalid escaped characters should fail and the percents should be
564 // escaped.
565 {"%zz%66%a", L"%zz%66%a", "%25zzf%25a", Component(0, 10),
566 CanonHostInfo::BROKEN, -1, ""},
567 // If we get an invalid character that has been escaped.
568 {"%25", L"%25", "%25", Component(0, 3), CanonHostInfo::BROKEN, -1, ""},
569 {"hello%00", L"hello%00", "hello%00", Component(0, 8),
570 CanonHostInfo::BROKEN, -1, ""},
571 // Escaped numbers should be treated like IP addresses if they are.
572 {"%30%78%63%30%2e%30%32%35%30.01", L"%30%78%63%30%2e%30%32%35%30.01",
573 "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"},
574 {"%30%78%63%30%2e%30%32%35%30.01%2e",
575 L"%30%78%63%30%2e%30%32%35%30.01%2e", "192.168.0.1", Component(0, 11),
576 CanonHostInfo::IPV4, 3, "C0A80001"},
577 // Invalid escaping should trigger the regular host error handling.
578 {"%3g%78%63%30%2e%30%32%35%30%2E.01",
579 L"%3g%78%63%30%2e%30%32%35%30%2E.01", "%253gxc0.0250..01",
580 Component(0, 17), CanonHostInfo::BROKEN, -1, ""},
581 // Something that isn't exactly an IP should get treated as a host and
582 // spaces escaped.
583 {"192.168.0.1 hello", L"192.168.0.1 hello", "192.168.0.1%20hello",
584 Component(0, 19), CanonHostInfo::NEUTRAL, -1, ""},
585 // Fullwidth and escaped UTF-8 fullwidth should still be treated as IP.
586 // These are "0Xc0.0250.01" in fullwidth.
587 {"\xef\xbc\x90%Ef%bc\xb8%ef%Bd%83\xef\xbc\x90%EF%BC%"
588 "8E\xef\xbc\x90\xef\xbc\x92\xef\xbc\x95\xef\xbc\x90\xef\xbc%"
589 "8E\xef\xbc\x90\xef\xbc\x91",
590 L"\xff10\xff38\xff43\xff10\xff0e\xff10\xff12\xff15\xff10\xff0e\xff10"
591 L"\xff11",
592 "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"},
593 // Broken IP addresses get marked as such.
594 {"192.168.0.257", L"192.168.0.257", "192.168.0.257", Component(0, 13),
595 CanonHostInfo::BROKEN, -1, ""},
596 {"[google.com]", L"[google.com]", "[google.com]", Component(0, 12),
597 CanonHostInfo::BROKEN, -1, ""},
598 // Cyrillic letter followed by '(' should return punycode for '(' escaped
599 // before punycode string was created. I.e.
600 // if '(' is escaped after punycode is created we would get xn--%28-8tb
601 // (incorrect).
602 {"\xd1\x82(", L"\x0442(", "xn--(-8tb", Component(0, 9),
603 CanonHostInfo::NEUTRAL, -1, ""},
604 // Address with all hexadecimal characters with leading number of 1<<32
605 // or greater and should return NEUTRAL rather than BROKEN if not all
606 // components are numbers.
607 {"12345678912345.de", L"12345678912345.de", "12345678912345.de",
608 Component(0, 17), CanonHostInfo::NEUTRAL, -1, ""},
609 {"1.12345678912345.de", L"1.12345678912345.de", "1.12345678912345.de",
610 Component(0, 19), CanonHostInfo::NEUTRAL, -1, ""},
611 {"12345678912345.12345678912345.de", L"12345678912345.12345678912345.de",
612 "12345678912345.12345678912345.de", Component(0, 32),
613 CanonHostInfo::NEUTRAL, -1, ""},
614 {"1.2.0xB3A73CE5B59.de", L"1.2.0xB3A73CE5B59.de", "1.2.0xb3a73ce5b59.de",
615 Component(0, 20), CanonHostInfo::NEUTRAL, -1, ""},
616 {"12345678912345.0xde", L"12345678912345.0xde", "12345678912345.0xde",
617 Component(0, 19), CanonHostInfo::BROKEN, -1, ""},
618 // A label that starts with "xn--" but contains non-ASCII characters
619 // should
620 // be an error. Escape the invalid characters.
621 {"xn--m\xc3\xbcnchen", L"xn--m\xfcnchen", "xn--m%C3%BCnchen",
622 Component(0, 16), CanonHostInfo::BROKEN, -1, ""},
623 };
624 // clang-format on
625
626 // CanonicalizeHost() non-verbose.
627 std::string out_str;
628 for (const auto& host_case : host_cases) {
629 // Narrow version.
630 if (host_case.input8) {
631 int host_len = static_cast<int>(strlen(host_case.input8));
632 Component in_comp(0, host_len);
633 Component out_comp;
634
635 out_str.clear();
636 StdStringCanonOutput output(&out_str);
637
638 bool success =
639 CanonicalizeHost(host_case.input8, in_comp, &output, &out_comp);
640 output.Complete();
641
642 EXPECT_EQ(host_case.expected_family != CanonHostInfo::BROKEN, success)
643 << "for input: " << host_case.input8;
644 EXPECT_EQ(host_case.expected, out_str)
645 << "for input: " << host_case.input8;
646 EXPECT_EQ(host_case.expected_component.begin, out_comp.begin)
647 << "for input: " << host_case.input8;
648 EXPECT_EQ(host_case.expected_component.len, out_comp.len)
649 << "for input: " << host_case.input8;
650 }
651
652 // Wide version.
653 if (host_case.input16) {
654 std::u16string input16(
655 test_utils::TruncateWStringToUTF16(host_case.input16));
656 int host_len = static_cast<int>(input16.length());
657 Component in_comp(0, host_len);
658 Component out_comp;
659
660 out_str.clear();
661 StdStringCanonOutput output(&out_str);
662
663 bool success = CanonicalizeHost(input16.c_str(), in_comp, &output,
664 &out_comp);
665 output.Complete();
666
667 EXPECT_EQ(host_case.expected_family != CanonHostInfo::BROKEN, success);
668 EXPECT_EQ(host_case.expected, out_str);
669 EXPECT_EQ(host_case.expected_component.begin, out_comp.begin);
670 EXPECT_EQ(host_case.expected_component.len, out_comp.len);
671 }
672 }
673
674 // CanonicalizeHostVerbose()
675 for (const auto& host_case : host_cases) {
676 // Narrow version.
677 if (host_case.input8) {
678 int host_len = static_cast<int>(strlen(host_case.input8));
679 Component in_comp(0, host_len);
680
681 out_str.clear();
682 StdStringCanonOutput output(&out_str);
683 CanonHostInfo host_info;
684
685 CanonicalizeHostVerbose(host_case.input8, in_comp, &output, &host_info);
686 output.Complete();
687
688 EXPECT_EQ(host_case.expected_family, host_info.family);
689 EXPECT_EQ(host_case.expected, out_str);
690 EXPECT_EQ(host_case.expected_component.begin, host_info.out_host.begin);
691 EXPECT_EQ(host_case.expected_component.len, host_info.out_host.len);
692 EXPECT_EQ(
693 host_case.expected_address_hex,
694 base::HexEncode(host_info.address,
695 static_cast<size_t>(host_info.AddressLength())));
696 if (host_case.expected_family == CanonHostInfo::IPV4) {
697 EXPECT_EQ(host_case.expected_num_ipv4_components,
698 host_info.num_ipv4_components);
699 }
700 }
701
702 // Wide version.
703 if (host_case.input16) {
704 std::u16string input16(
705 test_utils::TruncateWStringToUTF16(host_case.input16));
706 int host_len = static_cast<int>(input16.length());
707 Component in_comp(0, host_len);
708
709 out_str.clear();
710 StdStringCanonOutput output(&out_str);
711 CanonHostInfo host_info;
712
713 CanonicalizeHostVerbose(input16.c_str(), in_comp, &output, &host_info);
714 output.Complete();
715
716 EXPECT_EQ(host_case.expected_family, host_info.family);
717 EXPECT_EQ(host_case.expected, out_str);
718 EXPECT_EQ(host_case.expected_component.begin, host_info.out_host.begin);
719 EXPECT_EQ(host_case.expected_component.len, host_info.out_host.len);
720 EXPECT_EQ(
721 host_case.expected_address_hex,
722 base::HexEncode(host_info.address,
723 static_cast<size_t>(host_info.AddressLength())));
724 if (host_case.expected_family == CanonHostInfo::IPV4) {
725 EXPECT_EQ(host_case.expected_num_ipv4_components,
726 host_info.num_ipv4_components);
727 }
728 }
729 }
730 }
731
TEST(URLCanonTest,SpecialHostPuncutationChar)732 TEST(URLCanonTest, SpecialHostPuncutationChar) {
733 // '%' is not tested here. '%' is used for percent-escaping.
734 const std::string_view allowed_host_chars[] = {
735 "!", "\"", "$", "&", "'", "(", ")", "+", ",",
736 "-", ".", ";", "=", "_", "`", "{", "}", "~",
737 };
738
739 const std::string_view forbidden_host_chars[] = {
740 "#", "/", ":", "<", ">", "?", "@", "[", "\\", "]", "^", "|",
741 };
742
743 // Standard non-compliant characters which are escaped. See
744 // https://crbug.com/1416013.
745 struct EscapedCharTestCase {
746 std::string_view input;
747 std::string_view expected;
748 } escaped_host_chars[] = {{" ", "%20"}, {"*", "%2A"}};
749
750 for (const std::string_view input : allowed_host_chars) {
751 std::string out_str;
752 Component in_comp(0, input.size());
753 Component out_comp;
754 StdStringCanonOutput output(&out_str);
755 bool success =
756 CanonicalizeSpecialHost(input.data(), in_comp, output, out_comp);
757 EXPECT_TRUE(success) << "Input: " << input;
758 output.Complete();
759 EXPECT_EQ(out_str, input) << "Input: " << input;
760 }
761
762 for (const std::string_view input : forbidden_host_chars) {
763 std::string out_str;
764 Component in_comp(0, input.size());
765 Component out_comp;
766 StdStringCanonOutput output(&out_str);
767 EXPECT_FALSE(
768 CanonicalizeSpecialHost(input.data(), in_comp, output, out_comp))
769 << "Input: " << input;
770 }
771
772 for (const auto& c : escaped_host_chars) {
773 std::string out_str;
774 Component in_comp(0, c.input.size());
775 Component out_comp;
776 StdStringCanonOutput output(&out_str);
777 bool success =
778 CanonicalizeSpecialHost(c.input.data(), in_comp, output, out_comp);
779 EXPECT_TRUE(success) << "Input: " << c.input;
780 output.Complete();
781 EXPECT_EQ(out_str, c.expected) << "Input: " << c.input;
782 }
783 }
784
TEST(URLCanonTest,ForbiddenHostCodePoint)785 TEST(URLCanonTest, ForbiddenHostCodePoint) {
786 // Test only CanonicalizeNonSpecialHost.
787 // CanonicalizeSpecialHost is not standard compliant yet.
788 // See URLCanonTest::SpecialHostPuncutationChar.
789
790 // https://url.spec.whatwg.org/#forbidden-host-code-point
791 const std::string_view forbidden_host_chars[] = {
792 "\x09", "\x0A", "\x0D", " ", "#", "/", ":", "<",
793 ">", "?", "@", "[", "\\", "]", "^", "|",
794 };
795
796 for (const std::string_view input : forbidden_host_chars) {
797 std::string out_str;
798 Component in_comp(0, input.size());
799 Component out_comp;
800 StdStringCanonOutput output(&out_str);
801 EXPECT_FALSE(
802 CanonicalizeNonSpecialHost(input.data(), in_comp, output, out_comp))
803 << "Input: " << input;
804 }
805
806 // Test NULL manually.
807 const char host_with_null[] = "a\0b";
808 std::string out_str;
809 Component in_comp(0, 3);
810 Component out_comp;
811 StdStringCanonOutput output(&out_str);
812 EXPECT_FALSE(
813 CanonicalizeNonSpecialHost(host_with_null, in_comp, output, out_comp));
814 }
815
TEST(URLCanonTest,IPv4)816 TEST(URLCanonTest, IPv4) {
817 // clang-format off
818 IPAddressCase cases[] = {
819 // Empty is not an IP address.
820 {"", L"", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
821 {".", L".", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
822 // Regular IP addresses in different bases.
823 {"192.168.0.1", L"192.168.0.1", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"},
824 {"0300.0250.00.01", L"0300.0250.00.01", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"},
825 {"0xC0.0Xa8.0x0.0x1", L"0xC0.0Xa8.0x0.0x1", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"},
826 // Non-IP addresses due to invalid characters.
827 {"192.168.9.com", L"192.168.9.com", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
828 // Hostnames with a numeric final component but other components that don't
829 // parse as numbers should be considered broken.
830 {"19a.168.0.1", L"19a.168.0.1", "", Component(), CanonHostInfo::BROKEN, -1, ""},
831 {"19a.168.0.1.", L"19a.168.0.1.", "", Component(), CanonHostInfo::BROKEN, -1, ""},
832 {"0308.0250.00.01", L"0308.0250.00.01", "", Component(), CanonHostInfo::BROKEN, -1, ""},
833 {"0308.0250.00.01.", L"0308.0250.00.01.", "", Component(), CanonHostInfo::BROKEN, -1, ""},
834 {"0xCG.0xA8.0x0.0x1", L"0xCG.0xA8.0x0.0x1", "", Component(), CanonHostInfo::BROKEN, -1, ""},
835 {"0xCG.0xA8.0x0.0x1.", L"0xCG.0xA8.0x0.0x1.", "", Component(), CanonHostInfo::BROKEN, -1, ""},
836 // Non-numeric terminal compeonent should be considered not IPv4 hostnames, but valid.
837 {"19.168.0.1a", L"19.168.0.1a", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
838 {"0xC.0xA8.0x0.0x1G", L"0xC.0xA8.0x0.0x1G", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
839 // Hostnames that would be considered broken IPv4 hostnames should be considered valid non-IPv4 hostnames if they end with two dots instead of 0 or 1.
840 {"19a.168.0.1..", L"19a.168.0.1..", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
841 {"0308.0250.00.01..", L"0308.0250.00.01..", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
842 {"0xCG.0xA8.0x0.0x1..", L"0xCG.0xA8.0x0.0x1..", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
843 // Hosts with components that aren't considered valid IPv4 numbers but are entirely numeric should be considered invalid.
844 {"1.2.3.08", L"1.2.3.08", "", Component(), CanonHostInfo::BROKEN, -1, ""},
845 {"1.2.3.08.", L"1.2.3.08.", "", Component(), CanonHostInfo::BROKEN, -1, ""},
846 // If there are not enough components, the last one should fill them out.
847 {"192", L"192", "0.0.0.192", Component(0, 9), CanonHostInfo::IPV4, 1, "000000C0"},
848 {"0xC0a80001", L"0xC0a80001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"},
849 {"030052000001", L"030052000001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"},
850 {"000030052000001", L"000030052000001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"},
851 {"192.168", L"192.168", "192.0.0.168", Component(0, 11), CanonHostInfo::IPV4, 2, "C00000A8"},
852 {"192.0x00A80001", L"192.0x000A80001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 2, "C0A80001"},
853 {"0xc0.052000001", L"0xc0.052000001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 2, "C0A80001"},
854 {"192.168.1", L"192.168.1", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"},
855 // Hostnames with too many components, but a numeric final numeric component are invalid.
856 {"192.168.0.0.1", L"192.168.0.0.1", "", Component(), CanonHostInfo::BROKEN, -1, ""},
857 // We allow a single trailing dot.
858 {"192.168.0.1.", L"192.168.0.1.", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"},
859 {"192.168.0.1. hello", L"192.168.0.1. hello", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
860 {"192.168.0.1..", L"192.168.0.1..", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
861 // Hosts with two dots in a row with a final numeric component are considered invalid.
862 {"192.168..1", L"192.168..1", "", Component(), CanonHostInfo::BROKEN, -1, ""},
863 {"192.168..1.", L"192.168..1.", "", Component(), CanonHostInfo::BROKEN, -1, ""},
864 // Any numerical overflow should be marked as BROKEN.
865 {"0x100.0", L"0x100.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
866 {"0x100.0.0", L"0x100.0.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
867 {"0x100.0.0.0", L"0x100.0.0.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
868 {"0.0x100.0.0", L"0.0x100.0.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
869 {"0.0.0x100.0", L"0.0.0x100.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
870 {"0.0.0.0x100", L"0.0.0.0x100", "", Component(), CanonHostInfo::BROKEN, -1, ""},
871 {"0.0.0x10000", L"0.0.0x10000", "", Component(), CanonHostInfo::BROKEN, -1, ""},
872 {"0.0x1000000", L"0.0x1000000", "", Component(), CanonHostInfo::BROKEN, -1, ""},
873 {"0x100000000", L"0x100000000", "", Component(), CanonHostInfo::BROKEN, -1, ""},
874 // Repeat the previous tests, minus 1, to verify boundaries.
875 {"0xFF.0", L"0xFF.0", "255.0.0.0", Component(0, 9), CanonHostInfo::IPV4, 2, "FF000000"},
876 {"0xFF.0.0", L"0xFF.0.0", "255.0.0.0", Component(0, 9), CanonHostInfo::IPV4, 3, "FF000000"},
877 {"0xFF.0.0.0", L"0xFF.0.0.0", "255.0.0.0", Component(0, 9), CanonHostInfo::IPV4, 4, "FF000000"},
878 {"0.0xFF.0.0", L"0.0xFF.0.0", "0.255.0.0", Component(0, 9), CanonHostInfo::IPV4, 4, "00FF0000"},
879 {"0.0.0xFF.0", L"0.0.0xFF.0", "0.0.255.0", Component(0, 9), CanonHostInfo::IPV4, 4, "0000FF00"},
880 {"0.0.0.0xFF", L"0.0.0.0xFF", "0.0.0.255", Component(0, 9), CanonHostInfo::IPV4, 4, "000000FF"},
881 {"0.0.0xFFFF", L"0.0.0xFFFF", "0.0.255.255", Component(0, 11), CanonHostInfo::IPV4, 3, "0000FFFF"},
882 {"0.0xFFFFFF", L"0.0xFFFFFF", "0.255.255.255", Component(0, 13), CanonHostInfo::IPV4, 2, "00FFFFFF"},
883 {"0xFFFFFFFF", L"0xFFFFFFFF", "255.255.255.255", Component(0, 15), CanonHostInfo::IPV4, 1, "FFFFFFFF"},
884 // Old trunctations tests. They're all "BROKEN" now.
885 {"276.256.0xf1a2.077777", L"276.256.0xf1a2.077777", "", Component(), CanonHostInfo::BROKEN, -1, ""},
886 {"192.168.0.257", L"192.168.0.257", "", Component(), CanonHostInfo::BROKEN, -1, ""},
887 {"192.168.0xa20001", L"192.168.0xa20001", "", Component(), CanonHostInfo::BROKEN, -1, ""},
888 {"192.015052000001", L"192.015052000001", "", Component(), CanonHostInfo::BROKEN, -1, ""},
889 {"0X12C0a80001", L"0X12C0a80001", "", Component(), CanonHostInfo::BROKEN, -1, ""},
890 {"276.1.2", L"276.1.2", "", Component(), CanonHostInfo::BROKEN, -1, ""},
891 // Too many components should be rejected, in valid ranges or not.
892 {"255.255.255.255.255", L"255.255.255.255.255", "", Component(), CanonHostInfo::BROKEN, -1, ""},
893 {"256.256.256.256.256", L"256.256.256.256.256", "", Component(), CanonHostInfo::BROKEN, -1, ""},
894 // Spaces should be rejected.
895 {"192.168.0.1 hello", L"192.168.0.1 hello", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
896 // Very large numbers.
897 {"0000000000000300.0x00000000000000fF.00000000000000001", L"0000000000000300.0x00000000000000fF.00000000000000001", "192.255.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0FF0001"},
898 {"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", L"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", "", Component(0, 11), CanonHostInfo::BROKEN, -1, ""},
899 // A number has no length limit, but long numbers can still overflow.
900 {"00000000000000000001", L"00000000000000000001", "0.0.0.1", Component(0, 7), CanonHostInfo::IPV4, 1, "00000001"},
901 {"0000000000000000100000000000000001", L"0000000000000000100000000000000001", "", Component(), CanonHostInfo::BROKEN, -1, ""},
902 // If a long component is non-numeric, it's a hostname, *not* a broken IP.
903 {"0.0.0.000000000000000000z", L"0.0.0.000000000000000000z", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
904 {"0.0.0.100000000000000000z", L"0.0.0.100000000000000000z", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
905 // Truncation of all zeros should still result in 0.
906 {"0.00.0x.0x0", L"0.00.0x.0x0", "0.0.0.0", Component(0, 7), CanonHostInfo::IPV4, 4, "00000000"},
907 // Non-ASCII characters in final component should return NEUTRAL.
908 {"1.2.3.\xF0\x9F\x92\xA9", L"1.2.3.\xD83D\xDCA9", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
909 {"1.2.3.4\xF0\x9F\x92\xA9", L"1.2.3.4\xD83D\xDCA9", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
910 {"1.2.3.0x\xF0\x9F\x92\xA9", L"1.2.3.0x\xD83D\xDCA9", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
911 {"1.2.3.0\xF0\x9F\x92\xA9", L"1.2.3.0\xD83D\xDCA9", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
912 // Non-ASCII characters in other components should result in broken IPs when final component is numeric.
913 {"1.2.\xF0\x9F\x92\xA9.4", L"1.2.\xD83D\xDCA9.4", "", Component(), CanonHostInfo::BROKEN, -1, ""},
914 {"1.2.3\xF0\x9F\x92\xA9.4", L"1.2.3\xD83D\xDCA9.4", "", Component(), CanonHostInfo::BROKEN, -1, ""},
915 {"1.2.0x\xF0\x9F\x92\xA9.4", L"1.2.0x\xD83D\xDCA9.4", "", Component(), CanonHostInfo::BROKEN, -1, ""},
916 {"1.2.0\xF0\x9F\x92\xA9.4", L"1.2.0\xD83D\xDCA9.4", "", Component(), CanonHostInfo::BROKEN, -1, ""},
917 {"\xF0\x9F\x92\xA9.2.3.4", L"\xD83D\xDCA9.2.3.4", "", Component(), CanonHostInfo::BROKEN, -1, ""},
918 };
919 // clang-format on
920
921 for (const auto& test_case : cases) {
922 SCOPED_TRACE(test_case.input8);
923
924 // 8-bit version.
925 Component component(0, static_cast<int>(strlen(test_case.input8)));
926
927 std::string out_str1;
928 StdStringCanonOutput output1(&out_str1);
929 CanonHostInfo host_info;
930 CanonicalizeIPAddress(test_case.input8, component, &output1, &host_info);
931 output1.Complete();
932
933 EXPECT_EQ(test_case.expected_family, host_info.family);
934 EXPECT_EQ(test_case.expected_address_hex,
935 base::HexEncode(host_info.address,
936 static_cast<size_t>(host_info.AddressLength())));
937 if (host_info.family == CanonHostInfo::IPV4) {
938 EXPECT_STREQ(test_case.expected, out_str1.c_str());
939 EXPECT_EQ(test_case.expected_component.begin, host_info.out_host.begin);
940 EXPECT_EQ(test_case.expected_component.len, host_info.out_host.len);
941 EXPECT_EQ(test_case.expected_num_ipv4_components,
942 host_info.num_ipv4_components);
943 }
944
945 // 16-bit version.
946 std::u16string input16(
947 test_utils::TruncateWStringToUTF16(test_case.input16));
948 component = Component(0, static_cast<int>(input16.length()));
949
950 std::string out_str2;
951 StdStringCanonOutput output2(&out_str2);
952 CanonicalizeIPAddress(input16.c_str(), component, &output2, &host_info);
953 output2.Complete();
954
955 EXPECT_EQ(test_case.expected_family, host_info.family);
956 EXPECT_EQ(test_case.expected_address_hex,
957 base::HexEncode(host_info.address,
958 static_cast<size_t>(host_info.AddressLength())));
959 if (host_info.family == CanonHostInfo::IPV4) {
960 EXPECT_STREQ(test_case.expected, out_str2.c_str());
961 EXPECT_EQ(test_case.expected_component.begin, host_info.out_host.begin);
962 EXPECT_EQ(test_case.expected_component.len, host_info.out_host.len);
963 EXPECT_EQ(test_case.expected_num_ipv4_components,
964 host_info.num_ipv4_components);
965 }
966 }
967 }
968
TEST(URLCanonTest,IPv6)969 TEST(URLCanonTest, IPv6) {
970 IPAddressCase cases[] = {
971 // Empty is not an IP address.
972 {"", L"", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
973 // Non-IPs with [:] characters are marked BROKEN.
974 {":", L":", "", Component(), CanonHostInfo::BROKEN, -1, ""},
975 {"[", L"[", "", Component(), CanonHostInfo::BROKEN, -1, ""},
976 {"[:", L"[:", "", Component(), CanonHostInfo::BROKEN, -1, ""},
977 {"]", L"]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
978 {":]", L":]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
979 {"[]", L"[]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
980 {"[:]", L"[:]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
981 // Regular IP address is invalid without bounding '[' and ']'.
982 {"2001:db8::1", L"2001:db8::1", "", Component(), CanonHostInfo::BROKEN,
983 -1, ""},
984 {"[2001:db8::1", L"[2001:db8::1", "", Component(), CanonHostInfo::BROKEN,
985 -1, ""},
986 {"2001:db8::1]", L"2001:db8::1]", "", Component(), CanonHostInfo::BROKEN,
987 -1, ""},
988 // Regular IP addresses.
989 {"[::]", L"[::]", "[::]", Component(0, 4), CanonHostInfo::IPV6, -1,
990 "00000000000000000000000000000000"},
991 {"[::1]", L"[::1]", "[::1]", Component(0, 5), CanonHostInfo::IPV6, -1,
992 "00000000000000000000000000000001"},
993 {"[1::]", L"[1::]", "[1::]", Component(0, 5), CanonHostInfo::IPV6, -1,
994 "00010000000000000000000000000000"},
995
996 // Leading zeros should be stripped.
997 {"[000:01:02:003:004:5:6:007]", L"[000:01:02:003:004:5:6:007]",
998 "[0:1:2:3:4:5:6:7]", Component(0, 17), CanonHostInfo::IPV6, -1,
999 "00000001000200030004000500060007"},
1000
1001 // Upper case letters should be lowercased.
1002 {"[A:b:c:DE:fF:0:1:aC]", L"[A:b:c:DE:fF:0:1:aC]", "[a:b:c:de:ff:0:1:ac]",
1003 Component(0, 20), CanonHostInfo::IPV6, -1,
1004 "000A000B000C00DE00FF0000000100AC"},
1005
1006 // The same address can be written with different contractions, but should
1007 // get canonicalized to the same thing.
1008 {"[1:0:0:2::3:0]", L"[1:0:0:2::3:0]", "[1::2:0:0:3:0]", Component(0, 14),
1009 CanonHostInfo::IPV6, -1, "00010000000000020000000000030000"},
1010 {"[1::2:0:0:3:0]", L"[1::2:0:0:3:0]", "[1::2:0:0:3:0]", Component(0, 14),
1011 CanonHostInfo::IPV6, -1, "00010000000000020000000000030000"},
1012
1013 // Addresses with embedded IPv4.
1014 {"[::192.168.0.1]", L"[::192.168.0.1]", "[::c0a8:1]", Component(0, 10),
1015 CanonHostInfo::IPV6, -1, "000000000000000000000000C0A80001"},
1016 {"[::ffff:192.168.0.1]", L"[::ffff:192.168.0.1]", "[::ffff:c0a8:1]",
1017 Component(0, 15), CanonHostInfo::IPV6, -1,
1018 "00000000000000000000FFFFC0A80001"},
1019 {"[::eeee:192.168.0.1]", L"[::eeee:192.168.0.1]", "[::eeee:c0a8:1]",
1020 Component(0, 15), CanonHostInfo::IPV6, -1,
1021 "00000000000000000000EEEEC0A80001"},
1022 {"[2001::192.168.0.1]", L"[2001::192.168.0.1]", "[2001::c0a8:1]",
1023 Component(0, 14), CanonHostInfo::IPV6, -1,
1024 "200100000000000000000000C0A80001"},
1025 {"[1:2:192.168.0.1:5:6]", L"[1:2:192.168.0.1:5:6]", "", Component(),
1026 CanonHostInfo::BROKEN, -1, ""},
1027
1028 // IPv4 embedded IPv6 addresses
1029 {"[::ffff:192.1.2]", L"[::ffff:192.1.2]", "[::ffff:c001:2]", Component(),
1030 CanonHostInfo::BROKEN, -1, ""},
1031 {"[::ffff:192.1]", L"[::ffff:192.1]", "[::ffff:c000:1]", Component(),
1032 CanonHostInfo::BROKEN, -1, ""},
1033 {"[::ffff:192.1.2.3.4]", L"[::ffff:192.1.2.3.4]", "", Component(),
1034 CanonHostInfo::BROKEN, -1, ""},
1035
1036 // IPv4 using hex.
1037 // TODO(eroman): Should this format be disallowed?
1038 {"[::ffff:0xC0.0Xa8.0x0.0x1]", L"[::ffff:0xC0.0Xa8.0x0.0x1]",
1039 "[::ffff:c0a8:1]", Component(0, 15), CanonHostInfo::IPV6, -1,
1040 "00000000000000000000FFFFC0A80001"},
1041
1042 // There may be zeros surrounding the "::" contraction.
1043 {"[0:0::0:0:8]", L"[0:0::0:0:8]", "[::8]", Component(0, 5),
1044 CanonHostInfo::IPV6, -1, "00000000000000000000000000000008"},
1045
1046 {"[2001:db8::1]", L"[2001:db8::1]", "[2001:db8::1]", Component(0, 13),
1047 CanonHostInfo::IPV6, -1, "20010DB8000000000000000000000001"},
1048
1049 // Can only have one "::" contraction in an IPv6 string literal.
1050 {"[2001::db8::1]", L"[2001::db8::1]", "", Component(),
1051 CanonHostInfo::BROKEN, -1, ""},
1052 // No more than 2 consecutive ':'s.
1053 {"[2001:db8:::1]", L"[2001:db8:::1]", "", Component(),
1054 CanonHostInfo::BROKEN, -1, ""},
1055 {"[:::]", L"[:::]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
1056 // Non-IP addresses due to invalid characters.
1057 {"[2001::.com]", L"[2001::.com]", "", Component(), CanonHostInfo::BROKEN,
1058 -1, ""},
1059 // If there are not enough components, the last one should fill them out.
1060 // ... omitted at this time ...
1061 // Too many components means not an IP address. Similarly, with too few
1062 // if using IPv4 compat or mapped addresses.
1063 {"[::192.168.0.0.1]", L"[::192.168.0.0.1]", "", Component(),
1064 CanonHostInfo::BROKEN, -1, ""},
1065 {"[::ffff:192.168.0.0.1]", L"[::ffff:192.168.0.0.1]", "", Component(),
1066 CanonHostInfo::BROKEN, -1, ""},
1067 {"[1:2:3:4:5:6:7:8:9]", L"[1:2:3:4:5:6:7:8:9]", "", Component(),
1068 CanonHostInfo::BROKEN, -1, ""},
1069 // Too many bits (even though 8 components, the last one holds 32 bits).
1070 {"[0:0:0:0:0:0:0:192.168.0.1]", L"[0:0:0:0:0:0:0:192.168.0.1]", "",
1071 Component(), CanonHostInfo::BROKEN, -1, ""},
1072
1073 // Too many bits specified -- the contraction would have to be zero-length
1074 // to not exceed 128 bits.
1075 {"[1:2:3:4:5:6::192.168.0.1]", L"[1:2:3:4:5:6::192.168.0.1]", "",
1076 Component(), CanonHostInfo::BROKEN, -1, ""},
1077
1078 // The contraction is for 16 bits of zero.
1079 {"[1:2:3:4:5:6::8]", L"[1:2:3:4:5:6::8]", "[1:2:3:4:5:6:0:8]",
1080 Component(0, 17), CanonHostInfo::IPV6, -1,
1081 "00010002000300040005000600000008"},
1082
1083 // Cannot have a trailing colon.
1084 {"[1:2:3:4:5:6:7:8:]", L"[1:2:3:4:5:6:7:8:]", "", Component(),
1085 CanonHostInfo::BROKEN, -1, ""},
1086 {"[1:2:3:4:5:6:192.168.0.1:]", L"[1:2:3:4:5:6:192.168.0.1:]", "",
1087 Component(), CanonHostInfo::BROKEN, -1, ""},
1088
1089 // Cannot have negative numbers.
1090 {"[-1:2:3:4:5:6:7:8]", L"[-1:2:3:4:5:6:7:8]", "", Component(),
1091 CanonHostInfo::BROKEN, -1, ""},
1092
1093 // Scope ID -- the URL may contain an optional ["%" <scope_id>] section.
1094 // The scope_id should be included in the canonicalized URL, and is an
1095 // unsigned decimal number.
1096
1097 // Invalid because no ID was given after the percent.
1098
1099 // Don't allow scope-id
1100 {"[1::%1]", L"[1::%1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
1101 {"[1::%eth0]", L"[1::%eth0]", "", Component(), CanonHostInfo::BROKEN, -1,
1102 ""},
1103 {"[1::%]", L"[1::%]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
1104 {"[%]", L"[%]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
1105 {"[::%:]", L"[::%:]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
1106
1107 // Don't allow leading or trailing colons.
1108 {"[:0:0::0:0:8]", L"[:0:0::0:0:8]", "", Component(),
1109 CanonHostInfo::BROKEN, -1, ""},
1110 {"[0:0::0:0:8:]", L"[0:0::0:0:8:]", "", Component(),
1111 CanonHostInfo::BROKEN, -1, ""},
1112 {"[:0:0::0:0:8:]", L"[:0:0::0:0:8:]", "", Component(),
1113 CanonHostInfo::BROKEN, -1, ""},
1114
1115 // We allow a single trailing dot.
1116 // ... omitted at this time ...
1117 // Two dots in a row means not an IP address.
1118 {"[::192.168..1]", L"[::192.168..1]", "", Component(),
1119 CanonHostInfo::BROKEN, -1, ""},
1120 // Any non-first components get truncated to one byte.
1121 // ... omitted at this time ...
1122 // Spaces should be rejected.
1123 {"[::1 hello]", L"[::1 hello]", "", Component(), CanonHostInfo::BROKEN,
1124 -1, ""},
1125 };
1126
1127 for (size_t i = 0; i < std::size(cases); i++) {
1128 // 8-bit version.
1129 Component component(0, static_cast<int>(strlen(cases[i].input8)));
1130
1131 std::string out_str1;
1132 StdStringCanonOutput output1(&out_str1);
1133 CanonHostInfo host_info;
1134 CanonicalizeIPAddress(cases[i].input8, component, &output1, &host_info);
1135 output1.Complete();
1136
1137 EXPECT_EQ(cases[i].expected_family, host_info.family);
1138 EXPECT_EQ(cases[i].expected_address_hex,
1139 base::HexEncode(host_info.address,
1140 static_cast<size_t>(host_info.AddressLength())))
1141 << "iter " << i << " host " << cases[i].input8;
1142 if (host_info.family == CanonHostInfo::IPV6) {
1143 EXPECT_STREQ(cases[i].expected, out_str1.c_str());
1144 EXPECT_EQ(cases[i].expected_component.begin,
1145 host_info.out_host.begin);
1146 EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len);
1147 }
1148
1149 // 16-bit version.
1150 std::u16string input16(
1151 test_utils::TruncateWStringToUTF16(cases[i].input16));
1152 component = Component(0, static_cast<int>(input16.length()));
1153
1154 std::string out_str2;
1155 StdStringCanonOutput output2(&out_str2);
1156 CanonicalizeIPAddress(input16.c_str(), component, &output2, &host_info);
1157 output2.Complete();
1158
1159 EXPECT_EQ(cases[i].expected_family, host_info.family);
1160 EXPECT_EQ(cases[i].expected_address_hex,
1161 base::HexEncode(host_info.address,
1162 static_cast<size_t>(host_info.AddressLength())));
1163 if (host_info.family == CanonHostInfo::IPV6) {
1164 EXPECT_STREQ(cases[i].expected, out_str2.c_str());
1165 EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin);
1166 EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len);
1167 }
1168 }
1169 }
1170
TEST(URLCanonTest,IPEmpty)1171 TEST(URLCanonTest, IPEmpty) {
1172 std::string out_str1;
1173 StdStringCanonOutput output1(&out_str1);
1174 CanonHostInfo host_info;
1175
1176 // This tests tests.
1177 const char spec[] = "192.168.0.1";
1178 CanonicalizeIPAddress(spec, Component(), &output1, &host_info);
1179 EXPECT_FALSE(host_info.IsIPAddress());
1180
1181 CanonicalizeIPAddress(spec, Component(0, 0), &output1, &host_info);
1182 EXPECT_FALSE(host_info.IsIPAddress());
1183 }
1184
1185 // Verifies that CanonicalizeHostSubstring produces the expected output and
1186 // does not "fix" IP addresses. Because this code is a subset of
1187 // CanonicalizeHost, the shared functionality is not tested.
TEST(URLCanonTest,CanonicalizeHostSubstring)1188 TEST(URLCanonTest, CanonicalizeHostSubstring) {
1189 // Basic sanity check.
1190 {
1191 std::string out_str;
1192 StdStringCanonOutput output(&out_str);
1193 EXPECT_TRUE(CanonicalizeHostSubstring("M\xc3\x9cNCHEN.com",
1194 Component(0, 12), &output));
1195 output.Complete();
1196 EXPECT_EQ("xn--mnchen-3ya.com", out_str);
1197 }
1198
1199 // Failure case.
1200 {
1201 std::string out_str;
1202 StdStringCanonOutput output(&out_str);
1203 EXPECT_FALSE(CanonicalizeHostSubstring(
1204 test_utils::TruncateWStringToUTF16(L"\xfdd0zyx.com").c_str(),
1205 Component(0, 8), &output));
1206 output.Complete();
1207 EXPECT_EQ("%EF%B7%90zyx.com", out_str);
1208 }
1209
1210 // Should return true for empty input strings.
1211 {
1212 std::string out_str;
1213 StdStringCanonOutput output(&out_str);
1214 EXPECT_TRUE(CanonicalizeHostSubstring("", Component(0, 0), &output));
1215 output.Complete();
1216 EXPECT_EQ(std::string(), out_str);
1217 }
1218
1219 // Numbers that look like IP addresses should not be changed.
1220 {
1221 std::string out_str;
1222 StdStringCanonOutput output(&out_str);
1223 EXPECT_TRUE(
1224 CanonicalizeHostSubstring("01.02.03.04", Component(0, 11), &output));
1225 output.Complete();
1226 EXPECT_EQ("01.02.03.04", out_str);
1227 }
1228 }
1229
TEST(URLCanonTest,UserInfo)1230 TEST(URLCanonTest, UserInfo) {
1231 // Note that the canonicalizer should escape and treat empty components as
1232 // not being there.
1233
1234 // We actually parse a full input URL so we can get the initial components.
1235 struct UserComponentCase {
1236 const char* input;
1237 const char* expected;
1238 Component expected_username;
1239 Component expected_password;
1240 bool expected_success;
1241 } user_info_cases[] = {
1242 {"http://user:[email protected]/", "user:pass@", Component(0, 4), Component(5, 4), true},
1243 {"http://@host.com/", "", Component(0, -1), Component(0, -1), true},
1244 {"http://:@host.com/", "", Component(0, -1), Component(0, -1), true},
1245 {"http://foo:@host.com/", "foo@", Component(0, 3), Component(0, -1), true},
1246 {"http://:[email protected]/", ":foo@", Component(0, 0), Component(1, 3), true},
1247 {"http://^ :$\[email protected]/", "%5E%20:$%09@", Component(0, 6), Component(7, 4), true},
1248 {"http://user:pass@/", "user:pass@", Component(0, 4), Component(5, 4), true},
1249 {"http://%2540:[email protected]/", "%2540:bar@", Component(0, 5), Component(6, 3), true },
1250
1251 // IE7 compatibility: old versions allowed backslashes in usernames, but
1252 // IE7 does not. We disallow it as well.
1253 {"ftp://me\\mydomain:[email protected]/", "", Component(0, -1), Component(0, -1), true},
1254 };
1255
1256 for (const auto& user_info_case : user_info_cases) {
1257 int url_len = static_cast<int>(strlen(user_info_case.input));
1258 Parsed parsed;
1259 ParseStandardURL(user_info_case.input, url_len, &parsed);
1260 Component out_user, out_pass;
1261 std::string out_str;
1262 StdStringCanonOutput output1(&out_str);
1263
1264 bool success = CanonicalizeUserInfo(user_info_case.input, parsed.username,
1265 user_info_case.input, parsed.password,
1266 &output1, &out_user, &out_pass);
1267 output1.Complete();
1268
1269 EXPECT_EQ(user_info_case.expected_success, success);
1270 EXPECT_EQ(user_info_case.expected, out_str);
1271 EXPECT_EQ(user_info_case.expected_username.begin, out_user.begin);
1272 EXPECT_EQ(user_info_case.expected_username.len, out_user.len);
1273 EXPECT_EQ(user_info_case.expected_password.begin, out_pass.begin);
1274 EXPECT_EQ(user_info_case.expected_password.len, out_pass.len);
1275
1276 // Now try the wide version
1277 out_str.clear();
1278 StdStringCanonOutput output2(&out_str);
1279 std::u16string wide_input(base::UTF8ToUTF16(user_info_case.input));
1280 success = CanonicalizeUserInfo(wide_input.c_str(),
1281 parsed.username,
1282 wide_input.c_str(),
1283 parsed.password,
1284 &output2,
1285 &out_user,
1286 &out_pass);
1287 output2.Complete();
1288
1289 EXPECT_EQ(user_info_case.expected_success, success);
1290 EXPECT_EQ(user_info_case.expected, out_str);
1291 EXPECT_EQ(user_info_case.expected_username.begin, out_user.begin);
1292 EXPECT_EQ(user_info_case.expected_username.len, out_user.len);
1293 EXPECT_EQ(user_info_case.expected_password.begin, out_pass.begin);
1294 EXPECT_EQ(user_info_case.expected_password.len, out_pass.len);
1295 }
1296 }
1297
TEST(URLCanonTest,Port)1298 TEST(URLCanonTest, Port) {
1299 // We only need to test that the number gets properly put into the output
1300 // buffer. The parser unit tests will test scanning the number correctly.
1301 //
1302 // Note that the CanonicalizePort will always prepend a colon to the output
1303 // to separate it from the colon that it assumes precedes it.
1304 struct PortCase {
1305 const char* input;
1306 int default_port;
1307 const char* expected;
1308 Component expected_component;
1309 bool expected_success;
1310 } port_cases[] = {
1311 // Invalid input should be copied w/ failure.
1312 {"as df", 80, ":as%20df", Component(1, 7), false},
1313 {"-2", 80, ":-2", Component(1, 2), false},
1314 // Default port should be omitted.
1315 {"80", 80, "", Component(0, -1), true},
1316 {"8080", 80, ":8080", Component(1, 4), true},
1317 // PORT_UNSPECIFIED should mean always keep the port.
1318 {"80", PORT_UNSPECIFIED, ":80", Component(1, 2), true},
1319 };
1320
1321 for (const auto& port_case : port_cases) {
1322 int url_len = static_cast<int>(strlen(port_case.input));
1323 Component in_comp(0, url_len);
1324 Component out_comp;
1325 std::string out_str;
1326 StdStringCanonOutput output1(&out_str);
1327 bool success = CanonicalizePort(
1328 port_case.input, in_comp, port_case.default_port, &output1, &out_comp);
1329 output1.Complete();
1330
1331 EXPECT_EQ(port_case.expected_success, success);
1332 EXPECT_EQ(port_case.expected, out_str);
1333 EXPECT_EQ(port_case.expected_component.begin, out_comp.begin);
1334 EXPECT_EQ(port_case.expected_component.len, out_comp.len);
1335
1336 // Now try the wide version
1337 out_str.clear();
1338 StdStringCanonOutput output2(&out_str);
1339 std::u16string wide_input(base::UTF8ToUTF16(port_case.input));
1340 success = CanonicalizePort(wide_input.c_str(), in_comp,
1341 port_case.default_port, &output2, &out_comp);
1342 output2.Complete();
1343
1344 EXPECT_EQ(port_case.expected_success, success);
1345 EXPECT_EQ(port_case.expected, out_str);
1346 EXPECT_EQ(port_case.expected_component.begin, out_comp.begin);
1347 EXPECT_EQ(port_case.expected_component.len, out_comp.len);
1348 }
1349 }
1350
1351 DualComponentCase kCommonPathCases[] = {
1352 // ----- path collapsing tests -----
1353 {"/././foo", L"/././foo", "/foo", Component(0, 4), true},
1354 {"/./.foo", L"/./.foo", "/.foo", Component(0, 5), true},
1355 {"/foo/.", L"/foo/.", "/foo/", Component(0, 5), true},
1356 {"/foo/./", L"/foo/./", "/foo/", Component(0, 5), true},
1357 // double dots followed by a slash or the end of the string count
1358 {"/foo/bar/..", L"/foo/bar/..", "/foo/", Component(0, 5), true},
1359 {"/foo/bar/../", L"/foo/bar/../", "/foo/", Component(0, 5), true},
1360 // don't count double dots when they aren't followed by a slash
1361 {"/foo/..bar", L"/foo/..bar", "/foo/..bar", Component(0, 10), true},
1362 // some in the middle
1363 {"/foo/bar/../ton", L"/foo/bar/../ton", "/foo/ton", Component(0, 8), true},
1364 {"/foo/bar/../ton/../../a", L"/foo/bar/../ton/../../a", "/a",
1365 Component(0, 2), true},
1366 // we should not be able to go above the root
1367 {"/foo/../../..", L"/foo/../../..", "/", Component(0, 1), true},
1368 {"/foo/../../../ton", L"/foo/../../../ton", "/ton", Component(0, 4), true},
1369 // escaped dots should be unescaped and treated the same as dots
1370 {"/foo/%2e", L"/foo/%2e", "/foo/", Component(0, 5), true},
1371 {"/foo/%2e%2", L"/foo/%2e%2", "/foo/.%2", Component(0, 8), true},
1372 {"/foo/%2e./%2e%2e/.%2e/%2e.bar", L"/foo/%2e./%2e%2e/.%2e/%2e.bar",
1373 "/..bar", Component(0, 6), true},
1374 // Multiple slashes in a row should be preserved and treated like empty
1375 // directory names.
1376 {"////../..", L"////../..", "//", Component(0, 2), true},
1377
1378 // ----- escaping tests -----
1379 {"/foo", L"/foo", "/foo", Component(0, 4), true},
1380 // Valid escape sequence
1381 {"/%20foo", L"/%20foo", "/%20foo", Component(0, 7), true},
1382 // Invalid escape sequence we should pass through unchanged.
1383 {"/foo%", L"/foo%", "/foo%", Component(0, 5), true},
1384 {"/foo%2", L"/foo%2", "/foo%2", Component(0, 6), true},
1385 // Invalid escape sequence: bad characters should be treated the same as
1386 // the surrounding text, not as escaped (in this case, UTF-8).
1387 {"/foo%2zbar", L"/foo%2zbar", "/foo%2zbar", Component(0, 10), true},
1388 {"/foo%2\xc2\xa9zbar", nullptr, "/foo%2%C2%A9zbar", Component(0, 16), true},
1389 {nullptr, L"/foo%2\xc2\xa9zbar", "/foo%2%C3%82%C2%A9zbar", Component(0, 22),
1390 true},
1391 // Regular characters that are escaped should remain escaped
1392 {"/foo%41%7a", L"/foo%41%7a", "/foo%41%7a", Component(0, 10), true},
1393 // Funny characters that are unescaped should be escaped
1394 {"/foo\x09\x91%91", nullptr, "/foo%09%91%91", Component(0, 13), true},
1395 {nullptr, L"/foo\x09\x91%91", "/foo%09%C2%91%91", Component(0, 16), true},
1396 // %00 should not cause failures.
1397 {"/foo%00%51", L"/foo%00%51", "/foo%00%51", Component(0, 10), true},
1398 // Some characters should be passed through unchanged regardless of esc.
1399 {"/(%28:%3A%29)", L"/(%28:%3A%29)", "/(%28:%3A%29)", Component(0, 13),
1400 true},
1401 // Characters that are properly escaped should not have the case changed
1402 // of hex letters.
1403 {"/%3A%3a%3C%3c", L"/%3A%3a%3C%3c", "/%3A%3a%3C%3c", Component(0, 13),
1404 true},
1405 // Funny characters that are unescaped should be escaped
1406 {"/foo\tbar", L"/foo\tbar", "/foo%09bar", Component(0, 10), true},
1407 // Hashes found in paths (possibly only when the caller explicitly sets
1408 // the path on an already-parsed URL) should be escaped.
1409 {"/foo#bar", L"/foo#bar", "/foo%23bar", Component(0, 10), true},
1410 // %7f should be allowed and %3D should not be unescaped (these were wrong
1411 // in a previous version).
1412 {"/%7Ffp3%3Eju%3Dduvgw%3Dd", L"/%7Ffp3%3Eju%3Dduvgw%3Dd",
1413 "/%7Ffp3%3Eju%3Dduvgw%3Dd", Component(0, 24), true},
1414 // @ should be passed through unchanged (escaped or unescaped).
1415 {"/@asdf%40", L"/@asdf%40", "/@asdf%40", Component(0, 9), true},
1416 // Nested escape sequences no longer happen. See https://crbug.com/1252531.
1417 {"/%A%42", L"/%A%42", "/%A%42", Component(0, 6), true},
1418 {"/%%41B", L"/%%41B", "/%%41B", Component(0, 6), true},
1419 {"/%%41%42", L"/%%41%42", "/%%41%42", Component(0, 8), true},
1420 // Make sure truncated "nested" escapes don't result in reading off the
1421 // string end.
1422 {"/%%41", L"/%%41", "/%%41", Component(0, 5), true},
1423 // Don't unescape the leading '%' if unescaping doesn't result in a valid
1424 // new escape sequence.
1425 {"/%%470", L"/%%470", "/%%470", Component(0, 6), true},
1426 {"/%%2D%41", L"/%%2D%41", "/%%2D%41", Component(0, 8), true},
1427 // Don't erroneously downcast a UTF-16 character in a way that makes it
1428 // look like part of an escape sequence.
1429 {nullptr, L"/%%41\x0130", "/%%41%C4%B0", Component(0, 11), true},
1430
1431 // ----- encoding tests -----
1432 // Basic conversions
1433 {"/\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd",
1434 L"/\x4f60\x597d\x4f60\x597d", "/%E4%BD%A0%E5%A5%BD%E4%BD%A0%E5%A5%BD",
1435 Component(0, 37), true},
1436 // Unicode Noncharacter (U+FDD0) should not fail.
1437 {"/\xef\xb7\x90zyx", nullptr, "/%EF%B7%90zyx", Component(0, 13), true},
1438 {nullptr, L"/\xfdd0zyx", "/%EF%B7%90zyx", Component(0, 13), true},
1439 };
1440
1441 typedef bool (*CanonFunc8Bit)(const char*,
1442 const Component&,
1443 CanonOutput*,
1444 Component*);
1445 typedef bool (*CanonFunc16Bit)(const char16_t*,
1446 const Component&,
1447 CanonOutput*,
1448 Component*);
1449
DoPathTest(const DualComponentCase * path_cases,size_t num_cases,CanonFunc8Bit canon_func_8,CanonFunc16Bit canon_func_16)1450 void DoPathTest(const DualComponentCase* path_cases,
1451 size_t num_cases,
1452 CanonFunc8Bit canon_func_8,
1453 CanonFunc16Bit canon_func_16) {
1454 for (size_t i = 0; i < num_cases; i++) {
1455 testing::Message scope_message;
1456 scope_message << path_cases[i].input8 << "," << path_cases[i].input16;
1457 SCOPED_TRACE(scope_message);
1458 if (path_cases[i].input8) {
1459 int len = static_cast<int>(strlen(path_cases[i].input8));
1460 Component in_comp(0, len);
1461 Component out_comp;
1462 std::string out_str;
1463 StdStringCanonOutput output(&out_str);
1464 bool success =
1465 canon_func_8(path_cases[i].input8, in_comp, &output, &out_comp);
1466 output.Complete();
1467
1468 EXPECT_EQ(path_cases[i].expected_success, success);
1469 EXPECT_EQ(path_cases[i].expected_component.begin, out_comp.begin);
1470 EXPECT_EQ(path_cases[i].expected_component.len, out_comp.len);
1471 EXPECT_EQ(path_cases[i].expected, out_str);
1472 }
1473
1474 if (path_cases[i].input16) {
1475 std::u16string input16(
1476 test_utils::TruncateWStringToUTF16(path_cases[i].input16));
1477 int len = static_cast<int>(input16.length());
1478 Component in_comp(0, len);
1479 Component out_comp;
1480 std::string out_str;
1481 StdStringCanonOutput output(&out_str);
1482
1483 bool success =
1484 canon_func_16(input16.c_str(), in_comp, &output, &out_comp);
1485 output.Complete();
1486
1487 EXPECT_EQ(path_cases[i].expected_success, success);
1488 EXPECT_EQ(path_cases[i].expected_component.begin, out_comp.begin);
1489 EXPECT_EQ(path_cases[i].expected_component.len, out_comp.len);
1490 EXPECT_EQ(path_cases[i].expected, out_str);
1491 }
1492 }
1493 }
1494
TEST(URLCanonTest,SpecialPath)1495 TEST(URLCanonTest, SpecialPath) {
1496 // Common test cases
1497 DoPathTest(kCommonPathCases, std::size(kCommonPathCases),
1498 CanonicalizeSpecialPath, CanonicalizeSpecialPath);
1499
1500 // Manual test: embedded NULLs should be escaped and the URL should be marked
1501 // as valid.
1502 const char path_with_null[] = "/ab\0c";
1503 Component in_comp(0, 5);
1504 Component out_comp;
1505
1506 std::string out_str;
1507 StdStringCanonOutput output(&out_str);
1508 bool success =
1509 CanonicalizeSpecialPath(path_with_null, in_comp, &output, &out_comp);
1510 output.Complete();
1511 EXPECT_TRUE(success);
1512 EXPECT_EQ("/ab%00c", out_str);
1513
1514 // Test cases specific on special URLs.
1515 DualComponentCase special_path_cases[] = {
1516 // Canonical path for empty path is a slash.
1517 {"", L"", "/", Component(0, 1), true},
1518 // Backslashes should be used as path separators.
1519 {"\\a\\b", L"\\a\\b", "/a/b", Component(0, 4), true},
1520 {"/a\\..\\b", L"/a\\..\\b", "/b", Component(0, 2), true},
1521 {"/a\\.\\b", L"/a\\.\\b", "/a/b", Component(0, 4), true},
1522 };
1523
1524 DoPathTest(special_path_cases, std::size(special_path_cases),
1525 CanonicalizeSpecialPath, CanonicalizePath);
1526 }
1527
TEST(URLCanonTest,NonSpecialPath)1528 TEST(URLCanonTest, NonSpecialPath) {
1529 // Common test cases
1530 DoPathTest(kCommonPathCases, std::size(kCommonPathCases),
1531 CanonicalizeNonSpecialPath, CanonicalizeNonSpecialPath);
1532
1533 // Test cases specific on non-special URLs.
1534 DualComponentCase non_special_path_cases[] = {
1535 // Empty.
1536 {"", L"", "", Component(0, 0), true},
1537 // Backslashes.
1538 {"/a\\..\\b", L"/a\\..\\b", "/a\\..\\b", Component(0, 7), true},
1539 {"/a\\./b", L"/a\\./b", "/a\\./b", Component(0, 6), true},
1540 };
1541
1542 DoPathTest(non_special_path_cases, std::size(non_special_path_cases),
1543 CanonicalizeNonSpecialPath, CanonicalizeNonSpecialPath);
1544 }
1545
TEST(URLCanonTest,PartialPath)1546 TEST(URLCanonTest, PartialPath) {
1547 DualComponentCase partial_path_cases[] = {
1548 {".html", L".html", ".html", Component(0, 5), true},
1549 {"", L"", "", Component(0, 0), true},
1550 };
1551
1552 DoPathTest(kCommonPathCases, std::size(kCommonPathCases),
1553 CanonicalizePartialPath, CanonicalizePartialPath);
1554 DoPathTest(partial_path_cases, std::size(partial_path_cases),
1555 CanonicalizePartialPath, CanonicalizePartialPath);
1556 }
1557
TEST(URLCanonTest,Query)1558 TEST(URLCanonTest, Query) {
1559 struct QueryCase {
1560 const char* input8;
1561 const wchar_t* input16;
1562 const char* expected;
1563 } query_cases[] = {
1564 // Regular ASCII case.
1565 {"foo=bar", L"foo=bar", "?foo=bar"},
1566 // Allow question marks in the query without escaping
1567 {"as?df", L"as?df", "?as?df"},
1568 // Always escape '#' since it would mark the ref.
1569 {"as#df", L"as#df", "?as%23df"},
1570 // Escape some questionable 8-bit characters, but never unescape.
1571 {"\x02hello\x7f bye", L"\x02hello\x7f bye", "?%02hello%7F%20bye"},
1572 {"%40%41123", L"%40%41123", "?%40%41123"},
1573 // Chinese input/output
1574 {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "?q=%E4%BD%A0%E5%A5%BD"},
1575 // Invalid UTF-8/16 input should be replaced with invalid characters.
1576 {"q=\xed\xed", L"q=\xd800\xd800", "?q=%EF%BF%BD%EF%BF%BD"},
1577 // Don't allow < or > because sometimes they are used for XSS if the
1578 // URL is echoed in content. Firefox does this, IE doesn't.
1579 {"q=<asdf>", L"q=<asdf>", "?q=%3Casdf%3E"},
1580 // Escape double quotemarks in the query.
1581 {"q=\"asdf\"", L"q=\"asdf\"", "?q=%22asdf%22"},
1582 };
1583
1584 for (const auto& query_case : query_cases) {
1585 Component out_comp;
1586
1587 if (query_case.input8) {
1588 int len = static_cast<int>(strlen(query_case.input8));
1589 Component in_comp(0, len);
1590 std::string out_str;
1591
1592 StdStringCanonOutput output(&out_str);
1593 CanonicalizeQuery(query_case.input8, in_comp, nullptr, &output,
1594 &out_comp);
1595 output.Complete();
1596
1597 EXPECT_EQ(query_case.expected, out_str);
1598 }
1599
1600 if (query_case.input16) {
1601 std::u16string input16(
1602 test_utils::TruncateWStringToUTF16(query_case.input16));
1603 int len = static_cast<int>(input16.length());
1604 Component in_comp(0, len);
1605 std::string out_str;
1606
1607 StdStringCanonOutput output(&out_str);
1608 CanonicalizeQuery(input16.c_str(), in_comp, nullptr, &output, &out_comp);
1609 output.Complete();
1610
1611 EXPECT_EQ(query_case.expected, out_str);
1612 }
1613 }
1614
1615 // Extra test for input with embedded NULL;
1616 std::string out_str;
1617 StdStringCanonOutput output(&out_str);
1618 Component out_comp;
1619 CanonicalizeQuery("a \x00z\x01", Component(0, 5), nullptr, &output,
1620 &out_comp);
1621 output.Complete();
1622 EXPECT_EQ("?a%20%00z%01", out_str);
1623 }
1624
TEST(URLCanonTest,Ref)1625 TEST(URLCanonTest, Ref) {
1626 // Refs are trivial, it just checks the encoding.
1627 DualComponentCase ref_cases[] = {
1628 {"hello!", L"hello!", "#hello!", Component(1, 6), true},
1629 // We should escape spaces, double-quotes, angled braces, and backtics.
1630 {"hello, world", L"hello, world", "#hello,%20world", Component(1, 14),
1631 true},
1632 {"hello,\"world", L"hello,\"world", "#hello,%22world", Component(1, 14),
1633 true},
1634 {"hello,<world", L"hello,<world", "#hello,%3Cworld", Component(1, 14),
1635 true},
1636 {"hello,>world", L"hello,>world", "#hello,%3Eworld", Component(1, 14),
1637 true},
1638 {"hello,`world", L"hello,`world", "#hello,%60world", Component(1, 14),
1639 true},
1640 // UTF-8/wide input should be preserved
1641 {"\xc2\xa9", L"\xa9", "#%C2%A9", Component(1, 6), true},
1642 // Test a characer that takes > 16 bits (U+10300 = old italic letter A)
1643 {"\xF0\x90\x8C\x80ss", L"\xd800\xdf00ss", "#%F0%90%8C%80ss",
1644 Component(1, 14), true},
1645 // Escaping should be preserved unchanged, even invalid ones
1646 {"%41%a", L"%41%a", "#%41%a", Component(1, 5), true},
1647 // Invalid UTF-8/16 input should be flagged and the input made valid
1648 {"\xc2", nullptr, "#%EF%BF%BD", Component(1, 9), true},
1649 {nullptr, L"\xd800\x597d", "#%EF%BF%BD%E5%A5%BD", Component(1, 18), true},
1650 // Test a Unicode invalid character.
1651 {"a\xef\xb7\x90", L"a\xfdd0", "#a%EF%B7%90", Component(1, 10), true},
1652 // Refs can have # signs and we should preserve them.
1653 {"asdf#qwer", L"asdf#qwer", "#asdf#qwer", Component(1, 9), true},
1654 {"#asdf", L"#asdf", "##asdf", Component(1, 5), true},
1655 };
1656
1657 for (const auto& ref_case : ref_cases) {
1658 // 8-bit input
1659 if (ref_case.input8) {
1660 int len = static_cast<int>(strlen(ref_case.input8));
1661 Component in_comp(0, len);
1662 Component out_comp;
1663
1664 std::string out_str;
1665 StdStringCanonOutput output(&out_str);
1666 CanonicalizeRef(ref_case.input8, in_comp, &output, &out_comp);
1667 output.Complete();
1668
1669 EXPECT_EQ(ref_case.expected_component.begin, out_comp.begin);
1670 EXPECT_EQ(ref_case.expected_component.len, out_comp.len);
1671 EXPECT_EQ(ref_case.expected, out_str);
1672 }
1673
1674 // 16-bit input
1675 if (ref_case.input16) {
1676 std::u16string input16(
1677 test_utils::TruncateWStringToUTF16(ref_case.input16));
1678 int len = static_cast<int>(input16.length());
1679 Component in_comp(0, len);
1680 Component out_comp;
1681
1682 std::string out_str;
1683 StdStringCanonOutput output(&out_str);
1684 CanonicalizeRef(input16.c_str(), in_comp, &output, &out_comp);
1685 output.Complete();
1686
1687 EXPECT_EQ(ref_case.expected_component.begin, out_comp.begin);
1688 EXPECT_EQ(ref_case.expected_component.len, out_comp.len);
1689 EXPECT_EQ(ref_case.expected, out_str);
1690 }
1691 }
1692
1693 // Try one with an embedded NULL. It should be stripped.
1694 const char null_input[5] = "ab\x00z";
1695 Component null_input_component(0, 4);
1696 Component out_comp;
1697
1698 std::string out_str;
1699 StdStringCanonOutput output(&out_str);
1700 CanonicalizeRef(null_input, null_input_component, &output, &out_comp);
1701 output.Complete();
1702
1703 EXPECT_EQ(1, out_comp.begin);
1704 EXPECT_EQ(6, out_comp.len);
1705 EXPECT_EQ("#ab%00z", out_str);
1706 }
1707
TEST(URLCanonTest,CanonicalizeStandardURL)1708 TEST(URLCanonTest, CanonicalizeStandardURL) {
1709 // The individual component canonicalize tests should have caught the cases
1710 // for each of those components. Here, we just need to test that the various
1711 // parts are included or excluded properly, and have the correct separators.
1712 // clang-format off
1713 struct URLCase {
1714 const char* input;
1715 const char* expected;
1716 bool expected_success;
1717 } cases[] = {
1718 {"http://www.google.com/foo?bar=baz#", "http://www.google.com/foo?bar=baz#",
1719 true},
1720
1721 // Backslashes should get converted to forward slashes.
1722 {"http:\\\\www.google.com\\foo", "http://www.google.com/foo", true},
1723
1724 // Busted refs shouldn't make the whole thing fail.
1725 {"http://www.google.com/asdf#\xc2",
1726 "http://www.google.com/asdf#%EF%BF%BD", true},
1727
1728 // Basic port tests.
1729 {"http://foo:80/", "http://foo/", true},
1730 {"http://foo:81/", "http://foo:81/", true},
1731 {"httpa://foo:80/", "httpa://foo:80/", true},
1732 {"http://foo:-80/", "http://foo:-80/", false},
1733
1734 {"https://foo:443/", "https://foo/", true},
1735 {"https://foo:80/", "https://foo:80/", true},
1736 {"ftp://foo:21/", "ftp://foo/", true},
1737 {"ftp://foo:80/", "ftp://foo:80/", true},
1738 {"gopher://foo:70/", "gopher://foo:70/", true},
1739 {"gopher://foo:443/", "gopher://foo:443/", true},
1740 {"ws://foo:80/", "ws://foo/", true},
1741 {"ws://foo:81/", "ws://foo:81/", true},
1742 {"ws://foo:443/", "ws://foo:443/", true},
1743 {"ws://foo:815/", "ws://foo:815/", true},
1744 {"wss://foo:80/", "wss://foo:80/", true},
1745 {"wss://foo:81/", "wss://foo:81/", true},
1746 {"wss://foo:443/", "wss://foo/", true},
1747 {"wss://foo:815/", "wss://foo:815/", true},
1748
1749 // This particular code path ends up "backing up" to replace an invalid
1750 // host ICU generated with an escaped version. Test that in the context
1751 // of a full URL to make sure the backing up doesn't mess up the non-host
1752 // parts of the URL. "EF B9 AA" is U+FE6A which is a type of percent that
1753 // ICU will convert to an ASCII one, generating "%81".
1754 {"ws:)W\x1eW\xef\xb9\xaa"
1755 "81:80/",
1756 "ws://)w%1ew%81/", false},
1757 // Regression test for the last_invalid_percent_index bug described in
1758 // https://crbug.com/1080890#c10.
1759 {R"(HTTP:S/5%\../>%41)", "http://s/%3E%41", true},
1760 };
1761 // clang-format on
1762
1763 for (const auto& i : cases) {
1764 int url_len = static_cast<int>(strlen(i.input));
1765 Parsed parsed;
1766 ParseStandardURL(i.input, url_len, &parsed);
1767
1768 Parsed out_parsed;
1769 std::string out_str;
1770 StdStringCanonOutput output(&out_str);
1771 bool success = CanonicalizeStandardURL(
1772 i.input, parsed, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr,
1773 &output, &out_parsed);
1774 output.Complete();
1775
1776 EXPECT_EQ(i.expected_success, success);
1777 EXPECT_EQ(i.expected, out_str);
1778 }
1779 }
1780
TEST(URLCanonTest,CanonicalizeNonSpecialURL)1781 TEST(URLCanonTest, CanonicalizeNonSpecialURL) {
1782 // The individual component canonicalize tests should have caught the cases
1783 // for each of those components. Here, we just need to test that the various
1784 // parts are included or excluded properly, and have the correct separators.
1785 struct URLCase {
1786 const std::string_view input;
1787 const std::string_view expected;
1788 bool expected_success;
1789 } cases[] = {
1790 // Basic cases.
1791 {"git://host:80/path?a=b#ref", "git://host:80/path?a=b#ref", true},
1792 {"git://host", "git://host", true},
1793 {"git://host/", "git://host/", true},
1794 {"git://HosT/", "git://HosT/", true},
1795 {"git://..", "git://..", true},
1796 {"git://../", "git://../", true},
1797 {"git://../..", "git://../", true},
1798
1799 // Empty hosts.
1800 {"git://", "git://", true},
1801 {"git:///", "git:///", true},
1802 {"git:////", "git:////", true},
1803 {"git:///a", "git:///a", true},
1804 {"git:///a/../b", "git:///b", true},
1805 {"git:///..", "git:///", true},
1806
1807 // No hosts.
1808 {"git:/", "git:/", true},
1809 {"git:/a", "git:/a", true},
1810 {"git:/a/../b", "git:/b", true},
1811 {"git:/..", "git:/", true},
1812 {"git:/../", "git:/", true},
1813 {"git:/../..", "git:/", true},
1814 {"git:/.//a", "git:/.//a", true},
1815
1816 // Users.
1817 {"git://@host", "git://host", true},
1818 {"git:// @host", "git://%20@host", true},
1819 {"git://\\@host", "git://%5C@host", true},
1820
1821 // Paths.
1822 {"git://host/path", "git://host/path", true},
1823 {"git://host/p ath", "git://host/p%20ath", true},
1824 {"git://host/a/../b", "git://host/b", true},
1825 {"git://host/..", "git://host/", true},
1826 {"git://host/../", "git://host/", true},
1827 {"git://host/../..", "git://host/", true},
1828 {"git://host/.", "git://host/", true},
1829 {"git://host/./", "git://host/", true},
1830 {"git://host/./.", "git://host/", true},
1831 // Backslashes.
1832 {"git://host/a\\..\\b", "git://host/a\\..\\b", true},
1833
1834 // IPv6.
1835 {"git://[1:2:0:0:5:0:0:0]", "git://[1:2:0:0:5::]", true},
1836 {"git://[1:2:0:0:5:0:0:0]/", "git://[1:2:0:0:5::]/", true},
1837 {"git://[1:2:0:0:5:0:0:0]/path", "git://[1:2:0:0:5::]/path", true},
1838
1839 // IPv4 is unsupported.
1840 {"git://127.00.0.1", "git://127.00.0.1", true},
1841 {"git://127.1000.0.1", "git://127.1000.0.1", true},
1842
1843 // Invalid URLs.
1844 {"git://@", "git://", false},
1845 // Forbidden host code points.
1846 {"git://<", "git://", false},
1847 {"git:// /", "git:///", false},
1848 // Backslashes cannot be used as host terminators.
1849 {"git://host\\a/../b", "git://host/b", false},
1850
1851 // Opaque paths.
1852 {"git:", "git:", true},
1853 {"git:opaque", "git:opaque", true},
1854 {"git:o p a q u e", "git:o p a q u e", true},
1855 {"git: <", "git: <", true},
1856 {"git:opaque/a/../b", "git:opaque/a/../b", true},
1857 {"git:opaque\\a\\..\\b", "git:opaque\\a\\..\\b", true},
1858 {"git:\\a", "git:\\a", true},
1859 // Like URNs.
1860 {"git:a:b:c:123", "git:a:b:c:123", true},
1861 };
1862
1863 for (const auto& i : cases) {
1864 SCOPED_TRACE(i.input);
1865 Parsed parsed;
1866 ParseNonSpecialURL(i.input.data(), i.input.size(), &parsed);
1867 Parsed out_parsed;
1868 std::string out_str;
1869 StdStringCanonOutput output(&out_str);
1870 bool success = CanonicalizeNonSpecialURL(
1871 i.input.data(), i.input.size(), parsed,
1872 /*query_converter=*/nullptr, output, out_parsed);
1873 output.Complete();
1874 EXPECT_EQ(success, i.expected_success);
1875 EXPECT_EQ(out_str, i.expected);
1876 }
1877 }
1878
TEST(URLCanonTest,CanonicalizeNonSpecialURLOutputParsed)1879 TEST(URLCanonTest, CanonicalizeNonSpecialURLOutputParsed) {
1880 // Test that out_parsed is correctly set.
1881 struct URLCase {
1882 const std::string_view input;
1883 // Currently, test only host and length.
1884 Component expected_output_parsed_host;
1885 int expected_output_parsed_length;
1886 } cases[] = {
1887 {"git:", Component(), 4},
1888 {"git:opaque", Component(), 10},
1889 {"git:/", Component(), 5},
1890 {"git://", Component(6, 0), 6},
1891 {"git:///", Component(6, 0), 7},
1892 // The length of "[1:2:0:0:5::]" is 13.
1893 {"git://[1:2:0:0:5:0:0:0]/", Component(6, 13), 20},
1894 };
1895
1896 for (const auto& i : cases) {
1897 SCOPED_TRACE(i.input);
1898 Parsed parsed;
1899 ParseNonSpecialURL(i.input.data(), i.input.size(), &parsed);
1900 Parsed out_parsed;
1901 std::string unused_out_str;
1902 StdStringCanonOutput unused_output(&unused_out_str);
1903 bool success = CanonicalizeNonSpecialURL(
1904 i.input.data(), i.input.size(), parsed,
1905 /*query_converter=*/nullptr, unused_output, out_parsed);
1906 ASSERT_TRUE(success);
1907 EXPECT_EQ(out_parsed.host, i.expected_output_parsed_host);
1908 EXPECT_EQ(out_parsed.Length(), i.expected_output_parsed_length);
1909 }
1910 }
1911
1912 // The codepath here is the same as for regular canonicalization, so we just
1913 // need to test that things are replaced or not correctly.
TEST(URLCanonTest,ReplaceStandardURL)1914 TEST(URLCanonTest, ReplaceStandardURL) {
1915 ReplaceCase replace_cases[] = {
1916 // Common case of truncating the path.
1917 {"http://www.google.com/foo?bar=baz#ref", nullptr, nullptr, nullptr,
1918 nullptr, nullptr, "/", kDeleteComp, kDeleteComp,
1919 "http://www.google.com/"},
1920 // Replace everything
1921 {"http://a:[email protected]:22/foo;bar?baz@cat", "https", "me", "pw",
1922 "host.com", "99", "/path", "query", "ref",
1923 "https://me:[email protected]:99/path?query#ref"},
1924 // Replace nothing
1925 {"http://a:[email protected]:22/foo?baz@cat", nullptr, nullptr, nullptr,
1926 nullptr, nullptr, nullptr, nullptr, nullptr,
1927 "http://a:[email protected]:22/foo?baz@cat"},
1928 // Replace scheme with filesystem. The result is garbage, but you asked
1929 // for it.
1930 {"http://a:[email protected]:22/foo?baz@cat", "filesystem", nullptr, nullptr,
1931 nullptr, nullptr, nullptr, nullptr, nullptr,
1932 "filesystem://a:[email protected]:22/foo?baz@cat"},
1933 };
1934
1935 for (const auto& replace_case : replace_cases) {
1936 const ReplaceCase& cur = replace_case;
1937 int base_len = static_cast<int>(strlen(cur.base));
1938 Parsed parsed;
1939 ParseStandardURL(cur.base, base_len, &parsed);
1940
1941 Replacements<char> r;
1942 typedef Replacements<char> R; // Clean up syntax.
1943
1944 // Note that for the scheme we pass in a different clear function since
1945 // there is no function to clear the scheme.
1946 SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
1947 SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
1948 SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
1949 SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
1950 SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
1951 SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
1952 SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
1953 SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
1954
1955 std::string out_str;
1956 StdStringCanonOutput output(&out_str);
1957 Parsed out_parsed;
1958 ReplaceStandardURL(replace_case.base, parsed, r,
1959 SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr,
1960 &output, &out_parsed);
1961 output.Complete();
1962
1963 EXPECT_EQ(replace_case.expected, out_str);
1964 }
1965
1966 // The path pointer should be ignored if the address is invalid.
1967 {
1968 const char src[] = "http://www.google.com/here_is_the_path";
1969 int src_len = static_cast<int>(strlen(src));
1970
1971 Parsed parsed;
1972 ParseStandardURL(src, src_len, &parsed);
1973
1974 // Replace the path to 0 length string. By using 1 as the string address,
1975 // the test should get an access violation if it tries to dereference it.
1976 Replacements<char> r;
1977 r.SetPath(reinterpret_cast<char*>(0x00000001), Component(0, 0));
1978 std::string out_str1;
1979 StdStringCanonOutput output1(&out_str1);
1980 Parsed new_parsed;
1981 ReplaceStandardURL(src, parsed, r,
1982 SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr,
1983 &output1, &new_parsed);
1984 output1.Complete();
1985 EXPECT_STREQ("http://www.google.com/", out_str1.c_str());
1986
1987 // Same with an "invalid" path.
1988 r.SetPath(reinterpret_cast<char*>(0x00000001), Component());
1989 std::string out_str2;
1990 StdStringCanonOutput output2(&out_str2);
1991 ReplaceStandardURL(src, parsed, r,
1992 SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr,
1993 &output2, &new_parsed);
1994 output2.Complete();
1995 EXPECT_STREQ("http://www.google.com/", out_str2.c_str());
1996 }
1997 }
1998
TEST(URLCanonTest,ReplaceFileURL)1999 TEST(URLCanonTest, ReplaceFileURL) {
2000 ReplaceCase replace_cases[] = {
2001 // Replace everything
2002 {"file:///C:/gaba?query#ref", nullptr, nullptr, nullptr, "filer", nullptr,
2003 "/foo", "b", "c", "file://filer/foo?b#c"},
2004 // Replace nothing
2005 {"file:///C:/gaba?query#ref", nullptr, nullptr, nullptr, nullptr, nullptr,
2006 nullptr, nullptr, nullptr, "file:///C:/gaba?query#ref"},
2007 {"file:///Y:", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
2008 nullptr, nullptr, "file:///Y:"},
2009 {"file:///Y:/", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
2010 nullptr, nullptr, "file:///Y:/"},
2011 {"file:///./Y", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
2012 nullptr, nullptr, "file:///Y"},
2013 {"file:///./Y:", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
2014 nullptr, nullptr, "file:///Y:"},
2015 // Clear non-path components (common)
2016 {"file:///C:/gaba?query#ref", nullptr, nullptr, nullptr, nullptr, nullptr,
2017 nullptr, kDeleteComp, kDeleteComp, "file:///C:/gaba"},
2018 // Replace path with something that doesn't begin with a slash and make
2019 // sure it gets added properly.
2020 {"file:///C:/gaba", nullptr, nullptr, nullptr, nullptr, nullptr,
2021 "interesting/", nullptr, nullptr, "file:///interesting/"},
2022 {"file:///home/gaba?query#ref", nullptr, nullptr, nullptr, "filer",
2023 nullptr, "/foo", "b", "c", "file://filer/foo?b#c"},
2024 {"file:///home/gaba?query#ref", nullptr, nullptr, nullptr, nullptr,
2025 nullptr, nullptr, nullptr, nullptr, "file:///home/gaba?query#ref"},
2026 {"file:///home/gaba?query#ref", nullptr, nullptr, nullptr, nullptr,
2027 nullptr, nullptr, kDeleteComp, kDeleteComp, "file:///home/gaba"},
2028 {"file:///home/gaba", nullptr, nullptr, nullptr, nullptr, nullptr,
2029 "interesting/", nullptr, nullptr, "file:///interesting/"},
2030 // Replace scheme -- shouldn't do anything.
2031 {"file:///C:/gaba?query#ref", "http", nullptr, nullptr, nullptr, nullptr,
2032 nullptr, nullptr, nullptr, "file:///C:/gaba?query#ref"},
2033 };
2034
2035 for (const auto& replace_case : replace_cases) {
2036 const ReplaceCase& cur = replace_case;
2037 SCOPED_TRACE(cur.base);
2038 int base_len = static_cast<int>(strlen(cur.base));
2039 Parsed parsed;
2040 ParseFileURL(cur.base, base_len, &parsed);
2041
2042 Replacements<char> r;
2043 typedef Replacements<char> R; // Clean up syntax.
2044 SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
2045 SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
2046 SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
2047 SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
2048 SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
2049 SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
2050 SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
2051 SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
2052
2053 std::string out_str;
2054 StdStringCanonOutput output(&out_str);
2055 Parsed out_parsed;
2056 ReplaceFileURL(cur.base, parsed, r, nullptr, &output, &out_parsed);
2057 output.Complete();
2058
2059 EXPECT_EQ(replace_case.expected, out_str);
2060 }
2061 }
2062
TEST(URLCanonTest,ReplaceFileSystemURL)2063 TEST(URLCanonTest, ReplaceFileSystemURL) {
2064 ReplaceCase replace_cases[] = {
2065 // Replace everything in the outer URL.
2066 {"filesystem:file:///temporary/gaba?query#ref", nullptr, nullptr, nullptr,
2067 nullptr, nullptr, "/foo", "b", "c",
2068 "filesystem:file:///temporary/foo?b#c"},
2069 // Replace nothing
2070 {"filesystem:file:///temporary/gaba?query#ref", nullptr, nullptr, nullptr,
2071 nullptr, nullptr, nullptr, nullptr, nullptr,
2072 "filesystem:file:///temporary/gaba?query#ref"},
2073 // Clear non-path components (common)
2074 {"filesystem:file:///temporary/gaba?query#ref", nullptr, nullptr, nullptr,
2075 nullptr, nullptr, nullptr, kDeleteComp, kDeleteComp,
2076 "filesystem:file:///temporary/gaba"},
2077 // Replace path with something that doesn't begin with a slash and make
2078 // sure it gets added properly.
2079 {"filesystem:file:///temporary/gaba?query#ref", nullptr, nullptr, nullptr,
2080 nullptr, nullptr, "interesting/", nullptr, nullptr,
2081 "filesystem:file:///temporary/interesting/?query#ref"},
2082 // Replace scheme -- shouldn't do anything except canonicalize.
2083 {"filesystem:http://u:[email protected]/t/gaba?query#ref", "http", nullptr,
2084 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
2085 "filesystem:http://bar.com/t/gaba?query#ref"},
2086 // Replace username -- shouldn't do anything except canonicalize.
2087 {"filesystem:http://u:[email protected]/t/gaba?query#ref", nullptr, "u2", nullptr,
2088 nullptr, nullptr, nullptr, nullptr, nullptr,
2089 "filesystem:http://bar.com/t/gaba?query#ref"},
2090 // Replace password -- shouldn't do anything except canonicalize.
2091 {"filesystem:http://u:[email protected]/t/gaba?query#ref", nullptr, nullptr,
2092 "pw2", nullptr, nullptr, nullptr, nullptr, nullptr,
2093 "filesystem:http://bar.com/t/gaba?query#ref"},
2094 // Replace host -- shouldn't do anything except canonicalize.
2095 {"filesystem:http://u:[email protected]:80/t/gaba?query#ref", nullptr, nullptr,
2096 nullptr, "foo.com", nullptr, nullptr, nullptr, nullptr,
2097 "filesystem:http://bar.com/t/gaba?query#ref"},
2098 // Replace port -- shouldn't do anything except canonicalize.
2099 {"filesystem:http://u:[email protected]:40/t/gaba?query#ref", nullptr, nullptr,
2100 nullptr, nullptr, "41", nullptr, nullptr, nullptr,
2101 "filesystem:http://bar.com:40/t/gaba?query#ref"},
2102 };
2103
2104 for (const auto& replace_case : replace_cases) {
2105 const ReplaceCase& cur = replace_case;
2106 Parsed parsed = ParseFileSystemURL(cur.base);
2107
2108 Replacements<char> r;
2109 typedef Replacements<char> R; // Clean up syntax.
2110 SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
2111 SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
2112 SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
2113 SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
2114 SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
2115 SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
2116 SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
2117 SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
2118
2119 std::string out_str;
2120 StdStringCanonOutput output(&out_str);
2121 Parsed out_parsed;
2122 ReplaceFileSystemURL(cur.base, parsed, r, nullptr, &output, &out_parsed);
2123 output.Complete();
2124
2125 EXPECT_EQ(replace_case.expected, out_str);
2126 }
2127 }
2128
TEST(URLCanonTest,ReplacePathURL)2129 TEST(URLCanonTest, ReplacePathURL) {
2130 ReplaceCase replace_cases[] = {
2131 // Replace everything
2132 {"data:foo", "javascript", nullptr, nullptr, nullptr, nullptr,
2133 "alert('foo?');", nullptr, nullptr, "javascript:alert('foo?');"},
2134 // Replace nothing
2135 {"data:foo", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
2136 nullptr, nullptr, "data:foo"},
2137 // Replace one or the other
2138 {"data:foo", "javascript", nullptr, nullptr, nullptr, nullptr, nullptr,
2139 nullptr, nullptr, "javascript:foo"},
2140 {"data:foo", nullptr, nullptr, nullptr, nullptr, nullptr, "bar", nullptr,
2141 nullptr, "data:bar"},
2142 {"data:foo", nullptr, nullptr, nullptr, nullptr, nullptr, kDeleteComp,
2143 nullptr, nullptr, "data:"},
2144 };
2145
2146 for (const auto& replace_case : replace_cases) {
2147 const ReplaceCase& cur = replace_case;
2148 int base_len = static_cast<int>(strlen(cur.base));
2149 Parsed parsed;
2150 ParsePathURL(cur.base, base_len, false, &parsed);
2151
2152 Replacements<char> r;
2153 typedef Replacements<char> R; // Clean up syntax.
2154 SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
2155 SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
2156 SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
2157 SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
2158 SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
2159 SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
2160 SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
2161 SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
2162
2163 std::string out_str;
2164 StdStringCanonOutput output(&out_str);
2165 Parsed out_parsed;
2166 ReplacePathURL(cur.base, parsed, r, &output, &out_parsed);
2167 output.Complete();
2168
2169 EXPECT_EQ(replace_case.expected, out_str);
2170 }
2171 }
2172
TEST(URLCanonTest,ReplaceMailtoURL)2173 TEST(URLCanonTest, ReplaceMailtoURL) {
2174 ReplaceCase replace_cases[] = {
2175 // Replace everything
2176 {"mailto:[email protected]?body=sup", "mailto", nullptr, nullptr, nullptr,
2177 nullptr, "addr1", "to=tony", nullptr, "mailto:addr1?to=tony"},
2178 // Replace nothing
2179 {"mailto:[email protected]?body=sup", nullptr, nullptr, nullptr, nullptr,
2180 nullptr, nullptr, nullptr, nullptr, "mailto:[email protected]?body=sup"},
2181 // Replace the path
2182 {"mailto:[email protected]?body=sup", nullptr, nullptr, nullptr, nullptr,
2183 nullptr, "jason", nullptr, nullptr, "mailto:jason?body=sup"},
2184 // Replace the query
2185 {"mailto:[email protected]?body=sup", nullptr, nullptr, nullptr, nullptr,
2186 nullptr, nullptr, "custom=1", nullptr, "mailto:[email protected]?custom=1"},
2187 // Replace the path and query
2188 {"mailto:[email protected]?body=sup", nullptr, nullptr, nullptr, nullptr,
2189 nullptr, "jason", "custom=1", nullptr, "mailto:jason?custom=1"},
2190 // Set the query to empty (should leave trailing question mark)
2191 {"mailto:[email protected]?body=sup", nullptr, nullptr, nullptr, nullptr,
2192 nullptr, nullptr, "", nullptr, "mailto:[email protected]?"},
2193 // Clear the query
2194 {"mailto:[email protected]?body=sup", nullptr, nullptr, nullptr, nullptr,
2195 nullptr, nullptr, "|", nullptr, "mailto:[email protected]"},
2196 // Clear the path
2197 {"mailto:[email protected]?body=sup", nullptr, nullptr, nullptr, nullptr,
2198 nullptr, "|", nullptr, nullptr, "mailto:?body=sup"},
2199 // Clear the path + query
2200 {"mailto:", nullptr, nullptr, nullptr, nullptr, nullptr, "|", "|",
2201 nullptr, "mailto:"},
2202 // Setting the ref should have no effect
2203 {"mailto:addr1", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
2204 nullptr, "BLAH", "mailto:addr1"},
2205 };
2206
2207 for (const auto& replace_case : replace_cases) {
2208 const ReplaceCase& cur = replace_case;
2209 Parsed parsed = ParseMailtoURL(cur.base);
2210
2211 Replacements<char> r;
2212 typedef Replacements<char> R;
2213 SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
2214 SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
2215 SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
2216 SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
2217 SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
2218 SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
2219 SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
2220 SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
2221
2222 std::string out_str;
2223 StdStringCanonOutput output(&out_str);
2224 Parsed out_parsed;
2225 ReplaceMailtoURL(cur.base, parsed, r, &output, &out_parsed);
2226 output.Complete();
2227
2228 EXPECT_EQ(replace_case.expected, out_str);
2229 }
2230 }
2231
TEST(URLCanonTest,CanonicalizeFileURL)2232 TEST(URLCanonTest, CanonicalizeFileURL) {
2233 struct URLCase {
2234 const char* input;
2235 const char* expected;
2236 bool expected_success;
2237 Component expected_host;
2238 Component expected_path;
2239 } cases[] = {
2240 #ifdef _WIN32
2241 // Windows-style paths
2242 {"file:c:\\foo\\bar.html", "file:///C:/foo/bar.html", true, Component(),
2243 Component(7, 16)},
2244 {" File:c|////foo\\bar.html", "file:///C:////foo/bar.html", true,
2245 Component(), Component(7, 19)},
2246 {"file:", "file:///", true, Component(), Component(7, 1)},
2247 {"file:UNChost/path", "file://unchost/path", true, Component(7, 7),
2248 Component(14, 5)},
2249 // CanonicalizeFileURL supports absolute Windows style paths for IE
2250 // compatibility. Note that the caller must decide that this is a file
2251 // URL itself so it can call the file canonicalizer. This is usually
2252 // done automatically as part of relative URL resolving.
2253 {"c:\\foo\\bar", "file:///C:/foo/bar", true, Component(),
2254 Component(7, 11)},
2255 {"C|/foo/bar", "file:///C:/foo/bar", true, Component(), Component(7, 11)},
2256 {"/C|\\foo\\bar", "file:///C:/foo/bar", true, Component(),
2257 Component(7, 11)},
2258 {"//C|/foo/bar", "file:///C:/foo/bar", true, Component(),
2259 Component(7, 11)},
2260 {"//server/file", "file://server/file", true, Component(7, 6),
2261 Component(13, 5)},
2262 {"\\\\server\\file", "file://server/file", true, Component(7, 6),
2263 Component(13, 5)},
2264 {"/\\server/file", "file://server/file", true, Component(7, 6),
2265 Component(13, 5)},
2266 // We should preserve the number of slashes after the colon for IE
2267 // compatibility, except when there is none, in which case we should
2268 // add one.
2269 {"file:c:foo/bar.html", "file:///C:/foo/bar.html", true, Component(),
2270 Component(7, 16)},
2271 {"file:/\\/\\C:\\\\//foo\\bar.html", "file:///C:////foo/bar.html", true,
2272 Component(), Component(7, 19)},
2273 // Three slashes should be non-UNC, even if there is no drive spec (IE
2274 // does this, which makes the resulting request invalid).
2275 {"file:///foo/bar.txt", "file:///foo/bar.txt", true, Component(),
2276 Component(7, 12)},
2277 // TODO(brettw) we should probably fail for invalid host names, which
2278 // would change the expected result on this test. We also currently allow
2279 // colon even though it's probably invalid, because its currently the
2280 // "natural" result of the way the canonicalizer is written. There doesn't
2281 // seem to be a strong argument for why allowing it here would be bad, so
2282 // we just tolerate it and the load will fail later.
2283 {"FILE:/\\/\\7:\\\\//foo\\bar.html", "file://7:////foo/bar.html", false,
2284 Component(7, 2), Component(9, 16)},
2285 {"file:filer/home\\me", "file://filer/home/me", true, Component(7, 5),
2286 Component(12, 8)},
2287 // Make sure relative paths can't go above the "C:"
2288 {"file:///C:/foo/../../../bar.html", "file:///C:/bar.html", true,
2289 Component(), Component(7, 12)},
2290 // Busted refs shouldn't make the whole thing fail.
2291 {"file:///C:/asdf#\xc2", "file:///C:/asdf#%EF%BF%BD", true, Component(),
2292 Component(7, 8)},
2293 {"file:///./s:", "file:///S:", true, Component(), Component(7, 3)},
2294 #else
2295 // Unix-style paths
2296 {"file:///home/me", "file:///home/me", true, Component(),
2297 Component(7, 8)},
2298 // Windowsy ones should get still treated as Unix-style.
2299 {"file:c:\\foo\\bar.html", "file:///c:/foo/bar.html", true, Component(),
2300 Component(7, 16)},
2301 {"file:c|//foo\\bar.html", "file:///c%7C//foo/bar.html", true,
2302 Component(), Component(7, 19)},
2303 {"file:///./s:", "file:///s:", true, Component(), Component(7, 3)},
2304 // file: tests from WebKit (LayoutTests/fast/loader/url-parse-1.html)
2305 {"//", "file:///", true, Component(), Component(7, 1)},
2306 {"///", "file:///", true, Component(), Component(7, 1)},
2307 {"///test", "file:///test", true, Component(), Component(7, 5)},
2308 {"file://test", "file://test/", true, Component(7, 4), Component(11, 1)},
2309 {"file://localhost", "file://localhost/", true, Component(7, 9),
2310 Component(16, 1)},
2311 {"file://localhost/", "file://localhost/", true, Component(7, 9),
2312 Component(16, 1)},
2313 {"file://localhost/test", "file://localhost/test", true, Component(7, 9),
2314 Component(16, 5)},
2315 #endif // _WIN32
2316 };
2317
2318 for (const auto& i : cases) {
2319 int url_len = static_cast<int>(strlen(i.input));
2320 Parsed parsed;
2321 ParseFileURL(i.input, url_len, &parsed);
2322
2323 Parsed out_parsed;
2324 std::string out_str;
2325 StdStringCanonOutput output(&out_str);
2326 bool success = CanonicalizeFileURL(i.input, url_len, parsed, nullptr,
2327 &output, &out_parsed);
2328 output.Complete();
2329
2330 EXPECT_EQ(i.expected_success, success);
2331 EXPECT_EQ(i.expected, out_str);
2332
2333 // Make sure the spec was properly identified, the file canonicalizer has
2334 // different code for writing the spec.
2335 EXPECT_EQ(0, out_parsed.scheme.begin);
2336 EXPECT_EQ(4, out_parsed.scheme.len);
2337
2338 EXPECT_EQ(i.expected_host.begin, out_parsed.host.begin);
2339 EXPECT_EQ(i.expected_host.len, out_parsed.host.len);
2340
2341 EXPECT_EQ(i.expected_path.begin, out_parsed.path.begin);
2342 EXPECT_EQ(i.expected_path.len, out_parsed.path.len);
2343 }
2344 }
2345
TEST(URLCanonTest,CanonicalizeFileSystemURL)2346 TEST(URLCanonTest, CanonicalizeFileSystemURL) {
2347 struct URLCase {
2348 const char* input;
2349 const char* expected;
2350 bool expected_success;
2351 } cases[] = {
2352 {"Filesystem:htTp://www.Foo.com:80/tempoRary",
2353 "filesystem:http://www.foo.com/tempoRary/", true},
2354 {"filesystem:httpS://www.foo.com/temporary/",
2355 "filesystem:https://www.foo.com/temporary/", true},
2356 {"filesystem:http://www.foo.com//", "filesystem:http://www.foo.com//",
2357 false},
2358 {"filesystem:http://www.foo.com/persistent/bob?query#ref",
2359 "filesystem:http://www.foo.com/persistent/bob?query#ref", true},
2360 {"filesystem:fIle://\\temporary/", "filesystem:file:///temporary/", true},
2361 {"filesystem:fiLe:///temporary", "filesystem:file:///temporary/", true},
2362 {"filesystem:File:///temporary/Bob?qUery#reF",
2363 "filesystem:file:///temporary/Bob?qUery#reF", true},
2364 {"FilEsysteM:htTp:E=/.", "filesystem:http://e=//", false},
2365 };
2366
2367 for (const auto& i : cases) {
2368 Parsed parsed = ParseFileSystemURL(i.input);
2369
2370 Parsed out_parsed;
2371 std::string out_str;
2372 StdStringCanonOutput output(&out_str);
2373 bool success = CanonicalizeFileSystemURL(i.input, parsed, nullptr, &output,
2374 &out_parsed);
2375 output.Complete();
2376
2377 EXPECT_EQ(i.expected_success, success);
2378 EXPECT_EQ(i.expected, out_str);
2379
2380 // Make sure the spec was properly identified, the filesystem canonicalizer
2381 // has different code for writing the spec.
2382 EXPECT_EQ(0, out_parsed.scheme.begin);
2383 EXPECT_EQ(10, out_parsed.scheme.len);
2384 if (success)
2385 EXPECT_GT(out_parsed.path.len, 0);
2386 }
2387 }
2388
TEST(URLCanonTest,CanonicalizePathURL)2389 TEST(URLCanonTest, CanonicalizePathURL) {
2390 // Path URLs should get canonicalized schemes but nothing else.
2391 struct PathCase {
2392 const char* input;
2393 const char* expected;
2394 } path_cases[] = {
2395 {"javascript:", "javascript:"},
2396 {"JavaScript:Foo", "javascript:Foo"},
2397 {"Foo:\":This /is interesting;?#", "foo:\":This /is interesting;?#"},
2398
2399 // Unicode invalid characters should not cause failure. See
2400 // https://crbug.com/925614.
2401 {"javascript:\uFFFF", "javascript:%EF%BF%BF"},
2402 };
2403
2404 for (const auto& path_case : path_cases) {
2405 int url_len = static_cast<int>(strlen(path_case.input));
2406 Parsed parsed;
2407 ParsePathURL(path_case.input, url_len, true, &parsed);
2408
2409 Parsed out_parsed;
2410 std::string out_str;
2411 StdStringCanonOutput output(&out_str);
2412 bool success = CanonicalizePathURL(path_case.input, url_len, parsed,
2413 &output, &out_parsed);
2414 output.Complete();
2415
2416 EXPECT_TRUE(success);
2417 EXPECT_EQ(path_case.expected, out_str);
2418
2419 EXPECT_EQ(0, out_parsed.host.begin);
2420 EXPECT_EQ(-1, out_parsed.host.len);
2421
2422 // When we end with a colon at the end, there should be no path.
2423 if (path_case.input[url_len - 1] == ':') {
2424 EXPECT_EQ(0, out_parsed.GetContent().begin);
2425 EXPECT_EQ(-1, out_parsed.GetContent().len);
2426 }
2427 }
2428 }
2429
TEST(URLCanonTest,CanonicalizePathURLPath)2430 TEST(URLCanonTest, CanonicalizePathURLPath) {
2431 struct PathCase {
2432 std::string input;
2433 std::wstring input16;
2434 std::string expected;
2435 } path_cases[] = {
2436 {"Foo", L"Foo", "Foo"},
2437 {"\":This /is interesting;?#", L"\":This /is interesting;?#",
2438 "\":This /is interesting;?#"},
2439 {"\uFFFF", L"\uFFFF", "%EF%BF%BF"},
2440 };
2441
2442 for (const auto& path_case : path_cases) {
2443 // 8-bit string input
2444 std::string out_str;
2445 StdStringCanonOutput output(&out_str);
2446 url::Component out_component;
2447 CanonicalizePathURLPath(path_case.input.data(),
2448 Component(0, path_case.input.size()), &output,
2449 &out_component);
2450 output.Complete();
2451
2452 EXPECT_EQ(path_case.expected, out_str);
2453
2454 EXPECT_EQ(0, out_component.begin);
2455 EXPECT_EQ(path_case.expected.size(),
2456 static_cast<size_t>(out_component.len));
2457
2458 // 16-bit string input
2459 std::string out_str16;
2460 StdStringCanonOutput output16(&out_str16);
2461 url::Component out_component16;
2462 std::u16string input16(
2463 test_utils::TruncateWStringToUTF16(path_case.input16.data()));
2464 CanonicalizePathURLPath(input16.c_str(),
2465 Component(0, path_case.input16.size()), &output16,
2466 &out_component16);
2467 output16.Complete();
2468
2469 EXPECT_EQ(path_case.expected, out_str16);
2470
2471 EXPECT_EQ(0, out_component16.begin);
2472 EXPECT_EQ(path_case.expected.size(),
2473 static_cast<size_t>(out_component16.len));
2474 }
2475 }
2476
TEST(URLCanonTest,CanonicalizeMailtoURL)2477 TEST(URLCanonTest, CanonicalizeMailtoURL) {
2478 struct URLCase {
2479 const char* input;
2480 const char* expected;
2481 bool expected_success;
2482 Component expected_path;
2483 Component expected_query;
2484 } cases[] = {
2485 // Null character should be escaped to %00.
2486 // Keep this test first in the list as it is handled specially below.
2487 {"mailto:addr1\0addr2?foo",
2488 "mailto:addr1%00addr2?foo",
2489 true, Component(7, 13), Component(21, 3)},
2490 {"mailto:addr1",
2491 "mailto:addr1",
2492 true, Component(7, 5), Component()},
2493 {"mailto:[email protected]",
2494 "mailto:[email protected]",
2495 true, Component(7, 13), Component()},
2496 // Trailing whitespace is stripped.
2497 {"MaIlTo:addr1 \t ",
2498 "mailto:addr1",
2499 true, Component(7, 5), Component()},
2500 {"MaIlTo:addr1?to=jon",
2501 "mailto:addr1?to=jon",
2502 true, Component(7, 5), Component(13,6)},
2503 {"mailto:addr1,addr2",
2504 "mailto:addr1,addr2",
2505 true, Component(7, 11), Component()},
2506 // Embedded spaces must be encoded.
2507 {"mailto:addr1, addr2",
2508 "mailto:addr1,%20addr2",
2509 true, Component(7, 14), Component()},
2510 {"mailto:addr1, addr2?subject=one two ",
2511 "mailto:addr1,%20addr2?subject=one%20two",
2512 true, Component(7, 14), Component(22, 17)},
2513 {"mailto:addr1%2caddr2",
2514 "mailto:addr1%2caddr2",
2515 true, Component(7, 13), Component()},
2516 {"mailto:\xF0\x90\x8C\x80",
2517 "mailto:%F0%90%8C%80",
2518 true, Component(7, 12), Component()},
2519 // Invalid -- UTF-8 encoded surrogate value.
2520 {"mailto:\xed\xa0\x80",
2521 "mailto:%EF%BF%BD%EF%BF%BD%EF%BF%BD",
2522 false, Component(7, 27), Component()},
2523 {"mailto:addr1?",
2524 "mailto:addr1?",
2525 true, Component(7, 5), Component(13, 0)},
2526 // Certain characters have special meanings and must be encoded.
2527 {"mailto:! \x22$&()+,-./09:;<=>@AZ[\\]&_`az{|}~\x7f?Query! \x22$&()+,-./09:;<=>@AZ[\\]&_`az{|}~",
2528 "mailto:!%20%22$&()+,-./09:;%3C=%3E@AZ[\\]&_%60az%7B%7C%7D~%7F?Query!%20%22$&()+,-./09:;%3C=%3E@AZ[\\]&_`az{|}~",
2529 true, Component(7, 53), Component(61, 47)},
2530 };
2531
2532 // Define outside of loop to catch bugs where components aren't reset
2533 Parsed out_parsed;
2534
2535 for (size_t i = 0; i < std::size(cases); i++) {
2536 int url_len = static_cast<int>(strlen(cases[i].input));
2537 if (i == 0) {
2538 // The first test case purposely has a '\0' in it -- don't count it
2539 // as the string terminator.
2540 url_len = 22;
2541 }
2542
2543 std::string out_str;
2544 StdStringCanonOutput output(&out_str);
2545 bool success = CanonicalizeMailtoURL(
2546 cases[i].input, url_len,
2547 ParseMailtoURL(std::string_view(cases[i].input, url_len)), &output,
2548 &out_parsed);
2549 output.Complete();
2550
2551 EXPECT_EQ(cases[i].expected_success, success);
2552 EXPECT_EQ(cases[i].expected, out_str);
2553
2554 // Make sure the spec was properly identified
2555 EXPECT_EQ(0, out_parsed.scheme.begin);
2556 EXPECT_EQ(6, out_parsed.scheme.len);
2557
2558 EXPECT_EQ(cases[i].expected_path.begin, out_parsed.path.begin);
2559 EXPECT_EQ(cases[i].expected_path.len, out_parsed.path.len);
2560
2561 EXPECT_EQ(cases[i].expected_query.begin, out_parsed.query.begin);
2562 EXPECT_EQ(cases[i].expected_query.len, out_parsed.query.len);
2563 }
2564 }
2565
2566 #ifndef WIN32
2567
TEST(URLCanonTest,_itoa_s)2568 TEST(URLCanonTest, _itoa_s) {
2569 // We fill the buffer with 0xff to ensure that it's getting properly
2570 // null-terminated. We also allocate one byte more than what we tell
2571 // _itoa_s about, and ensure that the extra byte is untouched.
2572 char buf[6];
2573 memset(buf, 0xff, sizeof(buf));
2574 EXPECT_EQ(0, _itoa_s(12, buf, sizeof(buf) - 1, 10));
2575 EXPECT_STREQ("12", buf);
2576 EXPECT_EQ('\xFF', buf[3]);
2577
2578 // Test the edge cases - exactly the buffer size and one over
2579 memset(buf, 0xff, sizeof(buf));
2580 EXPECT_EQ(0, _itoa_s(1234, buf, sizeof(buf) - 1, 10));
2581 EXPECT_STREQ("1234", buf);
2582 EXPECT_EQ('\xFF', buf[5]);
2583
2584 memset(buf, 0xff, sizeof(buf));
2585 EXPECT_EQ(EINVAL, _itoa_s(12345, buf, sizeof(buf) - 1, 10));
2586 EXPECT_EQ('\xFF', buf[5]); // should never write to this location
2587
2588 // Test the template overload (note that this will see the full buffer)
2589 memset(buf, 0xff, sizeof(buf));
2590 EXPECT_EQ(0, _itoa_s(12, buf, 10));
2591 EXPECT_STREQ("12", buf);
2592 EXPECT_EQ('\xFF', buf[3]);
2593
2594 memset(buf, 0xff, sizeof(buf));
2595 EXPECT_EQ(0, _itoa_s(12345, buf, 10));
2596 EXPECT_STREQ("12345", buf);
2597
2598 EXPECT_EQ(EINVAL, _itoa_s(123456, buf, 10));
2599
2600 // Test that radix 16 is supported.
2601 memset(buf, 0xff, sizeof(buf));
2602 EXPECT_EQ(0, _itoa_s(1234, buf, sizeof(buf) - 1, 16));
2603 EXPECT_STREQ("4d2", buf);
2604 EXPECT_EQ('\xFF', buf[5]);
2605 }
2606
TEST(URLCanonTest,_itow_s)2607 TEST(URLCanonTest, _itow_s) {
2608 // We fill the buffer with 0xff to ensure that it's getting properly
2609 // null-terminated. We also allocate one byte more than what we tell
2610 // _itoa_s about, and ensure that the extra byte is untouched.
2611 char16_t buf[6];
2612 const char fill_mem = 0xff;
2613 const char16_t fill_char = 0xffff;
2614 memset(buf, fill_mem, sizeof(buf));
2615 EXPECT_EQ(0, _itow_s(12, buf, sizeof(buf) / 2 - 1, 10));
2616 EXPECT_EQ(u"12", std::u16string(buf));
2617 EXPECT_EQ(fill_char, buf[3]);
2618
2619 // Test the edge cases - exactly the buffer size and one over
2620 EXPECT_EQ(0, _itow_s(1234, buf, sizeof(buf) / 2 - 1, 10));
2621 EXPECT_EQ(u"1234", std::u16string(buf));
2622 EXPECT_EQ(fill_char, buf[5]);
2623
2624 memset(buf, fill_mem, sizeof(buf));
2625 EXPECT_EQ(EINVAL, _itow_s(12345, buf, sizeof(buf) / 2 - 1, 10));
2626 EXPECT_EQ(fill_char, buf[5]); // should never write to this location
2627
2628 // Test the template overload (note that this will see the full buffer)
2629 memset(buf, fill_mem, sizeof(buf));
2630 EXPECT_EQ(0, _itow_s(12, buf, 10));
2631 EXPECT_EQ(u"12", std::u16string(buf));
2632 EXPECT_EQ(fill_char, buf[3]);
2633
2634 memset(buf, fill_mem, sizeof(buf));
2635 EXPECT_EQ(0, _itow_s(12345, buf, 10));
2636 EXPECT_EQ(u"12345", std::u16string(buf));
2637
2638 EXPECT_EQ(EINVAL, _itow_s(123456, buf, 10));
2639 }
2640
2641 #endif // !WIN32
2642
2643 // Returns true if the given two structures are the same.
ParsedIsEqual(const Parsed & a,const Parsed & b)2644 static bool ParsedIsEqual(const Parsed& a, const Parsed& b) {
2645 return a.scheme.begin == b.scheme.begin && a.scheme.len == b.scheme.len &&
2646 a.username.begin == b.username.begin && a.username.len == b.username.len &&
2647 a.password.begin == b.password.begin && a.password.len == b.password.len &&
2648 a.host.begin == b.host.begin && a.host.len == b.host.len &&
2649 a.port.begin == b.port.begin && a.port.len == b.port.len &&
2650 a.path.begin == b.path.begin && a.path.len == b.path.len &&
2651 a.query.begin == b.query.begin && a.query.len == b.query.len &&
2652 a.ref.begin == b.ref.begin && a.ref.len == b.ref.len;
2653 }
2654
TEST(URLCanonTest,ResolveRelativeURL)2655 TEST(URLCanonTest, ResolveRelativeURL) {
2656 struct RelativeCase {
2657 const char* base; // Input base URL: MUST BE CANONICAL
2658 bool is_base_hier; // Is the base URL hierarchical
2659 bool is_base_file; // Tells us if the base is a file URL.
2660 const char* test; // Input URL to test against.
2661 bool succeed_relative; // Whether we expect IsRelativeURL to succeed
2662 bool is_rel; // Whether we expect |test| to be relative or not.
2663 bool succeed_resolve; // Whether we expect ResolveRelativeURL to succeed.
2664 const char* resolved; // What we expect in the result when resolving.
2665 } rel_cases[] = {
2666 // Basic absolute input.
2667 {"http://host/a", true, false, "http://another/", true, false, false,
2668 nullptr},
2669 {"http://host/a", true, false, "http:////another/", true, false, false,
2670 nullptr},
2671 // Empty relative URLs should only remove the ref part of the URL,
2672 // leaving the rest unchanged.
2673 {"http://foo/bar", true, false, "", true, true, true, "http://foo/bar"},
2674 {"http://foo/bar#ref", true, false, "", true, true, true,
2675 "http://foo/bar"},
2676 {"http://foo/bar#", true, false, "", true, true, true, "http://foo/bar"},
2677 // Spaces at the ends of the relative path should be ignored.
2678 {"http://foo/bar", true, false, " another ", true, true, true,
2679 "http://foo/another"},
2680 {"http://foo/bar", true, false, " . ", true, true, true, "http://foo/"},
2681 {"http://foo/bar", true, false, " \t ", true, true, true,
2682 "http://foo/bar"},
2683 // Matching schemes without two slashes are treated as relative.
2684 {"http://host/a", true, false, "http:path", true, true, true,
2685 "http://host/path"},
2686 {"http://host/a/", true, false, "http:path", true, true, true,
2687 "http://host/a/path"},
2688 {"http://host/a", true, false, "http:/path", true, true, true,
2689 "http://host/path"},
2690 {"http://host/a", true, false, "HTTP:/path", true, true, true,
2691 "http://host/path"},
2692 // Nonmatching schemes are absolute.
2693 {"http://host/a", true, false, "https:host2", true, false, false,
2694 nullptr},
2695 {"http://host/a", true, false, "htto:/host2", true, false, false,
2696 nullptr},
2697 // Absolute path input
2698 {"http://host/a", true, false, "/b/c/d", true, true, true,
2699 "http://host/b/c/d"},
2700 {"http://host/a", true, false, "\\b\\c\\d", true, true, true,
2701 "http://host/b/c/d"},
2702 {"http://host/a", true, false, "/b/../c", true, true, true,
2703 "http://host/c"},
2704 {"http://host/a?b#c", true, false, "/b/../c", true, true, true,
2705 "http://host/c"},
2706 {"http://host/a", true, false, "\\b/../c?x#y", true, true, true,
2707 "http://host/c?x#y"},
2708 {"http://host/a?b#c", true, false, "/b/../c?x#y", true, true, true,
2709 "http://host/c?x#y"},
2710 // Relative path input
2711 {"http://host/a", true, false, "b", true, true, true, "http://host/b"},
2712 {"http://host/a", true, false, "bc/de", true, true, true,
2713 "http://host/bc/de"},
2714 {"http://host/a/", true, false, "bc/de?query#ref", true, true, true,
2715 "http://host/a/bc/de?query#ref"},
2716 {"http://host/a/", true, false, ".", true, true, true, "http://host/a/"},
2717 {"http://host/a/", true, false, "..", true, true, true, "http://host/"},
2718 {"http://host/a/", true, false, "./..", true, true, true, "http://host/"},
2719 {"http://host/a/", true, false, "../.", true, true, true, "http://host/"},
2720 {"http://host/a/", true, false, "././.", true, true, true,
2721 "http://host/a/"},
2722 {"http://host/a?query#ref", true, false, "../../../foo", true, true, true,
2723 "http://host/foo"},
2724 // Query input
2725 {"http://host/a", true, false, "?foo=bar", true, true, true,
2726 "http://host/a?foo=bar"},
2727 {"http://host/a?x=y#z", true, false, "?", true, true, true,
2728 "http://host/a?"},
2729 {"http://host/a?x=y#z", true, false, "?foo=bar#com", true, true, true,
2730 "http://host/a?foo=bar#com"},
2731 // Ref input
2732 {"http://host/a", true, false, "#ref", true, true, true,
2733 "http://host/a#ref"},
2734 {"http://host/a#b", true, false, "#", true, true, true, "http://host/a#"},
2735 {"http://host/a?foo=bar#hello", true, false, "#bye", true, true, true,
2736 "http://host/a?foo=bar#bye"},
2737 // Non-hierarchical base: no relative handling. Relative input should
2738 // error, and if a scheme is present, it should be treated as absolute.
2739 {"data:foobar", false, false, "baz.html", false, false, false, nullptr},
2740 {"data:foobar", false, false, "data:baz", true, false, false, nullptr},
2741 {"data:foobar", false, false, "data:/base", true, false, false, nullptr},
2742 // Non-hierarchical base: absolute input should succeed.
2743 {"data:foobar", false, false, "http://host/", true, false, false,
2744 nullptr},
2745 {"data:foobar", false, false, "http:host", true, false, false, nullptr},
2746 // Non-hierarchical base: empty URL should give error.
2747 {"data:foobar", false, false, "", false, false, false, nullptr},
2748 // Invalid schemes should be treated as relative.
2749 {"http://foo/bar", true, false, "./asd:fgh", true, true, true,
2750 "http://foo/asd:fgh"},
2751 {"http://foo/bar", true, false, ":foo", true, true, true,
2752 "http://foo/:foo"},
2753 {"http://foo/bar", true, false, " hello world", true, true, true,
2754 "http://foo/hello%20world"},
2755 {"data:asdf", false, false, ":foo", false, false, false, nullptr},
2756 {"data:asdf", false, false, "bad(':foo')", false, false, false, nullptr},
2757 // We should treat semicolons like any other character in URL resolving
2758 {"http://host/a", true, false, ";foo", true, true, true,
2759 "http://host/;foo"},
2760 {"http://host/a;", true, false, ";foo", true, true, true,
2761 "http://host/;foo"},
2762 {"http://host/a", true, false, ";/../bar", true, true, true,
2763 "http://host/bar"},
2764 // Relative URLs can also be written as "//foo/bar" which is relative to
2765 // the scheme. In this case, it would take the old scheme, so for http
2766 // the example would resolve to "http://foo/bar".
2767 {"http://host/a", true, false, "//another", true, true, true,
2768 "http://another/"},
2769 {"http://host/a", true, false, "//another/path?query#ref", true, true,
2770 true, "http://another/path?query#ref"},
2771 {"http://host/a", true, false, "///another/path", true, true, true,
2772 "http://another/path"},
2773 {"http://host/a", true, false, "//Another\\path", true, true, true,
2774 "http://another/path"},
2775 {"http://host/a", true, false, "//", true, true, false, "http:"},
2776 // IE will also allow one or the other to be a backslash to get the same
2777 // behavior.
2778 {"http://host/a", true, false, "\\/another/path", true, true, true,
2779 "http://another/path"},
2780 {"http://host/a", true, false, "/\\Another\\path", true, true, true,
2781 "http://another/path"},
2782 #ifdef WIN32
2783 // Resolving against Windows file base URLs.
2784 {"file:///C:/foo", true, true, "http://host/", true, false, false,
2785 nullptr},
2786 {"file:///C:/foo", true, true, "bar", true, true, true, "file:///C:/bar"},
2787 {"file:///C:/foo", true, true, "../../../bar.html", true, true, true,
2788 "file:///C:/bar.html"},
2789 {"file:///C:/foo", true, true, "/../bar.html", true, true, true,
2790 "file:///C:/bar.html"},
2791 // But two backslashes on Windows should be UNC so should be treated
2792 // as absolute.
2793 {"http://host/a", true, false, "\\\\another\\path", true, false, false,
2794 nullptr},
2795 // IE doesn't support drive specs starting with two slashes. It fails
2796 // immediately and doesn't even try to load. We fix it up to either
2797 // an absolute path or UNC depending on what it looks like.
2798 {"file:///C:/something", true, true, "//c:/foo", true, true, true,
2799 "file:///C:/foo"},
2800 {"file:///C:/something", true, true, "//localhost/c:/foo", true, true,
2801 true, "file:///C:/foo"},
2802 // Windows drive specs should be allowed and treated as absolute.
2803 {"file:///C:/foo", true, true, "c:", true, false, false, nullptr},
2804 {"file:///C:/foo", true, true, "c:/foo", true, false, false, nullptr},
2805 {"http://host/a", true, false, "c:\\foo", true, false, false, nullptr},
2806 // Relative paths with drive letters should be allowed when the base is
2807 // also a file.
2808 {"file:///C:/foo", true, true, "/z:/bar", true, true, true,
2809 "file:///Z:/bar"},
2810 // Treat absolute paths as being off of the drive.
2811 {"file:///C:/foo", true, true, "/bar", true, true, true,
2812 "file:///C:/bar"},
2813 {"file://localhost/C:/foo", true, true, "/bar", true, true, true,
2814 "file://localhost/C:/bar"},
2815 {"file:///C:/foo/com/", true, true, "/bar", true, true, true,
2816 "file:///C:/bar"},
2817 // On Windows, two slashes without a drive letter when the base is a file
2818 // means that the path is UNC.
2819 {"file:///C:/something", true, true, "//somehost/path", true, true, true,
2820 "file://somehost/path"},
2821 {"file:///C:/something", true, true, "/\\//somehost/path", true, true,
2822 true, "file://somehost/path"},
2823 #else
2824 // On Unix we fall back to relative behavior since there's nothing else
2825 // reasonable to do.
2826 {"http://host/a", true, false, "\\\\Another\\path", true, true, true,
2827 "http://another/path"},
2828 #endif
2829 // Even on Windows, we don't allow relative drive specs when the base
2830 // is not file.
2831 {"http://host/a", true, false, "/c:\\foo", true, true, true,
2832 "http://host/c:/foo"},
2833 {"http://host/a", true, false, "//c:\\foo", true, true, true,
2834 "http://c/foo"},
2835 // Cross-platform relative file: resolution behavior.
2836 {"file://host/a", true, true, "/", true, true, true, "file://host/"},
2837 {"file://host/a", true, true, "//", true, true, true, "file:///"},
2838 {"file://host/a", true, true, "/b", true, true, true, "file://host/b"},
2839 {"file://host/a", true, true, "//b", true, true, true, "file://b/"},
2840 // Ensure that ports aren't allowed for hosts relative to a file url.
2841 // Although the result string shows a host:port portion, the call to
2842 // resolve the relative URL returns false, indicating parse failure,
2843 // which is what is required.
2844 {"file:///foo.txt", true, true, "//host:80/bar.txt", true, true, false,
2845 "file://host:80/bar.txt"},
2846 // Filesystem URL tests; filesystem URLs are only valid and relative if
2847 // they have no scheme, e.g. "./index.html". There's no valid equivalent
2848 // to http:index.html.
2849 {"filesystem:http://host/t/path", true, false,
2850 "filesystem:http://host/t/path2", true, false, false, nullptr},
2851 {"filesystem:http://host/t/path", true, false,
2852 "filesystem:https://host/t/path2", true, false, false, nullptr},
2853 {"filesystem:http://host/t/path", true, false, "http://host/t/path2",
2854 true, false, false, nullptr},
2855 {"http://host/t/path", true, false, "filesystem:http://host/t/path2",
2856 true, false, false, nullptr},
2857 {"filesystem:http://host/t/path", true, false, "./path2", true, true,
2858 true, "filesystem:http://host/t/path2"},
2859 {"filesystem:http://host/t/path/", true, false, "path2", true, true, true,
2860 "filesystem:http://host/t/path/path2"},
2861 {"filesystem:http://host/t/path", true, false, "filesystem:http:path2",
2862 true, false, false, nullptr},
2863 // Absolute URLs are still not relative to a non-standard base URL.
2864 {"about:blank", false, false, "http://X/A", true, false, true, ""},
2865 {"about:blank", false, false, "content://content.Provider/", true, false,
2866 true, ""},
2867 };
2868
2869 for (const auto& cur_case : rel_cases) {
2870 Parsed parsed;
2871 int base_len = static_cast<int>(strlen(cur_case.base));
2872 if (cur_case.is_base_file)
2873 ParseFileURL(cur_case.base, base_len, &parsed);
2874 else if (cur_case.is_base_hier)
2875 ParseStandardURL(cur_case.base, base_len, &parsed);
2876 else
2877 ParsePathURL(cur_case.base, base_len, false, &parsed);
2878
2879 // First see if it is relative.
2880 int test_len = static_cast<int>(strlen(cur_case.test));
2881 bool is_relative;
2882 Component relative_component;
2883 bool succeed_is_rel = IsRelativeURL(
2884 cur_case.base, parsed, cur_case.test, test_len, cur_case.is_base_hier,
2885 &is_relative, &relative_component);
2886
2887 EXPECT_EQ(cur_case.succeed_relative, succeed_is_rel) <<
2888 "succeed is rel failure on " << cur_case.test;
2889 EXPECT_EQ(cur_case.is_rel, is_relative) <<
2890 "is rel failure on " << cur_case.test;
2891 // Now resolve it.
2892 if (succeed_is_rel && is_relative && cur_case.is_rel) {
2893 std::string resolved;
2894 StdStringCanonOutput output(&resolved);
2895 Parsed resolved_parsed;
2896
2897 bool succeed_resolve = ResolveRelativeURL(
2898 cur_case.base, parsed, cur_case.is_base_file, cur_case.test,
2899 relative_component, nullptr, &output, &resolved_parsed);
2900 output.Complete();
2901
2902 EXPECT_EQ(cur_case.succeed_resolve, succeed_resolve);
2903 EXPECT_EQ(cur_case.resolved, resolved) << " on " << cur_case.test;
2904
2905 // Verify that the output parsed structure is the same as parsing a
2906 // the URL freshly.
2907 Parsed ref_parsed;
2908 int resolved_len = static_cast<int>(resolved.size());
2909 if (cur_case.is_base_file) {
2910 ParseFileURL(resolved.c_str(), resolved_len, &ref_parsed);
2911 } else if (cur_case.is_base_hier) {
2912 ParseStandardURL(resolved.c_str(), resolved_len, &ref_parsed);
2913 } else {
2914 ParsePathURL(resolved.c_str(), resolved_len, false, &ref_parsed);
2915 }
2916 EXPECT_TRUE(ParsedIsEqual(ref_parsed, resolved_parsed));
2917 }
2918 }
2919 }
2920
2921 class URLCanonTypedTest : public ::testing::TestWithParam<bool> {
2922 public:
URLCanonTypedTest()2923 URLCanonTypedTest()
2924 : use_standard_compliant_non_special_scheme_url_parsing_(GetParam()) {
2925 if (use_standard_compliant_non_special_scheme_url_parsing_) {
2926 scoped_feature_list_.InitAndEnableFeature(
2927 kStandardCompliantNonSpecialSchemeURLParsing);
2928 } else {
2929 scoped_feature_list_.InitAndDisableFeature(
2930 kStandardCompliantNonSpecialSchemeURLParsing);
2931 }
2932 }
2933
2934 protected:
2935 struct URLCase {
2936 const std::string_view input;
2937 const std::string_view expected;
2938 bool expected_success;
2939 };
2940
2941 struct ResolveRelativeURLCase {
2942 const std::string_view base;
2943 const std::string_view rel;
2944 const bool is_base_hier;
2945 const bool expected_base_is_valid;
2946 const bool expected_is_relative;
2947 const bool expected_succeed_resolve;
2948 const std::string_view expected_resolved_url;
2949 };
2950
TestNonSpecialResolveRelativeURL(const ResolveRelativeURLCase & relative_case)2951 void TestNonSpecialResolveRelativeURL(
2952 const ResolveRelativeURLCase& relative_case) {
2953 // The following test is similar to URLCanonTest::ResolveRelativeURL, but
2954 // simplified.
2955 Parsed parsed;
2956 if (use_standard_compliant_non_special_scheme_url_parsing_) {
2957 ParseNonSpecialURL(relative_case.base.data(), relative_case.base.size(),
2958 &parsed);
2959 } else {
2960 ParsePathURL(relative_case.base.data(), relative_case.base.size(),
2961 /*trim_path_end=*/true, &parsed);
2962 }
2963
2964 // First see if it is relative.
2965 bool is_relative;
2966 Component relative_component;
2967 bool succeed_is_rel = IsRelativeURL(
2968 relative_case.base.data(), parsed, relative_case.rel.data(),
2969 relative_case.rel.size(), relative_case.is_base_hier, &is_relative,
2970 &relative_component);
2971
2972 EXPECT_EQ(is_relative, relative_case.expected_is_relative);
2973 if (succeed_is_rel && is_relative) {
2974 std::string resolved_url;
2975 StdStringCanonOutput output(&resolved_url);
2976 Parsed resolved_parsed;
2977
2978 bool succeed_resolve = ResolveRelativeURL(
2979 relative_case.base.data(), parsed, relative_case.is_base_hier,
2980 relative_case.rel.data(), relative_component, nullptr, &output,
2981 &resolved_parsed);
2982 output.Complete();
2983
2984 EXPECT_EQ(succeed_resolve, relative_case.expected_succeed_resolve);
2985 EXPECT_EQ(resolved_url, relative_case.expected_resolved_url);
2986 }
2987 }
2988
2989 bool use_standard_compliant_non_special_scheme_url_parsing_;
2990
2991 private:
2992 base::test::ScopedFeatureList scoped_feature_list_;
2993 };
2994
TEST_P(URLCanonTypedTest,NonSpecialResolveRelativeURL)2995 TEST_P(URLCanonTypedTest, NonSpecialResolveRelativeURL) {
2996 // Test flag-dependent behaviors of non-special URLs.
2997 if (use_standard_compliant_non_special_scheme_url_parsing_) {
2998 ResolveRelativeURLCase cases[] = {
2999 {"git://host", "path", true, true, true, true, "git://host/path"},
3000 };
3001 for (const auto& i : cases) {
3002 TestNonSpecialResolveRelativeURL(i);
3003 }
3004 } else {
3005 ResolveRelativeURLCase cases[] = {
3006 {"git://host", "path", true, true, true, true, "git://path"},
3007 };
3008 for (const auto& i : cases) {
3009 TestNonSpecialResolveRelativeURL(i);
3010 }
3011 }
3012 }
3013
3014 INSTANTIATE_TEST_SUITE_P(All, URLCanonTypedTest, ::testing::Bool());
3015
3016 // It used to be the case that when we did a replacement with a long buffer of
3017 // UTF-16 characters, we would get invalid data in the URL. This is because the
3018 // buffer that it used to hold the UTF-8 data was resized, while some pointers
3019 // were still kept to the old buffer that was removed.
TEST(URLCanonTest,ReplacementOverflow)3020 TEST(URLCanonTest, ReplacementOverflow) {
3021 const char src[] = "file:///C:/foo/bar";
3022 int src_len = static_cast<int>(strlen(src));
3023 Parsed parsed;
3024 ParseFileURL(src, src_len, &parsed);
3025
3026 // Override two components, the path with something short, and the query with
3027 // something long enough to trigger the bug.
3028 Replacements<char16_t> repl;
3029 std::u16string new_query;
3030 for (int i = 0; i < 4800; i++)
3031 new_query.push_back('a');
3032
3033 std::u16string new_path(test_utils::TruncateWStringToUTF16(L"/foo"));
3034 repl.SetPath(new_path.c_str(), Component(0, 4));
3035 repl.SetQuery(new_query.c_str(),
3036 Component(0, static_cast<int>(new_query.length())));
3037
3038 // Call ReplaceComponents on the string. It doesn't matter if we call it for
3039 // standard URLs, file URLs, etc, since they will go to the same replacement
3040 // function that was buggy.
3041 Parsed repl_parsed;
3042 std::string repl_str;
3043 StdStringCanonOutput repl_output(&repl_str);
3044 ReplaceFileURL(src, parsed, repl, nullptr, &repl_output, &repl_parsed);
3045 repl_output.Complete();
3046
3047 // Generate the expected string and check.
3048 std::string expected("file:///foo?");
3049 for (size_t i = 0; i < new_query.length(); i++)
3050 expected.push_back('a');
3051 EXPECT_TRUE(expected == repl_str);
3052 }
3053
TEST(URLCanonTest,DefaultPortForScheme)3054 TEST(URLCanonTest, DefaultPortForScheme) {
3055 struct TestCases {
3056 const char* scheme;
3057 const int expected_port;
3058 } cases[]{
3059 {"http", 80},
3060 {"https", 443},
3061 {"ftp", 21},
3062 {"ws", 80},
3063 {"wss", 443},
3064 {"fake-scheme", PORT_UNSPECIFIED},
3065 {"HTTP", PORT_UNSPECIFIED},
3066 {"HTTPS", PORT_UNSPECIFIED},
3067 {"FTP", PORT_UNSPECIFIED},
3068 {"WS", PORT_UNSPECIFIED},
3069 {"WSS", PORT_UNSPECIFIED},
3070 };
3071
3072 for (const auto& test_case : cases) {
3073 SCOPED_TRACE(test_case.scheme);
3074 EXPECT_EQ(test_case.expected_port,
3075 DefaultPortForScheme(test_case.scheme, strlen(test_case.scheme)));
3076 }
3077 }
3078
TEST(URLCanonTest,FindWindowsDriveLetter)3079 TEST(URLCanonTest, FindWindowsDriveLetter) {
3080 struct TestCase {
3081 std::string_view spec;
3082 int begin;
3083 int end; // -1 for end of spec
3084 int expected_drive_letter_pos;
3085 } cases[] = {
3086 {"/", 0, -1, -1},
3087
3088 {"c:/foo", 0, -1, 0},
3089 {"/c:/foo", 0, -1, 1},
3090 {"//c:/foo", 0, -1, -1}, // "//" does not canonicalize to "/"
3091 {"\\C|\\foo", 0, -1, 1},
3092 {"/cd:/foo", 0, -1, -1}, // "/c" does not canonicalize to "/"
3093 {"/./c:/foo", 0, -1, 3},
3094 {"/.//c:/foo", 0, -1, -1}, // "/.//" does not canonicalize to "/"
3095 {"/././c:/foo", 0, -1, 5},
3096 {"/abc/c:/foo", 0, -1, -1}, // "/abc/" does not canonicalize to "/"
3097 {"/abc/./../c:/foo", 0, -1, 10},
3098
3099 {"/c:/c:/foo", 3, -1, 4}, // actual input is "/c:/foo"
3100 {"/c:/foo", 3, -1, -1}, // actual input is "/foo"
3101 {"/c:/foo", 0, 1, -1}, // actual input is "/"
3102 };
3103
3104 for (const auto& c : cases) {
3105 int end = c.end;
3106 if (end == -1)
3107 end = c.spec.size();
3108
3109 EXPECT_EQ(c.expected_drive_letter_pos,
3110 FindWindowsDriveLetter(c.spec.data(), c.begin, end))
3111 << "for " << c.spec << "[" << c.begin << ":" << end << "] (UTF-8)";
3112
3113 std::u16string spec16 = base::ASCIIToUTF16(c.spec);
3114 EXPECT_EQ(c.expected_drive_letter_pos,
3115 FindWindowsDriveLetter(spec16.data(), c.begin, end))
3116 << "for " << c.spec << "[" << c.begin << ":" << end << "] (UTF-16)";
3117 }
3118 }
3119
TEST(URLCanonTest,IDNToASCII)3120 TEST(URLCanonTest, IDNToASCII) {
3121 RawCanonOutputW<1024> output;
3122
3123 // Basic ASCII test.
3124 std::u16string str = u"hello";
3125 EXPECT_TRUE(IDNToASCII(str, &output));
3126 EXPECT_EQ(u"hello", std::u16string(output.data()));
3127 output.set_length(0);
3128
3129 // Mixed ASCII/non-ASCII.
3130 str = u"hellö";
3131 EXPECT_TRUE(IDNToASCII(str, &output));
3132 EXPECT_EQ(u"xn--hell-8qa", std::u16string(output.data()));
3133 output.set_length(0);
3134
3135 // All non-ASCII.
3136 str = u"你好";
3137 EXPECT_TRUE(IDNToASCII(str, &output));
3138 EXPECT_EQ(u"xn--6qq79v", std::u16string(output.data()));
3139 output.set_length(0);
3140
3141 // Characters that need mapping (the resulting Punycode is the encoding for
3142 // "1⁄4").
3143 str = u"¼";
3144 EXPECT_TRUE(IDNToASCII(str, &output));
3145 EXPECT_EQ(u"xn--14-c6t", std::u16string(output.data()));
3146 output.set_length(0);
3147
3148 // String to encode already starts with "xn--", and all ASCII. Should not
3149 // modify the string.
3150 str = u"xn--hell-8qa";
3151 EXPECT_TRUE(IDNToASCII(str, &output));
3152 EXPECT_EQ(u"xn--hell-8qa", std::u16string(output.data()));
3153 output.set_length(0);
3154
3155 // String to encode already starts with "xn--", and mixed ASCII/non-ASCII.
3156 // Should fail, due to a special case: if the label starts with "xn--", it
3157 // should be parsed as Punycode, which must be all ASCII.
3158 str = u"xn--hellö";
3159 EXPECT_FALSE(IDNToASCII(str, &output));
3160 output.set_length(0);
3161
3162 // String to encode already starts with "xn--", and mixed ASCII/non-ASCII.
3163 // This tests that there is still an error for the character '⁄' (U+2044),
3164 // which would be a valid ASCII character, U+0044, if the high byte were
3165 // ignored.
3166 str = u"xn--1⁄4";
3167 EXPECT_FALSE(IDNToASCII(str, &output));
3168 output.set_length(0);
3169 }
3170
ComponentCaseMatches(bool success,std::string_view out_str,const Component & out_comp,const DualComponentCase & expected)3171 void ComponentCaseMatches(bool success,
3172 std::string_view out_str,
3173 const Component& out_comp,
3174 const DualComponentCase& expected) {
3175 EXPECT_EQ(success, expected.expected_success);
3176 EXPECT_STREQ(out_str.data(), expected.expected);
3177 EXPECT_EQ(out_comp, expected.expected_component);
3178 }
3179
TEST(URLCanonTest,OpaqueHost)3180 TEST(URLCanonTest, OpaqueHost) {
3181 DualComponentCase host_cases[] = {
3182 {"", L"", "", Component(), true},
3183 {"google.com", L"google.com", "google.com", Component(0, 10), true},
3184 // Upper case letters should be preserved.
3185 {"gooGle.com", L"gooGle.com", "gooGle.com", Component(0, 10), true},
3186 {"\x41", L"\x41", "A", Component(0, 1), true},
3187 {"\x61", L"\x61", "a", Component(0, 1), true},
3188 // Percent encode.
3189 {"\x10", L"\x10", "%10", Component(0, 3), true},
3190 // A valid percent encoding should be preserved.
3191 {"%41", L"%41", "%41", Component(0, 3), true},
3192 // An invalid percent encoding should be preserved too.
3193 {"%zz", L"%zz", "%zz", Component(0, 3), true},
3194 // UTF-16 HIRAGANA LETTER A (codepoint U+3042, "\xe3\x81\x82" in UTF-8).
3195 {"\xe3\x81\x82", L"\x3042", "%E3%81%82", Component(0, 9), true},
3196 };
3197
3198 for (const auto& host_case : host_cases) {
3199 SCOPED_TRACE(testing::Message() << "url: \"" << host_case.input8 << "\"");
3200 std::string out_str;
3201 StdStringCanonOutput output(&out_str);
3202 Component out_comp;
3203 bool success = CanonicalizeNonSpecialHost(
3204 host_case.input8,
3205 Component(0, static_cast<int>(strlen(host_case.input8))), output,
3206 out_comp);
3207 output.Complete();
3208 ComponentCaseMatches(success, out_str, out_comp, host_case);
3209 }
3210
3211 // UTF-16 version.
3212 for (const auto& host_case : host_cases) {
3213 SCOPED_TRACE(testing::Message() << "url: \"" << host_case.input16 << "\"");
3214 std::u16string input16(
3215 test_utils::TruncateWStringToUTF16(host_case.input16));
3216 std::string out_str;
3217 StdStringCanonOutput output(&out_str);
3218 Component out_comp;
3219 bool success = CanonicalizeNonSpecialHost(
3220 input16.c_str(), Component(0, static_cast<int>(input16.length())),
3221 output, out_comp);
3222 output.Complete();
3223 ComponentCaseMatches(success, out_str, out_comp, host_case);
3224 }
3225 }
3226
IPAddressCaseMatches(std::string_view out_str,const CanonHostInfo & host_info,const IPAddressCase & expected)3227 void IPAddressCaseMatches(std::string_view out_str,
3228 const CanonHostInfo& host_info,
3229 const IPAddressCase& expected) {
3230 EXPECT_EQ(host_info.family, expected.expected_family);
3231 EXPECT_STREQ(out_str.data(), expected.expected);
3232 EXPECT_EQ(base::HexEncode(host_info.address,
3233 static_cast<size_t>(host_info.AddressLength())),
3234 expected.expected_address_hex);
3235 if (expected.expected_family == CanonHostInfo::IPV4) {
3236 EXPECT_EQ(host_info.num_ipv4_components,
3237 expected.expected_num_ipv4_components);
3238 }
3239 }
3240
TEST(URLCanonTest,NonSpecialHostIPv6Address)3241 TEST(URLCanonTest, NonSpecialHostIPv6Address) {
3242 IPAddressCase ip_address_cases[] = {
3243 // Non-special URLs don't support IPv4. Family must be NEUTRAL.
3244 {"192.168.0.1", L"192.168.0.1", "192.168.0.1", Component(0, 11),
3245 CanonHostInfo::NEUTRAL, 0, ""},
3246 {"192", L"192", "192", Component(0, 3), CanonHostInfo::NEUTRAL, 0, ""},
3247 // "257" is allowed since the number is not considered as a part of IPv4.
3248 {"192.168.0.257", L"192.168.0.257", "192.168.0.257", Component(0, 13),
3249 CanonHostInfo::NEUTRAL, 0, ""},
3250 // IPv6.
3251 {"[1:0:0:2::3:0]", L"[1:0:0:2::3:0]", "[1::2:0:0:3:0]", Component(0, 14),
3252 CanonHostInfo::IPV6, -1, "00010000000000020000000000030000"},
3253 {"[::]", L"[::]", "[::]", Component(0, 4), CanonHostInfo::IPV6, -1,
3254 "00000000000000000000000000000000"},
3255 // Invalid hosts.
3256 {"#[::]", L"#[::]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
3257 {"[]", L"[]", "[]", Component(), CanonHostInfo::BROKEN, -1, ""},
3258 {"a]", L"a]", "a]", Component(), CanonHostInfo::BROKEN, -1, ""},
3259 {"[a", L"[a", "[a", Component(), CanonHostInfo::BROKEN, -1, ""},
3260 {"a[]", L"a[]", "a[]", Component(), CanonHostInfo::BROKEN, -1, ""},
3261 {"[]a", L"[]a", "[]a", Component(), CanonHostInfo::BROKEN, -1, ""},
3262 };
3263
3264 for (const auto& ip_address_case : ip_address_cases) {
3265 SCOPED_TRACE(testing::Message()
3266 << "url: \"" << ip_address_case.input8 << "\"");
3267 std::string out_str;
3268 StdStringCanonOutput output(&out_str);
3269 CanonHostInfo host_info;
3270 CanonicalizeNonSpecialHostVerbose(
3271 ip_address_case.input8,
3272 Component(0, static_cast<int>(strlen(ip_address_case.input8))), output,
3273 host_info);
3274 output.Complete();
3275 IPAddressCaseMatches(out_str, host_info, ip_address_case);
3276 }
3277
3278 // UTF-16 version.
3279 for (const auto& ip_address_case : ip_address_cases) {
3280 SCOPED_TRACE(testing::Message()
3281 << "url: \"" << ip_address_case.input16 << "\"");
3282 std::u16string input16(
3283 test_utils::TruncateWStringToUTF16(ip_address_case.input16));
3284 std::string out_str;
3285 StdStringCanonOutput output(&out_str);
3286 CanonHostInfo host_info;
3287 CanonicalizeNonSpecialHostVerbose(
3288 input16.c_str(), Component(0, static_cast<int>(input16.length())),
3289 output, host_info);
3290 output.Complete();
3291 IPAddressCaseMatches(out_str, host_info, ip_address_case);
3292 }
3293 }
3294
3295 } // namespace url
3296