xref: /aosp_15_r20/external/cronet/url/url_canon_unittest.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "url/url_canon.h"
6 
7 #include <errno.h>
8 #include <stddef.h>
9 #include <string_view>
10 
11 #include "base/strings/string_number_conversions.h"
12 #include "base/strings/utf_string_conversions.h"
13 #include "base/test/gtest_util.h"
14 #include "base/test/scoped_feature_list.h"
15 #include "testing/gtest/include/gtest/gtest.h"
16 #include "url/third_party/mozilla/url_parse.h"
17 #include "url/url_canon_internal.h"
18 #include "url/url_canon_stdstring.h"
19 #include "url/url_features.h"
20 #include "url/url_test_utils.h"
21 
22 namespace url {
23 
24 namespace {
25 
26 struct ComponentCase {
27   const char* input;
28   const char* expected;
29   Component expected_component;
30   bool expected_success;
31 };
32 
33 // ComponentCase but with dual 8-bit/16-bit input. Generally, the unit tests
34 // treat each input as optional, and will only try processing if non-NULL.
35 // The output is always 8-bit.
36 struct DualComponentCase {
37   const char* input8;
38   const wchar_t* input16;
39   const char* expected;
40   Component expected_component;
41   bool expected_success;
42 };
43 
44 // Test cases for CanonicalizeIPAddress(). The inputs are identical to
45 // DualComponentCase, but the output has extra CanonHostInfo fields.
46 struct IPAddressCase {
47   const char* input8;
48   const wchar_t* input16;
49   const char* expected;
50   Component expected_component;
51 
52   // CanonHostInfo fields, for verbose output.
53   CanonHostInfo::Family expected_family;
54   int expected_num_ipv4_components;
55   const char* expected_address_hex;  // Two hex chars per IP address byte.
56 };
57 
58 struct ReplaceCase {
59   const char* base;
60   const char* scheme;
61   const char* username;
62   const char* password;
63   const char* host;
64   const char* port;
65   const char* path;
66   const char* query;
67   const char* ref;
68   const char* expected;
69 };
70 
71 // Magic string used in the replacements code that tells SetupReplComp to
72 // call the clear function.
73 const char kDeleteComp[] = "|";
74 
75 // Sets up a replacement for a single component. This is given pointers to
76 // the set and clear function for the component being replaced, and will
77 // either set the component (if it exists) or clear it (if the replacement
78 // string matches kDeleteComp).
79 //
80 // This template is currently used only for the 8-bit case, and the strlen
81 // causes it to fail in other cases. It is left a template in case we have
82 // tests for wide replacements.
83 template<typename CHAR>
SetupReplComp(void (Replacements<CHAR>::* set)(const CHAR *,const Component &),void (Replacements<CHAR>::* clear)(),Replacements<CHAR> * rep,const CHAR * str)84 void SetupReplComp(
85     void (Replacements<CHAR>::*set)(const CHAR*, const Component&),
86     void (Replacements<CHAR>::*clear)(),
87     Replacements<CHAR>* rep,
88     const CHAR* str) {
89   if (str && str[0] == kDeleteComp[0]) {
90     (rep->*clear)();
91   } else if (str) {
92     (rep->*set)(str, Component(0, static_cast<int>(strlen(str))));
93   }
94 }
95 
CanonicalizeSpecialPath(const char * spec,const Component & path,CanonOutput * output,Component * out_path)96 bool CanonicalizeSpecialPath(const char* spec,
97                              const Component& path,
98                              CanonOutput* output,
99                              Component* out_path) {
100   return CanonicalizePath(spec, path, CanonMode::kSpecialURL, output, out_path);
101 }
102 
CanonicalizeSpecialPath(const char16_t * spec,const Component & path,CanonOutput * output,Component * out_path)103 bool CanonicalizeSpecialPath(const char16_t* spec,
104                              const Component& path,
105                              CanonOutput* output,
106                              Component* out_path) {
107   return CanonicalizePath(spec, path, CanonMode::kSpecialURL, output, out_path);
108 }
109 
CanonicalizeNonSpecialPath(const char * spec,const Component & path,CanonOutput * output,Component * out_path)110 bool CanonicalizeNonSpecialPath(const char* spec,
111                                 const Component& path,
112                                 CanonOutput* output,
113                                 Component* out_path) {
114   return CanonicalizePath(spec, path, CanonMode::kNonSpecialURL, output,
115                           out_path);
116 }
117 
CanonicalizeNonSpecialPath(const char16_t * spec,const Component & path,CanonOutput * output,Component * out_path)118 bool CanonicalizeNonSpecialPath(const char16_t* spec,
119                                 const Component& path,
120                                 CanonOutput* output,
121                                 Component* out_path) {
122   return CanonicalizePath(spec, path, CanonMode::kNonSpecialURL, output,
123                           out_path);
124 }
125 
126 }  // namespace
127 
TEST(URLCanonTest,DoAppendUTF8)128 TEST(URLCanonTest, DoAppendUTF8) {
129   struct UTF8Case {
130     unsigned input;
131     const char* output;
132   } utf_cases[] = {
133     // Valid code points.
134     {0x24, "\x24"},
135     {0xA2, "\xC2\xA2"},
136     {0x20AC, "\xE2\x82\xAC"},
137     {0x24B62, "\xF0\xA4\xAD\xA2"},
138     {0x10FFFF, "\xF4\x8F\xBF\xBF"},
139   };
140   std::string out_str;
141   for (const auto& utf_case : utf_cases) {
142     out_str.clear();
143     StdStringCanonOutput output(&out_str);
144     AppendUTF8Value(utf_case.input, &output);
145     output.Complete();
146     EXPECT_EQ(utf_case.output, out_str);
147   }
148 }
149 
TEST(URLCanonTest,DoAppendUTF8Invalid)150 TEST(URLCanonTest, DoAppendUTF8Invalid) {
151   std::string out_str;
152   StdStringCanonOutput output(&out_str);
153   // Invalid code point (too large).
154   EXPECT_DCHECK_DEATH({
155     AppendUTF8Value(0x110000, &output);
156     output.Complete();
157   });
158 }
159 
TEST(URLCanonTest,UTF)160 TEST(URLCanonTest, UTF) {
161   // Low-level test that we handle reading, canonicalization, and writing
162   // UTF-8/UTF-16 strings properly.
163   struct UTFCase {
164     const char* input8;
165     const wchar_t* input16;
166     bool expected_success;
167     const char* output;
168   } utf_cases[] = {
169       // Valid canonical input should get passed through & escaped.
170       {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true, "%E4%BD%A0%E5%A5%BD"},
171       // Test a character that takes > 16 bits (U+10300 = old italic letter A)
172       {"\xF0\x90\x8C\x80", L"\xd800\xdf00", true, "%F0%90%8C%80"},
173       // Non-shortest-form UTF-8 characters are invalid. The bad bytes should
174       // each be replaced with the invalid character (EF BF DB in UTF-8).
175       {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", nullptr, false,
176        "%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%E5%A5%BD"},
177       // Invalid UTF-8 sequences should be marked as invalid (the first
178       // sequence is truncated).
179       {"\xe4\xa0\xe5\xa5\xbd", L"\xd800\x597d", false, "%EF%BF%BD%E5%A5%BD"},
180       // Character going off the end.
181       {"\xe4\xbd\xa0\xe5\xa5", L"\x4f60\xd800", false, "%E4%BD%A0%EF%BF%BD"},
182       // ...same with low surrogates with no high surrogate.
183       {nullptr, L"\xdc00", false, "%EF%BF%BD"},
184       // Test a UTF-8 encoded surrogate value is marked as invalid.
185       // ED A0 80 = U+D800
186       {"\xed\xa0\x80", nullptr, false, "%EF%BF%BD%EF%BF%BD%EF%BF%BD"},
187       // ...even when paired.
188       {"\xed\xa0\x80\xed\xb0\x80", nullptr, false,
189        "%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD"},
190   };
191 
192   std::string out_str;
193   for (const auto& utf_case : utf_cases) {
194     if (utf_case.input8) {
195       out_str.clear();
196       StdStringCanonOutput output(&out_str);
197 
198       size_t input_len = strlen(utf_case.input8);
199       bool success = true;
200       for (size_t ch = 0; ch < input_len; ch++) {
201         success &=
202             AppendUTF8EscapedChar(utf_case.input8, &ch, input_len, &output);
203       }
204       output.Complete();
205       EXPECT_EQ(utf_case.expected_success, success);
206       EXPECT_EQ(utf_case.output, out_str);
207     }
208     if (utf_case.input16) {
209       out_str.clear();
210       StdStringCanonOutput output(&out_str);
211 
212       std::u16string input_str(
213           test_utils::TruncateWStringToUTF16(utf_case.input16));
214       size_t input_len = input_str.length();
215       bool success = true;
216       for (size_t ch = 0; ch < input_len; ch++) {
217         success &= AppendUTF8EscapedChar(input_str.c_str(), &ch, input_len,
218                                          &output);
219       }
220       output.Complete();
221       EXPECT_EQ(utf_case.expected_success, success);
222       EXPECT_EQ(utf_case.output, out_str);
223     }
224 
225     if (utf_case.input8 && utf_case.input16 && utf_case.expected_success) {
226       // Check that the UTF-8 and UTF-16 inputs are equivalent.
227 
228       // UTF-16 -> UTF-8
229       std::string input8_str(utf_case.input8);
230       std::u16string input16_str(
231           test_utils::TruncateWStringToUTF16(utf_case.input16));
232       EXPECT_EQ(input8_str, base::UTF16ToUTF8(input16_str));
233 
234       // UTF-8 -> UTF-16
235       EXPECT_EQ(input16_str, base::UTF8ToUTF16(input8_str));
236     }
237   }
238 }
239 
TEST(URLCanonTest,Scheme)240 TEST(URLCanonTest, Scheme) {
241   // Here, we're mostly testing that unusual characters are handled properly.
242   // The canonicalizer doesn't do any parsing or whitespace detection. It will
243   // also do its best on error, and will escape funny sequences (these won't be
244   // valid schemes and it will return error).
245   //
246   // Note that the canonicalizer will append a colon to the output to separate
247   // out the rest of the URL, which is not present in the input. We check,
248   // however, that the output range includes everything but the colon.
249   ComponentCase scheme_cases[] = {
250     {"http", "http:", Component(0, 4), true},
251     {"HTTP", "http:", Component(0, 4), true},
252     {" HTTP ", "%20http%20:", Component(0, 10), false},
253     {"htt: ", "htt%3A%20:", Component(0, 9), false},
254     {"\xe4\xbd\xa0\xe5\xa5\xbdhttp", "%E4%BD%A0%E5%A5%BDhttp:", Component(0, 22), false},
255       // Don't re-escape something already escaped. Note that it will
256       // "canonicalize" the 'A' to 'a', but that's OK.
257     {"ht%3Atp", "ht%3atp:", Component(0, 7), false},
258     {"", ":", Component(0, 0), false},
259   };
260 
261   std::string out_str;
262 
263   for (const auto& scheme_case : scheme_cases) {
264     int url_len = static_cast<int>(strlen(scheme_case.input));
265     Component in_comp(0, url_len);
266     Component out_comp;
267 
268     out_str.clear();
269     StdStringCanonOutput output1(&out_str);
270     bool success =
271         CanonicalizeScheme(scheme_case.input, in_comp, &output1, &out_comp);
272     output1.Complete();
273 
274     EXPECT_EQ(scheme_case.expected_success, success);
275     EXPECT_EQ(scheme_case.expected, out_str);
276     EXPECT_EQ(scheme_case.expected_component.begin, out_comp.begin);
277     EXPECT_EQ(scheme_case.expected_component.len, out_comp.len);
278 
279     // Now try the wide version.
280     out_str.clear();
281     StdStringCanonOutput output2(&out_str);
282 
283     std::u16string wide_input(base::UTF8ToUTF16(scheme_case.input));
284     in_comp.len = static_cast<int>(wide_input.length());
285     success = CanonicalizeScheme(wide_input.c_str(), in_comp, &output2,
286                                  &out_comp);
287     output2.Complete();
288 
289     EXPECT_EQ(scheme_case.expected_success, success);
290     EXPECT_EQ(scheme_case.expected, out_str);
291     EXPECT_EQ(scheme_case.expected_component.begin, out_comp.begin);
292     EXPECT_EQ(scheme_case.expected_component.len, out_comp.len);
293   }
294 
295   // Test the case where the scheme is declared nonexistent, it should be
296   // converted into an empty scheme.
297   Component out_comp;
298   out_str.clear();
299   StdStringCanonOutput output(&out_str);
300 
301   EXPECT_FALSE(CanonicalizeScheme("", Component(0, -1), &output, &out_comp));
302   output.Complete();
303 
304   EXPECT_EQ(":", out_str);
305   EXPECT_EQ(0, out_comp.begin);
306   EXPECT_EQ(0, out_comp.len);
307 }
308 
309 // IDNA mode to use in CanonHost tests.
310 enum class IDNAMode { kTransitional, kNonTransitional };
311 
312 class URLCanonHostTest
313     : public ::testing::Test,
314       public ::testing::WithParamInterface<IDNAMode> {
315  public:
URLCanonHostTest()316   URLCanonHostTest() {
317     if (GetParam() == IDNAMode::kNonTransitional) {
318       scoped_feature_list_.InitAndEnableFeature(kUseIDNA2008NonTransitional);
319     } else {
320       scoped_feature_list_.InitAndDisableFeature(kUseIDNA2008NonTransitional);
321     }
322   }
323 
324  private:
325   base::test::ScopedFeatureList scoped_feature_list_;
326 };
327 
328 INSTANTIATE_TEST_SUITE_P(All,
329                          URLCanonHostTest,
330                          ::testing::Values(IDNAMode::kTransitional,
331                                            IDNAMode::kNonTransitional));
332 
TEST_P(URLCanonHostTest,Host)333 TEST_P(URLCanonHostTest, Host) {
334   bool use_idna_non_transitional = IsUsingIDNA2008NonTransitional();
335 
336   // clang-format off
337   IPAddressCase host_cases[] = {
338       // Basic canonicalization, uppercase should be converted to lowercase.
339       {"GoOgLe.CoM", L"GoOgLe.CoM", "google.com", Component(0, 10),
340        CanonHostInfo::NEUTRAL, -1, ""},
341       // TODO(https://crbug.com/1416013): Update the test after SPACE is
342       // correctly handled.
343       {"Goo%20 goo.com", L"Goo%20 goo.com", "goo%20%20goo.com",
344        Component(0, 16), CanonHostInfo::NEUTRAL, -1, ""},
345       // TODO(https://crbug.com/1416013): Update the test after ASTERISK is
346       // correctly handled.
347       {"Goo%2a*goo.com", L"Goo%2a*goo.com", "goo%2A%2Agoo.com",
348        Component(0, 16), CanonHostInfo::NEUTRAL, -1, ""},
349       // Exciting different types of spaces!
350       {nullptr, L"GOO\x00a0\x3000goo.com", "goo%20%20goo.com", Component(0, 16),
351        CanonHostInfo::NEUTRAL, -1, ""},
352       // Other types of space (no-break, zero-width, zero-width-no-break) are
353       // name-prepped away to nothing.
354       {nullptr, L"GOO\x200b\x2060\xfeffgoo.com", "googoo.com", Component(0, 10),
355        CanonHostInfo::NEUTRAL, -1, ""},
356       // Ideographic full stop (full-width period for Chinese, etc.) should be
357       // treated as a dot.
358       {nullptr,
359        L"www.foo\x3002"
360        L"bar.com",
361        "www.foo.bar.com", Component(0, 15), CanonHostInfo::NEUTRAL, -1, ""},
362       // Invalid unicode characters should fail...
363       {"\xef\xb7\x90zyx.com", L"\xfdd0zyx.com", "%EF%B7%90zyx.com",
364        Component(0, 16), CanonHostInfo::BROKEN, -1, ""},
365       // ...This is the same as previous but with with escaped.
366       {"%ef%b7%90zyx.com", L"%ef%b7%90zyx.com", "%EF%B7%90zyx.com",
367        Component(0, 16), CanonHostInfo::BROKEN, -1, ""},
368       // Test name prepping, fullwidth input should be converted to ASCII and
369       // NOT
370       // IDN-ized. This is "Go" in fullwidth UTF-8/UTF-16.
371       {"\xef\xbc\xa7\xef\xbd\x8f.com", L"\xff27\xff4f.com", "go.com",
372        Component(0, 6), CanonHostInfo::NEUTRAL, -1, ""},
373       // Test that fullwidth escaped values are properly name-prepped,
374       // then converted or rejected.
375       // ...%41 in fullwidth = 'A' (also as escaped UTF-8 input)
376       {"\xef\xbc\x85\xef\xbc\x94\xef\xbc\x91.com", L"\xff05\xff14\xff11.com",
377        "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
378       {"%ef%bc%85%ef%bc%94%ef%bc%91.com", L"%ef%bc%85%ef%bc%94%ef%bc%91.com",
379        "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
380       // ...%00 in fullwidth should fail (also as escaped UTF-8 input)
381       {"\xef\xbc\x85\xef\xbc\x90\xef\xbc\x90.com", L"\xff05\xff10\xff10.com",
382        "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
383       {"%ef%bc%85%ef%bc%90%ef%bc%90.com", L"%ef%bc%85%ef%bc%90%ef%bc%90.com",
384        "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
385       // ICU will convert weird percents into ASCII percents, but not unescape
386       // further. A weird percent is U+FE6A (EF B9 AA in UTF-8) which is a
387       // "small percent". At this point we should be within our rights to mark
388       // anything as invalid since the URL is corrupt or malicious. The code
389       // happens to allow ASCII characters (%41 = "A" -> 'a') to be unescaped
390       // and kept as valid, so we validate that behavior here, but this level
391       // of fixing the input shouldn't be seen as required. "%81" is invalid.
392       {"\xef\xb9\xaa"
393        "41.com",
394        L"\xfe6a"
395        L"41.com",
396        "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
397       {"%ef%b9%aa"
398        "41.com",
399        L"\xfe6a"
400        L"41.com",
401        "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
402       {"\xef\xb9\xaa"
403        "81.com",
404        L"\xfe6a"
405        L"81.com",
406        "%81.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
407       {"%ef%b9%aa"
408        "81.com",
409        L"\xfe6a"
410        L"81.com",
411        "%81.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
412       // Basic IDN support, UTF-8 and UTF-16 input should be converted to IDN
413       {"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd",
414        L"\x4f60\x597d\x4f60\x597d", "xn--6qqa088eba", Component(0, 14),
415        CanonHostInfo::NEUTRAL, -1, ""},
416       // See http://unicode.org/cldr/utility/idna.jsp for other
417       // examples/experiments and http://goo.gl/7yG11o
418       // for the full list of characters handled differently by
419       // IDNA 2003, UTS 46 (http://unicode.org/reports/tr46/ ) and IDNA 2008.
420 
421       // 4 Deviation characters are mapped/ignored in UTS 46 transitional
422       // mechansm. UTS 46, table 4 row (g).
423       // Sharp-s is mapped to 'ss' in IDNA 2003, not in IDNA 2008 or UTF 46
424       // after transitional period.
425       // Previously, it'd be "fussball.de".
426       {"fu\xc3\x9f"
427        "ball.de",
428        L"fu\x00df"
429        L"ball.de",
430        use_idna_non_transitional ? "xn--fuball-cta.de" : "fussball.de",
431        use_idna_non_transitional ? Component(0, 17) : Component(0, 11),
432        CanonHostInfo::NEUTRAL, -1, ""},
433 
434       // Final-sigma (U+03C3) was mapped to regular sigma (U+03C2).
435       // Previously, it'd be "xn--wxaikc9b".
436       {"\xcf\x83\xcf\x8c\xce\xbb\xce\xbf\xcf\x82", L"\x3c3\x3cc\x3bb\x3bf\x3c2",
437        use_idna_non_transitional ? "xn--wxaijb9b" : "xn--wxaikc6b",
438        Component(0, 12), CanonHostInfo::NEUTRAL, -1, ""},
439 
440       // ZWNJ (U+200C) and ZWJ (U+200D) are mapped away in UTS 46 transitional
441       // handling as well as in IDNA 2003, but not thereafter.
442       {"a\xe2\x80\x8c"
443        "b\xe2\x80\x8d"
444        "c",
445        L"a\x200c"
446        L"b\x200d"
447        L"c",
448        use_idna_non_transitional ? "xn--abc-9m0ag" : "abc",
449        use_idna_non_transitional ? Component(0, 13) : Component(0, 3),
450        CanonHostInfo::NEUTRAL, -1, ""},
451 
452       // ZWJ between Devanagari characters was still mapped away in UTS 46
453       // transitional handling. IDNA 2008 gives xn--11bo0mv54g.
454       // Previously "xn--11bo0m".
455       {"\xe0\xa4\x95\xe0\xa5\x8d\xe2\x80\x8d\xe0\xa4\x9c",
456        L"\x915\x94d\x200d\x91c",
457        use_idna_non_transitional ? "xn--11bo0mv54g" : "xn--11bo0m",
458        use_idna_non_transitional ? Component(0, 14) : Component(0, 10),
459        CanonHostInfo::NEUTRAL, -1, ""},
460 
461       // Fullwidth exclamation mark is disallowed. UTS 46, table 4, row (b)
462       // However, we do allow this at the moment because we don't use
463       // STD3 rules and canonicalize full-width ASCII to ASCII.
464       {"wow\xef\xbc\x81", L"wow\xff01", "wow!", Component(0, 4),
465        CanonHostInfo::NEUTRAL, -1, ""},
466       // U+2132 (turned capital F) is disallowed. UTS 46, table 4, row (c)
467       // Allowed in IDNA 2003, but the mapping changed after Unicode 3.2
468       {"\xe2\x84\xb2oo", L"\x2132oo", "%E2%84%B2oo", Component(0, 11),
469        CanonHostInfo::BROKEN, -1, ""},
470       // U+2F868 (CJK Comp) is disallowed. UTS 46, table 4, row (d)
471       // Allowed in IDNA 2003, but the mapping changed after Unicode 3.2
472       {"\xf0\xaf\xa1\xa8\xe5\xa7\xbb.cn", L"\xd87e\xdc68\x59fb.cn",
473        "%F0%AF%A1%A8%E5%A7%BB.cn", Component(0, 24), CanonHostInfo::BROKEN, -1,
474        ""},
475       // Maps uppercase letters to lower case letters. UTS 46 table 4 row (e)
476       {"M\xc3\x9cNCHEN", L"M\xdcNCHEN", "xn--mnchen-3ya", Component(0, 14),
477        CanonHostInfo::NEUTRAL, -1, ""},
478       // An already-IDNA host is not modified.
479       {"xn--mnchen-3ya", L"xn--mnchen-3ya", "xn--mnchen-3ya", Component(0, 14),
480        CanonHostInfo::NEUTRAL, -1, ""},
481       // Symbol/punctuations are allowed in IDNA 2003/UTS46.
482       // Not allowed in IDNA 2008. UTS 46 table 4 row (f).
483       {"\xe2\x99\xa5ny.us", L"\x2665ny.us", "xn--ny-s0x.us", Component(0, 13),
484        CanonHostInfo::NEUTRAL, -1, ""},
485       // U+11013 is new in Unicode 6.0 and is allowed. UTS 46 table 4, row (h)
486       // We used to allow it because we passed through unassigned code points.
487       {"\xf0\x91\x80\x93.com", L"\xd804\xdc13.com", "xn--n00d.com",
488        Component(0, 12), CanonHostInfo::NEUTRAL, -1, ""},
489       // U+0602 is disallowed in UTS46/IDNA 2008. UTS 46 table 4, row(i)
490       // Used to be allowed in INDA 2003.
491       {"\xd8\x82.eg", L"\x602.eg", "%D8%82.eg", Component(0, 9),
492        CanonHostInfo::BROKEN, -1, ""},
493       // U+20B7 is new in Unicode 5.2 (not a part of IDNA 2003 based
494       // on Unicode 3.2). We did allow it in the past because we let unassigned
495       // code point pass. We continue to allow it even though it's a
496       // "punctuation and symbol" blocked in IDNA 2008.
497       // UTS 46 table 4, row (j)
498       {"\xe2\x82\xb7.com", L"\x20b7.com", "xn--wzg.com", Component(0, 11),
499        CanonHostInfo::NEUTRAL, -1, ""},
500       // Maps uppercase letters to lower case letters.
501       // In IDNA 2003, it's allowed without case-folding
502       // ( xn--bc-7cb.com ) because it's not defined in Unicode 3.2
503       // (added in Unicode 4.1). UTS 46 table 4 row (k)
504       {"bc\xc8\xba.com", L"bc\x23a.com", "xn--bc-is1a.com", Component(0, 15),
505        CanonHostInfo::NEUTRAL, -1, ""},
506       // Maps U+FF43 (Full Width Small Letter C) to 'c'.
507       {"ab\xef\xbd\x83.xyz", L"ab\xff43.xyz", "abc.xyz", Component(0, 7),
508        CanonHostInfo::NEUTRAL, -1, ""},
509       // Maps U+1D68C (Math Monospace Small C) to 'c'.
510       // U+1D68C = \xD835\xDE8C in UTF-16
511       {"ab\xf0\x9d\x9a\x8c.xyz", L"ab\xd835\xde8c.xyz", "abc.xyz",
512        Component(0, 7), CanonHostInfo::NEUTRAL, -1, ""},
513       // BiDi check test
514       // "Divehi" in Divehi (Thaana script) ends with BidiClass=NSM.
515       // Disallowed in IDNA 2003 but now allowed in UTS 46/IDNA 2008.
516       {"\xde\x8b\xde\xa8\xde\x88\xde\xac\xde\x80\xde\xa8",
517        L"\x78b\x7a8\x788\x7ac\x780\x7a8", "xn--hqbpi0jcw", Component(0, 13),
518        CanonHostInfo::NEUTRAL, -1, ""},
519       // Disallowed in both IDNA 2003 and 2008 with BiDi check.
520       // Labels starting with a RTL character cannot end with a LTR character.
521       {"\xd8\xac\xd8\xa7\xd8\xb1xyz", L"\x62c\x627\x631xyz",
522        "%D8%AC%D8%A7%D8%B1xyz", Component(0, 21), CanonHostInfo::BROKEN, -1,
523        ""},
524       // Labels starting with a RTL character can end with BC=EN (European
525       // number). Disallowed in IDNA 2003 but now allowed.
526       {"\xd8\xac\xd8\xa7\xd8\xb1"
527        "2",
528        L"\x62c\x627\x631"
529        L"2",
530        "xn--2-ymcov", Component(0, 11), CanonHostInfo::NEUTRAL, -1, ""},
531       // Labels starting with a RTL character cannot have "L" characters
532       // even if it ends with an BC=EN. Disallowed in both IDNA 2003/2008.
533       {"\xd8\xac\xd8\xa7\xd8\xb1xy2", L"\x62c\x627\x631xy2",
534        "%D8%AC%D8%A7%D8%B1xy2", Component(0, 21), CanonHostInfo::BROKEN, -1,
535        ""},
536       // Labels starting with a RTL character can end with BC=AN (Arabic number)
537       // Disallowed in IDNA 2003, but now allowed.
538       {"\xd8\xac\xd8\xa7\xd8\xb1\xd9\xa2", L"\x62c\x627\x631\x662",
539        "xn--mgbjq0r", Component(0, 11), CanonHostInfo::NEUTRAL, -1, ""},
540       // Labels starting with a RTL character cannot have "L" characters
541       // even if it ends with an BC=AN (Arabic number).
542       // Disallowed in both IDNA 2003/2008.
543       {"\xd8\xac\xd8\xa7\xd8\xb1xy\xd9\xa2", L"\x62c\x627\x631xy\x662",
544        "%D8%AC%D8%A7%D8%B1xy%D9%A2", Component(0, 26), CanonHostInfo::BROKEN,
545        -1, ""},
546       // Labels starting with a RTL character cannot mix BC=EN and BC=AN
547       {"\xd8\xac\xd8\xa7\xd8\xb1xy2\xd9\xa2", L"\x62c\x627\x631xy2\x662",
548        "%D8%AC%D8%A7%D8%B1xy2%D9%A2", Component(0, 27), CanonHostInfo::BROKEN,
549        -1, ""},
550       // As of Unicode 6.2, U+20CF is not assigned. We do not allow it.
551       {"\xe2\x83\x8f.com", L"\x20cf.com", "%E2%83%8F.com", Component(0, 13),
552        CanonHostInfo::BROKEN, -1, ""},
553       // U+0080 is not allowed.
554       {"\xc2\x80.com", L"\x80.com", "%C2%80.com", Component(0, 10),
555        CanonHostInfo::BROKEN, -1, ""},
556       // Mixed UTF-8 and escaped UTF-8 (narrow case) and UTF-16 and escaped
557       // Mixed UTF-8 and escaped UTF-8 (narrow case) and UTF-16 and escaped
558       // UTF-8 (wide case). The output should be equivalent to the true wide
559       // character input above).
560       {"%E4%BD%A0%E5%A5%BD\xe4\xbd\xa0\xe5\xa5\xbd",
561        L"%E4%BD%A0%E5%A5%BD\x4f60\x597d", "xn--6qqa088eba", Component(0, 14),
562        CanonHostInfo::NEUTRAL, -1, ""},
563       // Invalid escaped characters should fail and the percents should be
564       // escaped.
565       {"%zz%66%a", L"%zz%66%a", "%25zzf%25a", Component(0, 10),
566        CanonHostInfo::BROKEN, -1, ""},
567       // If we get an invalid character that has been escaped.
568       {"%25", L"%25", "%25", Component(0, 3), CanonHostInfo::BROKEN, -1, ""},
569       {"hello%00", L"hello%00", "hello%00", Component(0, 8),
570        CanonHostInfo::BROKEN, -1, ""},
571       // Escaped numbers should be treated like IP addresses if they are.
572       {"%30%78%63%30%2e%30%32%35%30.01", L"%30%78%63%30%2e%30%32%35%30.01",
573        "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"},
574       {"%30%78%63%30%2e%30%32%35%30.01%2e",
575        L"%30%78%63%30%2e%30%32%35%30.01%2e", "192.168.0.1", Component(0, 11),
576        CanonHostInfo::IPV4, 3, "C0A80001"},
577       // Invalid escaping should trigger the regular host error handling.
578       {"%3g%78%63%30%2e%30%32%35%30%2E.01",
579        L"%3g%78%63%30%2e%30%32%35%30%2E.01", "%253gxc0.0250..01",
580        Component(0, 17), CanonHostInfo::BROKEN, -1, ""},
581       // Something that isn't exactly an IP should get treated as a host and
582       // spaces escaped.
583       {"192.168.0.1 hello", L"192.168.0.1 hello", "192.168.0.1%20hello",
584        Component(0, 19), CanonHostInfo::NEUTRAL, -1, ""},
585       // Fullwidth and escaped UTF-8 fullwidth should still be treated as IP.
586       // These are "0Xc0.0250.01" in fullwidth.
587       {"\xef\xbc\x90%Ef%bc\xb8%ef%Bd%83\xef\xbc\x90%EF%BC%"
588        "8E\xef\xbc\x90\xef\xbc\x92\xef\xbc\x95\xef\xbc\x90\xef\xbc%"
589        "8E\xef\xbc\x90\xef\xbc\x91",
590        L"\xff10\xff38\xff43\xff10\xff0e\xff10\xff12\xff15\xff10\xff0e\xff10"
591        L"\xff11",
592        "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"},
593       // Broken IP addresses get marked as such.
594       {"192.168.0.257", L"192.168.0.257", "192.168.0.257", Component(0, 13),
595        CanonHostInfo::BROKEN, -1, ""},
596       {"[google.com]", L"[google.com]", "[google.com]", Component(0, 12),
597        CanonHostInfo::BROKEN, -1, ""},
598       // Cyrillic letter followed by '(' should return punycode for '(' escaped
599       // before punycode string was created. I.e.
600       // if '(' is escaped after punycode is created we would get xn--%28-8tb
601       // (incorrect).
602       {"\xd1\x82(", L"\x0442(", "xn--(-8tb", Component(0, 9),
603        CanonHostInfo::NEUTRAL, -1, ""},
604       // Address with all hexadecimal characters with leading number of 1<<32
605       // or greater and should return NEUTRAL rather than BROKEN if not all
606       // components are numbers.
607       {"12345678912345.de", L"12345678912345.de", "12345678912345.de",
608        Component(0, 17), CanonHostInfo::NEUTRAL, -1, ""},
609       {"1.12345678912345.de", L"1.12345678912345.de", "1.12345678912345.de",
610        Component(0, 19), CanonHostInfo::NEUTRAL, -1, ""},
611       {"12345678912345.12345678912345.de", L"12345678912345.12345678912345.de",
612        "12345678912345.12345678912345.de", Component(0, 32),
613        CanonHostInfo::NEUTRAL, -1, ""},
614       {"1.2.0xB3A73CE5B59.de", L"1.2.0xB3A73CE5B59.de", "1.2.0xb3a73ce5b59.de",
615        Component(0, 20), CanonHostInfo::NEUTRAL, -1, ""},
616       {"12345678912345.0xde", L"12345678912345.0xde", "12345678912345.0xde",
617        Component(0, 19), CanonHostInfo::BROKEN, -1, ""},
618       // A label that starts with "xn--" but contains non-ASCII characters
619       // should
620       // be an error. Escape the invalid characters.
621       {"xn--m\xc3\xbcnchen", L"xn--m\xfcnchen", "xn--m%C3%BCnchen",
622        Component(0, 16), CanonHostInfo::BROKEN, -1, ""},
623   };
624   // clang-format on
625 
626   // CanonicalizeHost() non-verbose.
627   std::string out_str;
628   for (const auto& host_case : host_cases) {
629     // Narrow version.
630     if (host_case.input8) {
631       int host_len = static_cast<int>(strlen(host_case.input8));
632       Component in_comp(0, host_len);
633       Component out_comp;
634 
635       out_str.clear();
636       StdStringCanonOutput output(&out_str);
637 
638       bool success =
639           CanonicalizeHost(host_case.input8, in_comp, &output, &out_comp);
640       output.Complete();
641 
642       EXPECT_EQ(host_case.expected_family != CanonHostInfo::BROKEN, success)
643           << "for input: " << host_case.input8;
644       EXPECT_EQ(host_case.expected, out_str)
645           << "for input: " << host_case.input8;
646       EXPECT_EQ(host_case.expected_component.begin, out_comp.begin)
647           << "for input: " << host_case.input8;
648       EXPECT_EQ(host_case.expected_component.len, out_comp.len)
649           << "for input: " << host_case.input8;
650     }
651 
652     // Wide version.
653     if (host_case.input16) {
654       std::u16string input16(
655           test_utils::TruncateWStringToUTF16(host_case.input16));
656       int host_len = static_cast<int>(input16.length());
657       Component in_comp(0, host_len);
658       Component out_comp;
659 
660       out_str.clear();
661       StdStringCanonOutput output(&out_str);
662 
663       bool success = CanonicalizeHost(input16.c_str(), in_comp, &output,
664                                       &out_comp);
665       output.Complete();
666 
667       EXPECT_EQ(host_case.expected_family != CanonHostInfo::BROKEN, success);
668       EXPECT_EQ(host_case.expected, out_str);
669       EXPECT_EQ(host_case.expected_component.begin, out_comp.begin);
670       EXPECT_EQ(host_case.expected_component.len, out_comp.len);
671     }
672   }
673 
674   // CanonicalizeHostVerbose()
675   for (const auto& host_case : host_cases) {
676     // Narrow version.
677     if (host_case.input8) {
678       int host_len = static_cast<int>(strlen(host_case.input8));
679       Component in_comp(0, host_len);
680 
681       out_str.clear();
682       StdStringCanonOutput output(&out_str);
683       CanonHostInfo host_info;
684 
685       CanonicalizeHostVerbose(host_case.input8, in_comp, &output, &host_info);
686       output.Complete();
687 
688       EXPECT_EQ(host_case.expected_family, host_info.family);
689       EXPECT_EQ(host_case.expected, out_str);
690       EXPECT_EQ(host_case.expected_component.begin, host_info.out_host.begin);
691       EXPECT_EQ(host_case.expected_component.len, host_info.out_host.len);
692       EXPECT_EQ(
693           host_case.expected_address_hex,
694           base::HexEncode(host_info.address,
695                           static_cast<size_t>(host_info.AddressLength())));
696       if (host_case.expected_family == CanonHostInfo::IPV4) {
697         EXPECT_EQ(host_case.expected_num_ipv4_components,
698                   host_info.num_ipv4_components);
699       }
700     }
701 
702     // Wide version.
703     if (host_case.input16) {
704       std::u16string input16(
705           test_utils::TruncateWStringToUTF16(host_case.input16));
706       int host_len = static_cast<int>(input16.length());
707       Component in_comp(0, host_len);
708 
709       out_str.clear();
710       StdStringCanonOutput output(&out_str);
711       CanonHostInfo host_info;
712 
713       CanonicalizeHostVerbose(input16.c_str(), in_comp, &output, &host_info);
714       output.Complete();
715 
716       EXPECT_EQ(host_case.expected_family, host_info.family);
717       EXPECT_EQ(host_case.expected, out_str);
718       EXPECT_EQ(host_case.expected_component.begin, host_info.out_host.begin);
719       EXPECT_EQ(host_case.expected_component.len, host_info.out_host.len);
720       EXPECT_EQ(
721           host_case.expected_address_hex,
722           base::HexEncode(host_info.address,
723                           static_cast<size_t>(host_info.AddressLength())));
724       if (host_case.expected_family == CanonHostInfo::IPV4) {
725         EXPECT_EQ(host_case.expected_num_ipv4_components,
726                   host_info.num_ipv4_components);
727       }
728     }
729   }
730 }
731 
TEST(URLCanonTest,SpecialHostPuncutationChar)732 TEST(URLCanonTest, SpecialHostPuncutationChar) {
733   // '%' is not tested here. '%' is used for percent-escaping.
734   const std::string_view allowed_host_chars[] = {
735       "!", "\"", "$", "&", "'", "(", ")", "+", ",",
736       "-", ".",  ";", "=", "_", "`", "{", "}", "~",
737   };
738 
739   const std::string_view forbidden_host_chars[] = {
740       "#", "/", ":", "<", ">", "?", "@", "[", "\\", "]", "^", "|",
741   };
742 
743   // Standard non-compliant characters which are escaped. See
744   // https://crbug.com/1416013.
745   struct EscapedCharTestCase {
746     std::string_view input;
747     std::string_view expected;
748   } escaped_host_chars[] = {{" ", "%20"}, {"*", "%2A"}};
749 
750   for (const std::string_view input : allowed_host_chars) {
751     std::string out_str;
752     Component in_comp(0, input.size());
753     Component out_comp;
754     StdStringCanonOutput output(&out_str);
755     bool success =
756         CanonicalizeSpecialHost(input.data(), in_comp, output, out_comp);
757     EXPECT_TRUE(success) << "Input: " << input;
758     output.Complete();
759     EXPECT_EQ(out_str, input) << "Input: " << input;
760   }
761 
762   for (const std::string_view input : forbidden_host_chars) {
763     std::string out_str;
764     Component in_comp(0, input.size());
765     Component out_comp;
766     StdStringCanonOutput output(&out_str);
767     EXPECT_FALSE(
768         CanonicalizeSpecialHost(input.data(), in_comp, output, out_comp))
769         << "Input: " << input;
770   }
771 
772   for (const auto& c : escaped_host_chars) {
773     std::string out_str;
774     Component in_comp(0, c.input.size());
775     Component out_comp;
776     StdStringCanonOutput output(&out_str);
777     bool success =
778         CanonicalizeSpecialHost(c.input.data(), in_comp, output, out_comp);
779     EXPECT_TRUE(success) << "Input: " << c.input;
780     output.Complete();
781     EXPECT_EQ(out_str, c.expected) << "Input: " << c.input;
782   }
783 }
784 
TEST(URLCanonTest,ForbiddenHostCodePoint)785 TEST(URLCanonTest, ForbiddenHostCodePoint) {
786   // Test only CanonicalizeNonSpecialHost.
787   // CanonicalizeSpecialHost is not standard compliant yet.
788   // See URLCanonTest::SpecialHostPuncutationChar.
789 
790   // https://url.spec.whatwg.org/#forbidden-host-code-point
791   const std::string_view forbidden_host_chars[] = {
792       "\x09", "\x0A", "\x0D", " ", "#",  "/", ":", "<",
793       ">",    "?",    "@",    "[", "\\", "]", "^", "|",
794   };
795 
796   for (const std::string_view input : forbidden_host_chars) {
797     std::string out_str;
798     Component in_comp(0, input.size());
799     Component out_comp;
800     StdStringCanonOutput output(&out_str);
801     EXPECT_FALSE(
802         CanonicalizeNonSpecialHost(input.data(), in_comp, output, out_comp))
803         << "Input: " << input;
804   }
805 
806   // Test NULL manually.
807   const char host_with_null[] = "a\0b";
808   std::string out_str;
809   Component in_comp(0, 3);
810   Component out_comp;
811   StdStringCanonOutput output(&out_str);
812   EXPECT_FALSE(
813       CanonicalizeNonSpecialHost(host_with_null, in_comp, output, out_comp));
814 }
815 
TEST(URLCanonTest,IPv4)816 TEST(URLCanonTest, IPv4) {
817   // clang-format off
818   IPAddressCase cases[] = {
819     // Empty is not an IP address.
820     {"", L"", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
821     {".", L".", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
822     // Regular IP addresses in different bases.
823     {"192.168.0.1", L"192.168.0.1", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"},
824     {"0300.0250.00.01", L"0300.0250.00.01", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"},
825     {"0xC0.0Xa8.0x0.0x1", L"0xC0.0Xa8.0x0.0x1", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"},
826     // Non-IP addresses due to invalid characters.
827     {"192.168.9.com", L"192.168.9.com", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
828     // Hostnames with a numeric final component but other components that don't
829     // parse as numbers should be considered broken.
830     {"19a.168.0.1", L"19a.168.0.1", "", Component(), CanonHostInfo::BROKEN, -1, ""},
831     {"19a.168.0.1.", L"19a.168.0.1.", "", Component(), CanonHostInfo::BROKEN, -1, ""},
832     {"0308.0250.00.01", L"0308.0250.00.01", "", Component(), CanonHostInfo::BROKEN, -1, ""},
833     {"0308.0250.00.01.", L"0308.0250.00.01.", "", Component(), CanonHostInfo::BROKEN, -1, ""},
834     {"0xCG.0xA8.0x0.0x1", L"0xCG.0xA8.0x0.0x1", "", Component(), CanonHostInfo::BROKEN, -1, ""},
835     {"0xCG.0xA8.0x0.0x1.", L"0xCG.0xA8.0x0.0x1.", "", Component(), CanonHostInfo::BROKEN, -1, ""},
836     // Non-numeric terminal compeonent should be considered not IPv4 hostnames, but valid.
837     {"19.168.0.1a", L"19.168.0.1a", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
838     {"0xC.0xA8.0x0.0x1G", L"0xC.0xA8.0x0.0x1G", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
839     // Hostnames that would be considered broken IPv4 hostnames should be considered valid non-IPv4 hostnames if they end with two dots instead of 0 or 1.
840     {"19a.168.0.1..", L"19a.168.0.1..", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
841     {"0308.0250.00.01..", L"0308.0250.00.01..", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
842     {"0xCG.0xA8.0x0.0x1..", L"0xCG.0xA8.0x0.0x1..", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
843     // Hosts with components that aren't considered valid IPv4 numbers but are entirely numeric should be considered invalid.
844     {"1.2.3.08", L"1.2.3.08", "", Component(), CanonHostInfo::BROKEN, -1, ""},
845     {"1.2.3.08.", L"1.2.3.08.", "", Component(), CanonHostInfo::BROKEN, -1, ""},
846     // If there are not enough components, the last one should fill them out.
847     {"192", L"192", "0.0.0.192", Component(0, 9), CanonHostInfo::IPV4, 1, "000000C0"},
848     {"0xC0a80001", L"0xC0a80001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"},
849     {"030052000001", L"030052000001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"},
850     {"000030052000001", L"000030052000001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"},
851     {"192.168", L"192.168", "192.0.0.168", Component(0, 11), CanonHostInfo::IPV4, 2, "C00000A8"},
852     {"192.0x00A80001", L"192.0x000A80001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 2, "C0A80001"},
853     {"0xc0.052000001", L"0xc0.052000001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 2, "C0A80001"},
854     {"192.168.1", L"192.168.1", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"},
855     // Hostnames with too many components, but a numeric final numeric component are invalid.
856     {"192.168.0.0.1", L"192.168.0.0.1", "", Component(), CanonHostInfo::BROKEN, -1, ""},
857     // We allow a single trailing dot.
858     {"192.168.0.1.", L"192.168.0.1.", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"},
859     {"192.168.0.1. hello", L"192.168.0.1. hello", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
860     {"192.168.0.1..", L"192.168.0.1..", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
861     // Hosts with two dots in a row with a final numeric component are considered invalid.
862     {"192.168..1", L"192.168..1", "", Component(), CanonHostInfo::BROKEN, -1, ""},
863     {"192.168..1.", L"192.168..1.", "", Component(), CanonHostInfo::BROKEN, -1, ""},
864     // Any numerical overflow should be marked as BROKEN.
865     {"0x100.0", L"0x100.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
866     {"0x100.0.0", L"0x100.0.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
867     {"0x100.0.0.0", L"0x100.0.0.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
868     {"0.0x100.0.0", L"0.0x100.0.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
869     {"0.0.0x100.0", L"0.0.0x100.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
870     {"0.0.0.0x100", L"0.0.0.0x100", "", Component(), CanonHostInfo::BROKEN, -1, ""},
871     {"0.0.0x10000", L"0.0.0x10000", "", Component(), CanonHostInfo::BROKEN, -1, ""},
872     {"0.0x1000000", L"0.0x1000000", "", Component(), CanonHostInfo::BROKEN, -1, ""},
873     {"0x100000000", L"0x100000000", "", Component(), CanonHostInfo::BROKEN, -1, ""},
874     // Repeat the previous tests, minus 1, to verify boundaries.
875     {"0xFF.0", L"0xFF.0", "255.0.0.0", Component(0, 9), CanonHostInfo::IPV4, 2, "FF000000"},
876     {"0xFF.0.0", L"0xFF.0.0", "255.0.0.0", Component(0, 9), CanonHostInfo::IPV4, 3, "FF000000"},
877     {"0xFF.0.0.0", L"0xFF.0.0.0", "255.0.0.0", Component(0, 9), CanonHostInfo::IPV4, 4, "FF000000"},
878     {"0.0xFF.0.0", L"0.0xFF.0.0", "0.255.0.0", Component(0, 9), CanonHostInfo::IPV4, 4, "00FF0000"},
879     {"0.0.0xFF.0", L"0.0.0xFF.0", "0.0.255.0", Component(0, 9), CanonHostInfo::IPV4, 4, "0000FF00"},
880     {"0.0.0.0xFF", L"0.0.0.0xFF", "0.0.0.255", Component(0, 9), CanonHostInfo::IPV4, 4, "000000FF"},
881     {"0.0.0xFFFF", L"0.0.0xFFFF", "0.0.255.255", Component(0, 11), CanonHostInfo::IPV4, 3, "0000FFFF"},
882     {"0.0xFFFFFF", L"0.0xFFFFFF", "0.255.255.255", Component(0, 13), CanonHostInfo::IPV4, 2, "00FFFFFF"},
883     {"0xFFFFFFFF", L"0xFFFFFFFF", "255.255.255.255", Component(0, 15), CanonHostInfo::IPV4, 1, "FFFFFFFF"},
884     // Old trunctations tests. They're all "BROKEN" now.
885     {"276.256.0xf1a2.077777", L"276.256.0xf1a2.077777", "", Component(), CanonHostInfo::BROKEN, -1, ""},
886     {"192.168.0.257", L"192.168.0.257", "", Component(), CanonHostInfo::BROKEN, -1, ""},
887     {"192.168.0xa20001", L"192.168.0xa20001", "", Component(), CanonHostInfo::BROKEN, -1, ""},
888     {"192.015052000001", L"192.015052000001", "", Component(), CanonHostInfo::BROKEN, -1, ""},
889     {"0X12C0a80001", L"0X12C0a80001", "", Component(), CanonHostInfo::BROKEN, -1, ""},
890     {"276.1.2", L"276.1.2", "", Component(), CanonHostInfo::BROKEN, -1, ""},
891     // Too many components should be rejected, in valid ranges or not.
892     {"255.255.255.255.255", L"255.255.255.255.255", "", Component(), CanonHostInfo::BROKEN, -1, ""},
893     {"256.256.256.256.256", L"256.256.256.256.256", "", Component(), CanonHostInfo::BROKEN, -1, ""},
894     // Spaces should be rejected.
895     {"192.168.0.1 hello", L"192.168.0.1 hello", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
896     // Very large numbers.
897     {"0000000000000300.0x00000000000000fF.00000000000000001", L"0000000000000300.0x00000000000000fF.00000000000000001", "192.255.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0FF0001"},
898     {"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", L"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", "", Component(0, 11), CanonHostInfo::BROKEN, -1, ""},
899     // A number has no length limit, but long numbers can still overflow.
900     {"00000000000000000001", L"00000000000000000001", "0.0.0.1", Component(0, 7), CanonHostInfo::IPV4, 1, "00000001"},
901     {"0000000000000000100000000000000001", L"0000000000000000100000000000000001", "", Component(), CanonHostInfo::BROKEN, -1, ""},
902     // If a long component is non-numeric, it's a hostname, *not* a broken IP.
903     {"0.0.0.000000000000000000z", L"0.0.0.000000000000000000z", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
904     {"0.0.0.100000000000000000z", L"0.0.0.100000000000000000z", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
905     // Truncation of all zeros should still result in 0.
906     {"0.00.0x.0x0", L"0.00.0x.0x0", "0.0.0.0", Component(0, 7), CanonHostInfo::IPV4, 4, "00000000"},
907     // Non-ASCII characters in final component should return NEUTRAL.
908     {"1.2.3.\xF0\x9F\x92\xA9", L"1.2.3.\xD83D\xDCA9", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
909     {"1.2.3.4\xF0\x9F\x92\xA9", L"1.2.3.4\xD83D\xDCA9", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
910     {"1.2.3.0x\xF0\x9F\x92\xA9", L"1.2.3.0x\xD83D\xDCA9", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
911     {"1.2.3.0\xF0\x9F\x92\xA9", L"1.2.3.0\xD83D\xDCA9", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
912     // Non-ASCII characters in other components should result in broken IPs when final component is numeric.
913     {"1.2.\xF0\x9F\x92\xA9.4", L"1.2.\xD83D\xDCA9.4", "", Component(), CanonHostInfo::BROKEN, -1, ""},
914     {"1.2.3\xF0\x9F\x92\xA9.4", L"1.2.3\xD83D\xDCA9.4", "", Component(), CanonHostInfo::BROKEN, -1, ""},
915     {"1.2.0x\xF0\x9F\x92\xA9.4", L"1.2.0x\xD83D\xDCA9.4", "", Component(), CanonHostInfo::BROKEN, -1, ""},
916     {"1.2.0\xF0\x9F\x92\xA9.4", L"1.2.0\xD83D\xDCA9.4", "", Component(), CanonHostInfo::BROKEN, -1, ""},
917     {"\xF0\x9F\x92\xA9.2.3.4", L"\xD83D\xDCA9.2.3.4", "", Component(), CanonHostInfo::BROKEN, -1, ""},
918   };
919   // clang-format on
920 
921   for (const auto& test_case : cases) {
922     SCOPED_TRACE(test_case.input8);
923 
924     // 8-bit version.
925     Component component(0, static_cast<int>(strlen(test_case.input8)));
926 
927     std::string out_str1;
928     StdStringCanonOutput output1(&out_str1);
929     CanonHostInfo host_info;
930     CanonicalizeIPAddress(test_case.input8, component, &output1, &host_info);
931     output1.Complete();
932 
933     EXPECT_EQ(test_case.expected_family, host_info.family);
934     EXPECT_EQ(test_case.expected_address_hex,
935               base::HexEncode(host_info.address,
936                               static_cast<size_t>(host_info.AddressLength())));
937     if (host_info.family == CanonHostInfo::IPV4) {
938       EXPECT_STREQ(test_case.expected, out_str1.c_str());
939       EXPECT_EQ(test_case.expected_component.begin, host_info.out_host.begin);
940       EXPECT_EQ(test_case.expected_component.len, host_info.out_host.len);
941       EXPECT_EQ(test_case.expected_num_ipv4_components,
942                 host_info.num_ipv4_components);
943     }
944 
945     // 16-bit version.
946     std::u16string input16(
947         test_utils::TruncateWStringToUTF16(test_case.input16));
948     component = Component(0, static_cast<int>(input16.length()));
949 
950     std::string out_str2;
951     StdStringCanonOutput output2(&out_str2);
952     CanonicalizeIPAddress(input16.c_str(), component, &output2, &host_info);
953     output2.Complete();
954 
955     EXPECT_EQ(test_case.expected_family, host_info.family);
956     EXPECT_EQ(test_case.expected_address_hex,
957               base::HexEncode(host_info.address,
958                               static_cast<size_t>(host_info.AddressLength())));
959     if (host_info.family == CanonHostInfo::IPV4) {
960       EXPECT_STREQ(test_case.expected, out_str2.c_str());
961       EXPECT_EQ(test_case.expected_component.begin, host_info.out_host.begin);
962       EXPECT_EQ(test_case.expected_component.len, host_info.out_host.len);
963       EXPECT_EQ(test_case.expected_num_ipv4_components,
964                 host_info.num_ipv4_components);
965     }
966   }
967 }
968 
TEST(URLCanonTest,IPv6)969 TEST(URLCanonTest, IPv6) {
970   IPAddressCase cases[] = {
971       // Empty is not an IP address.
972       {"", L"", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
973       // Non-IPs with [:] characters are marked BROKEN.
974       {":", L":", "", Component(), CanonHostInfo::BROKEN, -1, ""},
975       {"[", L"[", "", Component(), CanonHostInfo::BROKEN, -1, ""},
976       {"[:", L"[:", "", Component(), CanonHostInfo::BROKEN, -1, ""},
977       {"]", L"]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
978       {":]", L":]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
979       {"[]", L"[]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
980       {"[:]", L"[:]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
981       // Regular IP address is invalid without bounding '[' and ']'.
982       {"2001:db8::1", L"2001:db8::1", "", Component(), CanonHostInfo::BROKEN,
983        -1, ""},
984       {"[2001:db8::1", L"[2001:db8::1", "", Component(), CanonHostInfo::BROKEN,
985        -1, ""},
986       {"2001:db8::1]", L"2001:db8::1]", "", Component(), CanonHostInfo::BROKEN,
987        -1, ""},
988       // Regular IP addresses.
989       {"[::]", L"[::]", "[::]", Component(0, 4), CanonHostInfo::IPV6, -1,
990        "00000000000000000000000000000000"},
991       {"[::1]", L"[::1]", "[::1]", Component(0, 5), CanonHostInfo::IPV6, -1,
992        "00000000000000000000000000000001"},
993       {"[1::]", L"[1::]", "[1::]", Component(0, 5), CanonHostInfo::IPV6, -1,
994        "00010000000000000000000000000000"},
995 
996       // Leading zeros should be stripped.
997       {"[000:01:02:003:004:5:6:007]", L"[000:01:02:003:004:5:6:007]",
998        "[0:1:2:3:4:5:6:7]", Component(0, 17), CanonHostInfo::IPV6, -1,
999        "00000001000200030004000500060007"},
1000 
1001       // Upper case letters should be lowercased.
1002       {"[A:b:c:DE:fF:0:1:aC]", L"[A:b:c:DE:fF:0:1:aC]", "[a:b:c:de:ff:0:1:ac]",
1003        Component(0, 20), CanonHostInfo::IPV6, -1,
1004        "000A000B000C00DE00FF0000000100AC"},
1005 
1006       // The same address can be written with different contractions, but should
1007       // get canonicalized to the same thing.
1008       {"[1:0:0:2::3:0]", L"[1:0:0:2::3:0]", "[1::2:0:0:3:0]", Component(0, 14),
1009        CanonHostInfo::IPV6, -1, "00010000000000020000000000030000"},
1010       {"[1::2:0:0:3:0]", L"[1::2:0:0:3:0]", "[1::2:0:0:3:0]", Component(0, 14),
1011        CanonHostInfo::IPV6, -1, "00010000000000020000000000030000"},
1012 
1013       // Addresses with embedded IPv4.
1014       {"[::192.168.0.1]", L"[::192.168.0.1]", "[::c0a8:1]", Component(0, 10),
1015        CanonHostInfo::IPV6, -1, "000000000000000000000000C0A80001"},
1016       {"[::ffff:192.168.0.1]", L"[::ffff:192.168.0.1]", "[::ffff:c0a8:1]",
1017        Component(0, 15), CanonHostInfo::IPV6, -1,
1018        "00000000000000000000FFFFC0A80001"},
1019       {"[::eeee:192.168.0.1]", L"[::eeee:192.168.0.1]", "[::eeee:c0a8:1]",
1020        Component(0, 15), CanonHostInfo::IPV6, -1,
1021        "00000000000000000000EEEEC0A80001"},
1022       {"[2001::192.168.0.1]", L"[2001::192.168.0.1]", "[2001::c0a8:1]",
1023        Component(0, 14), CanonHostInfo::IPV6, -1,
1024        "200100000000000000000000C0A80001"},
1025       {"[1:2:192.168.0.1:5:6]", L"[1:2:192.168.0.1:5:6]", "", Component(),
1026        CanonHostInfo::BROKEN, -1, ""},
1027 
1028       // IPv4 embedded IPv6 addresses
1029       {"[::ffff:192.1.2]", L"[::ffff:192.1.2]", "[::ffff:c001:2]", Component(),
1030        CanonHostInfo::BROKEN, -1, ""},
1031       {"[::ffff:192.1]", L"[::ffff:192.1]", "[::ffff:c000:1]", Component(),
1032        CanonHostInfo::BROKEN, -1, ""},
1033       {"[::ffff:192.1.2.3.4]", L"[::ffff:192.1.2.3.4]", "", Component(),
1034        CanonHostInfo::BROKEN, -1, ""},
1035 
1036       // IPv4 using hex.
1037       // TODO(eroman): Should this format be disallowed?
1038       {"[::ffff:0xC0.0Xa8.0x0.0x1]", L"[::ffff:0xC0.0Xa8.0x0.0x1]",
1039        "[::ffff:c0a8:1]", Component(0, 15), CanonHostInfo::IPV6, -1,
1040        "00000000000000000000FFFFC0A80001"},
1041 
1042       // There may be zeros surrounding the "::" contraction.
1043       {"[0:0::0:0:8]", L"[0:0::0:0:8]", "[::8]", Component(0, 5),
1044        CanonHostInfo::IPV6, -1, "00000000000000000000000000000008"},
1045 
1046       {"[2001:db8::1]", L"[2001:db8::1]", "[2001:db8::1]", Component(0, 13),
1047        CanonHostInfo::IPV6, -1, "20010DB8000000000000000000000001"},
1048 
1049       // Can only have one "::" contraction in an IPv6 string literal.
1050       {"[2001::db8::1]", L"[2001::db8::1]", "", Component(),
1051        CanonHostInfo::BROKEN, -1, ""},
1052       // No more than 2 consecutive ':'s.
1053       {"[2001:db8:::1]", L"[2001:db8:::1]", "", Component(),
1054        CanonHostInfo::BROKEN, -1, ""},
1055       {"[:::]", L"[:::]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
1056       // Non-IP addresses due to invalid characters.
1057       {"[2001::.com]", L"[2001::.com]", "", Component(), CanonHostInfo::BROKEN,
1058        -1, ""},
1059       // If there are not enough components, the last one should fill them out.
1060       // ... omitted at this time ...
1061       // Too many components means not an IP address. Similarly, with too few
1062       // if using IPv4 compat or mapped addresses.
1063       {"[::192.168.0.0.1]", L"[::192.168.0.0.1]", "", Component(),
1064        CanonHostInfo::BROKEN, -1, ""},
1065       {"[::ffff:192.168.0.0.1]", L"[::ffff:192.168.0.0.1]", "", Component(),
1066        CanonHostInfo::BROKEN, -1, ""},
1067       {"[1:2:3:4:5:6:7:8:9]", L"[1:2:3:4:5:6:7:8:9]", "", Component(),
1068        CanonHostInfo::BROKEN, -1, ""},
1069       // Too many bits (even though 8 components, the last one holds 32 bits).
1070       {"[0:0:0:0:0:0:0:192.168.0.1]", L"[0:0:0:0:0:0:0:192.168.0.1]", "",
1071        Component(), CanonHostInfo::BROKEN, -1, ""},
1072 
1073       // Too many bits specified -- the contraction would have to be zero-length
1074       // to not exceed 128 bits.
1075       {"[1:2:3:4:5:6::192.168.0.1]", L"[1:2:3:4:5:6::192.168.0.1]", "",
1076        Component(), CanonHostInfo::BROKEN, -1, ""},
1077 
1078       // The contraction is for 16 bits of zero.
1079       {"[1:2:3:4:5:6::8]", L"[1:2:3:4:5:6::8]", "[1:2:3:4:5:6:0:8]",
1080        Component(0, 17), CanonHostInfo::IPV6, -1,
1081        "00010002000300040005000600000008"},
1082 
1083       // Cannot have a trailing colon.
1084       {"[1:2:3:4:5:6:7:8:]", L"[1:2:3:4:5:6:7:8:]", "", Component(),
1085        CanonHostInfo::BROKEN, -1, ""},
1086       {"[1:2:3:4:5:6:192.168.0.1:]", L"[1:2:3:4:5:6:192.168.0.1:]", "",
1087        Component(), CanonHostInfo::BROKEN, -1, ""},
1088 
1089       // Cannot have negative numbers.
1090       {"[-1:2:3:4:5:6:7:8]", L"[-1:2:3:4:5:6:7:8]", "", Component(),
1091        CanonHostInfo::BROKEN, -1, ""},
1092 
1093       // Scope ID -- the URL may contain an optional ["%" <scope_id>] section.
1094       // The scope_id should be included in the canonicalized URL, and is an
1095       // unsigned decimal number.
1096 
1097       // Invalid because no ID was given after the percent.
1098 
1099       // Don't allow scope-id
1100       {"[1::%1]", L"[1::%1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
1101       {"[1::%eth0]", L"[1::%eth0]", "", Component(), CanonHostInfo::BROKEN, -1,
1102        ""},
1103       {"[1::%]", L"[1::%]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
1104       {"[%]", L"[%]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
1105       {"[::%:]", L"[::%:]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
1106 
1107       // Don't allow leading or trailing colons.
1108       {"[:0:0::0:0:8]", L"[:0:0::0:0:8]", "", Component(),
1109        CanonHostInfo::BROKEN, -1, ""},
1110       {"[0:0::0:0:8:]", L"[0:0::0:0:8:]", "", Component(),
1111        CanonHostInfo::BROKEN, -1, ""},
1112       {"[:0:0::0:0:8:]", L"[:0:0::0:0:8:]", "", Component(),
1113        CanonHostInfo::BROKEN, -1, ""},
1114 
1115       // We allow a single trailing dot.
1116       // ... omitted at this time ...
1117       // Two dots in a row means not an IP address.
1118       {"[::192.168..1]", L"[::192.168..1]", "", Component(),
1119        CanonHostInfo::BROKEN, -1, ""},
1120       // Any non-first components get truncated to one byte.
1121       // ... omitted at this time ...
1122       // Spaces should be rejected.
1123       {"[::1 hello]", L"[::1 hello]", "", Component(), CanonHostInfo::BROKEN,
1124        -1, ""},
1125   };
1126 
1127   for (size_t i = 0; i < std::size(cases); i++) {
1128     // 8-bit version.
1129     Component component(0, static_cast<int>(strlen(cases[i].input8)));
1130 
1131     std::string out_str1;
1132     StdStringCanonOutput output1(&out_str1);
1133     CanonHostInfo host_info;
1134     CanonicalizeIPAddress(cases[i].input8, component, &output1, &host_info);
1135     output1.Complete();
1136 
1137     EXPECT_EQ(cases[i].expected_family, host_info.family);
1138     EXPECT_EQ(cases[i].expected_address_hex,
1139               base::HexEncode(host_info.address,
1140                               static_cast<size_t>(host_info.AddressLength())))
1141         << "iter " << i << " host " << cases[i].input8;
1142     if (host_info.family == CanonHostInfo::IPV6) {
1143       EXPECT_STREQ(cases[i].expected, out_str1.c_str());
1144       EXPECT_EQ(cases[i].expected_component.begin,
1145                 host_info.out_host.begin);
1146       EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len);
1147     }
1148 
1149     // 16-bit version.
1150     std::u16string input16(
1151         test_utils::TruncateWStringToUTF16(cases[i].input16));
1152     component = Component(0, static_cast<int>(input16.length()));
1153 
1154     std::string out_str2;
1155     StdStringCanonOutput output2(&out_str2);
1156     CanonicalizeIPAddress(input16.c_str(), component, &output2, &host_info);
1157     output2.Complete();
1158 
1159     EXPECT_EQ(cases[i].expected_family, host_info.family);
1160     EXPECT_EQ(cases[i].expected_address_hex,
1161               base::HexEncode(host_info.address,
1162                               static_cast<size_t>(host_info.AddressLength())));
1163     if (host_info.family == CanonHostInfo::IPV6) {
1164       EXPECT_STREQ(cases[i].expected, out_str2.c_str());
1165       EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin);
1166       EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len);
1167     }
1168   }
1169 }
1170 
TEST(URLCanonTest,IPEmpty)1171 TEST(URLCanonTest, IPEmpty) {
1172   std::string out_str1;
1173   StdStringCanonOutput output1(&out_str1);
1174   CanonHostInfo host_info;
1175 
1176   // This tests tests.
1177   const char spec[] = "192.168.0.1";
1178   CanonicalizeIPAddress(spec, Component(), &output1, &host_info);
1179   EXPECT_FALSE(host_info.IsIPAddress());
1180 
1181   CanonicalizeIPAddress(spec, Component(0, 0), &output1, &host_info);
1182   EXPECT_FALSE(host_info.IsIPAddress());
1183 }
1184 
1185 // Verifies that CanonicalizeHostSubstring produces the expected output and
1186 // does not "fix" IP addresses. Because this code is a subset of
1187 // CanonicalizeHost, the shared functionality is not tested.
TEST(URLCanonTest,CanonicalizeHostSubstring)1188 TEST(URLCanonTest, CanonicalizeHostSubstring) {
1189   // Basic sanity check.
1190   {
1191     std::string out_str;
1192     StdStringCanonOutput output(&out_str);
1193     EXPECT_TRUE(CanonicalizeHostSubstring("M\xc3\x9cNCHEN.com",
1194                                           Component(0, 12), &output));
1195     output.Complete();
1196     EXPECT_EQ("xn--mnchen-3ya.com", out_str);
1197   }
1198 
1199   // Failure case.
1200   {
1201     std::string out_str;
1202     StdStringCanonOutput output(&out_str);
1203     EXPECT_FALSE(CanonicalizeHostSubstring(
1204         test_utils::TruncateWStringToUTF16(L"\xfdd0zyx.com").c_str(),
1205         Component(0, 8), &output));
1206     output.Complete();
1207     EXPECT_EQ("%EF%B7%90zyx.com", out_str);
1208   }
1209 
1210   // Should return true for empty input strings.
1211   {
1212     std::string out_str;
1213     StdStringCanonOutput output(&out_str);
1214     EXPECT_TRUE(CanonicalizeHostSubstring("", Component(0, 0), &output));
1215     output.Complete();
1216     EXPECT_EQ(std::string(), out_str);
1217   }
1218 
1219   // Numbers that look like IP addresses should not be changed.
1220   {
1221     std::string out_str;
1222     StdStringCanonOutput output(&out_str);
1223     EXPECT_TRUE(
1224         CanonicalizeHostSubstring("01.02.03.04", Component(0, 11), &output));
1225     output.Complete();
1226     EXPECT_EQ("01.02.03.04", out_str);
1227   }
1228 }
1229 
TEST(URLCanonTest,UserInfo)1230 TEST(URLCanonTest, UserInfo) {
1231   // Note that the canonicalizer should escape and treat empty components as
1232   // not being there.
1233 
1234   // We actually parse a full input URL so we can get the initial components.
1235   struct UserComponentCase {
1236     const char* input;
1237     const char* expected;
1238     Component expected_username;
1239     Component expected_password;
1240     bool expected_success;
1241   } user_info_cases[] = {
1242     {"http://user:[email protected]/", "user:pass@", Component(0, 4), Component(5, 4), true},
1243     {"http://@host.com/", "", Component(0, -1), Component(0, -1), true},
1244     {"http://:@host.com/", "", Component(0, -1), Component(0, -1), true},
1245     {"http://foo:@host.com/", "foo@", Component(0, 3), Component(0, -1), true},
1246     {"http://:[email protected]/", ":foo@", Component(0, 0), Component(1, 3), true},
1247     {"http://^ :$\[email protected]/", "%5E%20:$%09@", Component(0, 6), Component(7, 4), true},
1248     {"http://user:pass@/", "user:pass@", Component(0, 4), Component(5, 4), true},
1249     {"http://%2540:[email protected]/", "%2540:bar@", Component(0, 5), Component(6, 3), true },
1250 
1251       // IE7 compatibility: old versions allowed backslashes in usernames, but
1252       // IE7 does not. We disallow it as well.
1253     {"ftp://me\\mydomain:[email protected]/", "", Component(0, -1), Component(0, -1), true},
1254   };
1255 
1256   for (const auto& user_info_case : user_info_cases) {
1257     int url_len = static_cast<int>(strlen(user_info_case.input));
1258     Parsed parsed;
1259     ParseStandardURL(user_info_case.input, url_len, &parsed);
1260     Component out_user, out_pass;
1261     std::string out_str;
1262     StdStringCanonOutput output1(&out_str);
1263 
1264     bool success = CanonicalizeUserInfo(user_info_case.input, parsed.username,
1265                                         user_info_case.input, parsed.password,
1266                                         &output1, &out_user, &out_pass);
1267     output1.Complete();
1268 
1269     EXPECT_EQ(user_info_case.expected_success, success);
1270     EXPECT_EQ(user_info_case.expected, out_str);
1271     EXPECT_EQ(user_info_case.expected_username.begin, out_user.begin);
1272     EXPECT_EQ(user_info_case.expected_username.len, out_user.len);
1273     EXPECT_EQ(user_info_case.expected_password.begin, out_pass.begin);
1274     EXPECT_EQ(user_info_case.expected_password.len, out_pass.len);
1275 
1276     // Now try the wide version
1277     out_str.clear();
1278     StdStringCanonOutput output2(&out_str);
1279     std::u16string wide_input(base::UTF8ToUTF16(user_info_case.input));
1280     success = CanonicalizeUserInfo(wide_input.c_str(),
1281                                    parsed.username,
1282                                    wide_input.c_str(),
1283                                    parsed.password,
1284                                    &output2,
1285                                    &out_user,
1286                                    &out_pass);
1287     output2.Complete();
1288 
1289     EXPECT_EQ(user_info_case.expected_success, success);
1290     EXPECT_EQ(user_info_case.expected, out_str);
1291     EXPECT_EQ(user_info_case.expected_username.begin, out_user.begin);
1292     EXPECT_EQ(user_info_case.expected_username.len, out_user.len);
1293     EXPECT_EQ(user_info_case.expected_password.begin, out_pass.begin);
1294     EXPECT_EQ(user_info_case.expected_password.len, out_pass.len);
1295   }
1296 }
1297 
TEST(URLCanonTest,Port)1298 TEST(URLCanonTest, Port) {
1299   // We only need to test that the number gets properly put into the output
1300   // buffer. The parser unit tests will test scanning the number correctly.
1301   //
1302   // Note that the CanonicalizePort will always prepend a colon to the output
1303   // to separate it from the colon that it assumes precedes it.
1304   struct PortCase {
1305     const char* input;
1306     int default_port;
1307     const char* expected;
1308     Component expected_component;
1309     bool expected_success;
1310   } port_cases[] = {
1311       // Invalid input should be copied w/ failure.
1312     {"as df", 80, ":as%20df", Component(1, 7), false},
1313     {"-2", 80, ":-2", Component(1, 2), false},
1314       // Default port should be omitted.
1315     {"80", 80, "", Component(0, -1), true},
1316     {"8080", 80, ":8080", Component(1, 4), true},
1317       // PORT_UNSPECIFIED should mean always keep the port.
1318     {"80", PORT_UNSPECIFIED, ":80", Component(1, 2), true},
1319   };
1320 
1321   for (const auto& port_case : port_cases) {
1322     int url_len = static_cast<int>(strlen(port_case.input));
1323     Component in_comp(0, url_len);
1324     Component out_comp;
1325     std::string out_str;
1326     StdStringCanonOutput output1(&out_str);
1327     bool success = CanonicalizePort(
1328         port_case.input, in_comp, port_case.default_port, &output1, &out_comp);
1329     output1.Complete();
1330 
1331     EXPECT_EQ(port_case.expected_success, success);
1332     EXPECT_EQ(port_case.expected, out_str);
1333     EXPECT_EQ(port_case.expected_component.begin, out_comp.begin);
1334     EXPECT_EQ(port_case.expected_component.len, out_comp.len);
1335 
1336     // Now try the wide version
1337     out_str.clear();
1338     StdStringCanonOutput output2(&out_str);
1339     std::u16string wide_input(base::UTF8ToUTF16(port_case.input));
1340     success = CanonicalizePort(wide_input.c_str(), in_comp,
1341                                port_case.default_port, &output2, &out_comp);
1342     output2.Complete();
1343 
1344     EXPECT_EQ(port_case.expected_success, success);
1345     EXPECT_EQ(port_case.expected, out_str);
1346     EXPECT_EQ(port_case.expected_component.begin, out_comp.begin);
1347     EXPECT_EQ(port_case.expected_component.len, out_comp.len);
1348   }
1349 }
1350 
1351 DualComponentCase kCommonPathCases[] = {
1352     // ----- path collapsing tests -----
1353     {"/././foo", L"/././foo", "/foo", Component(0, 4), true},
1354     {"/./.foo", L"/./.foo", "/.foo", Component(0, 5), true},
1355     {"/foo/.", L"/foo/.", "/foo/", Component(0, 5), true},
1356     {"/foo/./", L"/foo/./", "/foo/", Component(0, 5), true},
1357     // double dots followed by a slash or the end of the string count
1358     {"/foo/bar/..", L"/foo/bar/..", "/foo/", Component(0, 5), true},
1359     {"/foo/bar/../", L"/foo/bar/../", "/foo/", Component(0, 5), true},
1360     // don't count double dots when they aren't followed by a slash
1361     {"/foo/..bar", L"/foo/..bar", "/foo/..bar", Component(0, 10), true},
1362     // some in the middle
1363     {"/foo/bar/../ton", L"/foo/bar/../ton", "/foo/ton", Component(0, 8), true},
1364     {"/foo/bar/../ton/../../a", L"/foo/bar/../ton/../../a", "/a",
1365      Component(0, 2), true},
1366     // we should not be able to go above the root
1367     {"/foo/../../..", L"/foo/../../..", "/", Component(0, 1), true},
1368     {"/foo/../../../ton", L"/foo/../../../ton", "/ton", Component(0, 4), true},
1369     // escaped dots should be unescaped and treated the same as dots
1370     {"/foo/%2e", L"/foo/%2e", "/foo/", Component(0, 5), true},
1371     {"/foo/%2e%2", L"/foo/%2e%2", "/foo/.%2", Component(0, 8), true},
1372     {"/foo/%2e./%2e%2e/.%2e/%2e.bar", L"/foo/%2e./%2e%2e/.%2e/%2e.bar",
1373      "/..bar", Component(0, 6), true},
1374     // Multiple slashes in a row should be preserved and treated like empty
1375     // directory names.
1376     {"////../..", L"////../..", "//", Component(0, 2), true},
1377 
1378     // ----- escaping tests -----
1379     {"/foo", L"/foo", "/foo", Component(0, 4), true},
1380     // Valid escape sequence
1381     {"/%20foo", L"/%20foo", "/%20foo", Component(0, 7), true},
1382     // Invalid escape sequence we should pass through unchanged.
1383     {"/foo%", L"/foo%", "/foo%", Component(0, 5), true},
1384     {"/foo%2", L"/foo%2", "/foo%2", Component(0, 6), true},
1385     // Invalid escape sequence: bad characters should be treated the same as
1386     // the surrounding text, not as escaped (in this case, UTF-8).
1387     {"/foo%2zbar", L"/foo%2zbar", "/foo%2zbar", Component(0, 10), true},
1388     {"/foo%2\xc2\xa9zbar", nullptr, "/foo%2%C2%A9zbar", Component(0, 16), true},
1389     {nullptr, L"/foo%2\xc2\xa9zbar", "/foo%2%C3%82%C2%A9zbar", Component(0, 22),
1390      true},
1391     // Regular characters that are escaped should remain escaped
1392     {"/foo%41%7a", L"/foo%41%7a", "/foo%41%7a", Component(0, 10), true},
1393     // Funny characters that are unescaped should be escaped
1394     {"/foo\x09\x91%91", nullptr, "/foo%09%91%91", Component(0, 13), true},
1395     {nullptr, L"/foo\x09\x91%91", "/foo%09%C2%91%91", Component(0, 16), true},
1396     // %00 should not cause failures.
1397     {"/foo%00%51", L"/foo%00%51", "/foo%00%51", Component(0, 10), true},
1398     // Some characters should be passed through unchanged regardless of esc.
1399     {"/(%28:%3A%29)", L"/(%28:%3A%29)", "/(%28:%3A%29)", Component(0, 13),
1400      true},
1401     // Characters that are properly escaped should not have the case changed
1402     // of hex letters.
1403     {"/%3A%3a%3C%3c", L"/%3A%3a%3C%3c", "/%3A%3a%3C%3c", Component(0, 13),
1404      true},
1405     // Funny characters that are unescaped should be escaped
1406     {"/foo\tbar", L"/foo\tbar", "/foo%09bar", Component(0, 10), true},
1407     // Hashes found in paths (possibly only when the caller explicitly sets
1408     // the path on an already-parsed URL) should be escaped.
1409     {"/foo#bar", L"/foo#bar", "/foo%23bar", Component(0, 10), true},
1410     // %7f should be allowed and %3D should not be unescaped (these were wrong
1411     // in a previous version).
1412     {"/%7Ffp3%3Eju%3Dduvgw%3Dd", L"/%7Ffp3%3Eju%3Dduvgw%3Dd",
1413      "/%7Ffp3%3Eju%3Dduvgw%3Dd", Component(0, 24), true},
1414     // @ should be passed through unchanged (escaped or unescaped).
1415     {"/@asdf%40", L"/@asdf%40", "/@asdf%40", Component(0, 9), true},
1416     // Nested escape sequences no longer happen. See https://crbug.com/1252531.
1417     {"/%A%42", L"/%A%42", "/%A%42", Component(0, 6), true},
1418     {"/%%41B", L"/%%41B", "/%%41B", Component(0, 6), true},
1419     {"/%%41%42", L"/%%41%42", "/%%41%42", Component(0, 8), true},
1420     // Make sure truncated "nested" escapes don't result in reading off the
1421     // string end.
1422     {"/%%41", L"/%%41", "/%%41", Component(0, 5), true},
1423     // Don't unescape the leading '%' if unescaping doesn't result in a valid
1424     // new escape sequence.
1425     {"/%%470", L"/%%470", "/%%470", Component(0, 6), true},
1426     {"/%%2D%41", L"/%%2D%41", "/%%2D%41", Component(0, 8), true},
1427     // Don't erroneously downcast a UTF-16 character in a way that makes it
1428     // look like part of an escape sequence.
1429     {nullptr, L"/%%41\x0130", "/%%41%C4%B0", Component(0, 11), true},
1430 
1431     // ----- encoding tests -----
1432     // Basic conversions
1433     {"/\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd",
1434      L"/\x4f60\x597d\x4f60\x597d", "/%E4%BD%A0%E5%A5%BD%E4%BD%A0%E5%A5%BD",
1435      Component(0, 37), true},
1436     // Unicode Noncharacter (U+FDD0) should not fail.
1437     {"/\xef\xb7\x90zyx", nullptr, "/%EF%B7%90zyx", Component(0, 13), true},
1438     {nullptr, L"/\xfdd0zyx", "/%EF%B7%90zyx", Component(0, 13), true},
1439 };
1440 
1441 typedef bool (*CanonFunc8Bit)(const char*,
1442                               const Component&,
1443                               CanonOutput*,
1444                               Component*);
1445 typedef bool (*CanonFunc16Bit)(const char16_t*,
1446                                const Component&,
1447                                CanonOutput*,
1448                                Component*);
1449 
DoPathTest(const DualComponentCase * path_cases,size_t num_cases,CanonFunc8Bit canon_func_8,CanonFunc16Bit canon_func_16)1450 void DoPathTest(const DualComponentCase* path_cases,
1451                 size_t num_cases,
1452                 CanonFunc8Bit canon_func_8,
1453                 CanonFunc16Bit canon_func_16) {
1454   for (size_t i = 0; i < num_cases; i++) {
1455     testing::Message scope_message;
1456     scope_message << path_cases[i].input8 << "," << path_cases[i].input16;
1457     SCOPED_TRACE(scope_message);
1458     if (path_cases[i].input8) {
1459       int len = static_cast<int>(strlen(path_cases[i].input8));
1460       Component in_comp(0, len);
1461       Component out_comp;
1462       std::string out_str;
1463       StdStringCanonOutput output(&out_str);
1464       bool success =
1465           canon_func_8(path_cases[i].input8, in_comp, &output, &out_comp);
1466       output.Complete();
1467 
1468       EXPECT_EQ(path_cases[i].expected_success, success);
1469       EXPECT_EQ(path_cases[i].expected_component.begin, out_comp.begin);
1470       EXPECT_EQ(path_cases[i].expected_component.len, out_comp.len);
1471       EXPECT_EQ(path_cases[i].expected, out_str);
1472     }
1473 
1474     if (path_cases[i].input16) {
1475       std::u16string input16(
1476           test_utils::TruncateWStringToUTF16(path_cases[i].input16));
1477       int len = static_cast<int>(input16.length());
1478       Component in_comp(0, len);
1479       Component out_comp;
1480       std::string out_str;
1481       StdStringCanonOutput output(&out_str);
1482 
1483       bool success =
1484           canon_func_16(input16.c_str(), in_comp, &output, &out_comp);
1485       output.Complete();
1486 
1487       EXPECT_EQ(path_cases[i].expected_success, success);
1488       EXPECT_EQ(path_cases[i].expected_component.begin, out_comp.begin);
1489       EXPECT_EQ(path_cases[i].expected_component.len, out_comp.len);
1490       EXPECT_EQ(path_cases[i].expected, out_str);
1491     }
1492   }
1493 }
1494 
TEST(URLCanonTest,SpecialPath)1495 TEST(URLCanonTest, SpecialPath) {
1496   // Common test cases
1497   DoPathTest(kCommonPathCases, std::size(kCommonPathCases),
1498              CanonicalizeSpecialPath, CanonicalizeSpecialPath);
1499 
1500   // Manual test: embedded NULLs should be escaped and the URL should be marked
1501   // as valid.
1502   const char path_with_null[] = "/ab\0c";
1503   Component in_comp(0, 5);
1504   Component out_comp;
1505 
1506   std::string out_str;
1507   StdStringCanonOutput output(&out_str);
1508   bool success =
1509       CanonicalizeSpecialPath(path_with_null, in_comp, &output, &out_comp);
1510   output.Complete();
1511   EXPECT_TRUE(success);
1512   EXPECT_EQ("/ab%00c", out_str);
1513 
1514   // Test cases specific on special URLs.
1515   DualComponentCase special_path_cases[] = {
1516       // Canonical path for empty path is a slash.
1517       {"", L"", "/", Component(0, 1), true},
1518       // Backslashes should be used as path separators.
1519       {"\\a\\b", L"\\a\\b", "/a/b", Component(0, 4), true},
1520       {"/a\\..\\b", L"/a\\..\\b", "/b", Component(0, 2), true},
1521       {"/a\\.\\b", L"/a\\.\\b", "/a/b", Component(0, 4), true},
1522   };
1523 
1524   DoPathTest(special_path_cases, std::size(special_path_cases),
1525              CanonicalizeSpecialPath, CanonicalizePath);
1526 }
1527 
TEST(URLCanonTest,NonSpecialPath)1528 TEST(URLCanonTest, NonSpecialPath) {
1529   // Common test cases
1530   DoPathTest(kCommonPathCases, std::size(kCommonPathCases),
1531              CanonicalizeNonSpecialPath, CanonicalizeNonSpecialPath);
1532 
1533   // Test cases specific on non-special URLs.
1534   DualComponentCase non_special_path_cases[] = {
1535       // Empty.
1536       {"", L"", "", Component(0, 0), true},
1537       // Backslashes.
1538       {"/a\\..\\b", L"/a\\..\\b", "/a\\..\\b", Component(0, 7), true},
1539       {"/a\\./b", L"/a\\./b", "/a\\./b", Component(0, 6), true},
1540   };
1541 
1542   DoPathTest(non_special_path_cases, std::size(non_special_path_cases),
1543              CanonicalizeNonSpecialPath, CanonicalizeNonSpecialPath);
1544 }
1545 
TEST(URLCanonTest,PartialPath)1546 TEST(URLCanonTest, PartialPath) {
1547   DualComponentCase partial_path_cases[] = {
1548       {".html", L".html", ".html", Component(0, 5), true},
1549       {"", L"", "", Component(0, 0), true},
1550   };
1551 
1552   DoPathTest(kCommonPathCases, std::size(kCommonPathCases),
1553              CanonicalizePartialPath, CanonicalizePartialPath);
1554   DoPathTest(partial_path_cases, std::size(partial_path_cases),
1555              CanonicalizePartialPath, CanonicalizePartialPath);
1556 }
1557 
TEST(URLCanonTest,Query)1558 TEST(URLCanonTest, Query) {
1559   struct QueryCase {
1560     const char* input8;
1561     const wchar_t* input16;
1562     const char* expected;
1563   } query_cases[] = {
1564       // Regular ASCII case.
1565     {"foo=bar", L"foo=bar", "?foo=bar"},
1566       // Allow question marks in the query without escaping
1567     {"as?df", L"as?df", "?as?df"},
1568       // Always escape '#' since it would mark the ref.
1569     {"as#df", L"as#df", "?as%23df"},
1570       // Escape some questionable 8-bit characters, but never unescape.
1571     {"\x02hello\x7f bye", L"\x02hello\x7f bye", "?%02hello%7F%20bye"},
1572     {"%40%41123", L"%40%41123", "?%40%41123"},
1573       // Chinese input/output
1574     {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "?q=%E4%BD%A0%E5%A5%BD"},
1575       // Invalid UTF-8/16 input should be replaced with invalid characters.
1576     {"q=\xed\xed", L"q=\xd800\xd800", "?q=%EF%BF%BD%EF%BF%BD"},
1577       // Don't allow < or > because sometimes they are used for XSS if the
1578       // URL is echoed in content. Firefox does this, IE doesn't.
1579     {"q=<asdf>", L"q=<asdf>", "?q=%3Casdf%3E"},
1580       // Escape double quotemarks in the query.
1581     {"q=\"asdf\"", L"q=\"asdf\"", "?q=%22asdf%22"},
1582   };
1583 
1584   for (const auto& query_case : query_cases) {
1585     Component out_comp;
1586 
1587     if (query_case.input8) {
1588       int len = static_cast<int>(strlen(query_case.input8));
1589       Component in_comp(0, len);
1590       std::string out_str;
1591 
1592       StdStringCanonOutput output(&out_str);
1593       CanonicalizeQuery(query_case.input8, in_comp, nullptr, &output,
1594                         &out_comp);
1595       output.Complete();
1596 
1597       EXPECT_EQ(query_case.expected, out_str);
1598     }
1599 
1600     if (query_case.input16) {
1601       std::u16string input16(
1602           test_utils::TruncateWStringToUTF16(query_case.input16));
1603       int len = static_cast<int>(input16.length());
1604       Component in_comp(0, len);
1605       std::string out_str;
1606 
1607       StdStringCanonOutput output(&out_str);
1608       CanonicalizeQuery(input16.c_str(), in_comp, nullptr, &output, &out_comp);
1609       output.Complete();
1610 
1611       EXPECT_EQ(query_case.expected, out_str);
1612     }
1613   }
1614 
1615   // Extra test for input with embedded NULL;
1616   std::string out_str;
1617   StdStringCanonOutput output(&out_str);
1618   Component out_comp;
1619   CanonicalizeQuery("a \x00z\x01", Component(0, 5), nullptr, &output,
1620                     &out_comp);
1621   output.Complete();
1622   EXPECT_EQ("?a%20%00z%01", out_str);
1623 }
1624 
TEST(URLCanonTest,Ref)1625 TEST(URLCanonTest, Ref) {
1626   // Refs are trivial, it just checks the encoding.
1627   DualComponentCase ref_cases[] = {
1628       {"hello!", L"hello!", "#hello!", Component(1, 6), true},
1629       // We should escape spaces, double-quotes, angled braces, and backtics.
1630       {"hello, world", L"hello, world", "#hello,%20world", Component(1, 14),
1631        true},
1632       {"hello,\"world", L"hello,\"world", "#hello,%22world", Component(1, 14),
1633        true},
1634       {"hello,<world", L"hello,<world", "#hello,%3Cworld", Component(1, 14),
1635        true},
1636       {"hello,>world", L"hello,>world", "#hello,%3Eworld", Component(1, 14),
1637        true},
1638       {"hello,`world", L"hello,`world", "#hello,%60world", Component(1, 14),
1639        true},
1640       // UTF-8/wide input should be preserved
1641       {"\xc2\xa9", L"\xa9", "#%C2%A9", Component(1, 6), true},
1642       // Test a characer that takes > 16 bits (U+10300 = old italic letter A)
1643       {"\xF0\x90\x8C\x80ss", L"\xd800\xdf00ss", "#%F0%90%8C%80ss",
1644        Component(1, 14), true},
1645       // Escaping should be preserved unchanged, even invalid ones
1646       {"%41%a", L"%41%a", "#%41%a", Component(1, 5), true},
1647       // Invalid UTF-8/16 input should be flagged and the input made valid
1648       {"\xc2", nullptr, "#%EF%BF%BD", Component(1, 9), true},
1649       {nullptr, L"\xd800\x597d", "#%EF%BF%BD%E5%A5%BD", Component(1, 18), true},
1650       // Test a Unicode invalid character.
1651       {"a\xef\xb7\x90", L"a\xfdd0", "#a%EF%B7%90", Component(1, 10), true},
1652       // Refs can have # signs and we should preserve them.
1653       {"asdf#qwer", L"asdf#qwer", "#asdf#qwer", Component(1, 9), true},
1654       {"#asdf", L"#asdf", "##asdf", Component(1, 5), true},
1655   };
1656 
1657   for (const auto& ref_case : ref_cases) {
1658     // 8-bit input
1659     if (ref_case.input8) {
1660       int len = static_cast<int>(strlen(ref_case.input8));
1661       Component in_comp(0, len);
1662       Component out_comp;
1663 
1664       std::string out_str;
1665       StdStringCanonOutput output(&out_str);
1666       CanonicalizeRef(ref_case.input8, in_comp, &output, &out_comp);
1667       output.Complete();
1668 
1669       EXPECT_EQ(ref_case.expected_component.begin, out_comp.begin);
1670       EXPECT_EQ(ref_case.expected_component.len, out_comp.len);
1671       EXPECT_EQ(ref_case.expected, out_str);
1672     }
1673 
1674     // 16-bit input
1675     if (ref_case.input16) {
1676       std::u16string input16(
1677           test_utils::TruncateWStringToUTF16(ref_case.input16));
1678       int len = static_cast<int>(input16.length());
1679       Component in_comp(0, len);
1680       Component out_comp;
1681 
1682       std::string out_str;
1683       StdStringCanonOutput output(&out_str);
1684       CanonicalizeRef(input16.c_str(), in_comp, &output, &out_comp);
1685       output.Complete();
1686 
1687       EXPECT_EQ(ref_case.expected_component.begin, out_comp.begin);
1688       EXPECT_EQ(ref_case.expected_component.len, out_comp.len);
1689       EXPECT_EQ(ref_case.expected, out_str);
1690     }
1691   }
1692 
1693   // Try one with an embedded NULL. It should be stripped.
1694   const char null_input[5] = "ab\x00z";
1695   Component null_input_component(0, 4);
1696   Component out_comp;
1697 
1698   std::string out_str;
1699   StdStringCanonOutput output(&out_str);
1700   CanonicalizeRef(null_input, null_input_component, &output, &out_comp);
1701   output.Complete();
1702 
1703   EXPECT_EQ(1, out_comp.begin);
1704   EXPECT_EQ(6, out_comp.len);
1705   EXPECT_EQ("#ab%00z", out_str);
1706 }
1707 
TEST(URLCanonTest,CanonicalizeStandardURL)1708 TEST(URLCanonTest, CanonicalizeStandardURL) {
1709   // The individual component canonicalize tests should have caught the cases
1710   // for each of those components. Here, we just need to test that the various
1711   // parts are included or excluded properly, and have the correct separators.
1712   // clang-format off
1713   struct URLCase {
1714     const char* input;
1715     const char* expected;
1716     bool expected_success;
1717   } cases[] = {
1718     {"http://www.google.com/foo?bar=baz#", "http://www.google.com/foo?bar=baz#",
1719      true},
1720 
1721       // Backslashes should get converted to forward slashes.
1722       {"http:\\\\www.google.com\\foo", "http://www.google.com/foo", true},
1723 
1724       // Busted refs shouldn't make the whole thing fail.
1725       {"http://www.google.com/asdf#\xc2",
1726        "http://www.google.com/asdf#%EF%BF%BD", true},
1727 
1728       // Basic port tests.
1729       {"http://foo:80/", "http://foo/", true},
1730       {"http://foo:81/", "http://foo:81/", true},
1731       {"httpa://foo:80/", "httpa://foo:80/", true},
1732       {"http://foo:-80/", "http://foo:-80/", false},
1733 
1734       {"https://foo:443/", "https://foo/", true},
1735       {"https://foo:80/", "https://foo:80/", true},
1736       {"ftp://foo:21/", "ftp://foo/", true},
1737       {"ftp://foo:80/", "ftp://foo:80/", true},
1738       {"gopher://foo:70/", "gopher://foo:70/", true},
1739       {"gopher://foo:443/", "gopher://foo:443/", true},
1740       {"ws://foo:80/", "ws://foo/", true},
1741       {"ws://foo:81/", "ws://foo:81/", true},
1742       {"ws://foo:443/", "ws://foo:443/", true},
1743       {"ws://foo:815/", "ws://foo:815/", true},
1744       {"wss://foo:80/", "wss://foo:80/", true},
1745       {"wss://foo:81/", "wss://foo:81/", true},
1746       {"wss://foo:443/", "wss://foo/", true},
1747       {"wss://foo:815/", "wss://foo:815/", true},
1748 
1749       // This particular code path ends up "backing up" to replace an invalid
1750       // host ICU generated with an escaped version. Test that in the context
1751       // of a full URL to make sure the backing up doesn't mess up the non-host
1752       // parts of the URL. "EF B9 AA" is U+FE6A which is a type of percent that
1753       // ICU will convert to an ASCII one, generating "%81".
1754       {"ws:)W\x1eW\xef\xb9\xaa"
1755        "81:80/",
1756        "ws://)w%1ew%81/", false},
1757       // Regression test for the last_invalid_percent_index bug described in
1758       // https://crbug.com/1080890#c10.
1759       {R"(HTTP:S/5%\../>%41)", "http://s/%3E%41", true},
1760   };
1761   // clang-format on
1762 
1763   for (const auto& i : cases) {
1764     int url_len = static_cast<int>(strlen(i.input));
1765     Parsed parsed;
1766     ParseStandardURL(i.input, url_len, &parsed);
1767 
1768     Parsed out_parsed;
1769     std::string out_str;
1770     StdStringCanonOutput output(&out_str);
1771     bool success = CanonicalizeStandardURL(
1772         i.input, parsed, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr,
1773         &output, &out_parsed);
1774     output.Complete();
1775 
1776     EXPECT_EQ(i.expected_success, success);
1777     EXPECT_EQ(i.expected, out_str);
1778   }
1779 }
1780 
TEST(URLCanonTest,CanonicalizeNonSpecialURL)1781 TEST(URLCanonTest, CanonicalizeNonSpecialURL) {
1782   // The individual component canonicalize tests should have caught the cases
1783   // for each of those components. Here, we just need to test that the various
1784   // parts are included or excluded properly, and have the correct separators.
1785   struct URLCase {
1786     const std::string_view input;
1787     const std::string_view expected;
1788     bool expected_success;
1789   } cases[] = {
1790       // Basic cases.
1791       {"git://host:80/path?a=b#ref", "git://host:80/path?a=b#ref", true},
1792       {"git://host", "git://host", true},
1793       {"git://host/", "git://host/", true},
1794       {"git://HosT/", "git://HosT/", true},
1795       {"git://..", "git://..", true},
1796       {"git://../", "git://../", true},
1797       {"git://../..", "git://../", true},
1798 
1799       // Empty hosts.
1800       {"git://", "git://", true},
1801       {"git:///", "git:///", true},
1802       {"git:////", "git:////", true},
1803       {"git:///a", "git:///a", true},
1804       {"git:///a/../b", "git:///b", true},
1805       {"git:///..", "git:///", true},
1806 
1807       // No hosts.
1808       {"git:/", "git:/", true},
1809       {"git:/a", "git:/a", true},
1810       {"git:/a/../b", "git:/b", true},
1811       {"git:/..", "git:/", true},
1812       {"git:/../", "git:/", true},
1813       {"git:/../..", "git:/", true},
1814       {"git:/.//a", "git:/.//a", true},
1815 
1816       // Users.
1817       {"git://@host", "git://host", true},
1818       {"git:// @host", "git://%20@host", true},
1819       {"git://\\@host", "git://%5C@host", true},
1820 
1821       // Paths.
1822       {"git://host/path", "git://host/path", true},
1823       {"git://host/p ath", "git://host/p%20ath", true},
1824       {"git://host/a/../b", "git://host/b", true},
1825       {"git://host/..", "git://host/", true},
1826       {"git://host/../", "git://host/", true},
1827       {"git://host/../..", "git://host/", true},
1828       {"git://host/.", "git://host/", true},
1829       {"git://host/./", "git://host/", true},
1830       {"git://host/./.", "git://host/", true},
1831       // Backslashes.
1832       {"git://host/a\\..\\b", "git://host/a\\..\\b", true},
1833 
1834       // IPv6.
1835       {"git://[1:2:0:0:5:0:0:0]", "git://[1:2:0:0:5::]", true},
1836       {"git://[1:2:0:0:5:0:0:0]/", "git://[1:2:0:0:5::]/", true},
1837       {"git://[1:2:0:0:5:0:0:0]/path", "git://[1:2:0:0:5::]/path", true},
1838 
1839       // IPv4 is unsupported.
1840       {"git://127.00.0.1", "git://127.00.0.1", true},
1841       {"git://127.1000.0.1", "git://127.1000.0.1", true},
1842 
1843       // Invalid URLs.
1844       {"git://@", "git://", false},
1845       // Forbidden host code points.
1846       {"git://<", "git://", false},
1847       {"git:// /", "git:///", false},
1848       // Backslashes cannot be used as host terminators.
1849       {"git://host\\a/../b", "git://host/b", false},
1850 
1851       // Opaque paths.
1852       {"git:", "git:", true},
1853       {"git:opaque", "git:opaque", true},
1854       {"git:o p a q u e", "git:o p a q u e", true},
1855       {"git: <", "git: <", true},
1856       {"git:opaque/a/../b", "git:opaque/a/../b", true},
1857       {"git:opaque\\a\\..\\b", "git:opaque\\a\\..\\b", true},
1858       {"git:\\a", "git:\\a", true},
1859       // Like URNs.
1860       {"git:a:b:c:123", "git:a:b:c:123", true},
1861   };
1862 
1863   for (const auto& i : cases) {
1864     SCOPED_TRACE(i.input);
1865     Parsed parsed;
1866     ParseNonSpecialURL(i.input.data(), i.input.size(), &parsed);
1867     Parsed out_parsed;
1868     std::string out_str;
1869     StdStringCanonOutput output(&out_str);
1870     bool success = CanonicalizeNonSpecialURL(
1871         i.input.data(), i.input.size(), parsed,
1872         /*query_converter=*/nullptr, output, out_parsed);
1873     output.Complete();
1874     EXPECT_EQ(success, i.expected_success);
1875     EXPECT_EQ(out_str, i.expected);
1876   }
1877 }
1878 
TEST(URLCanonTest,CanonicalizeNonSpecialURLOutputParsed)1879 TEST(URLCanonTest, CanonicalizeNonSpecialURLOutputParsed) {
1880   // Test that out_parsed is correctly set.
1881   struct URLCase {
1882     const std::string_view input;
1883     // Currently, test only host and length.
1884     Component expected_output_parsed_host;
1885     int expected_output_parsed_length;
1886   } cases[] = {
1887       {"git:", Component(), 4},
1888       {"git:opaque", Component(), 10},
1889       {"git:/", Component(), 5},
1890       {"git://", Component(6, 0), 6},
1891       {"git:///", Component(6, 0), 7},
1892       // The length of "[1:2:0:0:5::]" is 13.
1893       {"git://[1:2:0:0:5:0:0:0]/", Component(6, 13), 20},
1894   };
1895 
1896   for (const auto& i : cases) {
1897     SCOPED_TRACE(i.input);
1898     Parsed parsed;
1899     ParseNonSpecialURL(i.input.data(), i.input.size(), &parsed);
1900     Parsed out_parsed;
1901     std::string unused_out_str;
1902     StdStringCanonOutput unused_output(&unused_out_str);
1903     bool success = CanonicalizeNonSpecialURL(
1904         i.input.data(), i.input.size(), parsed,
1905         /*query_converter=*/nullptr, unused_output, out_parsed);
1906     ASSERT_TRUE(success);
1907     EXPECT_EQ(out_parsed.host, i.expected_output_parsed_host);
1908     EXPECT_EQ(out_parsed.Length(), i.expected_output_parsed_length);
1909   }
1910 }
1911 
1912 // The codepath here is the same as for regular canonicalization, so we just
1913 // need to test that things are replaced or not correctly.
TEST(URLCanonTest,ReplaceStandardURL)1914 TEST(URLCanonTest, ReplaceStandardURL) {
1915   ReplaceCase replace_cases[] = {
1916       // Common case of truncating the path.
1917       {"http://www.google.com/foo?bar=baz#ref", nullptr, nullptr, nullptr,
1918        nullptr, nullptr, "/", kDeleteComp, kDeleteComp,
1919        "http://www.google.com/"},
1920       // Replace everything
1921       {"http://a:[email protected]:22/foo;bar?baz@cat", "https", "me", "pw",
1922        "host.com", "99", "/path", "query", "ref",
1923        "https://me:[email protected]:99/path?query#ref"},
1924       // Replace nothing
1925       {"http://a:[email protected]:22/foo?baz@cat", nullptr, nullptr, nullptr,
1926        nullptr, nullptr, nullptr, nullptr, nullptr,
1927        "http://a:[email protected]:22/foo?baz@cat"},
1928       // Replace scheme with filesystem. The result is garbage, but you asked
1929       // for it.
1930       {"http://a:[email protected]:22/foo?baz@cat", "filesystem", nullptr, nullptr,
1931        nullptr, nullptr, nullptr, nullptr, nullptr,
1932        "filesystem://a:[email protected]:22/foo?baz@cat"},
1933   };
1934 
1935   for (const auto& replace_case : replace_cases) {
1936     const ReplaceCase& cur = replace_case;
1937     int base_len = static_cast<int>(strlen(cur.base));
1938     Parsed parsed;
1939     ParseStandardURL(cur.base, base_len, &parsed);
1940 
1941     Replacements<char> r;
1942     typedef Replacements<char> R;  // Clean up syntax.
1943 
1944     // Note that for the scheme we pass in a different clear function since
1945     // there is no function to clear the scheme.
1946     SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
1947     SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
1948     SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
1949     SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
1950     SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
1951     SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
1952     SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
1953     SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
1954 
1955     std::string out_str;
1956     StdStringCanonOutput output(&out_str);
1957     Parsed out_parsed;
1958     ReplaceStandardURL(replace_case.base, parsed, r,
1959                        SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr,
1960                        &output, &out_parsed);
1961     output.Complete();
1962 
1963     EXPECT_EQ(replace_case.expected, out_str);
1964   }
1965 
1966   // The path pointer should be ignored if the address is invalid.
1967   {
1968     const char src[] = "http://www.google.com/here_is_the_path";
1969     int src_len = static_cast<int>(strlen(src));
1970 
1971     Parsed parsed;
1972     ParseStandardURL(src, src_len, &parsed);
1973 
1974     // Replace the path to 0 length string. By using 1 as the string address,
1975     // the test should get an access violation if it tries to dereference it.
1976     Replacements<char> r;
1977     r.SetPath(reinterpret_cast<char*>(0x00000001), Component(0, 0));
1978     std::string out_str1;
1979     StdStringCanonOutput output1(&out_str1);
1980     Parsed new_parsed;
1981     ReplaceStandardURL(src, parsed, r,
1982                        SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr,
1983                        &output1, &new_parsed);
1984     output1.Complete();
1985     EXPECT_STREQ("http://www.google.com/", out_str1.c_str());
1986 
1987     // Same with an "invalid" path.
1988     r.SetPath(reinterpret_cast<char*>(0x00000001), Component());
1989     std::string out_str2;
1990     StdStringCanonOutput output2(&out_str2);
1991     ReplaceStandardURL(src, parsed, r,
1992                        SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr,
1993                        &output2, &new_parsed);
1994     output2.Complete();
1995     EXPECT_STREQ("http://www.google.com/", out_str2.c_str());
1996   }
1997 }
1998 
TEST(URLCanonTest,ReplaceFileURL)1999 TEST(URLCanonTest, ReplaceFileURL) {
2000   ReplaceCase replace_cases[] = {
2001       // Replace everything
2002       {"file:///C:/gaba?query#ref", nullptr, nullptr, nullptr, "filer", nullptr,
2003        "/foo", "b", "c", "file://filer/foo?b#c"},
2004       // Replace nothing
2005       {"file:///C:/gaba?query#ref", nullptr, nullptr, nullptr, nullptr, nullptr,
2006        nullptr, nullptr, nullptr, "file:///C:/gaba?query#ref"},
2007       {"file:///Y:", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
2008        nullptr, nullptr, "file:///Y:"},
2009       {"file:///Y:/", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
2010        nullptr, nullptr, "file:///Y:/"},
2011       {"file:///./Y", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
2012        nullptr, nullptr, "file:///Y"},
2013       {"file:///./Y:", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
2014        nullptr, nullptr, "file:///Y:"},
2015       // Clear non-path components (common)
2016       {"file:///C:/gaba?query#ref", nullptr, nullptr, nullptr, nullptr, nullptr,
2017        nullptr, kDeleteComp, kDeleteComp, "file:///C:/gaba"},
2018       // Replace path with something that doesn't begin with a slash and make
2019       // sure it gets added properly.
2020       {"file:///C:/gaba", nullptr, nullptr, nullptr, nullptr, nullptr,
2021        "interesting/", nullptr, nullptr, "file:///interesting/"},
2022       {"file:///home/gaba?query#ref", nullptr, nullptr, nullptr, "filer",
2023        nullptr, "/foo", "b", "c", "file://filer/foo?b#c"},
2024       {"file:///home/gaba?query#ref", nullptr, nullptr, nullptr, nullptr,
2025        nullptr, nullptr, nullptr, nullptr, "file:///home/gaba?query#ref"},
2026       {"file:///home/gaba?query#ref", nullptr, nullptr, nullptr, nullptr,
2027        nullptr, nullptr, kDeleteComp, kDeleteComp, "file:///home/gaba"},
2028       {"file:///home/gaba", nullptr, nullptr, nullptr, nullptr, nullptr,
2029        "interesting/", nullptr, nullptr, "file:///interesting/"},
2030       // Replace scheme -- shouldn't do anything.
2031       {"file:///C:/gaba?query#ref", "http", nullptr, nullptr, nullptr, nullptr,
2032        nullptr, nullptr, nullptr, "file:///C:/gaba?query#ref"},
2033   };
2034 
2035   for (const auto& replace_case : replace_cases) {
2036     const ReplaceCase& cur = replace_case;
2037     SCOPED_TRACE(cur.base);
2038     int base_len = static_cast<int>(strlen(cur.base));
2039     Parsed parsed;
2040     ParseFileURL(cur.base, base_len, &parsed);
2041 
2042     Replacements<char> r;
2043     typedef Replacements<char> R;  // Clean up syntax.
2044     SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
2045     SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
2046     SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
2047     SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
2048     SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
2049     SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
2050     SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
2051     SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
2052 
2053     std::string out_str;
2054     StdStringCanonOutput output(&out_str);
2055     Parsed out_parsed;
2056     ReplaceFileURL(cur.base, parsed, r, nullptr, &output, &out_parsed);
2057     output.Complete();
2058 
2059     EXPECT_EQ(replace_case.expected, out_str);
2060   }
2061 }
2062 
TEST(URLCanonTest,ReplaceFileSystemURL)2063 TEST(URLCanonTest, ReplaceFileSystemURL) {
2064   ReplaceCase replace_cases[] = {
2065       // Replace everything in the outer URL.
2066       {"filesystem:file:///temporary/gaba?query#ref", nullptr, nullptr, nullptr,
2067        nullptr, nullptr, "/foo", "b", "c",
2068        "filesystem:file:///temporary/foo?b#c"},
2069       // Replace nothing
2070       {"filesystem:file:///temporary/gaba?query#ref", nullptr, nullptr, nullptr,
2071        nullptr, nullptr, nullptr, nullptr, nullptr,
2072        "filesystem:file:///temporary/gaba?query#ref"},
2073       // Clear non-path components (common)
2074       {"filesystem:file:///temporary/gaba?query#ref", nullptr, nullptr, nullptr,
2075        nullptr, nullptr, nullptr, kDeleteComp, kDeleteComp,
2076        "filesystem:file:///temporary/gaba"},
2077       // Replace path with something that doesn't begin with a slash and make
2078       // sure it gets added properly.
2079       {"filesystem:file:///temporary/gaba?query#ref", nullptr, nullptr, nullptr,
2080        nullptr, nullptr, "interesting/", nullptr, nullptr,
2081        "filesystem:file:///temporary/interesting/?query#ref"},
2082       // Replace scheme -- shouldn't do anything except canonicalize.
2083       {"filesystem:http://u:[email protected]/t/gaba?query#ref", "http", nullptr,
2084        nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
2085        "filesystem:http://bar.com/t/gaba?query#ref"},
2086       // Replace username -- shouldn't do anything except canonicalize.
2087       {"filesystem:http://u:[email protected]/t/gaba?query#ref", nullptr, "u2", nullptr,
2088        nullptr, nullptr, nullptr, nullptr, nullptr,
2089        "filesystem:http://bar.com/t/gaba?query#ref"},
2090       // Replace password -- shouldn't do anything except canonicalize.
2091       {"filesystem:http://u:[email protected]/t/gaba?query#ref", nullptr, nullptr,
2092        "pw2", nullptr, nullptr, nullptr, nullptr, nullptr,
2093        "filesystem:http://bar.com/t/gaba?query#ref"},
2094       // Replace host -- shouldn't do anything except canonicalize.
2095       {"filesystem:http://u:[email protected]:80/t/gaba?query#ref", nullptr, nullptr,
2096        nullptr, "foo.com", nullptr, nullptr, nullptr, nullptr,
2097        "filesystem:http://bar.com/t/gaba?query#ref"},
2098       // Replace port -- shouldn't do anything except canonicalize.
2099       {"filesystem:http://u:[email protected]:40/t/gaba?query#ref", nullptr, nullptr,
2100        nullptr, nullptr, "41", nullptr, nullptr, nullptr,
2101        "filesystem:http://bar.com:40/t/gaba?query#ref"},
2102   };
2103 
2104   for (const auto& replace_case : replace_cases) {
2105     const ReplaceCase& cur = replace_case;
2106     Parsed parsed = ParseFileSystemURL(cur.base);
2107 
2108     Replacements<char> r;
2109     typedef Replacements<char> R;  // Clean up syntax.
2110     SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
2111     SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
2112     SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
2113     SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
2114     SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
2115     SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
2116     SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
2117     SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
2118 
2119     std::string out_str;
2120     StdStringCanonOutput output(&out_str);
2121     Parsed out_parsed;
2122     ReplaceFileSystemURL(cur.base, parsed, r, nullptr, &output, &out_parsed);
2123     output.Complete();
2124 
2125     EXPECT_EQ(replace_case.expected, out_str);
2126   }
2127 }
2128 
TEST(URLCanonTest,ReplacePathURL)2129 TEST(URLCanonTest, ReplacePathURL) {
2130   ReplaceCase replace_cases[] = {
2131       // Replace everything
2132       {"data:foo", "javascript", nullptr, nullptr, nullptr, nullptr,
2133        "alert('foo?');", nullptr, nullptr, "javascript:alert('foo?');"},
2134       // Replace nothing
2135       {"data:foo", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
2136        nullptr, nullptr, "data:foo"},
2137       // Replace one or the other
2138       {"data:foo", "javascript", nullptr, nullptr, nullptr, nullptr, nullptr,
2139        nullptr, nullptr, "javascript:foo"},
2140       {"data:foo", nullptr, nullptr, nullptr, nullptr, nullptr, "bar", nullptr,
2141        nullptr, "data:bar"},
2142       {"data:foo", nullptr, nullptr, nullptr, nullptr, nullptr, kDeleteComp,
2143        nullptr, nullptr, "data:"},
2144   };
2145 
2146   for (const auto& replace_case : replace_cases) {
2147     const ReplaceCase& cur = replace_case;
2148     int base_len = static_cast<int>(strlen(cur.base));
2149     Parsed parsed;
2150     ParsePathURL(cur.base, base_len, false, &parsed);
2151 
2152     Replacements<char> r;
2153     typedef Replacements<char> R;  // Clean up syntax.
2154     SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
2155     SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
2156     SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
2157     SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
2158     SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
2159     SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
2160     SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
2161     SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
2162 
2163     std::string out_str;
2164     StdStringCanonOutput output(&out_str);
2165     Parsed out_parsed;
2166     ReplacePathURL(cur.base, parsed, r, &output, &out_parsed);
2167     output.Complete();
2168 
2169     EXPECT_EQ(replace_case.expected, out_str);
2170   }
2171 }
2172 
TEST(URLCanonTest,ReplaceMailtoURL)2173 TEST(URLCanonTest, ReplaceMailtoURL) {
2174   ReplaceCase replace_cases[] = {
2175       // Replace everything
2176       {"mailto:[email protected]?body=sup", "mailto", nullptr, nullptr, nullptr,
2177        nullptr, "addr1", "to=tony", nullptr, "mailto:addr1?to=tony"},
2178       // Replace nothing
2179       {"mailto:[email protected]?body=sup", nullptr, nullptr, nullptr, nullptr,
2180        nullptr, nullptr, nullptr, nullptr, "mailto:[email protected]?body=sup"},
2181       // Replace the path
2182       {"mailto:[email protected]?body=sup", nullptr, nullptr, nullptr, nullptr,
2183        nullptr, "jason", nullptr, nullptr, "mailto:jason?body=sup"},
2184       // Replace the query
2185       {"mailto:[email protected]?body=sup", nullptr, nullptr, nullptr, nullptr,
2186        nullptr, nullptr, "custom=1", nullptr, "mailto:[email protected]?custom=1"},
2187       // Replace the path and query
2188       {"mailto:[email protected]?body=sup", nullptr, nullptr, nullptr, nullptr,
2189        nullptr, "jason", "custom=1", nullptr, "mailto:jason?custom=1"},
2190       // Set the query to empty (should leave trailing question mark)
2191       {"mailto:[email protected]?body=sup", nullptr, nullptr, nullptr, nullptr,
2192        nullptr, nullptr, "", nullptr, "mailto:[email protected]?"},
2193       // Clear the query
2194       {"mailto:[email protected]?body=sup", nullptr, nullptr, nullptr, nullptr,
2195        nullptr, nullptr, "|", nullptr, "mailto:[email protected]"},
2196       // Clear the path
2197       {"mailto:[email protected]?body=sup", nullptr, nullptr, nullptr, nullptr,
2198        nullptr, "|", nullptr, nullptr, "mailto:?body=sup"},
2199       // Clear the path + query
2200       {"mailto:", nullptr, nullptr, nullptr, nullptr, nullptr, "|", "|",
2201        nullptr, "mailto:"},
2202       // Setting the ref should have no effect
2203       {"mailto:addr1", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
2204        nullptr, "BLAH", "mailto:addr1"},
2205   };
2206 
2207   for (const auto& replace_case : replace_cases) {
2208     const ReplaceCase& cur = replace_case;
2209     Parsed parsed = ParseMailtoURL(cur.base);
2210 
2211     Replacements<char> r;
2212     typedef Replacements<char> R;
2213     SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
2214     SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
2215     SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
2216     SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
2217     SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
2218     SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
2219     SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
2220     SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
2221 
2222     std::string out_str;
2223     StdStringCanonOutput output(&out_str);
2224     Parsed out_parsed;
2225     ReplaceMailtoURL(cur.base, parsed, r, &output, &out_parsed);
2226     output.Complete();
2227 
2228     EXPECT_EQ(replace_case.expected, out_str);
2229   }
2230 }
2231 
TEST(URLCanonTest,CanonicalizeFileURL)2232 TEST(URLCanonTest, CanonicalizeFileURL) {
2233   struct URLCase {
2234     const char* input;
2235     const char* expected;
2236     bool expected_success;
2237     Component expected_host;
2238     Component expected_path;
2239   } cases[] = {
2240 #ifdef _WIN32
2241       // Windows-style paths
2242       {"file:c:\\foo\\bar.html", "file:///C:/foo/bar.html", true, Component(),
2243        Component(7, 16)},
2244       {"  File:c|////foo\\bar.html", "file:///C:////foo/bar.html", true,
2245        Component(), Component(7, 19)},
2246       {"file:", "file:///", true, Component(), Component(7, 1)},
2247       {"file:UNChost/path", "file://unchost/path", true, Component(7, 7),
2248        Component(14, 5)},
2249       // CanonicalizeFileURL supports absolute Windows style paths for IE
2250       // compatibility. Note that the caller must decide that this is a file
2251       // URL itself so it can call the file canonicalizer. This is usually
2252       // done automatically as part of relative URL resolving.
2253       {"c:\\foo\\bar", "file:///C:/foo/bar", true, Component(),
2254        Component(7, 11)},
2255       {"C|/foo/bar", "file:///C:/foo/bar", true, Component(), Component(7, 11)},
2256       {"/C|\\foo\\bar", "file:///C:/foo/bar", true, Component(),
2257        Component(7, 11)},
2258       {"//C|/foo/bar", "file:///C:/foo/bar", true, Component(),
2259        Component(7, 11)},
2260       {"//server/file", "file://server/file", true, Component(7, 6),
2261        Component(13, 5)},
2262       {"\\\\server\\file", "file://server/file", true, Component(7, 6),
2263        Component(13, 5)},
2264       {"/\\server/file", "file://server/file", true, Component(7, 6),
2265        Component(13, 5)},
2266       // We should preserve the number of slashes after the colon for IE
2267       // compatibility, except when there is none, in which case we should
2268       // add one.
2269       {"file:c:foo/bar.html", "file:///C:/foo/bar.html", true, Component(),
2270        Component(7, 16)},
2271       {"file:/\\/\\C:\\\\//foo\\bar.html", "file:///C:////foo/bar.html", true,
2272        Component(), Component(7, 19)},
2273       // Three slashes should be non-UNC, even if there is no drive spec (IE
2274       // does this, which makes the resulting request invalid).
2275       {"file:///foo/bar.txt", "file:///foo/bar.txt", true, Component(),
2276        Component(7, 12)},
2277       // TODO(brettw) we should probably fail for invalid host names, which
2278       // would change the expected result on this test. We also currently allow
2279       // colon even though it's probably invalid, because its currently the
2280       // "natural" result of the way the canonicalizer is written. There doesn't
2281       // seem to be a strong argument for why allowing it here would be bad, so
2282       // we just tolerate it and the load will fail later.
2283       {"FILE:/\\/\\7:\\\\//foo\\bar.html", "file://7:////foo/bar.html", false,
2284        Component(7, 2), Component(9, 16)},
2285       {"file:filer/home\\me", "file://filer/home/me", true, Component(7, 5),
2286        Component(12, 8)},
2287       // Make sure relative paths can't go above the "C:"
2288       {"file:///C:/foo/../../../bar.html", "file:///C:/bar.html", true,
2289        Component(), Component(7, 12)},
2290       // Busted refs shouldn't make the whole thing fail.
2291       {"file:///C:/asdf#\xc2", "file:///C:/asdf#%EF%BF%BD", true, Component(),
2292        Component(7, 8)},
2293       {"file:///./s:", "file:///S:", true, Component(), Component(7, 3)},
2294 #else
2295       // Unix-style paths
2296       {"file:///home/me", "file:///home/me", true, Component(),
2297        Component(7, 8)},
2298       // Windowsy ones should get still treated as Unix-style.
2299       {"file:c:\\foo\\bar.html", "file:///c:/foo/bar.html", true, Component(),
2300        Component(7, 16)},
2301       {"file:c|//foo\\bar.html", "file:///c%7C//foo/bar.html", true,
2302        Component(), Component(7, 19)},
2303       {"file:///./s:", "file:///s:", true, Component(), Component(7, 3)},
2304       // file: tests from WebKit (LayoutTests/fast/loader/url-parse-1.html)
2305       {"//", "file:///", true, Component(), Component(7, 1)},
2306       {"///", "file:///", true, Component(), Component(7, 1)},
2307       {"///test", "file:///test", true, Component(), Component(7, 5)},
2308       {"file://test", "file://test/", true, Component(7, 4), Component(11, 1)},
2309       {"file://localhost", "file://localhost/", true, Component(7, 9),
2310        Component(16, 1)},
2311       {"file://localhost/", "file://localhost/", true, Component(7, 9),
2312        Component(16, 1)},
2313       {"file://localhost/test", "file://localhost/test", true, Component(7, 9),
2314        Component(16, 5)},
2315 #endif  // _WIN32
2316   };
2317 
2318   for (const auto& i : cases) {
2319     int url_len = static_cast<int>(strlen(i.input));
2320     Parsed parsed;
2321     ParseFileURL(i.input, url_len, &parsed);
2322 
2323     Parsed out_parsed;
2324     std::string out_str;
2325     StdStringCanonOutput output(&out_str);
2326     bool success = CanonicalizeFileURL(i.input, url_len, parsed, nullptr,
2327                                        &output, &out_parsed);
2328     output.Complete();
2329 
2330     EXPECT_EQ(i.expected_success, success);
2331     EXPECT_EQ(i.expected, out_str);
2332 
2333     // Make sure the spec was properly identified, the file canonicalizer has
2334     // different code for writing the spec.
2335     EXPECT_EQ(0, out_parsed.scheme.begin);
2336     EXPECT_EQ(4, out_parsed.scheme.len);
2337 
2338     EXPECT_EQ(i.expected_host.begin, out_parsed.host.begin);
2339     EXPECT_EQ(i.expected_host.len, out_parsed.host.len);
2340 
2341     EXPECT_EQ(i.expected_path.begin, out_parsed.path.begin);
2342     EXPECT_EQ(i.expected_path.len, out_parsed.path.len);
2343   }
2344 }
2345 
TEST(URLCanonTest,CanonicalizeFileSystemURL)2346 TEST(URLCanonTest, CanonicalizeFileSystemURL) {
2347   struct URLCase {
2348     const char* input;
2349     const char* expected;
2350     bool expected_success;
2351   } cases[] = {
2352       {"Filesystem:htTp://www.Foo.com:80/tempoRary",
2353        "filesystem:http://www.foo.com/tempoRary/", true},
2354       {"filesystem:httpS://www.foo.com/temporary/",
2355        "filesystem:https://www.foo.com/temporary/", true},
2356       {"filesystem:http://www.foo.com//", "filesystem:http://www.foo.com//",
2357        false},
2358       {"filesystem:http://www.foo.com/persistent/bob?query#ref",
2359        "filesystem:http://www.foo.com/persistent/bob?query#ref", true},
2360       {"filesystem:fIle://\\temporary/", "filesystem:file:///temporary/", true},
2361       {"filesystem:fiLe:///temporary", "filesystem:file:///temporary/", true},
2362       {"filesystem:File:///temporary/Bob?qUery#reF",
2363        "filesystem:file:///temporary/Bob?qUery#reF", true},
2364       {"FilEsysteM:htTp:E=/.", "filesystem:http://e=//", false},
2365   };
2366 
2367   for (const auto& i : cases) {
2368     Parsed parsed = ParseFileSystemURL(i.input);
2369 
2370     Parsed out_parsed;
2371     std::string out_str;
2372     StdStringCanonOutput output(&out_str);
2373     bool success = CanonicalizeFileSystemURL(i.input, parsed, nullptr, &output,
2374                                              &out_parsed);
2375     output.Complete();
2376 
2377     EXPECT_EQ(i.expected_success, success);
2378     EXPECT_EQ(i.expected, out_str);
2379 
2380     // Make sure the spec was properly identified, the filesystem canonicalizer
2381     // has different code for writing the spec.
2382     EXPECT_EQ(0, out_parsed.scheme.begin);
2383     EXPECT_EQ(10, out_parsed.scheme.len);
2384     if (success)
2385       EXPECT_GT(out_parsed.path.len, 0);
2386   }
2387 }
2388 
TEST(URLCanonTest,CanonicalizePathURL)2389 TEST(URLCanonTest, CanonicalizePathURL) {
2390   // Path URLs should get canonicalized schemes but nothing else.
2391   struct PathCase {
2392     const char* input;
2393     const char* expected;
2394   } path_cases[] = {
2395       {"javascript:", "javascript:"},
2396       {"JavaScript:Foo", "javascript:Foo"},
2397       {"Foo:\":This /is interesting;?#", "foo:\":This /is interesting;?#"},
2398 
2399       // Unicode invalid characters should not cause failure. See
2400       // https://crbug.com/925614.
2401       {"javascript:\uFFFF", "javascript:%EF%BF%BF"},
2402   };
2403 
2404   for (const auto& path_case : path_cases) {
2405     int url_len = static_cast<int>(strlen(path_case.input));
2406     Parsed parsed;
2407     ParsePathURL(path_case.input, url_len, true, &parsed);
2408 
2409     Parsed out_parsed;
2410     std::string out_str;
2411     StdStringCanonOutput output(&out_str);
2412     bool success = CanonicalizePathURL(path_case.input, url_len, parsed,
2413                                        &output, &out_parsed);
2414     output.Complete();
2415 
2416     EXPECT_TRUE(success);
2417     EXPECT_EQ(path_case.expected, out_str);
2418 
2419     EXPECT_EQ(0, out_parsed.host.begin);
2420     EXPECT_EQ(-1, out_parsed.host.len);
2421 
2422     // When we end with a colon at the end, there should be no path.
2423     if (path_case.input[url_len - 1] == ':') {
2424       EXPECT_EQ(0, out_parsed.GetContent().begin);
2425       EXPECT_EQ(-1, out_parsed.GetContent().len);
2426     }
2427   }
2428 }
2429 
TEST(URLCanonTest,CanonicalizePathURLPath)2430 TEST(URLCanonTest, CanonicalizePathURLPath) {
2431   struct PathCase {
2432     std::string input;
2433     std::wstring input16;
2434     std::string expected;
2435   } path_cases[] = {
2436       {"Foo", L"Foo", "Foo"},
2437       {"\":This /is interesting;?#", L"\":This /is interesting;?#",
2438        "\":This /is interesting;?#"},
2439       {"\uFFFF", L"\uFFFF", "%EF%BF%BF"},
2440   };
2441 
2442   for (const auto& path_case : path_cases) {
2443     // 8-bit string input
2444     std::string out_str;
2445     StdStringCanonOutput output(&out_str);
2446     url::Component out_component;
2447     CanonicalizePathURLPath(path_case.input.data(),
2448                             Component(0, path_case.input.size()), &output,
2449                             &out_component);
2450     output.Complete();
2451 
2452     EXPECT_EQ(path_case.expected, out_str);
2453 
2454     EXPECT_EQ(0, out_component.begin);
2455     EXPECT_EQ(path_case.expected.size(),
2456               static_cast<size_t>(out_component.len));
2457 
2458     // 16-bit string input
2459     std::string out_str16;
2460     StdStringCanonOutput output16(&out_str16);
2461     url::Component out_component16;
2462     std::u16string input16(
2463         test_utils::TruncateWStringToUTF16(path_case.input16.data()));
2464     CanonicalizePathURLPath(input16.c_str(),
2465                             Component(0, path_case.input16.size()), &output16,
2466                             &out_component16);
2467     output16.Complete();
2468 
2469     EXPECT_EQ(path_case.expected, out_str16);
2470 
2471     EXPECT_EQ(0, out_component16.begin);
2472     EXPECT_EQ(path_case.expected.size(),
2473               static_cast<size_t>(out_component16.len));
2474   }
2475 }
2476 
TEST(URLCanonTest,CanonicalizeMailtoURL)2477 TEST(URLCanonTest, CanonicalizeMailtoURL) {
2478   struct URLCase {
2479     const char* input;
2480     const char* expected;
2481     bool expected_success;
2482     Component expected_path;
2483     Component expected_query;
2484   } cases[] = {
2485     // Null character should be escaped to %00.
2486     // Keep this test first in the list as it is handled specially below.
2487     {"mailto:addr1\0addr2?foo",
2488      "mailto:addr1%00addr2?foo",
2489      true, Component(7, 13), Component(21, 3)},
2490     {"mailto:addr1",
2491      "mailto:addr1",
2492      true, Component(7, 5), Component()},
2493     {"mailto:[email protected]",
2494      "mailto:[email protected]",
2495      true, Component(7, 13), Component()},
2496     // Trailing whitespace is stripped.
2497     {"MaIlTo:addr1 \t ",
2498      "mailto:addr1",
2499      true, Component(7, 5), Component()},
2500     {"MaIlTo:addr1?to=jon",
2501      "mailto:addr1?to=jon",
2502      true, Component(7, 5), Component(13,6)},
2503     {"mailto:addr1,addr2",
2504      "mailto:addr1,addr2",
2505      true, Component(7, 11), Component()},
2506     // Embedded spaces must be encoded.
2507     {"mailto:addr1, addr2",
2508      "mailto:addr1,%20addr2",
2509      true, Component(7, 14), Component()},
2510     {"mailto:addr1, addr2?subject=one two ",
2511      "mailto:addr1,%20addr2?subject=one%20two",
2512      true, Component(7, 14), Component(22, 17)},
2513     {"mailto:addr1%2caddr2",
2514      "mailto:addr1%2caddr2",
2515      true, Component(7, 13), Component()},
2516     {"mailto:\xF0\x90\x8C\x80",
2517      "mailto:%F0%90%8C%80",
2518      true, Component(7, 12), Component()},
2519     // Invalid -- UTF-8 encoded surrogate value.
2520     {"mailto:\xed\xa0\x80",
2521      "mailto:%EF%BF%BD%EF%BF%BD%EF%BF%BD",
2522      false, Component(7, 27), Component()},
2523     {"mailto:addr1?",
2524      "mailto:addr1?",
2525      true, Component(7, 5), Component(13, 0)},
2526     // Certain characters have special meanings and must be encoded.
2527     {"mailto:! \x22$&()+,-./09:;<=>@AZ[\\]&_`az{|}~\x7f?Query! \x22$&()+,-./09:;<=>@AZ[\\]&_`az{|}~",
2528      "mailto:!%20%22$&()+,-./09:;%3C=%3E@AZ[\\]&_%60az%7B%7C%7D~%7F?Query!%20%22$&()+,-./09:;%3C=%3E@AZ[\\]&_`az{|}~",
2529      true, Component(7, 53), Component(61, 47)},
2530   };
2531 
2532   // Define outside of loop to catch bugs where components aren't reset
2533   Parsed out_parsed;
2534 
2535   for (size_t i = 0; i < std::size(cases); i++) {
2536     int url_len = static_cast<int>(strlen(cases[i].input));
2537     if (i == 0) {
2538       // The first test case purposely has a '\0' in it -- don't count it
2539       // as the string terminator.
2540       url_len = 22;
2541     }
2542 
2543     std::string out_str;
2544     StdStringCanonOutput output(&out_str);
2545     bool success = CanonicalizeMailtoURL(
2546         cases[i].input, url_len,
2547         ParseMailtoURL(std::string_view(cases[i].input, url_len)), &output,
2548         &out_parsed);
2549     output.Complete();
2550 
2551     EXPECT_EQ(cases[i].expected_success, success);
2552     EXPECT_EQ(cases[i].expected, out_str);
2553 
2554     // Make sure the spec was properly identified
2555     EXPECT_EQ(0, out_parsed.scheme.begin);
2556     EXPECT_EQ(6, out_parsed.scheme.len);
2557 
2558     EXPECT_EQ(cases[i].expected_path.begin, out_parsed.path.begin);
2559     EXPECT_EQ(cases[i].expected_path.len, out_parsed.path.len);
2560 
2561     EXPECT_EQ(cases[i].expected_query.begin, out_parsed.query.begin);
2562     EXPECT_EQ(cases[i].expected_query.len, out_parsed.query.len);
2563   }
2564 }
2565 
2566 #ifndef WIN32
2567 
TEST(URLCanonTest,_itoa_s)2568 TEST(URLCanonTest, _itoa_s) {
2569   // We fill the buffer with 0xff to ensure that it's getting properly
2570   // null-terminated. We also allocate one byte more than what we tell
2571   // _itoa_s about, and ensure that the extra byte is untouched.
2572   char buf[6];
2573   memset(buf, 0xff, sizeof(buf));
2574   EXPECT_EQ(0, _itoa_s(12, buf, sizeof(buf) - 1, 10));
2575   EXPECT_STREQ("12", buf);
2576   EXPECT_EQ('\xFF', buf[3]);
2577 
2578   // Test the edge cases - exactly the buffer size and one over
2579   memset(buf, 0xff, sizeof(buf));
2580   EXPECT_EQ(0, _itoa_s(1234, buf, sizeof(buf) - 1, 10));
2581   EXPECT_STREQ("1234", buf);
2582   EXPECT_EQ('\xFF', buf[5]);
2583 
2584   memset(buf, 0xff, sizeof(buf));
2585   EXPECT_EQ(EINVAL, _itoa_s(12345, buf, sizeof(buf) - 1, 10));
2586   EXPECT_EQ('\xFF', buf[5]);  // should never write to this location
2587 
2588   // Test the template overload (note that this will see the full buffer)
2589   memset(buf, 0xff, sizeof(buf));
2590   EXPECT_EQ(0, _itoa_s(12, buf, 10));
2591   EXPECT_STREQ("12", buf);
2592   EXPECT_EQ('\xFF', buf[3]);
2593 
2594   memset(buf, 0xff, sizeof(buf));
2595   EXPECT_EQ(0, _itoa_s(12345, buf, 10));
2596   EXPECT_STREQ("12345", buf);
2597 
2598   EXPECT_EQ(EINVAL, _itoa_s(123456, buf, 10));
2599 
2600   // Test that radix 16 is supported.
2601   memset(buf, 0xff, sizeof(buf));
2602   EXPECT_EQ(0, _itoa_s(1234, buf, sizeof(buf) - 1, 16));
2603   EXPECT_STREQ("4d2", buf);
2604   EXPECT_EQ('\xFF', buf[5]);
2605 }
2606 
TEST(URLCanonTest,_itow_s)2607 TEST(URLCanonTest, _itow_s) {
2608   // We fill the buffer with 0xff to ensure that it's getting properly
2609   // null-terminated. We also allocate one byte more than what we tell
2610   // _itoa_s about, and ensure that the extra byte is untouched.
2611   char16_t buf[6];
2612   const char fill_mem = 0xff;
2613   const char16_t fill_char = 0xffff;
2614   memset(buf, fill_mem, sizeof(buf));
2615   EXPECT_EQ(0, _itow_s(12, buf, sizeof(buf) / 2 - 1, 10));
2616   EXPECT_EQ(u"12", std::u16string(buf));
2617   EXPECT_EQ(fill_char, buf[3]);
2618 
2619   // Test the edge cases - exactly the buffer size and one over
2620   EXPECT_EQ(0, _itow_s(1234, buf, sizeof(buf) / 2 - 1, 10));
2621   EXPECT_EQ(u"1234", std::u16string(buf));
2622   EXPECT_EQ(fill_char, buf[5]);
2623 
2624   memset(buf, fill_mem, sizeof(buf));
2625   EXPECT_EQ(EINVAL, _itow_s(12345, buf, sizeof(buf) / 2 - 1, 10));
2626   EXPECT_EQ(fill_char, buf[5]);  // should never write to this location
2627 
2628   // Test the template overload (note that this will see the full buffer)
2629   memset(buf, fill_mem, sizeof(buf));
2630   EXPECT_EQ(0, _itow_s(12, buf, 10));
2631   EXPECT_EQ(u"12", std::u16string(buf));
2632   EXPECT_EQ(fill_char, buf[3]);
2633 
2634   memset(buf, fill_mem, sizeof(buf));
2635   EXPECT_EQ(0, _itow_s(12345, buf, 10));
2636   EXPECT_EQ(u"12345", std::u16string(buf));
2637 
2638   EXPECT_EQ(EINVAL, _itow_s(123456, buf, 10));
2639 }
2640 
2641 #endif  // !WIN32
2642 
2643 // Returns true if the given two structures are the same.
ParsedIsEqual(const Parsed & a,const Parsed & b)2644 static bool ParsedIsEqual(const Parsed& a, const Parsed& b) {
2645   return a.scheme.begin == b.scheme.begin && a.scheme.len == b.scheme.len &&
2646          a.username.begin == b.username.begin && a.username.len == b.username.len &&
2647          a.password.begin == b.password.begin && a.password.len == b.password.len &&
2648          a.host.begin == b.host.begin && a.host.len == b.host.len &&
2649          a.port.begin == b.port.begin && a.port.len == b.port.len &&
2650          a.path.begin == b.path.begin && a.path.len == b.path.len &&
2651          a.query.begin == b.query.begin && a.query.len == b.query.len &&
2652          a.ref.begin == b.ref.begin && a.ref.len == b.ref.len;
2653 }
2654 
TEST(URLCanonTest,ResolveRelativeURL)2655 TEST(URLCanonTest, ResolveRelativeURL) {
2656   struct RelativeCase {
2657     const char* base;      // Input base URL: MUST BE CANONICAL
2658     bool is_base_hier;     // Is the base URL hierarchical
2659     bool is_base_file;     // Tells us if the base is a file URL.
2660     const char* test;      // Input URL to test against.
2661     bool succeed_relative; // Whether we expect IsRelativeURL to succeed
2662     bool is_rel;           // Whether we expect |test| to be relative or not.
2663     bool succeed_resolve;  // Whether we expect ResolveRelativeURL to succeed.
2664     const char* resolved;  // What we expect in the result when resolving.
2665   } rel_cases[] = {
2666       // Basic absolute input.
2667       {"http://host/a", true, false, "http://another/", true, false, false,
2668        nullptr},
2669       {"http://host/a", true, false, "http:////another/", true, false, false,
2670        nullptr},
2671       // Empty relative URLs should only remove the ref part of the URL,
2672       // leaving the rest unchanged.
2673       {"http://foo/bar", true, false, "", true, true, true, "http://foo/bar"},
2674       {"http://foo/bar#ref", true, false, "", true, true, true,
2675        "http://foo/bar"},
2676       {"http://foo/bar#", true, false, "", true, true, true, "http://foo/bar"},
2677       // Spaces at the ends of the relative path should be ignored.
2678       {"http://foo/bar", true, false, "  another  ", true, true, true,
2679        "http://foo/another"},
2680       {"http://foo/bar", true, false, "  .  ", true, true, true, "http://foo/"},
2681       {"http://foo/bar", true, false, " \t ", true, true, true,
2682        "http://foo/bar"},
2683       // Matching schemes without two slashes are treated as relative.
2684       {"http://host/a", true, false, "http:path", true, true, true,
2685        "http://host/path"},
2686       {"http://host/a/", true, false, "http:path", true, true, true,
2687        "http://host/a/path"},
2688       {"http://host/a", true, false, "http:/path", true, true, true,
2689        "http://host/path"},
2690       {"http://host/a", true, false, "HTTP:/path", true, true, true,
2691        "http://host/path"},
2692       // Nonmatching schemes are absolute.
2693       {"http://host/a", true, false, "https:host2", true, false, false,
2694        nullptr},
2695       {"http://host/a", true, false, "htto:/host2", true, false, false,
2696        nullptr},
2697       // Absolute path input
2698       {"http://host/a", true, false, "/b/c/d", true, true, true,
2699        "http://host/b/c/d"},
2700       {"http://host/a", true, false, "\\b\\c\\d", true, true, true,
2701        "http://host/b/c/d"},
2702       {"http://host/a", true, false, "/b/../c", true, true, true,
2703        "http://host/c"},
2704       {"http://host/a?b#c", true, false, "/b/../c", true, true, true,
2705        "http://host/c"},
2706       {"http://host/a", true, false, "\\b/../c?x#y", true, true, true,
2707        "http://host/c?x#y"},
2708       {"http://host/a?b#c", true, false, "/b/../c?x#y", true, true, true,
2709        "http://host/c?x#y"},
2710       // Relative path input
2711       {"http://host/a", true, false, "b", true, true, true, "http://host/b"},
2712       {"http://host/a", true, false, "bc/de", true, true, true,
2713        "http://host/bc/de"},
2714       {"http://host/a/", true, false, "bc/de?query#ref", true, true, true,
2715        "http://host/a/bc/de?query#ref"},
2716       {"http://host/a/", true, false, ".", true, true, true, "http://host/a/"},
2717       {"http://host/a/", true, false, "..", true, true, true, "http://host/"},
2718       {"http://host/a/", true, false, "./..", true, true, true, "http://host/"},
2719       {"http://host/a/", true, false, "../.", true, true, true, "http://host/"},
2720       {"http://host/a/", true, false, "././.", true, true, true,
2721        "http://host/a/"},
2722       {"http://host/a?query#ref", true, false, "../../../foo", true, true, true,
2723        "http://host/foo"},
2724       // Query input
2725       {"http://host/a", true, false, "?foo=bar", true, true, true,
2726        "http://host/a?foo=bar"},
2727       {"http://host/a?x=y#z", true, false, "?", true, true, true,
2728        "http://host/a?"},
2729       {"http://host/a?x=y#z", true, false, "?foo=bar#com", true, true, true,
2730        "http://host/a?foo=bar#com"},
2731       // Ref input
2732       {"http://host/a", true, false, "#ref", true, true, true,
2733        "http://host/a#ref"},
2734       {"http://host/a#b", true, false, "#", true, true, true, "http://host/a#"},
2735       {"http://host/a?foo=bar#hello", true, false, "#bye", true, true, true,
2736        "http://host/a?foo=bar#bye"},
2737       // Non-hierarchical base: no relative handling. Relative input should
2738       // error, and if a scheme is present, it should be treated as absolute.
2739       {"data:foobar", false, false, "baz.html", false, false, false, nullptr},
2740       {"data:foobar", false, false, "data:baz", true, false, false, nullptr},
2741       {"data:foobar", false, false, "data:/base", true, false, false, nullptr},
2742       // Non-hierarchical base: absolute input should succeed.
2743       {"data:foobar", false, false, "http://host/", true, false, false,
2744        nullptr},
2745       {"data:foobar", false, false, "http:host", true, false, false, nullptr},
2746       // Non-hierarchical base: empty URL should give error.
2747       {"data:foobar", false, false, "", false, false, false, nullptr},
2748       // Invalid schemes should be treated as relative.
2749       {"http://foo/bar", true, false, "./asd:fgh", true, true, true,
2750        "http://foo/asd:fgh"},
2751       {"http://foo/bar", true, false, ":foo", true, true, true,
2752        "http://foo/:foo"},
2753       {"http://foo/bar", true, false, " hello world", true, true, true,
2754        "http://foo/hello%20world"},
2755       {"data:asdf", false, false, ":foo", false, false, false, nullptr},
2756       {"data:asdf", false, false, "bad(':foo')", false, false, false, nullptr},
2757       // We should treat semicolons like any other character in URL resolving
2758       {"http://host/a", true, false, ";foo", true, true, true,
2759        "http://host/;foo"},
2760       {"http://host/a;", true, false, ";foo", true, true, true,
2761        "http://host/;foo"},
2762       {"http://host/a", true, false, ";/../bar", true, true, true,
2763        "http://host/bar"},
2764       // Relative URLs can also be written as "//foo/bar" which is relative to
2765       // the scheme. In this case, it would take the old scheme, so for http
2766       // the example would resolve to "http://foo/bar".
2767       {"http://host/a", true, false, "//another", true, true, true,
2768        "http://another/"},
2769       {"http://host/a", true, false, "//another/path?query#ref", true, true,
2770        true, "http://another/path?query#ref"},
2771       {"http://host/a", true, false, "///another/path", true, true, true,
2772        "http://another/path"},
2773       {"http://host/a", true, false, "//Another\\path", true, true, true,
2774        "http://another/path"},
2775       {"http://host/a", true, false, "//", true, true, false, "http:"},
2776       // IE will also allow one or the other to be a backslash to get the same
2777       // behavior.
2778       {"http://host/a", true, false, "\\/another/path", true, true, true,
2779        "http://another/path"},
2780       {"http://host/a", true, false, "/\\Another\\path", true, true, true,
2781        "http://another/path"},
2782 #ifdef WIN32
2783       // Resolving against Windows file base URLs.
2784       {"file:///C:/foo", true, true, "http://host/", true, false, false,
2785        nullptr},
2786       {"file:///C:/foo", true, true, "bar", true, true, true, "file:///C:/bar"},
2787       {"file:///C:/foo", true, true, "../../../bar.html", true, true, true,
2788        "file:///C:/bar.html"},
2789       {"file:///C:/foo", true, true, "/../bar.html", true, true, true,
2790        "file:///C:/bar.html"},
2791       // But two backslashes on Windows should be UNC so should be treated
2792       // as absolute.
2793       {"http://host/a", true, false, "\\\\another\\path", true, false, false,
2794        nullptr},
2795       // IE doesn't support drive specs starting with two slashes. It fails
2796       // immediately and doesn't even try to load. We fix it up to either
2797       // an absolute path or UNC depending on what it looks like.
2798       {"file:///C:/something", true, true, "//c:/foo", true, true, true,
2799        "file:///C:/foo"},
2800       {"file:///C:/something", true, true, "//localhost/c:/foo", true, true,
2801        true, "file:///C:/foo"},
2802       // Windows drive specs should be allowed and treated as absolute.
2803       {"file:///C:/foo", true, true, "c:", true, false, false, nullptr},
2804       {"file:///C:/foo", true, true, "c:/foo", true, false, false, nullptr},
2805       {"http://host/a", true, false, "c:\\foo", true, false, false, nullptr},
2806       // Relative paths with drive letters should be allowed when the base is
2807       // also a file.
2808       {"file:///C:/foo", true, true, "/z:/bar", true, true, true,
2809        "file:///Z:/bar"},
2810       // Treat absolute paths as being off of the drive.
2811       {"file:///C:/foo", true, true, "/bar", true, true, true,
2812        "file:///C:/bar"},
2813       {"file://localhost/C:/foo", true, true, "/bar", true, true, true,
2814        "file://localhost/C:/bar"},
2815       {"file:///C:/foo/com/", true, true, "/bar", true, true, true,
2816        "file:///C:/bar"},
2817       // On Windows, two slashes without a drive letter when the base is a file
2818       // means that the path is UNC.
2819       {"file:///C:/something", true, true, "//somehost/path", true, true, true,
2820        "file://somehost/path"},
2821       {"file:///C:/something", true, true, "/\\//somehost/path", true, true,
2822        true, "file://somehost/path"},
2823 #else
2824       // On Unix we fall back to relative behavior since there's nothing else
2825       // reasonable to do.
2826       {"http://host/a", true, false, "\\\\Another\\path", true, true, true,
2827        "http://another/path"},
2828 #endif
2829       // Even on Windows, we don't allow relative drive specs when the base
2830       // is not file.
2831       {"http://host/a", true, false, "/c:\\foo", true, true, true,
2832        "http://host/c:/foo"},
2833       {"http://host/a", true, false, "//c:\\foo", true, true, true,
2834        "http://c/foo"},
2835       // Cross-platform relative file: resolution behavior.
2836       {"file://host/a", true, true, "/", true, true, true, "file://host/"},
2837       {"file://host/a", true, true, "//", true, true, true, "file:///"},
2838       {"file://host/a", true, true, "/b", true, true, true, "file://host/b"},
2839       {"file://host/a", true, true, "//b", true, true, true, "file://b/"},
2840       // Ensure that ports aren't allowed for hosts relative to a file url.
2841       // Although the result string shows a host:port portion, the call to
2842       // resolve the relative URL returns false, indicating parse failure,
2843       // which is what is required.
2844       {"file:///foo.txt", true, true, "//host:80/bar.txt", true, true, false,
2845        "file://host:80/bar.txt"},
2846       // Filesystem URL tests; filesystem URLs are only valid and relative if
2847       // they have no scheme, e.g. "./index.html". There's no valid equivalent
2848       // to http:index.html.
2849       {"filesystem:http://host/t/path", true, false,
2850        "filesystem:http://host/t/path2", true, false, false, nullptr},
2851       {"filesystem:http://host/t/path", true, false,
2852        "filesystem:https://host/t/path2", true, false, false, nullptr},
2853       {"filesystem:http://host/t/path", true, false, "http://host/t/path2",
2854        true, false, false, nullptr},
2855       {"http://host/t/path", true, false, "filesystem:http://host/t/path2",
2856        true, false, false, nullptr},
2857       {"filesystem:http://host/t/path", true, false, "./path2", true, true,
2858        true, "filesystem:http://host/t/path2"},
2859       {"filesystem:http://host/t/path/", true, false, "path2", true, true, true,
2860        "filesystem:http://host/t/path/path2"},
2861       {"filesystem:http://host/t/path", true, false, "filesystem:http:path2",
2862        true, false, false, nullptr},
2863       // Absolute URLs are still not relative to a non-standard base URL.
2864       {"about:blank", false, false, "http://X/A", true, false, true, ""},
2865       {"about:blank", false, false, "content://content.Provider/", true, false,
2866        true, ""},
2867   };
2868 
2869   for (const auto& cur_case : rel_cases) {
2870     Parsed parsed;
2871     int base_len = static_cast<int>(strlen(cur_case.base));
2872     if (cur_case.is_base_file)
2873       ParseFileURL(cur_case.base, base_len, &parsed);
2874     else if (cur_case.is_base_hier)
2875       ParseStandardURL(cur_case.base, base_len, &parsed);
2876     else
2877       ParsePathURL(cur_case.base, base_len, false, &parsed);
2878 
2879     // First see if it is relative.
2880     int test_len = static_cast<int>(strlen(cur_case.test));
2881     bool is_relative;
2882     Component relative_component;
2883     bool succeed_is_rel = IsRelativeURL(
2884         cur_case.base, parsed, cur_case.test, test_len, cur_case.is_base_hier,
2885         &is_relative, &relative_component);
2886 
2887     EXPECT_EQ(cur_case.succeed_relative, succeed_is_rel) <<
2888         "succeed is rel failure on " << cur_case.test;
2889     EXPECT_EQ(cur_case.is_rel, is_relative) <<
2890         "is rel failure on " << cur_case.test;
2891     // Now resolve it.
2892     if (succeed_is_rel && is_relative && cur_case.is_rel) {
2893       std::string resolved;
2894       StdStringCanonOutput output(&resolved);
2895       Parsed resolved_parsed;
2896 
2897       bool succeed_resolve = ResolveRelativeURL(
2898           cur_case.base, parsed, cur_case.is_base_file, cur_case.test,
2899           relative_component, nullptr, &output, &resolved_parsed);
2900       output.Complete();
2901 
2902       EXPECT_EQ(cur_case.succeed_resolve, succeed_resolve);
2903       EXPECT_EQ(cur_case.resolved, resolved) << " on " << cur_case.test;
2904 
2905       // Verify that the output parsed structure is the same as parsing a
2906       // the URL freshly.
2907       Parsed ref_parsed;
2908       int resolved_len = static_cast<int>(resolved.size());
2909       if (cur_case.is_base_file) {
2910         ParseFileURL(resolved.c_str(), resolved_len, &ref_parsed);
2911       } else if (cur_case.is_base_hier) {
2912         ParseStandardURL(resolved.c_str(), resolved_len, &ref_parsed);
2913       } else {
2914         ParsePathURL(resolved.c_str(), resolved_len, false, &ref_parsed);
2915       }
2916       EXPECT_TRUE(ParsedIsEqual(ref_parsed, resolved_parsed));
2917     }
2918   }
2919 }
2920 
2921 class URLCanonTypedTest : public ::testing::TestWithParam<bool> {
2922  public:
URLCanonTypedTest()2923   URLCanonTypedTest()
2924       : use_standard_compliant_non_special_scheme_url_parsing_(GetParam()) {
2925     if (use_standard_compliant_non_special_scheme_url_parsing_) {
2926       scoped_feature_list_.InitAndEnableFeature(
2927           kStandardCompliantNonSpecialSchemeURLParsing);
2928     } else {
2929       scoped_feature_list_.InitAndDisableFeature(
2930           kStandardCompliantNonSpecialSchemeURLParsing);
2931     }
2932   }
2933 
2934  protected:
2935   struct URLCase {
2936     const std::string_view input;
2937     const std::string_view expected;
2938     bool expected_success;
2939   };
2940 
2941   struct ResolveRelativeURLCase {
2942     const std::string_view base;
2943     const std::string_view rel;
2944     const bool is_base_hier;
2945     const bool expected_base_is_valid;
2946     const bool expected_is_relative;
2947     const bool expected_succeed_resolve;
2948     const std::string_view expected_resolved_url;
2949   };
2950 
TestNonSpecialResolveRelativeURL(const ResolveRelativeURLCase & relative_case)2951   void TestNonSpecialResolveRelativeURL(
2952       const ResolveRelativeURLCase& relative_case) {
2953     // The following test is similar to URLCanonTest::ResolveRelativeURL, but
2954     // simplified.
2955     Parsed parsed;
2956     if (use_standard_compliant_non_special_scheme_url_parsing_) {
2957       ParseNonSpecialURL(relative_case.base.data(), relative_case.base.size(),
2958                          &parsed);
2959     } else {
2960       ParsePathURL(relative_case.base.data(), relative_case.base.size(),
2961                    /*trim_path_end=*/true, &parsed);
2962     }
2963 
2964     // First see if it is relative.
2965     bool is_relative;
2966     Component relative_component;
2967     bool succeed_is_rel = IsRelativeURL(
2968         relative_case.base.data(), parsed, relative_case.rel.data(),
2969         relative_case.rel.size(), relative_case.is_base_hier, &is_relative,
2970         &relative_component);
2971 
2972     EXPECT_EQ(is_relative, relative_case.expected_is_relative);
2973     if (succeed_is_rel && is_relative) {
2974       std::string resolved_url;
2975       StdStringCanonOutput output(&resolved_url);
2976       Parsed resolved_parsed;
2977 
2978       bool succeed_resolve = ResolveRelativeURL(
2979           relative_case.base.data(), parsed, relative_case.is_base_hier,
2980           relative_case.rel.data(), relative_component, nullptr, &output,
2981           &resolved_parsed);
2982       output.Complete();
2983 
2984       EXPECT_EQ(succeed_resolve, relative_case.expected_succeed_resolve);
2985       EXPECT_EQ(resolved_url, relative_case.expected_resolved_url);
2986     }
2987   }
2988 
2989   bool use_standard_compliant_non_special_scheme_url_parsing_;
2990 
2991  private:
2992   base::test::ScopedFeatureList scoped_feature_list_;
2993 };
2994 
TEST_P(URLCanonTypedTest,NonSpecialResolveRelativeURL)2995 TEST_P(URLCanonTypedTest, NonSpecialResolveRelativeURL) {
2996   // Test flag-dependent behaviors of non-special URLs.
2997   if (use_standard_compliant_non_special_scheme_url_parsing_) {
2998     ResolveRelativeURLCase cases[] = {
2999         {"git://host", "path", true, true, true, true, "git://host/path"},
3000     };
3001     for (const auto& i : cases) {
3002       TestNonSpecialResolveRelativeURL(i);
3003     }
3004   } else {
3005     ResolveRelativeURLCase cases[] = {
3006         {"git://host", "path", true, true, true, true, "git://path"},
3007     };
3008     for (const auto& i : cases) {
3009       TestNonSpecialResolveRelativeURL(i);
3010     }
3011   }
3012 }
3013 
3014 INSTANTIATE_TEST_SUITE_P(All, URLCanonTypedTest, ::testing::Bool());
3015 
3016 // It used to be the case that when we did a replacement with a long buffer of
3017 // UTF-16 characters, we would get invalid data in the URL. This is because the
3018 // buffer that it used to hold the UTF-8 data was resized, while some pointers
3019 // were still kept to the old buffer that was removed.
TEST(URLCanonTest,ReplacementOverflow)3020 TEST(URLCanonTest, ReplacementOverflow) {
3021   const char src[] = "file:///C:/foo/bar";
3022   int src_len = static_cast<int>(strlen(src));
3023   Parsed parsed;
3024   ParseFileURL(src, src_len, &parsed);
3025 
3026   // Override two components, the path with something short, and the query with
3027   // something long enough to trigger the bug.
3028   Replacements<char16_t> repl;
3029   std::u16string new_query;
3030   for (int i = 0; i < 4800; i++)
3031     new_query.push_back('a');
3032 
3033   std::u16string new_path(test_utils::TruncateWStringToUTF16(L"/foo"));
3034   repl.SetPath(new_path.c_str(), Component(0, 4));
3035   repl.SetQuery(new_query.c_str(),
3036                 Component(0, static_cast<int>(new_query.length())));
3037 
3038   // Call ReplaceComponents on the string. It doesn't matter if we call it for
3039   // standard URLs, file URLs, etc, since they will go to the same replacement
3040   // function that was buggy.
3041   Parsed repl_parsed;
3042   std::string repl_str;
3043   StdStringCanonOutput repl_output(&repl_str);
3044   ReplaceFileURL(src, parsed, repl, nullptr, &repl_output, &repl_parsed);
3045   repl_output.Complete();
3046 
3047   // Generate the expected string and check.
3048   std::string expected("file:///foo?");
3049   for (size_t i = 0; i < new_query.length(); i++)
3050     expected.push_back('a');
3051   EXPECT_TRUE(expected == repl_str);
3052 }
3053 
TEST(URLCanonTest,DefaultPortForScheme)3054 TEST(URLCanonTest, DefaultPortForScheme) {
3055   struct TestCases {
3056     const char* scheme;
3057     const int expected_port;
3058   } cases[]{
3059       {"http", 80},
3060       {"https", 443},
3061       {"ftp", 21},
3062       {"ws", 80},
3063       {"wss", 443},
3064       {"fake-scheme", PORT_UNSPECIFIED},
3065       {"HTTP", PORT_UNSPECIFIED},
3066       {"HTTPS", PORT_UNSPECIFIED},
3067       {"FTP", PORT_UNSPECIFIED},
3068       {"WS", PORT_UNSPECIFIED},
3069       {"WSS", PORT_UNSPECIFIED},
3070   };
3071 
3072   for (const auto& test_case : cases) {
3073     SCOPED_TRACE(test_case.scheme);
3074     EXPECT_EQ(test_case.expected_port,
3075               DefaultPortForScheme(test_case.scheme, strlen(test_case.scheme)));
3076   }
3077 }
3078 
TEST(URLCanonTest,FindWindowsDriveLetter)3079 TEST(URLCanonTest, FindWindowsDriveLetter) {
3080   struct TestCase {
3081     std::string_view spec;
3082     int begin;
3083     int end;  // -1 for end of spec
3084     int expected_drive_letter_pos;
3085   } cases[] = {
3086       {"/", 0, -1, -1},
3087 
3088       {"c:/foo", 0, -1, 0},
3089       {"/c:/foo", 0, -1, 1},
3090       {"//c:/foo", 0, -1, -1},  // "//" does not canonicalize to "/"
3091       {"\\C|\\foo", 0, -1, 1},
3092       {"/cd:/foo", 0, -1, -1},  // "/c" does not canonicalize to "/"
3093       {"/./c:/foo", 0, -1, 3},
3094       {"/.//c:/foo", 0, -1, -1},  // "/.//" does not canonicalize to "/"
3095       {"/././c:/foo", 0, -1, 5},
3096       {"/abc/c:/foo", 0, -1, -1},  // "/abc/" does not canonicalize to "/"
3097       {"/abc/./../c:/foo", 0, -1, 10},
3098 
3099       {"/c:/c:/foo", 3, -1, 4},  // actual input is "/c:/foo"
3100       {"/c:/foo", 3, -1, -1},    // actual input is "/foo"
3101       {"/c:/foo", 0, 1, -1},     // actual input is "/"
3102   };
3103 
3104   for (const auto& c : cases) {
3105     int end = c.end;
3106     if (end == -1)
3107       end = c.spec.size();
3108 
3109     EXPECT_EQ(c.expected_drive_letter_pos,
3110               FindWindowsDriveLetter(c.spec.data(), c.begin, end))
3111         << "for " << c.spec << "[" << c.begin << ":" << end << "] (UTF-8)";
3112 
3113     std::u16string spec16 = base::ASCIIToUTF16(c.spec);
3114     EXPECT_EQ(c.expected_drive_letter_pos,
3115               FindWindowsDriveLetter(spec16.data(), c.begin, end))
3116         << "for " << c.spec << "[" << c.begin << ":" << end << "] (UTF-16)";
3117   }
3118 }
3119 
TEST(URLCanonTest,IDNToASCII)3120 TEST(URLCanonTest, IDNToASCII) {
3121   RawCanonOutputW<1024> output;
3122 
3123   // Basic ASCII test.
3124   std::u16string str = u"hello";
3125   EXPECT_TRUE(IDNToASCII(str, &output));
3126   EXPECT_EQ(u"hello", std::u16string(output.data()));
3127   output.set_length(0);
3128 
3129   // Mixed ASCII/non-ASCII.
3130   str = u"hellö";
3131   EXPECT_TRUE(IDNToASCII(str, &output));
3132   EXPECT_EQ(u"xn--hell-8qa", std::u16string(output.data()));
3133   output.set_length(0);
3134 
3135   // All non-ASCII.
3136   str = u"你好";
3137   EXPECT_TRUE(IDNToASCII(str, &output));
3138   EXPECT_EQ(u"xn--6qq79v", std::u16string(output.data()));
3139   output.set_length(0);
3140 
3141   // Characters that need mapping (the resulting Punycode is the encoding for
3142   // "1⁄4").
3143   str = u"¼";
3144   EXPECT_TRUE(IDNToASCII(str, &output));
3145   EXPECT_EQ(u"xn--14-c6t", std::u16string(output.data()));
3146   output.set_length(0);
3147 
3148   // String to encode already starts with "xn--", and all ASCII. Should not
3149   // modify the string.
3150   str = u"xn--hell-8qa";
3151   EXPECT_TRUE(IDNToASCII(str, &output));
3152   EXPECT_EQ(u"xn--hell-8qa", std::u16string(output.data()));
3153   output.set_length(0);
3154 
3155   // String to encode already starts with "xn--", and mixed ASCII/non-ASCII.
3156   // Should fail, due to a special case: if the label starts with "xn--", it
3157   // should be parsed as Punycode, which must be all ASCII.
3158   str = u"xn--hellö";
3159   EXPECT_FALSE(IDNToASCII(str, &output));
3160   output.set_length(0);
3161 
3162   // String to encode already starts with "xn--", and mixed ASCII/non-ASCII.
3163   // This tests that there is still an error for the character '⁄' (U+2044),
3164   // which would be a valid ASCII character, U+0044, if the high byte were
3165   // ignored.
3166   str = u"xn--1⁄4";
3167   EXPECT_FALSE(IDNToASCII(str, &output));
3168   output.set_length(0);
3169 }
3170 
ComponentCaseMatches(bool success,std::string_view out_str,const Component & out_comp,const DualComponentCase & expected)3171 void ComponentCaseMatches(bool success,
3172                           std::string_view out_str,
3173                           const Component& out_comp,
3174                           const DualComponentCase& expected) {
3175   EXPECT_EQ(success, expected.expected_success);
3176   EXPECT_STREQ(out_str.data(), expected.expected);
3177   EXPECT_EQ(out_comp, expected.expected_component);
3178 }
3179 
TEST(URLCanonTest,OpaqueHost)3180 TEST(URLCanonTest, OpaqueHost) {
3181   DualComponentCase host_cases[] = {
3182       {"", L"", "", Component(), true},
3183       {"google.com", L"google.com", "google.com", Component(0, 10), true},
3184       // Upper case letters should be preserved.
3185       {"gooGle.com", L"gooGle.com", "gooGle.com", Component(0, 10), true},
3186       {"\x41", L"\x41", "A", Component(0, 1), true},
3187       {"\x61", L"\x61", "a", Component(0, 1), true},
3188       // Percent encode.
3189       {"\x10", L"\x10", "%10", Component(0, 3), true},
3190       // A valid percent encoding should be preserved.
3191       {"%41", L"%41", "%41", Component(0, 3), true},
3192       // An invalid percent encoding should be preserved too.
3193       {"%zz", L"%zz", "%zz", Component(0, 3), true},
3194       // UTF-16 HIRAGANA LETTER A (codepoint U+3042, "\xe3\x81\x82" in UTF-8).
3195       {"\xe3\x81\x82", L"\x3042", "%E3%81%82", Component(0, 9), true},
3196   };
3197 
3198   for (const auto& host_case : host_cases) {
3199     SCOPED_TRACE(testing::Message() << "url: \"" << host_case.input8 << "\"");
3200     std::string out_str;
3201     StdStringCanonOutput output(&out_str);
3202     Component out_comp;
3203     bool success = CanonicalizeNonSpecialHost(
3204         host_case.input8,
3205         Component(0, static_cast<int>(strlen(host_case.input8))), output,
3206         out_comp);
3207     output.Complete();
3208     ComponentCaseMatches(success, out_str, out_comp, host_case);
3209   }
3210 
3211   // UTF-16 version.
3212   for (const auto& host_case : host_cases) {
3213     SCOPED_TRACE(testing::Message() << "url: \"" << host_case.input16 << "\"");
3214     std::u16string input16(
3215         test_utils::TruncateWStringToUTF16(host_case.input16));
3216     std::string out_str;
3217     StdStringCanonOutput output(&out_str);
3218     Component out_comp;
3219     bool success = CanonicalizeNonSpecialHost(
3220         input16.c_str(), Component(0, static_cast<int>(input16.length())),
3221         output, out_comp);
3222     output.Complete();
3223     ComponentCaseMatches(success, out_str, out_comp, host_case);
3224   }
3225 }
3226 
IPAddressCaseMatches(std::string_view out_str,const CanonHostInfo & host_info,const IPAddressCase & expected)3227 void IPAddressCaseMatches(std::string_view out_str,
3228                           const CanonHostInfo& host_info,
3229                           const IPAddressCase& expected) {
3230   EXPECT_EQ(host_info.family, expected.expected_family);
3231   EXPECT_STREQ(out_str.data(), expected.expected);
3232   EXPECT_EQ(base::HexEncode(host_info.address,
3233                             static_cast<size_t>(host_info.AddressLength())),
3234             expected.expected_address_hex);
3235   if (expected.expected_family == CanonHostInfo::IPV4) {
3236     EXPECT_EQ(host_info.num_ipv4_components,
3237               expected.expected_num_ipv4_components);
3238   }
3239 }
3240 
TEST(URLCanonTest,NonSpecialHostIPv6Address)3241 TEST(URLCanonTest, NonSpecialHostIPv6Address) {
3242   IPAddressCase ip_address_cases[] = {
3243       // Non-special URLs don't support IPv4. Family must be NEUTRAL.
3244       {"192.168.0.1", L"192.168.0.1", "192.168.0.1", Component(0, 11),
3245        CanonHostInfo::NEUTRAL, 0, ""},
3246       {"192", L"192", "192", Component(0, 3), CanonHostInfo::NEUTRAL, 0, ""},
3247       // "257" is allowed since the number is not considered as a part of IPv4.
3248       {"192.168.0.257", L"192.168.0.257", "192.168.0.257", Component(0, 13),
3249        CanonHostInfo::NEUTRAL, 0, ""},
3250       // IPv6.
3251       {"[1:0:0:2::3:0]", L"[1:0:0:2::3:0]", "[1::2:0:0:3:0]", Component(0, 14),
3252        CanonHostInfo::IPV6, -1, "00010000000000020000000000030000"},
3253       {"[::]", L"[::]", "[::]", Component(0, 4), CanonHostInfo::IPV6, -1,
3254        "00000000000000000000000000000000"},
3255       // Invalid hosts.
3256       {"#[::]", L"#[::]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
3257       {"[]", L"[]", "[]", Component(), CanonHostInfo::BROKEN, -1, ""},
3258       {"a]", L"a]", "a]", Component(), CanonHostInfo::BROKEN, -1, ""},
3259       {"[a", L"[a", "[a", Component(), CanonHostInfo::BROKEN, -1, ""},
3260       {"a[]", L"a[]", "a[]", Component(), CanonHostInfo::BROKEN, -1, ""},
3261       {"[]a", L"[]a", "[]a", Component(), CanonHostInfo::BROKEN, -1, ""},
3262   };
3263 
3264   for (const auto& ip_address_case : ip_address_cases) {
3265     SCOPED_TRACE(testing::Message()
3266                  << "url: \"" << ip_address_case.input8 << "\"");
3267     std::string out_str;
3268     StdStringCanonOutput output(&out_str);
3269     CanonHostInfo host_info;
3270     CanonicalizeNonSpecialHostVerbose(
3271         ip_address_case.input8,
3272         Component(0, static_cast<int>(strlen(ip_address_case.input8))), output,
3273         host_info);
3274     output.Complete();
3275     IPAddressCaseMatches(out_str, host_info, ip_address_case);
3276   }
3277 
3278   // UTF-16 version.
3279   for (const auto& ip_address_case : ip_address_cases) {
3280     SCOPED_TRACE(testing::Message()
3281                  << "url: \"" << ip_address_case.input16 << "\"");
3282     std::u16string input16(
3283         test_utils::TruncateWStringToUTF16(ip_address_case.input16));
3284     std::string out_str;
3285     StdStringCanonOutput output(&out_str);
3286     CanonHostInfo host_info;
3287     CanonicalizeNonSpecialHostVerbose(
3288         input16.c_str(), Component(0, static_cast<int>(input16.length())),
3289         output, host_info);
3290     output.Complete();
3291     IPAddressCaseMatches(out_str, host_info, ip_address_case);
3292   }
3293 }
3294 
3295 }  // namespace url
3296