xref: /aosp_15_r20/external/cronet/net/base/url_util.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "net/base/url_util.h"
6 
7 #include "build/build_config.h"
8 
9 #if BUILDFLAG(IS_POSIX)
10 #include <netinet/in.h>
11 #elif BUILDFLAG(IS_WIN)
12 #include <ws2tcpip.h>
13 #endif
14 
15 #include <optional>
16 #include <string_view>
17 
18 #include "base/check_op.h"
19 #include "base/containers/fixed_flat_set.h"
20 #include "base/strings/escape.h"
21 #include "base/strings/strcat.h"
22 #include "base/strings/string_util.h"
23 #include "base/strings/stringprintf.h"
24 #include "base/strings/utf_string_conversions.h"
25 #include "net/base/ip_address.h"
26 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
27 #include "url/gurl.h"
28 #include "url/scheme_host_port.h"
29 #include "url/url_canon.h"
30 #include "url/url_canon_internal.h"
31 #include "url/url_canon_ip.h"
32 #include "url/url_constants.h"
33 #include "url/url_util.h"
34 
35 namespace net {
36 
37 namespace {
38 
IsHostCharAlphanumeric(char c)39 bool IsHostCharAlphanumeric(char c) {
40   // We can just check lowercase because uppercase characters have already been
41   // normalized.
42   return ((c >= 'a') && (c <= 'z')) || ((c >= '0') && (c <= '9'));
43 }
44 
IsNormalizedLocalhostTLD(std::string_view host)45 bool IsNormalizedLocalhostTLD(std::string_view host) {
46   return base::EndsWith(host, ".localhost",
47                         base::CompareCase::INSENSITIVE_ASCII);
48 }
49 
50 // Helper function used by GetIdentityFromURL. If |escaped_text| can be "safely
51 // unescaped" to a valid UTF-8 string, return that string, as UTF-16. Otherwise,
52 // convert it as-is to UTF-16. "Safely unescaped" is defined as having no
53 // escaped character between '0x00' and '0x1F', inclusive.
UnescapeIdentityString(std::string_view escaped_text)54 std::u16string UnescapeIdentityString(std::string_view escaped_text) {
55   std::string unescaped_text;
56   if (base::UnescapeBinaryURLComponentSafe(
57           escaped_text, false /* fail_on_path_separators */, &unescaped_text)) {
58     std::u16string result;
59     if (base::UTF8ToUTF16(unescaped_text.data(), unescaped_text.length(),
60                           &result)) {
61       return result;
62     }
63   }
64   return base::UTF8ToUTF16(escaped_text);
65 }
66 
67 }  // namespace
68 
AppendQueryParameter(const GURL & url,std::string_view name,std::string_view value)69 GURL AppendQueryParameter(const GURL& url,
70                           std::string_view name,
71                           std::string_view value) {
72   std::string query(url.query());
73 
74   if (!query.empty())
75     query += "&";
76 
77   query += (base::EscapeQueryParamValue(name, true) + "=" +
78             base::EscapeQueryParamValue(value, true));
79   GURL::Replacements replacements;
80   replacements.SetQueryStr(query);
81   return url.ReplaceComponents(replacements);
82 }
83 
AppendOrReplaceQueryParameter(const GURL & url,std::string_view name,std::optional<std::string_view> value)84 GURL AppendOrReplaceQueryParameter(const GURL& url,
85                                    std::string_view name,
86                                    std::optional<std::string_view> value) {
87   bool replaced = false;
88   std::string param_name = base::EscapeQueryParamValue(name, true);
89   bool should_keep_param = value.has_value();
90 
91   std::string param_value;
92   if (should_keep_param)
93     param_value = base::EscapeQueryParamValue(value.value(), true);
94 
95   const std::string_view input = url.query_piece();
96   url::Component cursor(0, input.size());
97   std::string output;
98   url::Component key_range, value_range;
99   while (url::ExtractQueryKeyValue(input, &cursor, &key_range, &value_range)) {
100     const std::string_view key = input.substr(key_range.begin, key_range.len);
101     std::string key_value_pair;
102     // Check |replaced| as only the first pair should be replaced.
103     if (!replaced && key == param_name) {
104       replaced = true;
105       if (!should_keep_param)
106         continue;
107 
108       key_value_pair = param_name + "=" + param_value;
109     } else {
110       key_value_pair = std::string(
111           input.substr(key_range.begin, value_range.end() - key_range.begin));
112     }
113     if (!output.empty())
114       output += "&";
115 
116     output += key_value_pair;
117   }
118   if (!replaced && should_keep_param) {
119     if (!output.empty())
120       output += "&";
121 
122     output += (param_name + "=" + param_value);
123   }
124   GURL::Replacements replacements;
125   replacements.SetQueryStr(output);
126   return url.ReplaceComponents(replacements);
127 }
128 
AppendOrReplaceRef(const GURL & url,const std::string_view & ref)129 GURL AppendOrReplaceRef(const GURL& url, const std::string_view& ref) {
130   GURL::Replacements replacements;
131   replacements.SetRefStr(ref);
132   return url.ReplaceComponents(replacements);
133 }
134 
QueryIterator(const GURL & url)135 QueryIterator::QueryIterator(const GURL& url)
136     : url_(url), at_end_(!url.is_valid()) {
137   if (!at_end_) {
138     query_ = url.parsed_for_possibly_invalid_spec().query;
139     Advance();
140   }
141 }
142 
143 QueryIterator::~QueryIterator() = default;
144 
GetKey() const145 std::string_view QueryIterator::GetKey() const {
146   DCHECK(!at_end_);
147   if (key_.is_nonempty())
148     return std::string_view(url_->spec()).substr(key_.begin, key_.len);
149   return std::string_view();
150 }
151 
GetValue() const152 std::string_view QueryIterator::GetValue() const {
153   DCHECK(!at_end_);
154   if (value_.is_nonempty())
155     return std::string_view(url_->spec()).substr(value_.begin, value_.len);
156   return std::string_view();
157 }
158 
GetUnescapedValue()159 const std::string& QueryIterator::GetUnescapedValue() {
160   DCHECK(!at_end_);
161   if (value_.is_nonempty() && unescaped_value_.empty()) {
162     unescaped_value_ = base::UnescapeURLComponent(
163         GetValue(),
164         base::UnescapeRule::SPACES | base::UnescapeRule::PATH_SEPARATORS |
165             base::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS |
166             base::UnescapeRule::REPLACE_PLUS_WITH_SPACE);
167   }
168   return unescaped_value_;
169 }
170 
IsAtEnd() const171 bool QueryIterator::IsAtEnd() const {
172   return at_end_;
173 }
174 
Advance()175 void QueryIterator::Advance() {
176   DCHECK(!at_end_);
177   key_.reset();
178   value_.reset();
179   unescaped_value_.clear();
180   at_end_ = !url::ExtractQueryKeyValue(url_->spec(), &query_, &key_, &value_);
181 }
182 
GetValueForKeyInQuery(const GURL & url,std::string_view search_key,std::string * out_value)183 bool GetValueForKeyInQuery(const GURL& url,
184                            std::string_view search_key,
185                            std::string* out_value) {
186   for (QueryIterator it(url); !it.IsAtEnd(); it.Advance()) {
187     if (it.GetKey() == search_key) {
188       *out_value = it.GetUnescapedValue();
189       return true;
190     }
191   }
192   return false;
193 }
194 
ParseHostAndPort(std::string_view input,std::string * host,int * port)195 bool ParseHostAndPort(std::string_view input, std::string* host, int* port) {
196   if (input.empty())
197     return false;
198 
199   url::Component auth_component(0, input.size());
200   url::Component username_component;
201   url::Component password_component;
202   url::Component hostname_component;
203   url::Component port_component;
204 
205   // `input` is not NUL-terminated, so `input.data()` must be accompanied by a
206   // length. In these calls, `url::Component` provides an offset and length.
207   url::ParseAuthority(input.data(), auth_component, &username_component,
208                       &password_component, &hostname_component,
209                       &port_component);
210 
211   // There shouldn't be a username/password.
212   if (username_component.is_valid() || password_component.is_valid())
213     return false;
214 
215   if (hostname_component.is_empty())
216     return false;  // Failed parsing.
217 
218   int parsed_port_number = -1;
219   if (port_component.is_nonempty()) {
220     parsed_port_number = url::ParsePort(input.data(), port_component);
221 
222     // If parsing failed, port_number will be either PORT_INVALID or
223     // PORT_UNSPECIFIED, both of which are negative.
224     if (parsed_port_number < 0)
225       return false;  // Failed parsing the port number.
226   }
227 
228   if (port_component.len == 0)
229     return false;  // Reject inputs like "foo:"
230 
231   unsigned char tmp_ipv6_addr[16];
232 
233   // If the hostname starts with a bracket, it is either an IPv6 literal or
234   // invalid. If it is an IPv6 literal then strip the brackets.
235   if (hostname_component.len > 0 && input[hostname_component.begin] == '[') {
236     if (input[hostname_component.end() - 1] == ']' &&
237         url::IPv6AddressToNumber(input.data(), hostname_component,
238                                  tmp_ipv6_addr)) {
239       // Strip the brackets.
240       hostname_component.begin++;
241       hostname_component.len -= 2;
242     } else {
243       return false;
244     }
245   }
246 
247   // Pass results back to caller.
248   *host = std::string(
249       input.substr(hostname_component.begin, hostname_component.len));
250   *port = parsed_port_number;
251 
252   return true;  // Success.
253 }
254 
GetHostAndPort(const GURL & url)255 std::string GetHostAndPort(const GURL& url) {
256   // For IPv6 literals, GURL::host() already includes the brackets so it is
257   // safe to just append a colon.
258   return base::StringPrintf("%s:%d", url.host().c_str(),
259                             url.EffectiveIntPort());
260 }
261 
GetHostAndOptionalPort(const GURL & url)262 std::string GetHostAndOptionalPort(const GURL& url) {
263   // For IPv6 literals, GURL::host() already includes the brackets
264   // so it is safe to just append a colon.
265   if (url.has_port())
266     return base::StringPrintf("%s:%s", url.host().c_str(), url.port().c_str());
267   return url.host();
268 }
269 
GetHostAndOptionalPort(const url::SchemeHostPort & scheme_host_port)270 NET_EXPORT std::string GetHostAndOptionalPort(
271     const url::SchemeHostPort& scheme_host_port) {
272   int default_port = url::DefaultPortForScheme(
273       scheme_host_port.scheme().data(),
274       static_cast<int>(scheme_host_port.scheme().length()));
275   if (default_port != scheme_host_port.port()) {
276     return base::StringPrintf("%s:%i", scheme_host_port.host().c_str(),
277                               scheme_host_port.port());
278   }
279   return scheme_host_port.host();
280 }
281 
TrimEndingDot(std::string_view host)282 std::string TrimEndingDot(std::string_view host) {
283   std::string_view host_trimmed = host;
284   size_t len = host_trimmed.length();
285   if (len > 1 && host_trimmed[len - 1] == '.') {
286     host_trimmed.remove_suffix(1);
287   }
288   return std::string(host_trimmed);
289 }
290 
GetHostOrSpecFromURL(const GURL & url)291 std::string GetHostOrSpecFromURL(const GURL& url) {
292   return url.has_host() ? TrimEndingDot(url.host_piece()) : url.spec();
293 }
294 
GetSuperdomain(std::string_view domain)295 std::string GetSuperdomain(std::string_view domain) {
296   size_t dot_pos = domain.find('.');
297   if (dot_pos == std::string::npos)
298     return "";
299   return std::string(domain.substr(dot_pos + 1));
300 }
301 
IsSubdomainOf(std::string_view subdomain,std::string_view superdomain)302 bool IsSubdomainOf(std::string_view subdomain, std::string_view superdomain) {
303   // Subdomain must be identical or have strictly more labels than the
304   // superdomain.
305   if (subdomain.length() <= superdomain.length())
306     return subdomain == superdomain;
307 
308   // Superdomain must be suffix of subdomain, and the last character not
309   // included in the matching substring must be a dot.
310   if (!subdomain.ends_with(superdomain)) {
311     return false;
312   }
313   subdomain.remove_suffix(superdomain.length());
314   return subdomain.back() == '.';
315 }
316 
CanonicalizeHost(std::string_view host,url::CanonHostInfo * host_info)317 std::string CanonicalizeHost(std::string_view host,
318                              url::CanonHostInfo* host_info) {
319   // Try to canonicalize the host.
320   const url::Component raw_host_component(0, static_cast<int>(host.length()));
321   std::string canon_host;
322   url::StdStringCanonOutput canon_host_output(&canon_host);
323   // A url::StdStringCanonOutput starts off with a zero length buffer. The
324   // first time through Grow() immediately resizes it to 32 bytes, incurring
325   // a malloc. With libcxx a 22 byte or smaller request can be accommodated
326   // within the std::string itself (i.e. no malloc occurs). Start the buffer
327   // off at the max size to avoid a malloc on short strings.
328   // NOTE: To ensure the final size is correctly reflected, it's necessary
329   // to call Complete() which will adjust the size to the actual bytes written.
330   // This is handled below for success cases, while failure cases discard all
331   // the output.
332   const int kCxxMaxStringBufferSizeWithoutMalloc = 22;
333   canon_host_output.Resize(kCxxMaxStringBufferSizeWithoutMalloc);
334   url::CanonicalizeHostVerbose(host.data(), raw_host_component,
335                                &canon_host_output, host_info);
336 
337   if (host_info->out_host.is_nonempty() &&
338       host_info->family != url::CanonHostInfo::BROKEN) {
339     // Success!  Assert that there's no extra garbage.
340     canon_host_output.Complete();
341     DCHECK_EQ(host_info->out_host.len, static_cast<int>(canon_host.length()));
342   } else {
343     // Empty host, or canonicalization failed.  We'll return empty.
344     canon_host.clear();
345   }
346 
347   return canon_host;
348 }
349 
IsCanonicalizedHostCompliant(std::string_view host)350 bool IsCanonicalizedHostCompliant(std::string_view host) {
351   if (host.empty() || host.size() > 254 ||
352       (host.back() != '.' && host.size() == 254)) {
353     return false;
354   }
355 
356   bool in_component = false;
357   bool most_recent_component_started_alphanumeric = false;
358   size_t label_size = 0;
359 
360   for (char c : host) {
361     ++label_size;
362     if (!in_component) {
363       most_recent_component_started_alphanumeric = IsHostCharAlphanumeric(c);
364       if (!most_recent_component_started_alphanumeric && (c != '-') &&
365           (c != '_')) {
366         return false;
367       }
368       in_component = true;
369     } else if (c == '.') {
370       in_component = false;
371       if (label_size > 64 || label_size == 1) {
372         // Label should not be empty or longer than 63 characters (+1 for '.'
373         // character included in `label_size`).
374         return false;
375       } else {
376         label_size = 0;
377       }
378     } else if (!IsHostCharAlphanumeric(c) && (c != '-') && (c != '_')) {
379       return false;
380     }
381   }
382 
383   // Check for too-long label when not ended with final '.'.
384   if (label_size > 63)
385     return false;
386 
387   return most_recent_component_started_alphanumeric;
388 }
389 
IsHostnameNonUnique(std::string_view hostname)390 bool IsHostnameNonUnique(std::string_view hostname) {
391   // CanonicalizeHost requires surrounding brackets to parse an IPv6 address.
392   const std::string host_or_ip = hostname.find(':') != std::string::npos
393                                      ? base::StrCat({"[", hostname, "]"})
394                                      : std::string(hostname);
395   url::CanonHostInfo host_info;
396   std::string canonical_name = CanonicalizeHost(host_or_ip, &host_info);
397 
398   // If canonicalization fails, then the input is truly malformed. However,
399   // to avoid mis-reporting bad inputs as "non-unique", treat them as unique.
400   if (canonical_name.empty())
401     return false;
402 
403   // If |hostname| is an IP address, check to see if it's in an IANA-reserved
404   // range reserved for non-publicly routable networks.
405   if (host_info.IsIPAddress()) {
406     IPAddress host_addr;
407     if (!host_addr.AssignFromIPLiteral(hostname.substr(
408             host_info.out_host.begin, host_info.out_host.len))) {
409       return false;
410     }
411     switch (host_info.family) {
412       case url::CanonHostInfo::IPV4:
413       case url::CanonHostInfo::IPV6:
414         return !host_addr.IsPubliclyRoutable();
415       case url::CanonHostInfo::NEUTRAL:
416       case url::CanonHostInfo::BROKEN:
417         return false;
418     }
419   }
420 
421   // Check for a registry controlled portion of |hostname|, ignoring private
422   // registries, as they already chain to ICANN-administered registries,
423   // and explicitly ignoring unknown registries.
424   //
425   // Note: This means that as new gTLDs are introduced on the Internet, they
426   // will be treated as non-unique until the registry controlled domain list
427   // is updated. However, because gTLDs are expected to provide significant
428   // advance notice to deprecate older versions of this code, this an
429   // acceptable tradeoff.
430   return !registry_controlled_domains::HostHasRegistryControlledDomain(
431       canonical_name, registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
432       registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
433 }
434 
IsLocalhost(const GURL & url)435 bool IsLocalhost(const GURL& url) {
436   return HostStringIsLocalhost(url.HostNoBracketsPiece());
437 }
438 
HostStringIsLocalhost(std::string_view host)439 bool HostStringIsLocalhost(std::string_view host) {
440   IPAddress ip_address;
441   if (ip_address.AssignFromIPLiteral(host))
442     return ip_address.IsLoopback();
443   return IsLocalHostname(host);
444 }
445 
SimplifyUrlForRequest(const GURL & url)446 GURL SimplifyUrlForRequest(const GURL& url) {
447   DCHECK(url.is_valid());
448   // Fast path to avoid re-canonicalization via ReplaceComponents.
449   if (!url.has_username() && !url.has_password() && !url.has_ref())
450     return url;
451   GURL::Replacements replacements;
452   replacements.ClearUsername();
453   replacements.ClearPassword();
454   replacements.ClearRef();
455   return url.ReplaceComponents(replacements);
456 }
457 
ChangeWebSocketSchemeToHttpScheme(const GURL & url)458 GURL ChangeWebSocketSchemeToHttpScheme(const GURL& url) {
459   DCHECK(url.SchemeIsWSOrWSS());
460   GURL::Replacements replace_scheme;
461   replace_scheme.SetSchemeStr(url.SchemeIs(url::kWssScheme) ? url::kHttpsScheme
462                                                             : url::kHttpScheme);
463   return url.ReplaceComponents(replace_scheme);
464 }
465 
IsStandardSchemeWithNetworkHost(std::string_view scheme)466 bool IsStandardSchemeWithNetworkHost(std::string_view scheme) {
467   // file scheme is special. Windows file share origins can have network hosts.
468   if (scheme == url::kFileScheme)
469     return true;
470 
471   url::SchemeType scheme_type;
472   if (!url::GetStandardSchemeType(
473           scheme.data(), url::Component(0, scheme.length()), &scheme_type)) {
474     return false;
475   }
476   return scheme_type == url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION ||
477          scheme_type == url::SCHEME_WITH_HOST_AND_PORT;
478 }
479 
GetIdentityFromURL(const GURL & url,std::u16string * username,std::u16string * password)480 void GetIdentityFromURL(const GURL& url,
481                         std::u16string* username,
482                         std::u16string* password) {
483   *username = UnescapeIdentityString(url.username());
484   *password = UnescapeIdentityString(url.password());
485 }
486 
HasGoogleHost(const GURL & url)487 bool HasGoogleHost(const GURL& url) {
488   return IsGoogleHost(url.host_piece());
489 }
490 
IsGoogleHost(std::string_view host)491 bool IsGoogleHost(std::string_view host) {
492   static const char* kGoogleHostSuffixes[] = {
493       ".google.com",
494       ".youtube.com",
495       ".gmail.com",
496       ".doubleclick.net",
497       ".gstatic.com",
498       ".googlevideo.com",
499       ".googleusercontent.com",
500       ".googlesyndication.com",
501       ".google-analytics.com",
502       ".googleadservices.com",
503       ".googleapis.com",
504       ".ytimg.com",
505   };
506   for (const char* suffix : kGoogleHostSuffixes) {
507     // Here it's possible to get away with faster case-sensitive comparisons
508     // because the list above is all lowercase, and a GURL's host name will
509     // always be canonicalized to lowercase as well.
510     if (host.ends_with(suffix)) {
511       return true;
512     }
513   }
514   return false;
515 }
516 
IsGoogleHostWithAlpnH3(std::string_view host)517 bool IsGoogleHostWithAlpnH3(std::string_view host) {
518   return base::EqualsCaseInsensitiveASCII(host, "google.com") ||
519          base::EqualsCaseInsensitiveASCII(host, "www.google.com");
520 }
521 
IsLocalHostname(std::string_view host)522 bool IsLocalHostname(std::string_view host) {
523   // Remove any trailing '.'.
524   if (!host.empty() && *host.rbegin() == '.')
525     host.remove_suffix(1);
526 
527   return base::EqualsCaseInsensitiveASCII(host, "localhost") ||
528          IsNormalizedLocalhostTLD(host);
529 }
530 
UnescapePercentEncodedUrl(std::string_view input)531 std::string UnescapePercentEncodedUrl(std::string_view input) {
532   std::string result(input);
533   // Replace any 0x2B (+) with 0x20 (SP).
534   for (char& c : result) {
535     if (c == '+') {
536       c = ' ';
537     }
538   }
539   // Run UTF-8 decoding without BOM on the percent-decoding.
540   url::RawCanonOutputT<char16_t> canon_output;
541   url::DecodeURLEscapeSequences(result, url::DecodeURLMode::kUTF8,
542                                 &canon_output);
543   return base::UTF16ToUTF8(canon_output.view());
544 }
545 
546 }  // namespace net
547