xref: /aosp_15_r20/external/cronet/url/url_util.h (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef URL_URL_UTIL_H_
6 #define URL_URL_UTIL_H_
7 
8 #include <memory>
9 #include <string>
10 #include <string_view>
11 #include <vector>
12 
13 #include "base/component_export.h"
14 #include "url/third_party/mozilla/url_parse.h"
15 #include "url/url_canon.h"
16 #include "url/url_constants.h"
17 
18 namespace url {
19 
20 // Init ------------------------------------------------------------------------
21 
22 // Used for tests that need to reset schemes. Note that this can only be used
23 // in conjunction with ScopedSchemeRegistryForTests.
24 COMPONENT_EXPORT(URL) void ClearSchemesForTests();
25 
26 class ScopedSchemeRegistryInternal;
27 
28 // Stores the SchemeRegistry upon creation, allowing tests to modify a copy of
29 // it, and restores the original SchemeRegistry when deleted.
COMPONENT_EXPORT(URL)30 class COMPONENT_EXPORT(URL) ScopedSchemeRegistryForTests {
31  public:
32   ScopedSchemeRegistryForTests();
33   ~ScopedSchemeRegistryForTests();
34 
35  private:
36   std::unique_ptr<ScopedSchemeRegistryInternal> internal_;
37 };
38 
39 // Schemes ---------------------------------------------------------------------
40 
41 // Changes the behavior of SchemeHostPort / Origin to allow non-standard schemes
42 // to be specified, instead of canonicalizing them to an invalid SchemeHostPort
43 // or opaque Origin, respectively. This is used for Android WebView backwards
44 // compatibility, which allows the use of custom schemes: content hosted in
45 // Android WebView assumes that one URL with a non-standard scheme will be
46 // same-origin to another URL with the same non-standard scheme.
47 //
48 // Not thread-safe.
49 COMPONENT_EXPORT(URL) void EnableNonStandardSchemesForAndroidWebView();
50 
51 // Whether or not SchemeHostPort and Origin allow non-standard schemes.
52 COMPONENT_EXPORT(URL) bool AllowNonStandardSchemesForAndroidWebView();
53 
54 // The following Add*Scheme method are not threadsafe and can not be called
55 // concurrently with any other url_util function. They will assert if the lists
56 // of schemes have been locked (see LockSchemeRegistries), or used.
57 
58 // Adds an application-defined scheme to the internal list of "standard-format"
59 // URL schemes. A standard-format scheme adheres to what RFC 3986 calls "generic
60 // URI syntax" (https://tools.ietf.org/html/rfc3986#section-3).
61 
62 COMPONENT_EXPORT(URL)
63 void AddStandardScheme(const char* new_scheme, SchemeType scheme_type);
64 
65 // Returns the list of schemes registered for "standard" URLs.  Note, this
66 // should not be used if you just need to check if your protocol is standard
67 // or not.  Instead use the IsStandard() function above as its much more
68 // efficient.  This function should only be used where you need to perform
69 // other operations against the standard scheme list.
70 COMPONENT_EXPORT(URL)
71 std::vector<std::string> GetStandardSchemes();
72 
73 // Adds an application-defined scheme to the internal list of schemes allowed
74 // for referrers.
75 COMPONENT_EXPORT(URL)
76 void AddReferrerScheme(const char* new_scheme, SchemeType scheme_type);
77 
78 // Adds an application-defined scheme to the list of schemes that do not trigger
79 // mixed content warnings.
80 COMPONENT_EXPORT(URL) void AddSecureScheme(const char* new_scheme);
81 COMPONENT_EXPORT(URL) const std::vector<std::string>& GetSecureSchemes();
82 
83 // Adds an application-defined scheme to the list of schemes that normal pages
84 // cannot link to or access (i.e., with the same security rules as those applied
85 // to "file" URLs).
86 COMPONENT_EXPORT(URL) void AddLocalScheme(const char* new_scheme);
87 COMPONENT_EXPORT(URL) const std::vector<std::string>& GetLocalSchemes();
88 
89 // Adds an application-defined scheme to the list of schemes that cause pages
90 // loaded with them to not have access to pages loaded with any other URL
91 // scheme.
92 COMPONENT_EXPORT(URL) void AddNoAccessScheme(const char* new_scheme);
93 COMPONENT_EXPORT(URL) const std::vector<std::string>& GetNoAccessSchemes();
94 
95 // Adds an application-defined scheme to the list of schemes that can be sent
96 // CORS requests.
97 COMPONENT_EXPORT(URL) void AddCorsEnabledScheme(const char* new_scheme);
98 COMPONENT_EXPORT(URL) const std::vector<std::string>& GetCorsEnabledSchemes();
99 
100 // Adds an application-defined scheme to the list of web schemes that can be
101 // used by web to store data (e.g. cookies, local storage, ...). This is
102 // to differentiate them from schemes that can store data but are not used on
103 // web (e.g. application's internal schemes) or schemes that are used on web but
104 // cannot store data.
105 COMPONENT_EXPORT(URL) void AddWebStorageScheme(const char* new_scheme);
106 COMPONENT_EXPORT(URL) const std::vector<std::string>& GetWebStorageSchemes();
107 
108 // Adds an application-defined scheme to the list of schemes that can bypass the
109 // Content-Security-Policy (CSP) checks.
110 COMPONENT_EXPORT(URL) void AddCSPBypassingScheme(const char* new_scheme);
111 COMPONENT_EXPORT(URL) const std::vector<std::string>& GetCSPBypassingSchemes();
112 
113 // Adds an application-defined scheme to the list of schemes that are strictly
114 // empty documents, allowing them to commit synchronously.
115 COMPONENT_EXPORT(URL) void AddEmptyDocumentScheme(const char* new_scheme);
116 COMPONENT_EXPORT(URL) const std::vector<std::string>& GetEmptyDocumentSchemes();
117 
118 // Adds a scheme with a predefined default handler.
119 //
120 // This pair of strings must be normalized protocol handler parameters as
121 // described in the Custom Handler specification.
122 // https://html.spec.whatwg.org/multipage/system-state.html#normalize-protocol-handler-parameters
123 COMPONENT_EXPORT(URL)
124 void AddPredefinedHandlerScheme(const char* new_scheme, const char* handler);
125 COMPONENT_EXPORT(URL)
126 std::vector<std::pair<std::string, std::string>> GetPredefinedHandlerSchemes();
127 
128 // Sets a flag to prevent future calls to Add*Scheme from succeeding.
129 //
130 // This is designed to help prevent errors for multithreaded applications.
131 // Normal usage would be to call Add*Scheme for your custom schemes at
132 // the beginning of program initialization, and then LockSchemeRegistries. This
133 // prevents future callers from mistakenly calling Add*Scheme when the
134 // program is running with multiple threads, where such usage would be
135 // dangerous.
136 //
137 // We could have had Add*Scheme use a lock instead, but that would add
138 // some platform-specific dependencies we don't otherwise have now, and is
139 // overkill considering the normal usage is so simple.
140 COMPONENT_EXPORT(URL) void LockSchemeRegistries();
141 
142 // Locates the scheme in the given string and places it into |found_scheme|,
143 // which may be NULL to indicate the caller does not care about the range.
144 //
145 // Returns whether the given |compare| scheme matches the scheme found in the
146 // input (if any). The |compare| scheme must be a valid canonical scheme or
147 // the result of the comparison is undefined.
148 COMPONENT_EXPORT(URL)
149 bool FindAndCompareScheme(const char* str,
150                           int str_len,
151                           const char* compare,
152                           Component* found_scheme);
153 COMPONENT_EXPORT(URL)
154 bool FindAndCompareScheme(const char16_t* str,
155                           int str_len,
156                           const char* compare,
157                           Component* found_scheme);
FindAndCompareScheme(const std::string & str,const char * compare,Component * found_scheme)158 inline bool FindAndCompareScheme(const std::string& str,
159                                  const char* compare,
160                                  Component* found_scheme) {
161   return FindAndCompareScheme(str.data(), static_cast<int>(str.size()),
162                               compare, found_scheme);
163 }
FindAndCompareScheme(const std::u16string & str,const char * compare,Component * found_scheme)164 inline bool FindAndCompareScheme(const std::u16string& str,
165                                  const char* compare,
166                                  Component* found_scheme) {
167   return FindAndCompareScheme(str.data(), static_cast<int>(str.size()),
168                               compare, found_scheme);
169 }
170 
171 // Returns true if the given scheme identified by |scheme| within |spec| is in
172 // the list of known standard-format schemes (see AddStandardScheme).
173 COMPONENT_EXPORT(URL)
174 bool IsStandard(const char* spec, const Component& scheme);
175 COMPONENT_EXPORT(URL)
176 bool IsStandard(const char16_t* spec, const Component& scheme);
177 
178 bool IsStandardScheme(std::string_view scheme);
179 
180 // Returns true if the given scheme identified by |scheme| within |spec| is in
181 // the list of allowed schemes for referrers (see AddReferrerScheme).
182 COMPONENT_EXPORT(URL)
183 bool IsReferrerScheme(const char* spec, const Component& scheme);
184 
185 // Returns true and sets |type| to the SchemeType of the given scheme
186 // identified by |scheme| within |spec| if the scheme is in the list of known
187 // standard-format schemes (see AddStandardScheme).
188 COMPONENT_EXPORT(URL)
189 bool GetStandardSchemeType(const char* spec,
190                            const Component& scheme,
191                            SchemeType* type);
192 COMPONENT_EXPORT(URL)
193 bool GetStandardSchemeType(const char16_t* spec,
194                            const Component& scheme,
195                            SchemeType* type);
196 
197 // Hosts  ----------------------------------------------------------------------
198 
199 // Returns true if the |canonical_host| matches or is in the same domain as the
200 // given |canonical_domain| string. For example, if the canonicalized hostname
201 // is "www.google.com", this will return true for "com", "google.com", and
202 // "www.google.com" domains.
203 //
204 // If either of the input StringPieces is empty, the return value is false. The
205 // input domain should match host canonicalization rules. i.e. it should be
206 // lowercase except for escape chars.
207 COMPONENT_EXPORT(URL)
208 bool DomainIs(std::string_view canonical_host,
209               std::string_view canonical_domain);
210 
211 // Returns true if the hostname is an IP address. Note: this function isn't very
212 // cheap, as it must re-parse the host to verify.
213 COMPONENT_EXPORT(URL) bool HostIsIPAddress(std::string_view host);
214 
215 // URL library wrappers --------------------------------------------------------
216 
217 // Parses the given spec according to the extracted scheme type. Normal users
218 // should use the URL object, although this may be useful if performance is
219 // critical and you don't want to do the heap allocation for the std::string.
220 //
221 // As with the Canonicalize* functions, the charset converter can
222 // be NULL to use UTF-8 (it will be faster in this case).
223 //
224 // Returns true if a valid URL was produced, false if not. On failure, the
225 // output and parsed structures will still be filled and will be consistent,
226 // but they will not represent a loadable URL.
227 COMPONENT_EXPORT(URL)
228 bool Canonicalize(const char* spec,
229                   int spec_len,
230                   bool trim_path_end,
231                   CharsetConverter* charset_converter,
232                   CanonOutput* output,
233                   Parsed* output_parsed);
234 COMPONENT_EXPORT(URL)
235 bool Canonicalize(const char16_t* spec,
236                   int spec_len,
237                   bool trim_path_end,
238                   CharsetConverter* charset_converter,
239                   CanonOutput* output,
240                   Parsed* output_parsed);
241 
242 // Resolves a potentially relative URL relative to the given parsed base URL.
243 // The base MUST be valid. The resulting canonical URL and parsed information
244 // will be placed in to the given out variables.
245 //
246 // The relative need not be relative. If we discover that it's absolute, this
247 // will produce a canonical version of that URL. See Canonicalize() for more
248 // about the charset_converter.
249 //
250 // Returns true if the output is valid, false if the input could not produce
251 // a valid URL.
252 COMPONENT_EXPORT(URL)
253 bool ResolveRelative(const char* base_spec,
254                      int base_spec_len,
255                      const Parsed& base_parsed,
256                      const char* relative,
257                      int relative_length,
258                      CharsetConverter* charset_converter,
259                      CanonOutput* output,
260                      Parsed* output_parsed);
261 COMPONENT_EXPORT(URL)
262 bool ResolveRelative(const char* base_spec,
263                      int base_spec_len,
264                      const Parsed& base_parsed,
265                      const char16_t* relative,
266                      int relative_length,
267                      CharsetConverter* charset_converter,
268                      CanonOutput* output,
269                      Parsed* output_parsed);
270 
271 // Replaces components in the given VALID input URL. The new canonical URL info
272 // is written to output and out_parsed.
273 //
274 // Returns true if the resulting URL is valid.
275 COMPONENT_EXPORT(URL)
276 bool ReplaceComponents(const char* spec,
277                        int spec_len,
278                        const Parsed& parsed,
279                        const Replacements<char>& replacements,
280                        CharsetConverter* charset_converter,
281                        CanonOutput* output,
282                        Parsed* out_parsed);
283 COMPONENT_EXPORT(URL)
284 bool ReplaceComponents(const char* spec,
285                        int spec_len,
286                        const Parsed& parsed,
287                        const Replacements<char16_t>& replacements,
288                        CharsetConverter* charset_converter,
289                        CanonOutput* output,
290                        Parsed* out_parsed);
291 
292 // String helper functions -----------------------------------------------------
293 
294 enum class DecodeURLMode {
295   // UTF-8 decode only. Invalid byte sequences are replaced with U+FFFD.
296   kUTF8,
297   // Try UTF-8 decoding. If the input contains byte sequences invalid
298   // for UTF-8, apply byte to Unicode mapping.
299   kUTF8OrIsomorphic,
300 };
301 
302 // Unescapes the given string using URL escaping rules.
303 COMPONENT_EXPORT(URL)
304 void DecodeURLEscapeSequences(std::string_view input,
305                               DecodeURLMode mode,
306                               CanonOutputW* output);
307 
308 // Escapes the given string as defined by the JS method encodeURIComponent. See
309 // https://developer.mozilla.org/en/JavaScript/Reference/Global_Objects/encodeURIComponent
310 COMPONENT_EXPORT(URL)
311 void EncodeURIComponent(std::string_view input, CanonOutput* output);
312 
313 // Returns true if `c` is a character that does not require escaping in
314 // encodeURIComponent.
315 // TODO(crbug.com/1481056): Remove this when event-level reportEvent is removed
316 // (if it is still this function's only consumer).
317 COMPONENT_EXPORT(URL)
318 bool IsURIComponentChar(char c);
319 
320 // Checks an arbitrary string for invalid escape sequences.
321 //
322 // A valid percent-encoding is '%' followed by exactly two hex-digits. This
323 // function returns true if an occurrence of '%' is found and followed by
324 // anything other than two hex-digits.
325 COMPONENT_EXPORT(URL)
326 bool HasInvalidURLEscapeSequences(std::string_view input);
327 
328 // Check if a scheme is affected by the Android WebView Hack.
329 bool IsAndroidWebViewHackEnabledScheme(std::string_view scheme);
330 }  // namespace url
331 
332 #endif  // URL_URL_UTIL_H_
333