xref: /aosp_15_r20/external/cronet/url/url_canon_pathurl.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Functions for canonicalizing "path" URLs. Not to be confused with the path
6 // of a URL, these are URLs that have no authority section, only a path. For
7 // example, "javascript:" and "data:".
8 
9 #include "url/url_canon.h"
10 #include "url/url_canon_internal.h"
11 
12 namespace url {
13 
14 namespace {
15 
16 // Canonicalize the given |component| from |source| into |output| and
17 // |new_component|. If |separator| is non-zero, it is pre-pended to |output|
18 // prior to the canonicalized component; i.e. for the '?' or '#' characters.
19 template <typename CHAR, typename UCHAR>
DoCanonicalizePathComponent(const CHAR * source,const Component & component,char separator,CanonOutput * output,Component * new_component)20 void DoCanonicalizePathComponent(const CHAR* source,
21                                  const Component& component,
22                                  char separator,
23                                  CanonOutput* output,
24                                  Component* new_component) {
25   if (component.is_valid()) {
26     if (separator)
27       output->push_back(separator);
28     // Copy the path using path URL's more lax escaping rules (think for
29     // javascript:). We convert to UTF-8 and escape characters from the
30     // C0 control percent-encode set, but leave all other characters alone.
31     // This helps readability of JavaScript.
32     // https://url.spec.whatwg.org/#cannot-be-a-base-url-path-state
33     // https://url.spec.whatwg.org/#c0-control-percent-encode-set
34     new_component->begin = output->length();
35     size_t end = static_cast<size_t>(component.end());
36     for (size_t i = static_cast<size_t>(component.begin); i < end; i++) {
37       UCHAR uch = static_cast<UCHAR>(source[i]);
38       if (IsInC0ControlPercentEncodeSet(uch)) {
39         AppendUTF8EscapedChar(source, &i, end, output);
40       } else {
41         output->push_back(static_cast<char>(uch));
42       }
43     }
44     new_component->len = output->length() - new_component->begin;
45   } else {
46     // Empty part.
47     new_component->reset();
48   }
49 }
50 
51 template <typename CHAR, typename UCHAR>
DoCanonicalizePathURL(const URLComponentSource<CHAR> & source,const Parsed & parsed,CanonOutput * output,Parsed * new_parsed)52 bool DoCanonicalizePathURL(const URLComponentSource<CHAR>& source,
53                            const Parsed& parsed,
54                            CanonOutput* output,
55                            Parsed* new_parsed) {
56   // Scheme: this will append the colon.
57   bool success = CanonicalizeScheme(source.scheme, parsed.scheme,
58                                     output, &new_parsed->scheme);
59 
60   // We assume there's no authority for path URLs. Note that hosts should never
61   // have -1 length.
62   new_parsed->username.reset();
63   new_parsed->password.reset();
64   new_parsed->host.reset();
65   new_parsed->port.reset();
66 
67   // Canonicalize path via the weaker path URL rules.
68   //
69   // Note: parsing the path part should never cause a failure, see
70   // https://url.spec.whatwg.org/#cannot-be-a-base-url-path-state
71   DoCanonicalizePathComponent<CHAR, UCHAR>(source.path, parsed.path, '\0',
72                                            output, &new_parsed->path);
73 
74   // Similar to mailto:, always use the default UTF-8 charset converter for
75   // query.
76   CanonicalizeQuery(source.query, parsed.query, nullptr, output,
77                     &new_parsed->query);
78 
79   CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref);
80 
81   return success;
82 }
83 
84 }  // namespace
85 
CanonicalizePathURL(const char * spec,int spec_len,const Parsed & parsed,CanonOutput * output,Parsed * new_parsed)86 bool CanonicalizePathURL(const char* spec,
87                          int spec_len,
88                          const Parsed& parsed,
89                          CanonOutput* output,
90                          Parsed* new_parsed) {
91   return DoCanonicalizePathURL<char, unsigned char>(
92       URLComponentSource<char>(spec), parsed, output, new_parsed);
93 }
94 
CanonicalizePathURL(const char16_t * spec,int spec_len,const Parsed & parsed,CanonOutput * output,Parsed * new_parsed)95 bool CanonicalizePathURL(const char16_t* spec,
96                          int spec_len,
97                          const Parsed& parsed,
98                          CanonOutput* output,
99                          Parsed* new_parsed) {
100   return DoCanonicalizePathURL<char16_t, char16_t>(
101       URLComponentSource<char16_t>(spec), parsed, output, new_parsed);
102 }
103 
CanonicalizePathURLPath(const char * source,const Component & component,CanonOutput * output,Component * new_component)104 void CanonicalizePathURLPath(const char* source,
105                              const Component& component,
106                              CanonOutput* output,
107                              Component* new_component) {
108   DoCanonicalizePathComponent<char, unsigned char>(source, component, '\0',
109                                                    output, new_component);
110 }
111 
CanonicalizePathURLPath(const char16_t * source,const Component & component,CanonOutput * output,Component * new_component)112 void CanonicalizePathURLPath(const char16_t* source,
113                              const Component& component,
114                              CanonOutput* output,
115                              Component* new_component) {
116   DoCanonicalizePathComponent<char16_t, char16_t>(source, component, '\0',
117                                                   output, new_component);
118 }
119 
ReplacePathURL(const char * base,const Parsed & base_parsed,const Replacements<char> & replacements,CanonOutput * output,Parsed * new_parsed)120 bool ReplacePathURL(const char* base,
121                     const Parsed& base_parsed,
122                     const Replacements<char>& replacements,
123                     CanonOutput* output,
124                     Parsed* new_parsed) {
125   URLComponentSource<char> source(base);
126   Parsed parsed(base_parsed);
127   SetupOverrideComponents(base, replacements, &source, &parsed);
128   return DoCanonicalizePathURL<char, unsigned char>(
129       source, parsed, output, new_parsed);
130 }
131 
ReplacePathURL(const char * base,const Parsed & base_parsed,const Replacements<char16_t> & replacements,CanonOutput * output,Parsed * new_parsed)132 bool ReplacePathURL(const char* base,
133                     const Parsed& base_parsed,
134                     const Replacements<char16_t>& replacements,
135                     CanonOutput* output,
136                     Parsed* new_parsed) {
137   RawCanonOutput<1024> utf8;
138   URLComponentSource<char> source(base);
139   Parsed parsed(base_parsed);
140   SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
141   return DoCanonicalizePathURL<char, unsigned char>(
142       source, parsed, output, new_parsed);
143 }
144 
145 }  // namespace url
146