xref: /aosp_15_r20/external/cronet/base/strings/escape.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2020 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/strings/escape.h"
6 
7 #include <ostream>
8 
9 #include "base/check_op.h"
10 #include "base/feature_list.h"
11 #include "base/features.h"
12 #include "base/strings/string_number_conversions.h"
13 #include "base/strings/string_piece.h"
14 #include "base/strings/string_util.h"
15 #include "base/strings/utf_string_conversion_utils.h"
16 #include "base/strings/utf_string_conversions.h"
17 #include "base/third_party/icu/icu_utf.h"
18 
19 namespace base {
20 
21 namespace {
22 
23 // A fast bit-vector map for ascii characters.
24 //
25 // Internally stores 256 bits in an array of 8 ints.
26 // Does quick bit-flicking to lookup needed characters.
27 struct Charmap {
Containsbase::__anon620755e90111::Charmap28   bool Contains(unsigned char c) const {
29     return ((map[c >> 5] & (1 << (c & 31))) != 0);
30   }
31 
32   uint32_t map[8];
33 };
34 
35 // Given text to escape and a Charmap defining which values to escape,
36 // return an escaped string.  If use_plus is true, spaces are converted
37 // to +, otherwise, if spaces are in the charmap, they are converted to
38 // %20. And if keep_escaped is true, %XX will be kept as it is, otherwise, if
39 // '%' is in the charmap, it is converted to %25.
Escape(StringPiece text,const Charmap & charmap,bool use_plus,bool keep_escaped=false)40 std::string Escape(StringPiece text,
41                    const Charmap& charmap,
42                    bool use_plus,
43                    bool keep_escaped = false) {
44   std::string escaped;
45   escaped.reserve(text.length() * 3);
46   for (size_t i = 0; i < text.length(); ++i) {
47     unsigned char c = static_cast<unsigned char>(text[i]);
48     if (use_plus && ' ' == c) {
49       escaped.push_back('+');
50     } else if (keep_escaped && '%' == c && i + 2 < text.length() &&
51                IsHexDigit(text[i + 1]) && IsHexDigit(text[i + 2])) {
52       escaped.push_back('%');
53     } else if (charmap.Contains(c)) {
54       escaped.push_back('%');
55       AppendHexEncodedByte(c, escaped);
56     } else {
57       escaped.push_back(static_cast<char>(c));
58     }
59   }
60   return escaped;
61 }
62 
63 // Convert a character |c| to a form that will not be mistaken as HTML.
64 template <class str>
AppendEscapedCharForHTMLImpl(typename str::value_type c,str * output)65 void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) {
66   static constexpr struct {
67     char key;
68     StringPiece replacement;
69   } kCharsToEscape[] = {
70       {'<', "&lt;"},   {'>', "&gt;"},   {'&', "&amp;"},
71       {'"', "&quot;"}, {'\'', "&#39;"},
72   };
73   for (const auto& char_to_escape : kCharsToEscape) {
74     if (c == char_to_escape.key) {
75       output->append(std::begin(char_to_escape.replacement),
76                      std::end(char_to_escape.replacement));
77       return;
78     }
79   }
80   output->push_back(c);
81 }
82 
83 // Convert |input| string to a form that will not be interpreted as HTML.
84 template <typename T, typename CharT = typename T::value_type>
EscapeForHTMLImpl(T input)85 std::basic_string<CharT> EscapeForHTMLImpl(T input) {
86   std::basic_string<CharT> result;
87   result.reserve(input.size());  // Optimize for no escaping.
88 
89   for (auto c : input) {
90     AppendEscapedCharForHTMLImpl(c, &result);
91   }
92 
93   return result;
94 }
95 
96 // Everything except alphanumerics and -._~
97 // See RFC 3986 for the list of unreserved characters.
98 static const Charmap kUnreservedCharmap = {
99     {0xffffffffL, 0xfc009fffL, 0x78000001L, 0xb8000001L, 0xffffffffL,
100      0xffffffffL, 0xffffffffL, 0xffffffffL}};
101 
102 // Everything except alphanumerics and !'()*-._~
103 // See RFC 2396 for the list of reserved characters.
104 static const Charmap kQueryCharmap = {{0xffffffffL, 0xfc00987dL, 0x78000001L,
105                                        0xb8000001L, 0xffffffffL, 0xffffffffL,
106                                        0xffffffffL, 0xffffffffL}};
107 
108 // non-printable, non-7bit, and (including space)  "#%:<>?[\]^`{|}
109 static const Charmap kPathCharmap = {{0xffffffffL, 0xd400002dL, 0x78000000L,
110                                       0xb8000001L, 0xffffffffL, 0xffffffffL,
111                                       0xffffffffL, 0xffffffffL}};
112 
113 #if BUILDFLAG(IS_APPLE)
114 // non-printable, non-7bit, and (including space)  "#%<>[\]^`{|}
115 static const Charmap kNSURLCharmap = {{0xffffffffL, 0x5000002dL, 0x78000000L,
116                                        0xb8000001L, 0xffffffffL, 0xffffffffL,
117                                        0xffffffffL, 0xffffffffL}};
118 #endif  // BUILDFLAG(IS_APPLE)
119 
120 // non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|}
121 static const Charmap kUrlEscape = {{0xffffffffL, 0xf80008fdL, 0x78000001L,
122                                     0xb8000001L, 0xffffffffL, 0xffffffffL,
123                                     0xffffffffL, 0xffffffffL}};
124 
125 // non-7bit, as well as %.
126 static const Charmap kNonASCIICharmapAndPercent = {
127     {0x00000000L, 0x00000020L, 0x00000000L, 0x00000000L, 0xffffffffL,
128      0xffffffffL, 0xffffffffL, 0xffffffffL}};
129 
130 // non-7bit
131 static const Charmap kNonASCIICharmap = {{0x00000000L, 0x00000000L, 0x00000000L,
132                                           0x00000000L, 0xffffffffL, 0xffffffffL,
133                                           0xffffffffL, 0xffffffffL}};
134 
135 // Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and
136 // !'()*-._~#[]
137 static const Charmap kExternalHandlerCharmap = {
138     {0xffffffffL, 0x50000025L, 0x50000000L, 0xb8000001L, 0xffffffffL,
139      0xffffffffL, 0xffffffffL, 0xffffffffL}};
140 
141 // Contains nonzero when the corresponding character is unescapable for normal
142 // URLs. These characters are the ones that may change the parsing of a URL, so
143 // we don't want to unescape them sometimes. In many case we won't want to
144 // unescape spaces, but that is controlled by parameters to Unescape*.
145 //
146 // The basic rule is that we can't unescape anything that would changing parsing
147 // like # or ?. We also can't unescape &, =, or + since that could be part of a
148 // query and that could change the server's parsing of the query. Nor can we
149 // unescape \ since src/url/ will convert it to a /.
150 //
151 // Lastly, we can't unescape anything that doesn't have a canonical
152 // representation in a URL. This means that unescaping will change the URL, and
153 // you could get different behavior if you copy and paste the URL, or press
154 // enter in the URL bar. The list of characters that fall into this category
155 // are the ones labeled PASS (allow either escaped or unescaped) in the big
156 // lookup table at the top of url/url_canon_path.cc.  Also, characters
157 // that have CHAR_QUERY set in url/url_canon_internal.cc but are not
158 // allowed in query strings according to http://www.ietf.org/rfc/rfc3261.txt are
159 // not unescaped, to avoid turning a valid url according to spec into an
160 // invalid one.
161 // clang-format off
162 const char kUrlUnescape[128] = {
163 //   Null, control chars...
164      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
165      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
166 //  ' ' !  "  #  $  %  &  '  (  )  *  +  ,  -  .  /
167      0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
168 //   0  1  2  3  4  5  6  7  8  9  :  ;  <  =  >  ?
169      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
170 //   @  A  B  C  D  E  F  G  H  I  J  K  L  M  N  O
171      0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
172 //   P  Q  R  S  T  U  V  W  X  Y  Z  [  \  ]  ^  _
173      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
174 //   `  a  b  c  d  e  f  g  h  i  j  k  l  m  n  o
175      0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
176 //   p  q  r  s  t  u  v  w  x  y  z  {  |  }  ~  <NBSP>
177      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
178 };
179 // clang-format on
180 
181 // Attempts to unescape the sequence at |index| within |escaped_text|.  If
182 // successful, sets |value| to the unescaped value.  Returns whether
183 // unescaping succeeded.
UnescapeUnsignedByteAtIndex(StringPiece escaped_text,size_t index,unsigned char * value)184 bool UnescapeUnsignedByteAtIndex(StringPiece escaped_text,
185                                  size_t index,
186                                  unsigned char* value) {
187   if ((index + 2) >= escaped_text.size())
188     return false;
189   if (escaped_text[index] != '%')
190     return false;
191   char most_sig_digit(escaped_text[index + 1]);
192   char least_sig_digit(escaped_text[index + 2]);
193   if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {
194     *value = static_cast<unsigned char>(HexDigitToInt(most_sig_digit) * 16 +
195                                         HexDigitToInt(least_sig_digit));
196     return true;
197   }
198   return false;
199 }
200 
201 // Attempts to unescape and decode a UTF-8-encoded percent-escaped character at
202 // the specified index. On success, returns true, sets |code_point_out| to be
203 // the character's code point and |unescaped_out| to be the unescaped UTF-8
204 // string. |unescaped_out| will always be 1/3rd the length of the substring of
205 // |escaped_text| that corresponds to the unescaped character.
UnescapeUTF8CharacterAtIndex(StringPiece escaped_text,size_t index,base_icu::UChar32 * code_point_out,std::string * unescaped_out)206 bool UnescapeUTF8CharacterAtIndex(StringPiece escaped_text,
207                                   size_t index,
208                                   base_icu::UChar32* code_point_out,
209                                   std::string* unescaped_out) {
210   DCHECK(unescaped_out->empty());
211 
212   unsigned char bytes[CBU8_MAX_LENGTH];
213   if (!UnescapeUnsignedByteAtIndex(escaped_text, index, &bytes[0]))
214     return false;
215 
216   size_t num_bytes = 1;
217 
218   // If this is a lead byte, need to collect trail bytes as well.
219   if (CBU8_IS_LEAD(bytes[0])) {
220     // Look for the last trail byte of the UTF-8 character.  Give up once
221     // reach max character length number of bytes, or hit an unescaped
222     // character. No need to check length of escaped_text, as
223     // UnescapeUnsignedByteAtIndex checks lengths.
224     while (num_bytes < std::size(bytes) &&
225            UnescapeUnsignedByteAtIndex(escaped_text, index + num_bytes * 3,
226                                        &bytes[num_bytes]) &&
227            CBU8_IS_TRAIL(bytes[num_bytes])) {
228       ++num_bytes;
229     }
230   }
231 
232   size_t char_index = 0;
233   // Check if the unicode "character" that was just unescaped is valid.
234   if (!ReadUnicodeCharacter(reinterpret_cast<char*>(bytes), num_bytes,
235                             &char_index, code_point_out)) {
236     return false;
237   }
238 
239   // It's possible that a prefix of |bytes| forms a valid UTF-8 character,
240   // and the rest are not valid UTF-8, so need to update |num_bytes| based
241   // on the result of ReadUnicodeCharacter().
242   num_bytes = char_index + 1;
243   *unescaped_out = std::string(reinterpret_cast<char*>(bytes), num_bytes);
244   return true;
245 }
246 
247 // This method takes a Unicode code point and returns true if it should be
248 // unescaped, based on |rules|.
ShouldUnescapeCodePoint(UnescapeRule::Type rules,base_icu::UChar32 code_point)249 bool ShouldUnescapeCodePoint(UnescapeRule::Type rules,
250                              base_icu::UChar32 code_point) {
251   // If this is an ASCII character, use the lookup table.
252   if (code_point >= 0 && code_point < 0x80) {
253     return kUrlUnescape[static_cast<size_t>(code_point)] ||
254            // Allow some additional unescaping when flags are set.
255            (code_point == ' ' && (rules & UnescapeRule::SPACES)) ||
256            // Allow any of the prohibited but non-control characters when doing
257            // "special" chars.
258            ((code_point == '/' || code_point == '\\') &&
259             (rules & UnescapeRule::PATH_SEPARATORS)) ||
260            (code_point > ' ' && code_point != '/' && code_point != '\\' &&
261             (rules & UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS));
262   }
263 
264   // Compare the code point against a list of characters that can be used
265   // to spoof other URLs.
266   //
267   // Can't use icu to make this cleaner, because Cronet cannot depend on
268   // icu, and currently uses this file.
269   // TODO(https://crbug.com/829873): Try to make this use icu, both to
270   // protect against regressions as the Unicode standard is updated and to
271   // reduce the number of long lists of characters.
272   return !(
273       // Per http://tools.ietf.org/html/rfc3987#section-4.1, certain BiDi
274       // control characters are not allowed to appear unescaped in URLs.
275       code_point == 0x200E ||  // LEFT-TO-RIGHT MARK         (%E2%80%8E)
276       code_point == 0x200F ||  // RIGHT-TO-LEFT MARK         (%E2%80%8F)
277       code_point == 0x202A ||  // LEFT-TO-RIGHT EMBEDDING    (%E2%80%AA)
278       code_point == 0x202B ||  // RIGHT-TO-LEFT EMBEDDING    (%E2%80%AB)
279       code_point == 0x202C ||  // POP DIRECTIONAL FORMATTING (%E2%80%AC)
280       code_point == 0x202D ||  // LEFT-TO-RIGHT OVERRIDE     (%E2%80%AD)
281       code_point == 0x202E ||  // RIGHT-TO-LEFT OVERRIDE     (%E2%80%AE)
282 
283       // The Unicode Technical Report (TR9) as referenced by RFC 3987 above has
284       // since added some new BiDi control characters that are not safe to
285       // unescape. http://www.unicode.org/reports/tr9
286       code_point == 0x061C ||  // ARABIC LETTER MARK         (%D8%9C)
287       code_point == 0x2066 ||  // LEFT-TO-RIGHT ISOLATE      (%E2%81%A6)
288       code_point == 0x2067 ||  // RIGHT-TO-LEFT ISOLATE      (%E2%81%A7)
289       code_point == 0x2068 ||  // FIRST STRONG ISOLATE       (%E2%81%A8)
290       code_point == 0x2069 ||  // POP DIRECTIONAL ISOLATE    (%E2%81%A9)
291 
292       // The following spoofable characters are also banned in unescaped URLs,
293       // because they could be used to imitate parts of a web browser's UI.
294       code_point == 0x1F50F ||  // LOCK WITH INK PEN    (%F0%9F%94%8F)
295       code_point == 0x1F510 ||  // CLOSED LOCK WITH KEY (%F0%9F%94%90)
296       code_point == 0x1F512 ||  // LOCK                 (%F0%9F%94%92)
297       code_point == 0x1F513 ||  // OPEN LOCK            (%F0%9F%94%93)
298 
299       // Spaces are also banned, as they can be used to scroll text out of view.
300       code_point == 0x0085 ||  // NEXT LINE                  (%C2%85)
301       code_point == 0x00A0 ||  // NO-BREAK SPACE             (%C2%A0)
302       code_point == 0x1680 ||  // OGHAM SPACE MARK           (%E1%9A%80)
303       code_point == 0x2000 ||  // EN QUAD                    (%E2%80%80)
304       code_point == 0x2001 ||  // EM QUAD                    (%E2%80%81)
305       code_point == 0x2002 ||  // EN SPACE                   (%E2%80%82)
306       code_point == 0x2003 ||  // EM SPACE                   (%E2%80%83)
307       code_point == 0x2004 ||  // THREE-PER-EM SPACE         (%E2%80%84)
308       code_point == 0x2005 ||  // FOUR-PER-EM SPACE          (%E2%80%85)
309       code_point == 0x2006 ||  // SIX-PER-EM SPACE           (%E2%80%86)
310       code_point == 0x2007 ||  // FIGURE SPACE               (%E2%80%87)
311       code_point == 0x2008 ||  // PUNCTUATION SPACE          (%E2%80%88)
312       code_point == 0x2009 ||  // THIN SPACE                 (%E2%80%89)
313       code_point == 0x200A ||  // HAIR SPACE                 (%E2%80%8A)
314       code_point == 0x2028 ||  // LINE SEPARATOR             (%E2%80%A8)
315       code_point == 0x2029 ||  // PARAGRAPH SEPARATOR        (%E2%80%A9)
316       code_point == 0x202F ||  // NARROW NO-BREAK SPACE      (%E2%80%AF)
317       code_point == 0x205F ||  // MEDIUM MATHEMATICAL SPACE  (%E2%81%9F)
318       code_point == 0x3000 ||  // IDEOGRAPHIC SPACE          (%E3%80%80)
319       // U+2800 is rendered as a space, but is not considered whitespace (see
320       // crbug.com/1068531).
321       code_point == 0x2800 ||  // BRAILLE PATTERN BLANK      (%E2%A0%80)
322 
323       // Default Ignorable ([:Default_Ignorable_Code_Point=Yes:]) and Format
324       // characters ([:Cf:]) are also banned (see crbug.com/824715).
325       code_point == 0x00AD ||  // SOFT HYPHEN               (%C2%AD)
326       code_point == 0x034F ||  // COMBINING GRAPHEME JOINER (%CD%8F)
327       // Arabic number formatting
328       (code_point >= 0x0600 && code_point <= 0x0605) ||
329       // U+061C is already banned as a BiDi control character.
330       code_point == 0x06DD ||  // ARABIC END OF AYAH          (%DB%9D)
331       code_point == 0x070F ||  // SYRIAC ABBREVIATION MARK    (%DC%8F)
332       code_point == 0x08E2 ||  // ARABIC DISPUTED END OF AYAH (%E0%A3%A2)
333       code_point == 0x115F ||  // HANGUL CHOSEONG FILLER      (%E1%85%9F)
334       code_point == 0x1160 ||  // HANGUL JUNGSEONG FILLER     (%E1%85%A0)
335       code_point == 0x17B4 ||  // KHMER VOWEL INHERENT AQ     (%E1%9E%B4)
336       code_point == 0x17B5 ||  // KHMER VOWEL INHERENT AA     (%E1%9E%B5)
337       code_point == 0x180B ||  // MONGOLIAN FREE VARIATION SELECTOR ONE
338                                // (%E1%A0%8B)
339       code_point == 0x180C ||  // MONGOLIAN FREE VARIATION SELECTOR TWO
340                                // (%E1%A0%8C)
341       code_point == 0x180D ||  // MONGOLIAN FREE VARIATION SELECTOR THREE
342                                // (%E1%A0%8D)
343       code_point == 0x180E ||  // MONGOLIAN VOWEL SEPARATOR   (%E1%A0%8E)
344       code_point == 0x200B ||  // ZERO WIDTH SPACE            (%E2%80%8B)
345       code_point == 0x200C ||  // ZERO WIDTH SPACE NON-JOINER (%E2%80%8C)
346       code_point == 0x200D ||  // ZERO WIDTH JOINER           (%E2%80%8D)
347       // U+200E, U+200F, U+202A--202E, and U+2066--2069 are already banned as
348       // BiDi control characters.
349       code_point == 0x2060 ||  // WORD JOINER          (%E2%81%A0)
350       code_point == 0x2061 ||  // FUNCTION APPLICATION (%E2%81%A1)
351       code_point == 0x2062 ||  // INVISIBLE TIMES      (%E2%81%A2)
352       code_point == 0x2063 ||  // INVISIBLE SEPARATOR  (%E2%81%A3)
353       code_point == 0x2064 ||  // INVISIBLE PLUS       (%E2%81%A4)
354       code_point == 0x2065 ||  // null (%E2%81%A5)
355       // 0x2066--0x2069 are already banned as a BiDi control characters.
356       // General Punctuation - Deprecated (U+206A--206F)
357       (code_point >= 0x206A && code_point <= 0x206F) ||
358       code_point == 0x3164 ||  // HANGUL FILLER (%E3%85%A4)
359       (code_point >= 0xFFF0 && code_point <= 0xFFF8) ||  // null
360       // Variation selectors (%EF%B8%80 -- %EF%B8%8F)
361       (code_point >= 0xFE00 && code_point <= 0xFE0F) ||
362       code_point == 0xFEFF ||   // ZERO WIDTH NO-BREAK SPACE (%EF%BB%BF)
363       code_point == 0xFFA0 ||   // HALFWIDTH HANGUL FILLER (%EF%BE%A0)
364       code_point == 0xFFF9 ||   // INTERLINEAR ANNOTATION ANCHOR     (%EF%BF%B9)
365       code_point == 0xFFFA ||   // INTERLINEAR ANNOTATION SEPARATOR  (%EF%BF%BA)
366       code_point == 0xFFFB ||   // INTERLINEAR ANNOTATION TERMINATOR (%EF%BF%BB)
367       code_point == 0x110BD ||  // KAITHI NUMBER SIGN       (%F0%91%82%BD)
368       code_point == 0x110CD ||  // KAITHI NUMBER SIGN ABOVE (%F0%91%83%8D)
369       // Egyptian hieroglyph formatting (%F0%93%90%B0 -- %F0%93%90%B8)
370       (code_point >= 0x13430 && code_point <= 0x13438) ||
371       // Shorthand format controls (%F0%9B%B2%A0 -- %F0%9B%B2%A3)
372       (code_point >= 0x1BCA0 && code_point <= 0x1BCA3) ||
373       // Beams and slurs (%F0%9D%85%B3 -- %F0%9D%85%BA)
374       (code_point >= 0x1D173 && code_point <= 0x1D17A) ||
375       // Tags, Variation Selectors, nulls
376       (code_point >= 0xE0000 && code_point <= 0xE0FFF));
377 }
378 
379 // Unescapes |escaped_text| according to |rules|, returning the resulting
380 // string.  Fills in an |adjustments| parameter, if non-nullptr, so it reflects
381 // the alterations done to the string that are not one-character-to-one-
382 // character.  The resulting |adjustments| will always be sorted by increasing
383 // offset.
UnescapeURLWithAdjustmentsImpl(StringPiece escaped_text,UnescapeRule::Type rules,OffsetAdjuster::Adjustments * adjustments)384 std::string UnescapeURLWithAdjustmentsImpl(
385     StringPiece escaped_text,
386     UnescapeRule::Type rules,
387     OffsetAdjuster::Adjustments* adjustments) {
388   if (adjustments)
389     adjustments->clear();
390   // Do not unescape anything, return the |escaped_text| text.
391   if (rules == UnescapeRule::NONE)
392     return std::string(escaped_text);
393 
394   // The output of the unescaping is always smaller than the input, so we can
395   // reserve the input size to make sure we have enough buffer and don't have
396   // to allocate in the loop below.
397   std::string result;
398   result.reserve(escaped_text.length());
399 
400   // Locations of adjusted text.
401   for (size_t i = 0, max = escaped_text.size(); i < max;) {
402     // Try to unescape the character.
403     base_icu::UChar32 code_point;
404     std::string unescaped;
405     if (!UnescapeUTF8CharacterAtIndex(escaped_text, i, &code_point,
406                                       &unescaped)) {
407       // Check if the next character can be unescaped, but not as a valid UTF-8
408       // character. In that case, just unescaped and write the non-sense
409       // character.
410       //
411       // TODO(https://crbug.com/829868): Do not unescape illegal UTF-8
412       // sequences.
413       unsigned char non_utf8_byte;
414       if (UnescapeUnsignedByteAtIndex(escaped_text, i, &non_utf8_byte)) {
415         result.push_back(static_cast<char>(non_utf8_byte));
416         if (adjustments)
417           adjustments->push_back(OffsetAdjuster::Adjustment(i, 3, 1));
418         i += 3;
419         continue;
420       }
421 
422       // Character is not escaped, so append as is, unless it's a '+' and
423       // REPLACE_PLUS_WITH_SPACE is being applied.
424       if (escaped_text[i] == '+' &&
425           (rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE)) {
426         result.push_back(' ');
427       } else {
428         result.push_back(escaped_text[i]);
429       }
430       ++i;
431       continue;
432     }
433 
434     DCHECK(!unescaped.empty());
435 
436     if (!ShouldUnescapeCodePoint(rules, code_point)) {
437       // If it's a valid UTF-8 character, but not safe to unescape, copy all
438       // bytes directly.
439       result.append(escaped_text.substr(i, 3 * unescaped.length()));
440       i += unescaped.length() * 3;
441       continue;
442     }
443 
444     // If the code point is allowed, and append the entire unescaped character.
445     result.append(unescaped);
446     if (adjustments) {
447       for (size_t j = 0; j < unescaped.length(); ++j) {
448         adjustments->push_back(OffsetAdjuster::Adjustment(i + j * 3, 3, 1));
449       }
450     }
451     i += 3 * unescaped.length();
452   }
453 
454   return result;
455 }
456 
457 }  // namespace
458 
EscapeAllExceptUnreserved(StringPiece text)459 std::string EscapeAllExceptUnreserved(StringPiece text) {
460   return Escape(text, kUnreservedCharmap, false);
461 }
462 
EscapeQueryParamValue(StringPiece text,bool use_plus)463 std::string EscapeQueryParamValue(StringPiece text, bool use_plus) {
464   return Escape(text, kQueryCharmap, use_plus);
465 }
466 
EscapePath(StringPiece path)467 std::string EscapePath(StringPiece path) {
468   return Escape(path, kPathCharmap, false);
469 }
470 
471 #if BUILDFLAG(IS_APPLE)
EscapeNSURLPrecursor(StringPiece precursor)472 std::string EscapeNSURLPrecursor(StringPiece precursor) {
473   return Escape(precursor, kNSURLCharmap, false, true);
474 }
475 #endif  // BUILDFLAG(IS_APPLE)
476 
EscapeUrlEncodedData(StringPiece path,bool use_plus)477 std::string EscapeUrlEncodedData(StringPiece path, bool use_plus) {
478   return Escape(path, kUrlEscape, use_plus);
479 }
480 
EscapeNonASCIIAndPercent(StringPiece input)481 std::string EscapeNonASCIIAndPercent(StringPiece input) {
482   return Escape(input, kNonASCIICharmapAndPercent, false);
483 }
484 
EscapeNonASCII(StringPiece input)485 std::string EscapeNonASCII(StringPiece input) {
486   return Escape(input, kNonASCIICharmap, false);
487 }
488 
EscapeExternalHandlerValue(StringPiece text)489 std::string EscapeExternalHandlerValue(StringPiece text) {
490   return Escape(text, kExternalHandlerCharmap, false, true);
491 }
492 
AppendEscapedCharForHTML(char c,std::string * output)493 void AppendEscapedCharForHTML(char c, std::string* output) {
494   AppendEscapedCharForHTMLImpl(c, output);
495 }
496 
EscapeForHTML(StringPiece input)497 std::string EscapeForHTML(StringPiece input) {
498   return EscapeForHTMLImpl(input);
499 }
500 
EscapeForHTML(StringPiece16 input)501 std::u16string EscapeForHTML(StringPiece16 input) {
502   return EscapeForHTMLImpl(input);
503 }
504 
UnescapeURLComponent(StringPiece escaped_text,UnescapeRule::Type rules)505 std::string UnescapeURLComponent(StringPiece escaped_text,
506                                  UnescapeRule::Type rules) {
507   return UnescapeURLWithAdjustmentsImpl(escaped_text, rules, nullptr);
508 }
509 
UnescapeAndDecodeUTF8URLComponentWithAdjustments(StringPiece text,UnescapeRule::Type rules,OffsetAdjuster::Adjustments * adjustments)510 std::u16string UnescapeAndDecodeUTF8URLComponentWithAdjustments(
511     StringPiece text,
512     UnescapeRule::Type rules,
513     OffsetAdjuster::Adjustments* adjustments) {
514   std::u16string result;
515   OffsetAdjuster::Adjustments unescape_adjustments;
516   std::string unescaped_url(
517       UnescapeURLWithAdjustmentsImpl(text, rules, &unescape_adjustments));
518   if (UTF8ToUTF16WithAdjustments(unescaped_url.data(), unescaped_url.length(),
519                                  &result, adjustments)) {
520     // Character set looks like it's valid.
521     if (adjustments) {
522       OffsetAdjuster::MergeSequentialAdjustments(unescape_adjustments,
523                                                  adjustments);
524     }
525     return result;
526   }
527   // Character set is not valid.  Return the escaped version.
528   return UTF8ToUTF16WithAdjustments(text, adjustments);
529 }
530 
UnescapeBinaryURLComponent(StringPiece escaped_text,UnescapeRule::Type rules)531 std::string UnescapeBinaryURLComponent(StringPiece escaped_text,
532                                        UnescapeRule::Type rules) {
533   // Only NORMAL and REPLACE_PLUS_WITH_SPACE are supported.
534   DCHECK(rules != UnescapeRule::NONE);
535   DCHECK(!(rules &
536            ~(UnescapeRule::NORMAL | UnescapeRule::REPLACE_PLUS_WITH_SPACE)));
537 
538   // It is not possible to read the feature state when this function is invoked
539   // before FeatureList initialization. In that case, fallback to the feature's
540   // default state.
541   //
542   // TODO(crbug.com/1321924): Cleanup this feature.
543   const bool optimize_data_urls_feature_is_enabled =
544       base::FeatureList::GetInstance()
545           ? base::FeatureList::IsEnabled(features::kOptimizeDataUrls)
546           : features::kOptimizeDataUrls.default_state ==
547                 base::FEATURE_ENABLED_BY_DEFAULT;
548 
549   // If there are no '%' characters in the string, there will be nothing to
550   // unescape, so we can take the fast path.
551   if (optimize_data_urls_feature_is_enabled &&
552       escaped_text.find('%') == StringPiece::npos) {
553     std::string unescaped_text(escaped_text);
554     if (rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE)
555       std::replace(unescaped_text.begin(), unescaped_text.end(), '+', ' ');
556     return unescaped_text;
557   }
558 
559   std::string unescaped_text;
560 
561   // The output of the unescaping is always smaller than the input, so we can
562   // reserve the input size to make sure we have enough buffer and don't have
563   // to allocate in the loop below.
564   // Increase capacity before size, as just resizing can grow capacity
565   // needlessly beyond our requested size.
566   unescaped_text.reserve(escaped_text.size());
567   unescaped_text.resize(escaped_text.size());
568 
569   size_t output_index = 0;
570 
571   for (size_t i = 0, max = escaped_text.size(); i < max;) {
572     unsigned char byte;
573     // UnescapeUnsignedByteAtIndex does bounds checking, so this is always safe
574     // to call.
575     if (UnescapeUnsignedByteAtIndex(escaped_text, i, &byte)) {
576       unescaped_text[output_index++] = static_cast<char>(byte);
577       i += 3;
578       continue;
579     }
580 
581     if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&
582         escaped_text[i] == '+') {
583       unescaped_text[output_index++] = ' ';
584       ++i;
585       continue;
586     }
587 
588     unescaped_text[output_index++] = escaped_text[i++];
589   }
590 
591   DCHECK_LE(output_index, unescaped_text.size());
592   unescaped_text.resize(output_index);
593   return unescaped_text;
594 }
595 
UnescapeBinaryURLComponentSafe(StringPiece escaped_text,bool fail_on_path_separators,std::string * unescaped_text)596 bool UnescapeBinaryURLComponentSafe(StringPiece escaped_text,
597                                     bool fail_on_path_separators,
598                                     std::string* unescaped_text) {
599   unescaped_text->clear();
600 
601   std::set<unsigned char> illegal_encoded_bytes;
602   for (unsigned char c = '\x00'; c < '\x20'; ++c) {
603     illegal_encoded_bytes.insert(c);
604   }
605   if (fail_on_path_separators) {
606     illegal_encoded_bytes.insert('/');
607     illegal_encoded_bytes.insert('\\');
608   }
609   if (ContainsEncodedBytes(escaped_text, illegal_encoded_bytes))
610     return false;
611 
612   *unescaped_text = UnescapeBinaryURLComponent(escaped_text);
613   return true;
614 }
615 
ContainsEncodedBytes(StringPiece escaped_text,const std::set<unsigned char> & bytes)616 bool ContainsEncodedBytes(StringPiece escaped_text,
617                           const std::set<unsigned char>& bytes) {
618   for (size_t i = 0, max = escaped_text.size(); i < max;) {
619     unsigned char byte;
620     // UnescapeUnsignedByteAtIndex does bounds checking, so this is always safe
621     // to call.
622     if (UnescapeUnsignedByteAtIndex(escaped_text, i, &byte)) {
623       if (bytes.find(byte) != bytes.end())
624         return true;
625 
626       i += 3;
627       continue;
628     }
629 
630     ++i;
631   }
632 
633   return false;
634 }
635 
UnescapeForHTML(StringPiece16 input)636 std::u16string UnescapeForHTML(StringPiece16 input) {
637   static const struct {
638     const char* ampersand_code;
639     const char16_t replacement;
640   } kEscapeToChars[] = {
641       {"&lt;", '<'},   {"&gt;", '>'},   {"&amp;", '&'},
642       {"&quot;", '"'}, {"&#39;", '\''},
643   };
644   constexpr size_t kEscapeToCharsCount = std::size(kEscapeToChars);
645 
646   if (input.find(u"&") == std::string::npos)
647     return std::u16string(input);
648 
649   std::u16string ampersand_chars[kEscapeToCharsCount];
650   std::u16string text(input);
651   for (std::u16string::iterator iter = text.begin(); iter != text.end();
652        ++iter) {
653     if (*iter == '&') {
654       // Potential ampersand encode char.
655       size_t index = static_cast<size_t>(iter - text.begin());
656       for (size_t i = 0; i < std::size(kEscapeToChars); i++) {
657         if (ampersand_chars[i].empty()) {
658           ampersand_chars[i] = ASCIIToUTF16(kEscapeToChars[i].ampersand_code);
659         }
660         if (text.find(ampersand_chars[i], index) == index) {
661           text.replace(
662               iter, iter + static_cast<ptrdiff_t>(ampersand_chars[i].length()),
663               1, kEscapeToChars[i].replacement);
664           break;
665         }
666       }
667     }
668   }
669   return text;
670 }
671 
672 }  // namespace base
673