1 // Copyright 2020 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "base/strings/escape.h"
6
7 #include <ostream>
8
9 #include "base/check_op.h"
10 #include "base/feature_list.h"
11 #include "base/features.h"
12 #include "base/strings/string_number_conversions.h"
13 #include "base/strings/string_piece.h"
14 #include "base/strings/string_util.h"
15 #include "base/strings/utf_string_conversion_utils.h"
16 #include "base/strings/utf_string_conversions.h"
17 #include "base/third_party/icu/icu_utf.h"
18
19 namespace base {
20
21 namespace {
22
23 // A fast bit-vector map for ascii characters.
24 //
25 // Internally stores 256 bits in an array of 8 ints.
26 // Does quick bit-flicking to lookup needed characters.
27 struct Charmap {
Containsbase::__anon620755e90111::Charmap28 bool Contains(unsigned char c) const {
29 return ((map[c >> 5] & (1 << (c & 31))) != 0);
30 }
31
32 uint32_t map[8];
33 };
34
35 // Given text to escape and a Charmap defining which values to escape,
36 // return an escaped string. If use_plus is true, spaces are converted
37 // to +, otherwise, if spaces are in the charmap, they are converted to
38 // %20. And if keep_escaped is true, %XX will be kept as it is, otherwise, if
39 // '%' is in the charmap, it is converted to %25.
Escape(StringPiece text,const Charmap & charmap,bool use_plus,bool keep_escaped=false)40 std::string Escape(StringPiece text,
41 const Charmap& charmap,
42 bool use_plus,
43 bool keep_escaped = false) {
44 std::string escaped;
45 escaped.reserve(text.length() * 3);
46 for (size_t i = 0; i < text.length(); ++i) {
47 unsigned char c = static_cast<unsigned char>(text[i]);
48 if (use_plus && ' ' == c) {
49 escaped.push_back('+');
50 } else if (keep_escaped && '%' == c && i + 2 < text.length() &&
51 IsHexDigit(text[i + 1]) && IsHexDigit(text[i + 2])) {
52 escaped.push_back('%');
53 } else if (charmap.Contains(c)) {
54 escaped.push_back('%');
55 AppendHexEncodedByte(c, escaped);
56 } else {
57 escaped.push_back(static_cast<char>(c));
58 }
59 }
60 return escaped;
61 }
62
63 // Convert a character |c| to a form that will not be mistaken as HTML.
64 template <class str>
AppendEscapedCharForHTMLImpl(typename str::value_type c,str * output)65 void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) {
66 static constexpr struct {
67 char key;
68 StringPiece replacement;
69 } kCharsToEscape[] = {
70 {'<', "<"}, {'>', ">"}, {'&', "&"},
71 {'"', """}, {'\'', "'"},
72 };
73 for (const auto& char_to_escape : kCharsToEscape) {
74 if (c == char_to_escape.key) {
75 output->append(std::begin(char_to_escape.replacement),
76 std::end(char_to_escape.replacement));
77 return;
78 }
79 }
80 output->push_back(c);
81 }
82
83 // Convert |input| string to a form that will not be interpreted as HTML.
84 template <typename T, typename CharT = typename T::value_type>
EscapeForHTMLImpl(T input)85 std::basic_string<CharT> EscapeForHTMLImpl(T input) {
86 std::basic_string<CharT> result;
87 result.reserve(input.size()); // Optimize for no escaping.
88
89 for (auto c : input) {
90 AppendEscapedCharForHTMLImpl(c, &result);
91 }
92
93 return result;
94 }
95
96 // Everything except alphanumerics and -._~
97 // See RFC 3986 for the list of unreserved characters.
98 static const Charmap kUnreservedCharmap = {
99 {0xffffffffL, 0xfc009fffL, 0x78000001L, 0xb8000001L, 0xffffffffL,
100 0xffffffffL, 0xffffffffL, 0xffffffffL}};
101
102 // Everything except alphanumerics and !'()*-._~
103 // See RFC 2396 for the list of reserved characters.
104 static const Charmap kQueryCharmap = {{0xffffffffL, 0xfc00987dL, 0x78000001L,
105 0xb8000001L, 0xffffffffL, 0xffffffffL,
106 0xffffffffL, 0xffffffffL}};
107
108 // non-printable, non-7bit, and (including space) "#%:<>?[\]^`{|}
109 static const Charmap kPathCharmap = {{0xffffffffL, 0xd400002dL, 0x78000000L,
110 0xb8000001L, 0xffffffffL, 0xffffffffL,
111 0xffffffffL, 0xffffffffL}};
112
113 #if BUILDFLAG(IS_APPLE)
114 // non-printable, non-7bit, and (including space) "#%<>[\]^`{|}
115 static const Charmap kNSURLCharmap = {{0xffffffffL, 0x5000002dL, 0x78000000L,
116 0xb8000001L, 0xffffffffL, 0xffffffffL,
117 0xffffffffL, 0xffffffffL}};
118 #endif // BUILDFLAG(IS_APPLE)
119
120 // non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|}
121 static const Charmap kUrlEscape = {{0xffffffffL, 0xf80008fdL, 0x78000001L,
122 0xb8000001L, 0xffffffffL, 0xffffffffL,
123 0xffffffffL, 0xffffffffL}};
124
125 // non-7bit, as well as %.
126 static const Charmap kNonASCIICharmapAndPercent = {
127 {0x00000000L, 0x00000020L, 0x00000000L, 0x00000000L, 0xffffffffL,
128 0xffffffffL, 0xffffffffL, 0xffffffffL}};
129
130 // non-7bit
131 static const Charmap kNonASCIICharmap = {{0x00000000L, 0x00000000L, 0x00000000L,
132 0x00000000L, 0xffffffffL, 0xffffffffL,
133 0xffffffffL, 0xffffffffL}};
134
135 // Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and
136 // !'()*-._~#[]
137 static const Charmap kExternalHandlerCharmap = {
138 {0xffffffffL, 0x50000025L, 0x50000000L, 0xb8000001L, 0xffffffffL,
139 0xffffffffL, 0xffffffffL, 0xffffffffL}};
140
141 // Contains nonzero when the corresponding character is unescapable for normal
142 // URLs. These characters are the ones that may change the parsing of a URL, so
143 // we don't want to unescape them sometimes. In many case we won't want to
144 // unescape spaces, but that is controlled by parameters to Unescape*.
145 //
146 // The basic rule is that we can't unescape anything that would changing parsing
147 // like # or ?. We also can't unescape &, =, or + since that could be part of a
148 // query and that could change the server's parsing of the query. Nor can we
149 // unescape \ since src/url/ will convert it to a /.
150 //
151 // Lastly, we can't unescape anything that doesn't have a canonical
152 // representation in a URL. This means that unescaping will change the URL, and
153 // you could get different behavior if you copy and paste the URL, or press
154 // enter in the URL bar. The list of characters that fall into this category
155 // are the ones labeled PASS (allow either escaped or unescaped) in the big
156 // lookup table at the top of url/url_canon_path.cc. Also, characters
157 // that have CHAR_QUERY set in url/url_canon_internal.cc but are not
158 // allowed in query strings according to http://www.ietf.org/rfc/rfc3261.txt are
159 // not unescaped, to avoid turning a valid url according to spec into an
160 // invalid one.
161 // clang-format off
162 const char kUrlUnescape[128] = {
163 // Null, control chars...
164 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
166 // ' ' ! " # $ % & ' ( ) * + , - . /
167 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
168 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
169 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
170 // @ A B C D E F G H I J K L M N O
171 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
172 // P Q R S T U V W X Y Z [ \ ] ^ _
173 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
174 // ` a b c d e f g h i j k l m n o
175 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
176 // p q r s t u v w x y z { | } ~ <NBSP>
177 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
178 };
179 // clang-format on
180
181 // Attempts to unescape the sequence at |index| within |escaped_text|. If
182 // successful, sets |value| to the unescaped value. Returns whether
183 // unescaping succeeded.
UnescapeUnsignedByteAtIndex(StringPiece escaped_text,size_t index,unsigned char * value)184 bool UnescapeUnsignedByteAtIndex(StringPiece escaped_text,
185 size_t index,
186 unsigned char* value) {
187 if ((index + 2) >= escaped_text.size())
188 return false;
189 if (escaped_text[index] != '%')
190 return false;
191 char most_sig_digit(escaped_text[index + 1]);
192 char least_sig_digit(escaped_text[index + 2]);
193 if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {
194 *value = static_cast<unsigned char>(HexDigitToInt(most_sig_digit) * 16 +
195 HexDigitToInt(least_sig_digit));
196 return true;
197 }
198 return false;
199 }
200
201 // Attempts to unescape and decode a UTF-8-encoded percent-escaped character at
202 // the specified index. On success, returns true, sets |code_point_out| to be
203 // the character's code point and |unescaped_out| to be the unescaped UTF-8
204 // string. |unescaped_out| will always be 1/3rd the length of the substring of
205 // |escaped_text| that corresponds to the unescaped character.
UnescapeUTF8CharacterAtIndex(StringPiece escaped_text,size_t index,base_icu::UChar32 * code_point_out,std::string * unescaped_out)206 bool UnescapeUTF8CharacterAtIndex(StringPiece escaped_text,
207 size_t index,
208 base_icu::UChar32* code_point_out,
209 std::string* unescaped_out) {
210 DCHECK(unescaped_out->empty());
211
212 unsigned char bytes[CBU8_MAX_LENGTH];
213 if (!UnescapeUnsignedByteAtIndex(escaped_text, index, &bytes[0]))
214 return false;
215
216 size_t num_bytes = 1;
217
218 // If this is a lead byte, need to collect trail bytes as well.
219 if (CBU8_IS_LEAD(bytes[0])) {
220 // Look for the last trail byte of the UTF-8 character. Give up once
221 // reach max character length number of bytes, or hit an unescaped
222 // character. No need to check length of escaped_text, as
223 // UnescapeUnsignedByteAtIndex checks lengths.
224 while (num_bytes < std::size(bytes) &&
225 UnescapeUnsignedByteAtIndex(escaped_text, index + num_bytes * 3,
226 &bytes[num_bytes]) &&
227 CBU8_IS_TRAIL(bytes[num_bytes])) {
228 ++num_bytes;
229 }
230 }
231
232 size_t char_index = 0;
233 // Check if the unicode "character" that was just unescaped is valid.
234 if (!ReadUnicodeCharacter(reinterpret_cast<char*>(bytes), num_bytes,
235 &char_index, code_point_out)) {
236 return false;
237 }
238
239 // It's possible that a prefix of |bytes| forms a valid UTF-8 character,
240 // and the rest are not valid UTF-8, so need to update |num_bytes| based
241 // on the result of ReadUnicodeCharacter().
242 num_bytes = char_index + 1;
243 *unescaped_out = std::string(reinterpret_cast<char*>(bytes), num_bytes);
244 return true;
245 }
246
247 // This method takes a Unicode code point and returns true if it should be
248 // unescaped, based on |rules|.
ShouldUnescapeCodePoint(UnescapeRule::Type rules,base_icu::UChar32 code_point)249 bool ShouldUnescapeCodePoint(UnescapeRule::Type rules,
250 base_icu::UChar32 code_point) {
251 // If this is an ASCII character, use the lookup table.
252 if (code_point >= 0 && code_point < 0x80) {
253 return kUrlUnescape[static_cast<size_t>(code_point)] ||
254 // Allow some additional unescaping when flags are set.
255 (code_point == ' ' && (rules & UnescapeRule::SPACES)) ||
256 // Allow any of the prohibited but non-control characters when doing
257 // "special" chars.
258 ((code_point == '/' || code_point == '\\') &&
259 (rules & UnescapeRule::PATH_SEPARATORS)) ||
260 (code_point > ' ' && code_point != '/' && code_point != '\\' &&
261 (rules & UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS));
262 }
263
264 // Compare the code point against a list of characters that can be used
265 // to spoof other URLs.
266 //
267 // Can't use icu to make this cleaner, because Cronet cannot depend on
268 // icu, and currently uses this file.
269 // TODO(https://crbug.com/829873): Try to make this use icu, both to
270 // protect against regressions as the Unicode standard is updated and to
271 // reduce the number of long lists of characters.
272 return !(
273 // Per http://tools.ietf.org/html/rfc3987#section-4.1, certain BiDi
274 // control characters are not allowed to appear unescaped in URLs.
275 code_point == 0x200E || // LEFT-TO-RIGHT MARK (%E2%80%8E)
276 code_point == 0x200F || // RIGHT-TO-LEFT MARK (%E2%80%8F)
277 code_point == 0x202A || // LEFT-TO-RIGHT EMBEDDING (%E2%80%AA)
278 code_point == 0x202B || // RIGHT-TO-LEFT EMBEDDING (%E2%80%AB)
279 code_point == 0x202C || // POP DIRECTIONAL FORMATTING (%E2%80%AC)
280 code_point == 0x202D || // LEFT-TO-RIGHT OVERRIDE (%E2%80%AD)
281 code_point == 0x202E || // RIGHT-TO-LEFT OVERRIDE (%E2%80%AE)
282
283 // The Unicode Technical Report (TR9) as referenced by RFC 3987 above has
284 // since added some new BiDi control characters that are not safe to
285 // unescape. http://www.unicode.org/reports/tr9
286 code_point == 0x061C || // ARABIC LETTER MARK (%D8%9C)
287 code_point == 0x2066 || // LEFT-TO-RIGHT ISOLATE (%E2%81%A6)
288 code_point == 0x2067 || // RIGHT-TO-LEFT ISOLATE (%E2%81%A7)
289 code_point == 0x2068 || // FIRST STRONG ISOLATE (%E2%81%A8)
290 code_point == 0x2069 || // POP DIRECTIONAL ISOLATE (%E2%81%A9)
291
292 // The following spoofable characters are also banned in unescaped URLs,
293 // because they could be used to imitate parts of a web browser's UI.
294 code_point == 0x1F50F || // LOCK WITH INK PEN (%F0%9F%94%8F)
295 code_point == 0x1F510 || // CLOSED LOCK WITH KEY (%F0%9F%94%90)
296 code_point == 0x1F512 || // LOCK (%F0%9F%94%92)
297 code_point == 0x1F513 || // OPEN LOCK (%F0%9F%94%93)
298
299 // Spaces are also banned, as they can be used to scroll text out of view.
300 code_point == 0x0085 || // NEXT LINE (%C2%85)
301 code_point == 0x00A0 || // NO-BREAK SPACE (%C2%A0)
302 code_point == 0x1680 || // OGHAM SPACE MARK (%E1%9A%80)
303 code_point == 0x2000 || // EN QUAD (%E2%80%80)
304 code_point == 0x2001 || // EM QUAD (%E2%80%81)
305 code_point == 0x2002 || // EN SPACE (%E2%80%82)
306 code_point == 0x2003 || // EM SPACE (%E2%80%83)
307 code_point == 0x2004 || // THREE-PER-EM SPACE (%E2%80%84)
308 code_point == 0x2005 || // FOUR-PER-EM SPACE (%E2%80%85)
309 code_point == 0x2006 || // SIX-PER-EM SPACE (%E2%80%86)
310 code_point == 0x2007 || // FIGURE SPACE (%E2%80%87)
311 code_point == 0x2008 || // PUNCTUATION SPACE (%E2%80%88)
312 code_point == 0x2009 || // THIN SPACE (%E2%80%89)
313 code_point == 0x200A || // HAIR SPACE (%E2%80%8A)
314 code_point == 0x2028 || // LINE SEPARATOR (%E2%80%A8)
315 code_point == 0x2029 || // PARAGRAPH SEPARATOR (%E2%80%A9)
316 code_point == 0x202F || // NARROW NO-BREAK SPACE (%E2%80%AF)
317 code_point == 0x205F || // MEDIUM MATHEMATICAL SPACE (%E2%81%9F)
318 code_point == 0x3000 || // IDEOGRAPHIC SPACE (%E3%80%80)
319 // U+2800 is rendered as a space, but is not considered whitespace (see
320 // crbug.com/1068531).
321 code_point == 0x2800 || // BRAILLE PATTERN BLANK (%E2%A0%80)
322
323 // Default Ignorable ([:Default_Ignorable_Code_Point=Yes:]) and Format
324 // characters ([:Cf:]) are also banned (see crbug.com/824715).
325 code_point == 0x00AD || // SOFT HYPHEN (%C2%AD)
326 code_point == 0x034F || // COMBINING GRAPHEME JOINER (%CD%8F)
327 // Arabic number formatting
328 (code_point >= 0x0600 && code_point <= 0x0605) ||
329 // U+061C is already banned as a BiDi control character.
330 code_point == 0x06DD || // ARABIC END OF AYAH (%DB%9D)
331 code_point == 0x070F || // SYRIAC ABBREVIATION MARK (%DC%8F)
332 code_point == 0x08E2 || // ARABIC DISPUTED END OF AYAH (%E0%A3%A2)
333 code_point == 0x115F || // HANGUL CHOSEONG FILLER (%E1%85%9F)
334 code_point == 0x1160 || // HANGUL JUNGSEONG FILLER (%E1%85%A0)
335 code_point == 0x17B4 || // KHMER VOWEL INHERENT AQ (%E1%9E%B4)
336 code_point == 0x17B5 || // KHMER VOWEL INHERENT AA (%E1%9E%B5)
337 code_point == 0x180B || // MONGOLIAN FREE VARIATION SELECTOR ONE
338 // (%E1%A0%8B)
339 code_point == 0x180C || // MONGOLIAN FREE VARIATION SELECTOR TWO
340 // (%E1%A0%8C)
341 code_point == 0x180D || // MONGOLIAN FREE VARIATION SELECTOR THREE
342 // (%E1%A0%8D)
343 code_point == 0x180E || // MONGOLIAN VOWEL SEPARATOR (%E1%A0%8E)
344 code_point == 0x200B || // ZERO WIDTH SPACE (%E2%80%8B)
345 code_point == 0x200C || // ZERO WIDTH SPACE NON-JOINER (%E2%80%8C)
346 code_point == 0x200D || // ZERO WIDTH JOINER (%E2%80%8D)
347 // U+200E, U+200F, U+202A--202E, and U+2066--2069 are already banned as
348 // BiDi control characters.
349 code_point == 0x2060 || // WORD JOINER (%E2%81%A0)
350 code_point == 0x2061 || // FUNCTION APPLICATION (%E2%81%A1)
351 code_point == 0x2062 || // INVISIBLE TIMES (%E2%81%A2)
352 code_point == 0x2063 || // INVISIBLE SEPARATOR (%E2%81%A3)
353 code_point == 0x2064 || // INVISIBLE PLUS (%E2%81%A4)
354 code_point == 0x2065 || // null (%E2%81%A5)
355 // 0x2066--0x2069 are already banned as a BiDi control characters.
356 // General Punctuation - Deprecated (U+206A--206F)
357 (code_point >= 0x206A && code_point <= 0x206F) ||
358 code_point == 0x3164 || // HANGUL FILLER (%E3%85%A4)
359 (code_point >= 0xFFF0 && code_point <= 0xFFF8) || // null
360 // Variation selectors (%EF%B8%80 -- %EF%B8%8F)
361 (code_point >= 0xFE00 && code_point <= 0xFE0F) ||
362 code_point == 0xFEFF || // ZERO WIDTH NO-BREAK SPACE (%EF%BB%BF)
363 code_point == 0xFFA0 || // HALFWIDTH HANGUL FILLER (%EF%BE%A0)
364 code_point == 0xFFF9 || // INTERLINEAR ANNOTATION ANCHOR (%EF%BF%B9)
365 code_point == 0xFFFA || // INTERLINEAR ANNOTATION SEPARATOR (%EF%BF%BA)
366 code_point == 0xFFFB || // INTERLINEAR ANNOTATION TERMINATOR (%EF%BF%BB)
367 code_point == 0x110BD || // KAITHI NUMBER SIGN (%F0%91%82%BD)
368 code_point == 0x110CD || // KAITHI NUMBER SIGN ABOVE (%F0%91%83%8D)
369 // Egyptian hieroglyph formatting (%F0%93%90%B0 -- %F0%93%90%B8)
370 (code_point >= 0x13430 && code_point <= 0x13438) ||
371 // Shorthand format controls (%F0%9B%B2%A0 -- %F0%9B%B2%A3)
372 (code_point >= 0x1BCA0 && code_point <= 0x1BCA3) ||
373 // Beams and slurs (%F0%9D%85%B3 -- %F0%9D%85%BA)
374 (code_point >= 0x1D173 && code_point <= 0x1D17A) ||
375 // Tags, Variation Selectors, nulls
376 (code_point >= 0xE0000 && code_point <= 0xE0FFF));
377 }
378
379 // Unescapes |escaped_text| according to |rules|, returning the resulting
380 // string. Fills in an |adjustments| parameter, if non-nullptr, so it reflects
381 // the alterations done to the string that are not one-character-to-one-
382 // character. The resulting |adjustments| will always be sorted by increasing
383 // offset.
UnescapeURLWithAdjustmentsImpl(StringPiece escaped_text,UnescapeRule::Type rules,OffsetAdjuster::Adjustments * adjustments)384 std::string UnescapeURLWithAdjustmentsImpl(
385 StringPiece escaped_text,
386 UnescapeRule::Type rules,
387 OffsetAdjuster::Adjustments* adjustments) {
388 if (adjustments)
389 adjustments->clear();
390 // Do not unescape anything, return the |escaped_text| text.
391 if (rules == UnescapeRule::NONE)
392 return std::string(escaped_text);
393
394 // The output of the unescaping is always smaller than the input, so we can
395 // reserve the input size to make sure we have enough buffer and don't have
396 // to allocate in the loop below.
397 std::string result;
398 result.reserve(escaped_text.length());
399
400 // Locations of adjusted text.
401 for (size_t i = 0, max = escaped_text.size(); i < max;) {
402 // Try to unescape the character.
403 base_icu::UChar32 code_point;
404 std::string unescaped;
405 if (!UnescapeUTF8CharacterAtIndex(escaped_text, i, &code_point,
406 &unescaped)) {
407 // Check if the next character can be unescaped, but not as a valid UTF-8
408 // character. In that case, just unescaped and write the non-sense
409 // character.
410 //
411 // TODO(https://crbug.com/829868): Do not unescape illegal UTF-8
412 // sequences.
413 unsigned char non_utf8_byte;
414 if (UnescapeUnsignedByteAtIndex(escaped_text, i, &non_utf8_byte)) {
415 result.push_back(static_cast<char>(non_utf8_byte));
416 if (adjustments)
417 adjustments->push_back(OffsetAdjuster::Adjustment(i, 3, 1));
418 i += 3;
419 continue;
420 }
421
422 // Character is not escaped, so append as is, unless it's a '+' and
423 // REPLACE_PLUS_WITH_SPACE is being applied.
424 if (escaped_text[i] == '+' &&
425 (rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE)) {
426 result.push_back(' ');
427 } else {
428 result.push_back(escaped_text[i]);
429 }
430 ++i;
431 continue;
432 }
433
434 DCHECK(!unescaped.empty());
435
436 if (!ShouldUnescapeCodePoint(rules, code_point)) {
437 // If it's a valid UTF-8 character, but not safe to unescape, copy all
438 // bytes directly.
439 result.append(escaped_text.substr(i, 3 * unescaped.length()));
440 i += unescaped.length() * 3;
441 continue;
442 }
443
444 // If the code point is allowed, and append the entire unescaped character.
445 result.append(unescaped);
446 if (adjustments) {
447 for (size_t j = 0; j < unescaped.length(); ++j) {
448 adjustments->push_back(OffsetAdjuster::Adjustment(i + j * 3, 3, 1));
449 }
450 }
451 i += 3 * unescaped.length();
452 }
453
454 return result;
455 }
456
457 } // namespace
458
EscapeAllExceptUnreserved(StringPiece text)459 std::string EscapeAllExceptUnreserved(StringPiece text) {
460 return Escape(text, kUnreservedCharmap, false);
461 }
462
EscapeQueryParamValue(StringPiece text,bool use_plus)463 std::string EscapeQueryParamValue(StringPiece text, bool use_plus) {
464 return Escape(text, kQueryCharmap, use_plus);
465 }
466
EscapePath(StringPiece path)467 std::string EscapePath(StringPiece path) {
468 return Escape(path, kPathCharmap, false);
469 }
470
471 #if BUILDFLAG(IS_APPLE)
EscapeNSURLPrecursor(StringPiece precursor)472 std::string EscapeNSURLPrecursor(StringPiece precursor) {
473 return Escape(precursor, kNSURLCharmap, false, true);
474 }
475 #endif // BUILDFLAG(IS_APPLE)
476
EscapeUrlEncodedData(StringPiece path,bool use_plus)477 std::string EscapeUrlEncodedData(StringPiece path, bool use_plus) {
478 return Escape(path, kUrlEscape, use_plus);
479 }
480
EscapeNonASCIIAndPercent(StringPiece input)481 std::string EscapeNonASCIIAndPercent(StringPiece input) {
482 return Escape(input, kNonASCIICharmapAndPercent, false);
483 }
484
EscapeNonASCII(StringPiece input)485 std::string EscapeNonASCII(StringPiece input) {
486 return Escape(input, kNonASCIICharmap, false);
487 }
488
EscapeExternalHandlerValue(StringPiece text)489 std::string EscapeExternalHandlerValue(StringPiece text) {
490 return Escape(text, kExternalHandlerCharmap, false, true);
491 }
492
AppendEscapedCharForHTML(char c,std::string * output)493 void AppendEscapedCharForHTML(char c, std::string* output) {
494 AppendEscapedCharForHTMLImpl(c, output);
495 }
496
EscapeForHTML(StringPiece input)497 std::string EscapeForHTML(StringPiece input) {
498 return EscapeForHTMLImpl(input);
499 }
500
EscapeForHTML(StringPiece16 input)501 std::u16string EscapeForHTML(StringPiece16 input) {
502 return EscapeForHTMLImpl(input);
503 }
504
UnescapeURLComponent(StringPiece escaped_text,UnescapeRule::Type rules)505 std::string UnescapeURLComponent(StringPiece escaped_text,
506 UnescapeRule::Type rules) {
507 return UnescapeURLWithAdjustmentsImpl(escaped_text, rules, nullptr);
508 }
509
UnescapeAndDecodeUTF8URLComponentWithAdjustments(StringPiece text,UnescapeRule::Type rules,OffsetAdjuster::Adjustments * adjustments)510 std::u16string UnescapeAndDecodeUTF8URLComponentWithAdjustments(
511 StringPiece text,
512 UnescapeRule::Type rules,
513 OffsetAdjuster::Adjustments* adjustments) {
514 std::u16string result;
515 OffsetAdjuster::Adjustments unescape_adjustments;
516 std::string unescaped_url(
517 UnescapeURLWithAdjustmentsImpl(text, rules, &unescape_adjustments));
518 if (UTF8ToUTF16WithAdjustments(unescaped_url.data(), unescaped_url.length(),
519 &result, adjustments)) {
520 // Character set looks like it's valid.
521 if (adjustments) {
522 OffsetAdjuster::MergeSequentialAdjustments(unescape_adjustments,
523 adjustments);
524 }
525 return result;
526 }
527 // Character set is not valid. Return the escaped version.
528 return UTF8ToUTF16WithAdjustments(text, adjustments);
529 }
530
UnescapeBinaryURLComponent(StringPiece escaped_text,UnescapeRule::Type rules)531 std::string UnescapeBinaryURLComponent(StringPiece escaped_text,
532 UnescapeRule::Type rules) {
533 // Only NORMAL and REPLACE_PLUS_WITH_SPACE are supported.
534 DCHECK(rules != UnescapeRule::NONE);
535 DCHECK(!(rules &
536 ~(UnescapeRule::NORMAL | UnescapeRule::REPLACE_PLUS_WITH_SPACE)));
537
538 // It is not possible to read the feature state when this function is invoked
539 // before FeatureList initialization. In that case, fallback to the feature's
540 // default state.
541 //
542 // TODO(crbug.com/1321924): Cleanup this feature.
543 const bool optimize_data_urls_feature_is_enabled =
544 base::FeatureList::GetInstance()
545 ? base::FeatureList::IsEnabled(features::kOptimizeDataUrls)
546 : features::kOptimizeDataUrls.default_state ==
547 base::FEATURE_ENABLED_BY_DEFAULT;
548
549 // If there are no '%' characters in the string, there will be nothing to
550 // unescape, so we can take the fast path.
551 if (optimize_data_urls_feature_is_enabled &&
552 escaped_text.find('%') == StringPiece::npos) {
553 std::string unescaped_text(escaped_text);
554 if (rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE)
555 std::replace(unescaped_text.begin(), unescaped_text.end(), '+', ' ');
556 return unescaped_text;
557 }
558
559 std::string unescaped_text;
560
561 // The output of the unescaping is always smaller than the input, so we can
562 // reserve the input size to make sure we have enough buffer and don't have
563 // to allocate in the loop below.
564 // Increase capacity before size, as just resizing can grow capacity
565 // needlessly beyond our requested size.
566 unescaped_text.reserve(escaped_text.size());
567 unescaped_text.resize(escaped_text.size());
568
569 size_t output_index = 0;
570
571 for (size_t i = 0, max = escaped_text.size(); i < max;) {
572 unsigned char byte;
573 // UnescapeUnsignedByteAtIndex does bounds checking, so this is always safe
574 // to call.
575 if (UnescapeUnsignedByteAtIndex(escaped_text, i, &byte)) {
576 unescaped_text[output_index++] = static_cast<char>(byte);
577 i += 3;
578 continue;
579 }
580
581 if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&
582 escaped_text[i] == '+') {
583 unescaped_text[output_index++] = ' ';
584 ++i;
585 continue;
586 }
587
588 unescaped_text[output_index++] = escaped_text[i++];
589 }
590
591 DCHECK_LE(output_index, unescaped_text.size());
592 unescaped_text.resize(output_index);
593 return unescaped_text;
594 }
595
UnescapeBinaryURLComponentSafe(StringPiece escaped_text,bool fail_on_path_separators,std::string * unescaped_text)596 bool UnescapeBinaryURLComponentSafe(StringPiece escaped_text,
597 bool fail_on_path_separators,
598 std::string* unescaped_text) {
599 unescaped_text->clear();
600
601 std::set<unsigned char> illegal_encoded_bytes;
602 for (unsigned char c = '\x00'; c < '\x20'; ++c) {
603 illegal_encoded_bytes.insert(c);
604 }
605 if (fail_on_path_separators) {
606 illegal_encoded_bytes.insert('/');
607 illegal_encoded_bytes.insert('\\');
608 }
609 if (ContainsEncodedBytes(escaped_text, illegal_encoded_bytes))
610 return false;
611
612 *unescaped_text = UnescapeBinaryURLComponent(escaped_text);
613 return true;
614 }
615
ContainsEncodedBytes(StringPiece escaped_text,const std::set<unsigned char> & bytes)616 bool ContainsEncodedBytes(StringPiece escaped_text,
617 const std::set<unsigned char>& bytes) {
618 for (size_t i = 0, max = escaped_text.size(); i < max;) {
619 unsigned char byte;
620 // UnescapeUnsignedByteAtIndex does bounds checking, so this is always safe
621 // to call.
622 if (UnescapeUnsignedByteAtIndex(escaped_text, i, &byte)) {
623 if (bytes.find(byte) != bytes.end())
624 return true;
625
626 i += 3;
627 continue;
628 }
629
630 ++i;
631 }
632
633 return false;
634 }
635
UnescapeForHTML(StringPiece16 input)636 std::u16string UnescapeForHTML(StringPiece16 input) {
637 static const struct {
638 const char* ampersand_code;
639 const char16_t replacement;
640 } kEscapeToChars[] = {
641 {"<", '<'}, {">", '>'}, {"&", '&'},
642 {""", '"'}, {"'", '\''},
643 };
644 constexpr size_t kEscapeToCharsCount = std::size(kEscapeToChars);
645
646 if (input.find(u"&") == std::string::npos)
647 return std::u16string(input);
648
649 std::u16string ampersand_chars[kEscapeToCharsCount];
650 std::u16string text(input);
651 for (std::u16string::iterator iter = text.begin(); iter != text.end();
652 ++iter) {
653 if (*iter == '&') {
654 // Potential ampersand encode char.
655 size_t index = static_cast<size_t>(iter - text.begin());
656 for (size_t i = 0; i < std::size(kEscapeToChars); i++) {
657 if (ampersand_chars[i].empty()) {
658 ampersand_chars[i] = ASCIIToUTF16(kEscapeToChars[i].ampersand_code);
659 }
660 if (text.find(ampersand_chars[i], index) == index) {
661 text.replace(
662 iter, iter + static_cast<ptrdiff_t>(ampersand_chars[i].length()),
663 1, kEscapeToChars[i].replacement);
664 break;
665 }
666 }
667 }
668 }
669 return text;
670 }
671
672 } // namespace base
673