xref: /aosp_15_r20/external/jsoup/src/main/java/org/jsoup/nodes/Entities.java (revision 6da8f8c4bc310ad659121b84dd089062417a2ce2)
1 package org.jsoup.nodes;
2 
3 import org.jsoup.SerializationException;
4 import org.jsoup.internal.StringUtil;
5 import org.jsoup.helper.Validate;
6 import org.jsoup.nodes.Document.OutputSettings;
7 import org.jsoup.parser.CharacterReader;
8 import org.jsoup.parser.Parser;
9 import org.jspecify.annotations.Nullable;
10 
11 import java.io.IOException;
12 import java.nio.charset.CharsetEncoder;
13 import java.util.Arrays;
14 import java.util.HashMap;
15 
16 import static org.jsoup.nodes.Document.OutputSettings.*;
17 import static org.jsoup.nodes.Entities.EscapeMode.base;
18 import static org.jsoup.nodes.Entities.EscapeMode.extended;
19 
20 /**
21  * HTML entities, and escape routines. Source: <a href="http://www.w3.org/TR/html5/named-character-references.html#named-character-references">W3C
22  * HTML named character references</a>.
23  */
24 public class Entities {
25     private static final int empty = -1;
26     private static final String emptyName = "";
27     static final int codepointRadix = 36;
28     private static final char[] codeDelims = {',', ';'};
29     private static final HashMap<String, String> multipoints = new HashMap<>(); // name -> multiple character references
30 
31     public enum EscapeMode {
32         /**
33          * Restricted entities suitable for XHTML output: lt, gt, amp, and quot only.
34          */
35         xhtml(EntitiesData.xmlPoints, 4),
36         /**
37          * Default HTML output entities.
38          */
39         base(EntitiesData.basePoints, 106),
40         /**
41          * Complete HTML entities.
42          */
43         extended(EntitiesData.fullPoints, 2125);
44 
45         // table of named references to their codepoints. sorted so we can binary search. built by BuildEntities.
46         private String[] nameKeys;
47         private int[] codeVals; // limitation is the few references with multiple characters; those go into multipoints.
48 
49         // table of codepoints to named entities.
50         private int[] codeKeys; // we don't support multicodepoints to single named value currently
51         private String[] nameVals;
52 
EscapeMode(String file, int size)53         EscapeMode(String file, int size) {
54             load(this, file, size);
55         }
56 
codepointForName(final String name)57         int codepointForName(final String name) {
58             int index = Arrays.binarySearch(nameKeys, name);
59             return index >= 0 ? codeVals[index] : empty;
60         }
61 
nameForCodepoint(final int codepoint)62         String nameForCodepoint(final int codepoint) {
63             final int index = Arrays.binarySearch(codeKeys, codepoint);
64             if (index >= 0) {
65                 // the results are ordered so lower case versions of same codepoint come after uppercase, and we prefer to emit lower
66                 // (and binary search for same item with multi results is undefined
67                 return (index < nameVals.length - 1 && codeKeys[index + 1] == codepoint) ?
68                     nameVals[index + 1] : nameVals[index];
69             }
70             return emptyName;
71         }
72 
size()73         private int size() {
74             return nameKeys.length;
75         }
76     }
77 
Entities()78     private Entities() {
79     }
80 
81     /**
82      * Check if the input is a known named entity
83      *
84      * @param name the possible entity name (e.g. "lt" or "amp")
85      * @return true if a known named entity
86      */
isNamedEntity(final String name)87     public static boolean isNamedEntity(final String name) {
88         return extended.codepointForName(name) != empty;
89     }
90 
91     /**
92      * Check if the input is a known named entity in the base entity set.
93      *
94      * @param name the possible entity name (e.g. "lt" or "amp")
95      * @return true if a known named entity in the base set
96      * @see #isNamedEntity(String)
97      */
isBaseNamedEntity(final String name)98     public static boolean isBaseNamedEntity(final String name) {
99         return base.codepointForName(name) != empty;
100     }
101 
102     /**
103      * Get the character(s) represented by the named entity
104      *
105      * @param name entity (e.g. "lt" or "amp")
106      * @return the string value of the character(s) represented by this entity, or "" if not defined
107      */
getByName(String name)108     public static String getByName(String name) {
109         String val = multipoints.get(name);
110         if (val != null)
111             return val;
112         int codepoint = extended.codepointForName(name);
113         if (codepoint != empty)
114             return new String(new int[]{codepoint}, 0, 1);
115         return emptyName;
116     }
117 
codepointsForName(final String name, final int[] codepoints)118     public static int codepointsForName(final String name, final int[] codepoints) {
119         String val = multipoints.get(name);
120         if (val != null) {
121             codepoints[0] = val.codePointAt(0);
122             codepoints[1] = val.codePointAt(1);
123             return 2;
124         }
125         int codepoint = extended.codepointForName(name);
126         if (codepoint != empty) {
127             codepoints[0] = codepoint;
128             return 1;
129         }
130         return 0;
131     }
132 
133     /**
134      * HTML escape an input string. That is, {@code <} is returned as {@code &lt;}
135      *
136      * @param string the un-escaped string to escape
137      * @param out the output settings to use
138      * @return the escaped string
139      */
escape(String string, OutputSettings out)140     public static String escape(String string, OutputSettings out) {
141         if (string == null)
142             return "";
143         StringBuilder accum = StringUtil.borrowBuilder();
144         try {
145             escape(accum, string, out, false, false, false, false);
146         } catch (IOException e) {
147             throw new SerializationException(e); // doesn't happen
148         }
149         return StringUtil.releaseBuilder(accum);
150     }
151 
152     /**
153      * HTML escape an input string, using the default settings (UTF-8, base entities). That is, {@code <} is returned as
154      * {@code &lt;}
155      *
156      * @param string the un-escaped string to escape
157      * @return the escaped string
158      */
escape(String string)159     public static String escape(String string) {
160         if (DefaultOutput == null)
161             DefaultOutput = new OutputSettings();
162         return escape(string, DefaultOutput);
163     }
164     private static @Nullable OutputSettings DefaultOutput; // lazy-init, to break circular dependency with OutputSettings
165 
166     // this method does a lot, but other breakups cause rescanning and stringbuilder generations
escape(Appendable accum, String string, OutputSettings out, boolean inAttribute, boolean normaliseWhite, boolean stripLeadingWhite, boolean trimTrailing)167     static void escape(Appendable accum, String string, OutputSettings out,
168                        boolean inAttribute, boolean normaliseWhite, boolean stripLeadingWhite, boolean trimTrailing) throws IOException {
169 
170         boolean lastWasWhite = false;
171         boolean reachedNonWhite = false;
172         final EscapeMode escapeMode = out.escapeMode();
173         final CharsetEncoder encoder = out.encoder();
174         final CoreCharset coreCharset = out.coreCharset; // init in out.prepareEncoder()
175         final int length = string.length();
176 
177         int codePoint;
178         boolean skipped = false;
179         for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) {
180             codePoint = string.codePointAt(offset);
181 
182             if (normaliseWhite) {
183                 if (StringUtil.isWhitespace(codePoint)) {
184                     if (stripLeadingWhite && !reachedNonWhite) continue;
185                     if (lastWasWhite) continue;
186                     if (trimTrailing) {
187                         skipped = true;
188                         continue;
189                     }
190                     accum.append(' ');
191                     lastWasWhite = true;
192                     continue;
193                 } else {
194                     lastWasWhite = false;
195                     reachedNonWhite = true;
196                     if (skipped) {
197                         accum.append(' '); // wasn't the end, so need to place a normalized space
198                         skipped = false;
199                     }
200                 }
201             }
202             // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]):
203             if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
204                 final char c = (char) codePoint;
205                 // html specific and required escapes:
206                 switch (c) {
207                     case '&':
208                         accum.append("&amp;");
209                         break;
210                     case 0xA0:
211                         if (escapeMode != EscapeMode.xhtml)
212                             accum.append("&nbsp;");
213                         else
214                             accum.append("&#xa0;");
215                         break;
216                     case '<':
217                         // escape when in character data or when in a xml attribute val or XML syntax; not needed in html attr val
218                         if (!inAttribute || escapeMode == EscapeMode.xhtml || out.syntax() == Syntax.xml)
219                             accum.append("&lt;");
220                         else
221                             accum.append(c);
222                         break;
223                     case '>':
224                         if (!inAttribute)
225                             accum.append("&gt;");
226                         else
227                             accum.append(c);
228                         break;
229                     case '"':
230                         if (inAttribute)
231                             accum.append("&quot;");
232                         else
233                             accum.append(c);
234                         break;
235                     // we escape ascii control <x20 (other than tab, line-feed, carriage return)  for XML compliance (required) and HTML ease of reading (not required) - https://www.w3.org/TR/xml/#charsets
236                     case 0x9:
237                     case 0xA:
238                     case 0xD:
239                         accum.append(c);
240                         break;
241                     default:
242                         if (c < 0x20 || !canEncode(coreCharset, c, encoder))
243                             appendEncoded(accum, escapeMode, codePoint);
244                         else
245                             accum.append(c);
246                 }
247             } else {
248                 final String c = new String(Character.toChars(codePoint));
249                 if (encoder.canEncode(c)) // uses fallback encoder for simplicity
250                     accum.append(c);
251                 else
252                     appendEncoded(accum, escapeMode, codePoint);
253             }
254         }
255     }
256 
appendEncoded(Appendable accum, EscapeMode escapeMode, int codePoint)257     private static void appendEncoded(Appendable accum, EscapeMode escapeMode, int codePoint) throws IOException {
258         final String name = escapeMode.nameForCodepoint(codePoint);
259         if (!emptyName.equals(name)) // ok for identity check
260             accum.append('&').append(name).append(';');
261         else
262             accum.append("&#x").append(Integer.toHexString(codePoint)).append(';');
263     }
264 
265     /**
266      * Un-escape an HTML escaped string. That is, {@code &lt;} is returned as {@code <}.
267      *
268      * @param string the HTML string to un-escape
269      * @return the unescaped string
270      */
unescape(String string)271     public static String unescape(String string) {
272         return unescape(string, false);
273     }
274 
275     /**
276      * Unescape the input string.
277      *
278      * @param string to un-HTML-escape
279      * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional)
280      * @return unescaped string
281      */
unescape(String string, boolean strict)282     static String unescape(String string, boolean strict) {
283         return Parser.unescapeEntities(string, strict);
284     }
285 
286     /*
287      * Provides a fast-path for Encoder.canEncode, which drastically improves performance on Android post JellyBean.
288      * After KitKat, the implementation of canEncode degrades to the point of being useless. For non ASCII or UTF,
289      * performance may be bad. We can add more encoders for common character sets that are impacted by performance
290      * issues on Android if required.
291      *
292      * Benchmarks:     *
293      * OLD toHtml() impl v New (fastpath) in millis
294      * Wiki: 1895, 16
295      * CNN: 6378, 55
296      * Alterslash: 3013, 28
297      * Jsoup: 167, 2
298      */
canEncode(final CoreCharset charset, final char c, final CharsetEncoder fallback)299     private static boolean canEncode(final CoreCharset charset, final char c, final CharsetEncoder fallback) {
300         // todo add more charset tests if impacted by Android's bad perf in canEncode
301         switch (charset) {
302             case ascii:
303                 return c < 0x80;
304             case utf:
305                 return true; // real is:!(Character.isLowSurrogate(c) || Character.isHighSurrogate(c)); - but already check above
306             default:
307                 return fallback.canEncode(c);
308         }
309     }
310 
311     enum CoreCharset {
312         ascii, utf, fallback;
313 
byName(final String name)314         static CoreCharset byName(final String name) {
315             if (name.equals("US-ASCII"))
316                 return ascii;
317             if (name.startsWith("UTF-")) // covers UTF-8, UTF-16, et al
318                 return utf;
319             return fallback;
320         }
321     }
322 
load(EscapeMode e, String pointsData, int size)323     private static void load(EscapeMode e, String pointsData, int size) {
324         e.nameKeys = new String[size];
325         e.codeVals = new int[size];
326         e.codeKeys = new int[size];
327         e.nameVals = new String[size];
328 
329         int i = 0;
330         CharacterReader reader = new CharacterReader(pointsData);
331         try {
332             while (!reader.isEmpty()) {
333                 // NotNestedLessLess=10913,824;1887&
334 
335                 final String name = reader.consumeTo('=');
336                 reader.advance();
337                 final int cp1 = Integer.parseInt(reader.consumeToAny(codeDelims), codepointRadix);
338                 final char codeDelim = reader.current();
339                 reader.advance();
340                 final int cp2;
341                 if (codeDelim == ',') {
342                     cp2 = Integer.parseInt(reader.consumeTo(';'), codepointRadix);
343                     reader.advance();
344                 } else {
345                     cp2 = empty;
346                 }
347                 final String indexS = reader.consumeTo('&');
348                 final int index = Integer.parseInt(indexS, codepointRadix);
349                 reader.advance();
350 
351                 e.nameKeys[i] = name;
352                 e.codeVals[i] = cp1;
353                 e.codeKeys[index] = cp1;
354                 e.nameVals[index] = name;
355 
356                 if (cp2 != empty) {
357                     multipoints.put(name, new String(new int[]{cp1, cp2}, 0, 2));
358                 }
359                 i++;
360             }
361 
362             Validate.isTrue(i == size, "Unexpected count of entities loaded");
363         } finally {
364             reader.close();
365         }
366     }
367 }
368