1 package org.jsoup.nodes; 2 3 import org.jsoup.SerializationException; 4 import org.jsoup.internal.StringUtil; 5 import org.jsoup.helper.Validate; 6 import org.jsoup.nodes.Document.OutputSettings; 7 import org.jsoup.parser.CharacterReader; 8 import org.jsoup.parser.Parser; 9 import org.jspecify.annotations.Nullable; 10 11 import java.io.IOException; 12 import java.nio.charset.CharsetEncoder; 13 import java.util.Arrays; 14 import java.util.HashMap; 15 16 import static org.jsoup.nodes.Document.OutputSettings.*; 17 import static org.jsoup.nodes.Entities.EscapeMode.base; 18 import static org.jsoup.nodes.Entities.EscapeMode.extended; 19 20 /** 21 * HTML entities, and escape routines. Source: <a href="http://www.w3.org/TR/html5/named-character-references.html#named-character-references">W3C 22 * HTML named character references</a>. 23 */ 24 public class Entities { 25 private static final int empty = -1; 26 private static final String emptyName = ""; 27 static final int codepointRadix = 36; 28 private static final char[] codeDelims = {',', ';'}; 29 private static final HashMap<String, String> multipoints = new HashMap<>(); // name -> multiple character references 30 31 public enum EscapeMode { 32 /** 33 * Restricted entities suitable for XHTML output: lt, gt, amp, and quot only. 34 */ 35 xhtml(EntitiesData.xmlPoints, 4), 36 /** 37 * Default HTML output entities. 38 */ 39 base(EntitiesData.basePoints, 106), 40 /** 41 * Complete HTML entities. 42 */ 43 extended(EntitiesData.fullPoints, 2125); 44 45 // table of named references to their codepoints. sorted so we can binary search. built by BuildEntities. 46 private String[] nameKeys; 47 private int[] codeVals; // limitation is the few references with multiple characters; those go into multipoints. 48 49 // table of codepoints to named entities. 50 private int[] codeKeys; // we don't support multicodepoints to single named value currently 51 private String[] nameVals; 52 EscapeMode(String file, int size)53 EscapeMode(String file, int size) { 54 load(this, file, size); 55 } 56 codepointForName(final String name)57 int codepointForName(final String name) { 58 int index = Arrays.binarySearch(nameKeys, name); 59 return index >= 0 ? codeVals[index] : empty; 60 } 61 nameForCodepoint(final int codepoint)62 String nameForCodepoint(final int codepoint) { 63 final int index = Arrays.binarySearch(codeKeys, codepoint); 64 if (index >= 0) { 65 // the results are ordered so lower case versions of same codepoint come after uppercase, and we prefer to emit lower 66 // (and binary search for same item with multi results is undefined 67 return (index < nameVals.length - 1 && codeKeys[index + 1] == codepoint) ? 68 nameVals[index + 1] : nameVals[index]; 69 } 70 return emptyName; 71 } 72 size()73 private int size() { 74 return nameKeys.length; 75 } 76 } 77 Entities()78 private Entities() { 79 } 80 81 /** 82 * Check if the input is a known named entity 83 * 84 * @param name the possible entity name (e.g. "lt" or "amp") 85 * @return true if a known named entity 86 */ isNamedEntity(final String name)87 public static boolean isNamedEntity(final String name) { 88 return extended.codepointForName(name) != empty; 89 } 90 91 /** 92 * Check if the input is a known named entity in the base entity set. 93 * 94 * @param name the possible entity name (e.g. "lt" or "amp") 95 * @return true if a known named entity in the base set 96 * @see #isNamedEntity(String) 97 */ isBaseNamedEntity(final String name)98 public static boolean isBaseNamedEntity(final String name) { 99 return base.codepointForName(name) != empty; 100 } 101 102 /** 103 * Get the character(s) represented by the named entity 104 * 105 * @param name entity (e.g. "lt" or "amp") 106 * @return the string value of the character(s) represented by this entity, or "" if not defined 107 */ getByName(String name)108 public static String getByName(String name) { 109 String val = multipoints.get(name); 110 if (val != null) 111 return val; 112 int codepoint = extended.codepointForName(name); 113 if (codepoint != empty) 114 return new String(new int[]{codepoint}, 0, 1); 115 return emptyName; 116 } 117 codepointsForName(final String name, final int[] codepoints)118 public static int codepointsForName(final String name, final int[] codepoints) { 119 String val = multipoints.get(name); 120 if (val != null) { 121 codepoints[0] = val.codePointAt(0); 122 codepoints[1] = val.codePointAt(1); 123 return 2; 124 } 125 int codepoint = extended.codepointForName(name); 126 if (codepoint != empty) { 127 codepoints[0] = codepoint; 128 return 1; 129 } 130 return 0; 131 } 132 133 /** 134 * HTML escape an input string. That is, {@code <} is returned as {@code <} 135 * 136 * @param string the un-escaped string to escape 137 * @param out the output settings to use 138 * @return the escaped string 139 */ escape(String string, OutputSettings out)140 public static String escape(String string, OutputSettings out) { 141 if (string == null) 142 return ""; 143 StringBuilder accum = StringUtil.borrowBuilder(); 144 try { 145 escape(accum, string, out, false, false, false, false); 146 } catch (IOException e) { 147 throw new SerializationException(e); // doesn't happen 148 } 149 return StringUtil.releaseBuilder(accum); 150 } 151 152 /** 153 * HTML escape an input string, using the default settings (UTF-8, base entities). That is, {@code <} is returned as 154 * {@code <} 155 * 156 * @param string the un-escaped string to escape 157 * @return the escaped string 158 */ escape(String string)159 public static String escape(String string) { 160 if (DefaultOutput == null) 161 DefaultOutput = new OutputSettings(); 162 return escape(string, DefaultOutput); 163 } 164 private static @Nullable OutputSettings DefaultOutput; // lazy-init, to break circular dependency with OutputSettings 165 166 // this method does a lot, but other breakups cause rescanning and stringbuilder generations escape(Appendable accum, String string, OutputSettings out, boolean inAttribute, boolean normaliseWhite, boolean stripLeadingWhite, boolean trimTrailing)167 static void escape(Appendable accum, String string, OutputSettings out, 168 boolean inAttribute, boolean normaliseWhite, boolean stripLeadingWhite, boolean trimTrailing) throws IOException { 169 170 boolean lastWasWhite = false; 171 boolean reachedNonWhite = false; 172 final EscapeMode escapeMode = out.escapeMode(); 173 final CharsetEncoder encoder = out.encoder(); 174 final CoreCharset coreCharset = out.coreCharset; // init in out.prepareEncoder() 175 final int length = string.length(); 176 177 int codePoint; 178 boolean skipped = false; 179 for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) { 180 codePoint = string.codePointAt(offset); 181 182 if (normaliseWhite) { 183 if (StringUtil.isWhitespace(codePoint)) { 184 if (stripLeadingWhite && !reachedNonWhite) continue; 185 if (lastWasWhite) continue; 186 if (trimTrailing) { 187 skipped = true; 188 continue; 189 } 190 accum.append(' '); 191 lastWasWhite = true; 192 continue; 193 } else { 194 lastWasWhite = false; 195 reachedNonWhite = true; 196 if (skipped) { 197 accum.append(' '); // wasn't the end, so need to place a normalized space 198 skipped = false; 199 } 200 } 201 } 202 // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]): 203 if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) { 204 final char c = (char) codePoint; 205 // html specific and required escapes: 206 switch (c) { 207 case '&': 208 accum.append("&"); 209 break; 210 case 0xA0: 211 if (escapeMode != EscapeMode.xhtml) 212 accum.append(" "); 213 else 214 accum.append(" "); 215 break; 216 case '<': 217 // escape when in character data or when in a xml attribute val or XML syntax; not needed in html attr val 218 if (!inAttribute || escapeMode == EscapeMode.xhtml || out.syntax() == Syntax.xml) 219 accum.append("<"); 220 else 221 accum.append(c); 222 break; 223 case '>': 224 if (!inAttribute) 225 accum.append(">"); 226 else 227 accum.append(c); 228 break; 229 case '"': 230 if (inAttribute) 231 accum.append("""); 232 else 233 accum.append(c); 234 break; 235 // we escape ascii control <x20 (other than tab, line-feed, carriage return) for XML compliance (required) and HTML ease of reading (not required) - https://www.w3.org/TR/xml/#charsets 236 case 0x9: 237 case 0xA: 238 case 0xD: 239 accum.append(c); 240 break; 241 default: 242 if (c < 0x20 || !canEncode(coreCharset, c, encoder)) 243 appendEncoded(accum, escapeMode, codePoint); 244 else 245 accum.append(c); 246 } 247 } else { 248 final String c = new String(Character.toChars(codePoint)); 249 if (encoder.canEncode(c)) // uses fallback encoder for simplicity 250 accum.append(c); 251 else 252 appendEncoded(accum, escapeMode, codePoint); 253 } 254 } 255 } 256 appendEncoded(Appendable accum, EscapeMode escapeMode, int codePoint)257 private static void appendEncoded(Appendable accum, EscapeMode escapeMode, int codePoint) throws IOException { 258 final String name = escapeMode.nameForCodepoint(codePoint); 259 if (!emptyName.equals(name)) // ok for identity check 260 accum.append('&').append(name).append(';'); 261 else 262 accum.append("&#x").append(Integer.toHexString(codePoint)).append(';'); 263 } 264 265 /** 266 * Un-escape an HTML escaped string. That is, {@code <} is returned as {@code <}. 267 * 268 * @param string the HTML string to un-escape 269 * @return the unescaped string 270 */ unescape(String string)271 public static String unescape(String string) { 272 return unescape(string, false); 273 } 274 275 /** 276 * Unescape the input string. 277 * 278 * @param string to un-HTML-escape 279 * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional) 280 * @return unescaped string 281 */ unescape(String string, boolean strict)282 static String unescape(String string, boolean strict) { 283 return Parser.unescapeEntities(string, strict); 284 } 285 286 /* 287 * Provides a fast-path for Encoder.canEncode, which drastically improves performance on Android post JellyBean. 288 * After KitKat, the implementation of canEncode degrades to the point of being useless. For non ASCII or UTF, 289 * performance may be bad. We can add more encoders for common character sets that are impacted by performance 290 * issues on Android if required. 291 * 292 * Benchmarks: * 293 * OLD toHtml() impl v New (fastpath) in millis 294 * Wiki: 1895, 16 295 * CNN: 6378, 55 296 * Alterslash: 3013, 28 297 * Jsoup: 167, 2 298 */ canEncode(final CoreCharset charset, final char c, final CharsetEncoder fallback)299 private static boolean canEncode(final CoreCharset charset, final char c, final CharsetEncoder fallback) { 300 // todo add more charset tests if impacted by Android's bad perf in canEncode 301 switch (charset) { 302 case ascii: 303 return c < 0x80; 304 case utf: 305 return true; // real is:!(Character.isLowSurrogate(c) || Character.isHighSurrogate(c)); - but already check above 306 default: 307 return fallback.canEncode(c); 308 } 309 } 310 311 enum CoreCharset { 312 ascii, utf, fallback; 313 byName(final String name)314 static CoreCharset byName(final String name) { 315 if (name.equals("US-ASCII")) 316 return ascii; 317 if (name.startsWith("UTF-")) // covers UTF-8, UTF-16, et al 318 return utf; 319 return fallback; 320 } 321 } 322 load(EscapeMode e, String pointsData, int size)323 private static void load(EscapeMode e, String pointsData, int size) { 324 e.nameKeys = new String[size]; 325 e.codeVals = new int[size]; 326 e.codeKeys = new int[size]; 327 e.nameVals = new String[size]; 328 329 int i = 0; 330 CharacterReader reader = new CharacterReader(pointsData); 331 try { 332 while (!reader.isEmpty()) { 333 // NotNestedLessLess=10913,824;1887& 334 335 final String name = reader.consumeTo('='); 336 reader.advance(); 337 final int cp1 = Integer.parseInt(reader.consumeToAny(codeDelims), codepointRadix); 338 final char codeDelim = reader.current(); 339 reader.advance(); 340 final int cp2; 341 if (codeDelim == ',') { 342 cp2 = Integer.parseInt(reader.consumeTo(';'), codepointRadix); 343 reader.advance(); 344 } else { 345 cp2 = empty; 346 } 347 final String indexS = reader.consumeTo('&'); 348 final int index = Integer.parseInt(indexS, codepointRadix); 349 reader.advance(); 350 351 e.nameKeys[i] = name; 352 e.codeVals[i] = cp1; 353 e.codeKeys[index] = cp1; 354 e.nameVals[index] = name; 355 356 if (cp2 != empty) { 357 multipoints.put(name, new String(new int[]{cp1, cp2}, 0, 2)); 358 } 359 i++; 360 } 361 362 Validate.isTrue(i == size, "Unexpected count of entities loaded"); 363 } finally { 364 reader.close(); 365 } 366 } 367 } 368