1 package org.jsoup.parser; 2 3 import org.jsoup.nodes.Document; 4 import org.jsoup.nodes.Element; 5 import org.jsoup.nodes.Node; 6 7 import java.io.Reader; 8 import java.io.StringReader; 9 import java.util.List; 10 11 /** 12 Parses HTML or XML into a {@link org.jsoup.nodes.Document}. Generally, it is simpler to use one of the parse methods in 13 {@link org.jsoup.Jsoup}. 14 <p>Note that a Parser instance object is not threadsafe. To reuse a Parser configuration in a multi-threaded 15 environment, use {@link #newInstance()} to make copies. */ 16 public class Parser { 17 public static final String NamespaceHtml = "http://www.w3.org/1999/xhtml"; 18 public static final String NamespaceXml = "http://www.w3.org/XML/1998/namespace"; 19 public static final String NamespaceMathml = "http://www.w3.org/1998/Math/MathML"; 20 public static final String NamespaceSvg = "http://www.w3.org/2000/svg"; 21 22 private TreeBuilder treeBuilder; 23 private ParseErrorList errors; 24 private ParseSettings settings; 25 private boolean trackPosition = false; 26 27 /** 28 * Create a new Parser, using the specified TreeBuilder 29 * @param treeBuilder TreeBuilder to use to parse input into Documents. 30 */ Parser(TreeBuilder treeBuilder)31 public Parser(TreeBuilder treeBuilder) { 32 this.treeBuilder = treeBuilder; 33 settings = treeBuilder.defaultSettings(); 34 errors = ParseErrorList.noTracking(); 35 } 36 37 /** 38 Creates a new Parser as a deep copy of this; including initializing a new TreeBuilder. Allows independent (multi-threaded) use. 39 @return a copied parser 40 */ newInstance()41 public Parser newInstance() { 42 return new Parser(this); 43 } 44 Parser(Parser copy)45 private Parser(Parser copy) { 46 treeBuilder = copy.treeBuilder.newInstance(); // because extended 47 errors = new ParseErrorList(copy.errors); // only copies size, not contents 48 settings = new ParseSettings(copy.settings); 49 trackPosition = copy.trackPosition; 50 } 51 parseInput(String html, String baseUri)52 public Document parseInput(String html, String baseUri) { 53 return treeBuilder.parse(new StringReader(html), baseUri, this); 54 } 55 parseInput(Reader inputHtml, String baseUri)56 public Document parseInput(Reader inputHtml, String baseUri) { 57 return treeBuilder.parse(inputHtml, baseUri, this); 58 } 59 parseFragmentInput(String fragment, Element context, String baseUri)60 public List<Node> parseFragmentInput(String fragment, Element context, String baseUri) { 61 return treeBuilder.parseFragment(fragment, context, baseUri, this); 62 } 63 // gets & sets 64 /** 65 * Get the TreeBuilder currently in use. 66 * @return current TreeBuilder. 67 */ getTreeBuilder()68 public TreeBuilder getTreeBuilder() { 69 return treeBuilder; 70 } 71 72 /** 73 * Update the TreeBuilder used when parsing content. 74 * @param treeBuilder new TreeBuilder 75 * @return this, for chaining 76 */ setTreeBuilder(TreeBuilder treeBuilder)77 public Parser setTreeBuilder(TreeBuilder treeBuilder) { 78 this.treeBuilder = treeBuilder; 79 treeBuilder.parser = this; 80 return this; 81 } 82 83 /** 84 * Check if parse error tracking is enabled. 85 * @return current track error state. 86 */ isTrackErrors()87 public boolean isTrackErrors() { 88 return errors.getMaxSize() > 0; 89 } 90 91 /** 92 * Enable or disable parse error tracking for the next parse. 93 * @param maxErrors the maximum number of errors to track. Set to 0 to disable. 94 * @return this, for chaining 95 */ setTrackErrors(int maxErrors)96 public Parser setTrackErrors(int maxErrors) { 97 errors = maxErrors > 0 ? ParseErrorList.tracking(maxErrors) : ParseErrorList.noTracking(); 98 return this; 99 } 100 101 /** 102 * Retrieve the parse errors, if any, from the last parse. 103 * @return list of parse errors, up to the size of the maximum errors tracked. 104 * @see #setTrackErrors(int) 105 */ getErrors()106 public ParseErrorList getErrors() { 107 return errors; 108 } 109 110 /** 111 Test if position tracking is enabled. If it is, Nodes will have a Position to track where in the original input 112 source they were created from. By default, tracking is not enabled. 113 * @return current track position setting 114 */ isTrackPosition()115 public boolean isTrackPosition() { 116 return trackPosition; 117 } 118 119 /** 120 Enable or disable source position tracking. If enabled, Nodes will have a Position to track where in the original 121 input source they were created from. 122 @param trackPosition position tracking setting; {@code true} to enable 123 @return this Parser, for chaining 124 */ setTrackPosition(boolean trackPosition)125 public Parser setTrackPosition(boolean trackPosition) { 126 this.trackPosition = trackPosition; 127 return this; 128 } 129 130 /** 131 Update the ParseSettings of this Parser, to control the case sensitivity of tags and attributes. 132 * @param settings the new settings 133 * @return this Parser 134 */ settings(ParseSettings settings)135 public Parser settings(ParseSettings settings) { 136 this.settings = settings; 137 return this; 138 } 139 140 /** 141 Gets the current ParseSettings for this Parser 142 * @return current ParseSettings 143 */ settings()144 public ParseSettings settings() { 145 return settings; 146 } 147 148 /** 149 (An internal method, visible for Element. For HTML parse, signals that script and style text should be treated as 150 Data Nodes). 151 */ isContentForTagData(String normalName)152 public boolean isContentForTagData(String normalName) { 153 return getTreeBuilder().isContentForTagData(normalName); 154 } 155 defaultNamespace()156 public String defaultNamespace() { 157 return getTreeBuilder().defaultNamespace(); 158 } 159 160 // static parse functions below 161 /** 162 * Parse HTML into a Document. 163 * 164 * @param html HTML to parse 165 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 166 * 167 * @return parsed Document 168 */ parse(String html, String baseUri)169 public static Document parse(String html, String baseUri) { 170 TreeBuilder treeBuilder = new HtmlTreeBuilder(); 171 return treeBuilder.parse(new StringReader(html), baseUri, new Parser(treeBuilder)); 172 } 173 174 /** 175 * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context. 176 * 177 * @param fragmentHtml the fragment of HTML to parse 178 * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This 179 * provides stack context (for implicit element creation). 180 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 181 * 182 * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified. 183 */ parseFragment(String fragmentHtml, Element context, String baseUri)184 public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri) { 185 HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder(); 186 return treeBuilder.parseFragment(fragmentHtml, context, baseUri, new Parser(treeBuilder)); 187 } 188 189 /** 190 * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context. 191 * 192 * @param fragmentHtml the fragment of HTML to parse 193 * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This 194 * provides stack context (for implicit element creation). 195 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 196 * @param errorList list to add errors to 197 * 198 * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified. 199 */ parseFragment(String fragmentHtml, Element context, String baseUri, ParseErrorList errorList)200 public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri, ParseErrorList errorList) { 201 HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder(); 202 Parser parser = new Parser(treeBuilder); 203 parser.errors = errorList; 204 return treeBuilder.parseFragment(fragmentHtml, context, baseUri, parser); 205 } 206 207 /** 208 * Parse a fragment of XML into a list of nodes. 209 * 210 * @param fragmentXml the fragment of XML to parse 211 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 212 * @return list of nodes parsed from the input XML. 213 */ parseXmlFragment(String fragmentXml, String baseUri)214 public static List<Node> parseXmlFragment(String fragmentXml, String baseUri) { 215 XmlTreeBuilder treeBuilder = new XmlTreeBuilder(); 216 return treeBuilder.parseFragment(fragmentXml, baseUri, new Parser(treeBuilder)); 217 } 218 219 /** 220 * Parse a fragment of HTML into the {@code body} of a Document. 221 * 222 * @param bodyHtml fragment of HTML 223 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 224 * 225 * @return Document, with empty head, and HTML parsed into body 226 */ parseBodyFragment(String bodyHtml, String baseUri)227 public static Document parseBodyFragment(String bodyHtml, String baseUri) { 228 Document doc = Document.createShell(baseUri); 229 Element body = doc.body(); 230 List<Node> nodeList = parseFragment(bodyHtml, body, baseUri); 231 Node[] nodes = nodeList.toArray(new Node[0]); // the node list gets modified when re-parented 232 for (int i = nodes.length - 1; i > 0; i--) { 233 nodes[i].remove(); 234 } 235 for (Node node : nodes) { 236 body.appendChild(node); 237 } 238 return doc; 239 } 240 241 /** 242 * Utility method to unescape HTML entities from a string 243 * @param string HTML escaped string 244 * @param inAttribute if the string is to be escaped in strict mode (as attributes are) 245 * @return an unescaped string 246 */ unescapeEntities(String string, boolean inAttribute)247 public static String unescapeEntities(String string, boolean inAttribute) { 248 Parser parser = Parser.htmlParser(); 249 parser.treeBuilder.initialiseParse(new StringReader(string), "", parser); 250 Tokeniser tokeniser = new Tokeniser(parser.treeBuilder); 251 return tokeniser.unescapeEntities(inAttribute); 252 } 253 254 // builders 255 256 /** 257 * Create a new HTML parser. This parser treats input as HTML5, and enforces the creation of a normalised document, 258 * based on a knowledge of the semantics of the incoming tags. 259 * @return a new HTML parser. 260 */ htmlParser()261 public static Parser htmlParser() { 262 return new Parser(new HtmlTreeBuilder()); 263 } 264 265 /** 266 * Create a new XML parser. This parser assumes no knowledge of the incoming tags and does not treat it as HTML, 267 * rather creates a simple tree directly from the input. 268 * @return a new simple XML parser. 269 */ xmlParser()270 public static Parser xmlParser() { 271 return new Parser(new XmlTreeBuilder()); 272 } 273 } 274