xref: /aosp_15_r20/external/jsoup/src/main/java/org/jsoup/parser/Parser.java (revision 6da8f8c4bc310ad659121b84dd089062417a2ce2)
1 package org.jsoup.parser;
2 
3 import org.jsoup.nodes.Document;
4 import org.jsoup.nodes.Element;
5 import org.jsoup.nodes.Node;
6 
7 import java.io.Reader;
8 import java.io.StringReader;
9 import java.util.List;
10 
11 /**
12  Parses HTML or XML into a {@link org.jsoup.nodes.Document}. Generally, it is simpler to use one of the parse methods in
13  {@link org.jsoup.Jsoup}.
14  <p>Note that a Parser instance object is not threadsafe. To reuse a Parser configuration in a multi-threaded
15  environment, use {@link #newInstance()} to make copies. */
16 public class Parser {
17     public static final String NamespaceHtml = "http://www.w3.org/1999/xhtml";
18     public static final String NamespaceXml = "http://www.w3.org/XML/1998/namespace";
19     public static final String NamespaceMathml = "http://www.w3.org/1998/Math/MathML";
20     public static final String NamespaceSvg = "http://www.w3.org/2000/svg";
21 
22     private TreeBuilder treeBuilder;
23     private ParseErrorList errors;
24     private ParseSettings settings;
25     private boolean trackPosition = false;
26 
27     /**
28      * Create a new Parser, using the specified TreeBuilder
29      * @param treeBuilder TreeBuilder to use to parse input into Documents.
30      */
Parser(TreeBuilder treeBuilder)31     public Parser(TreeBuilder treeBuilder) {
32         this.treeBuilder = treeBuilder;
33         settings = treeBuilder.defaultSettings();
34         errors = ParseErrorList.noTracking();
35     }
36 
37     /**
38      Creates a new Parser as a deep copy of this; including initializing a new TreeBuilder. Allows independent (multi-threaded) use.
39      @return a copied parser
40      */
newInstance()41     public Parser newInstance() {
42         return new Parser(this);
43     }
44 
Parser(Parser copy)45     private Parser(Parser copy) {
46         treeBuilder = copy.treeBuilder.newInstance(); // because extended
47         errors = new ParseErrorList(copy.errors); // only copies size, not contents
48         settings = new ParseSettings(copy.settings);
49         trackPosition = copy.trackPosition;
50     }
51 
parseInput(String html, String baseUri)52     public Document parseInput(String html, String baseUri) {
53         return treeBuilder.parse(new StringReader(html), baseUri, this);
54     }
55 
parseInput(Reader inputHtml, String baseUri)56     public Document parseInput(Reader inputHtml, String baseUri) {
57         return treeBuilder.parse(inputHtml, baseUri, this);
58     }
59 
parseFragmentInput(String fragment, Element context, String baseUri)60     public List<Node> parseFragmentInput(String fragment, Element context, String baseUri) {
61         return treeBuilder.parseFragment(fragment, context, baseUri, this);
62     }
63     // gets & sets
64     /**
65      * Get the TreeBuilder currently in use.
66      * @return current TreeBuilder.
67      */
getTreeBuilder()68     public TreeBuilder getTreeBuilder() {
69         return treeBuilder;
70     }
71 
72     /**
73      * Update the TreeBuilder used when parsing content.
74      * @param treeBuilder new TreeBuilder
75      * @return this, for chaining
76      */
setTreeBuilder(TreeBuilder treeBuilder)77     public Parser setTreeBuilder(TreeBuilder treeBuilder) {
78         this.treeBuilder = treeBuilder;
79         treeBuilder.parser = this;
80         return this;
81     }
82 
83     /**
84      * Check if parse error tracking is enabled.
85      * @return current track error state.
86      */
isTrackErrors()87     public boolean isTrackErrors() {
88         return errors.getMaxSize() > 0;
89     }
90 
91     /**
92      * Enable or disable parse error tracking for the next parse.
93      * @param maxErrors the maximum number of errors to track. Set to 0 to disable.
94      * @return this, for chaining
95      */
setTrackErrors(int maxErrors)96     public Parser setTrackErrors(int maxErrors) {
97         errors = maxErrors > 0 ? ParseErrorList.tracking(maxErrors) : ParseErrorList.noTracking();
98         return this;
99     }
100 
101     /**
102      * Retrieve the parse errors, if any, from the last parse.
103      * @return list of parse errors, up to the size of the maximum errors tracked.
104      * @see #setTrackErrors(int)
105      */
getErrors()106     public ParseErrorList getErrors() {
107         return errors;
108     }
109 
110     /**
111      Test if position tracking is enabled. If it is, Nodes will have a Position to track where in the original input
112      source they were created from. By default, tracking is not enabled.
113      * @return current track position setting
114      */
isTrackPosition()115     public boolean isTrackPosition() {
116         return trackPosition;
117     }
118 
119     /**
120      Enable or disable source position tracking. If enabled, Nodes will have a Position to track where in the original
121      input source they were created from.
122      @param trackPosition position tracking setting; {@code true} to enable
123      @return this Parser, for chaining
124      */
setTrackPosition(boolean trackPosition)125     public Parser setTrackPosition(boolean trackPosition) {
126         this.trackPosition = trackPosition;
127         return this;
128     }
129 
130     /**
131      Update the ParseSettings of this Parser, to control the case sensitivity of tags and attributes.
132      * @param settings the new settings
133      * @return this Parser
134      */
settings(ParseSettings settings)135     public Parser settings(ParseSettings settings) {
136         this.settings = settings;
137         return this;
138     }
139 
140     /**
141      Gets the current ParseSettings for this Parser
142      * @return current ParseSettings
143      */
settings()144     public ParseSettings settings() {
145         return settings;
146     }
147 
148     /**
149      (An internal method, visible for Element. For HTML parse, signals that script and style text should be treated as
150      Data Nodes).
151      */
isContentForTagData(String normalName)152     public boolean isContentForTagData(String normalName) {
153         return getTreeBuilder().isContentForTagData(normalName);
154     }
155 
defaultNamespace()156     public String defaultNamespace() {
157         return getTreeBuilder().defaultNamespace();
158     }
159 
160     // static parse functions below
161     /**
162      * Parse HTML into a Document.
163      *
164      * @param html HTML to parse
165      * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
166      *
167      * @return parsed Document
168      */
parse(String html, String baseUri)169     public static Document parse(String html, String baseUri) {
170         TreeBuilder treeBuilder = new HtmlTreeBuilder();
171         return treeBuilder.parse(new StringReader(html), baseUri, new Parser(treeBuilder));
172     }
173 
174     /**
175      * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context.
176      *
177      * @param fragmentHtml the fragment of HTML to parse
178      * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This
179      * provides stack context (for implicit element creation).
180      * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
181      *
182      * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified.
183      */
parseFragment(String fragmentHtml, Element context, String baseUri)184     public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri) {
185         HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder();
186         return treeBuilder.parseFragment(fragmentHtml, context, baseUri, new Parser(treeBuilder));
187     }
188 
189     /**
190      * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context.
191      *
192      * @param fragmentHtml the fragment of HTML to parse
193      * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This
194      * provides stack context (for implicit element creation).
195      * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
196      * @param errorList list to add errors to
197      *
198      * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified.
199      */
parseFragment(String fragmentHtml, Element context, String baseUri, ParseErrorList errorList)200     public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri, ParseErrorList errorList) {
201         HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder();
202         Parser parser = new Parser(treeBuilder);
203         parser.errors = errorList;
204         return treeBuilder.parseFragment(fragmentHtml, context, baseUri, parser);
205     }
206 
207     /**
208      * Parse a fragment of XML into a list of nodes.
209      *
210      * @param fragmentXml the fragment of XML to parse
211      * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
212      * @return list of nodes parsed from the input XML.
213      */
parseXmlFragment(String fragmentXml, String baseUri)214     public static List<Node> parseXmlFragment(String fragmentXml, String baseUri) {
215         XmlTreeBuilder treeBuilder = new XmlTreeBuilder();
216         return treeBuilder.parseFragment(fragmentXml, baseUri, new Parser(treeBuilder));
217     }
218 
219     /**
220      * Parse a fragment of HTML into the {@code body} of a Document.
221      *
222      * @param bodyHtml fragment of HTML
223      * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
224      *
225      * @return Document, with empty head, and HTML parsed into body
226      */
parseBodyFragment(String bodyHtml, String baseUri)227     public static Document parseBodyFragment(String bodyHtml, String baseUri) {
228         Document doc = Document.createShell(baseUri);
229         Element body = doc.body();
230         List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
231         Node[] nodes = nodeList.toArray(new Node[0]); // the node list gets modified when re-parented
232         for (int i = nodes.length - 1; i > 0; i--) {
233             nodes[i].remove();
234         }
235         for (Node node : nodes) {
236             body.appendChild(node);
237         }
238         return doc;
239     }
240 
241     /**
242      * Utility method to unescape HTML entities from a string
243      * @param string HTML escaped string
244      * @param inAttribute if the string is to be escaped in strict mode (as attributes are)
245      * @return an unescaped string
246      */
unescapeEntities(String string, boolean inAttribute)247     public static String unescapeEntities(String string, boolean inAttribute) {
248         Parser parser = Parser.htmlParser();
249         parser.treeBuilder.initialiseParse(new StringReader(string), "", parser);
250         Tokeniser tokeniser = new Tokeniser(parser.treeBuilder);
251         return tokeniser.unescapeEntities(inAttribute);
252     }
253 
254     // builders
255 
256     /**
257      * Create a new HTML parser. This parser treats input as HTML5, and enforces the creation of a normalised document,
258      * based on a knowledge of the semantics of the incoming tags.
259      * @return a new HTML parser.
260      */
htmlParser()261     public static Parser htmlParser() {
262         return new Parser(new HtmlTreeBuilder());
263     }
264 
265     /**
266      * Create a new XML parser. This parser assumes no knowledge of the incoming tags and does not treat it as HTML,
267      * rather creates a simple tree directly from the input.
268      * @return a new simple XML parser.
269      */
xmlParser()270     public static Parser xmlParser() {
271         return new Parser(new XmlTreeBuilder());
272     }
273 }
274