xref: /aosp_15_r20/external/jsoup/src/test/java/org/jsoup/parser/HtmlParserTest.java (revision 6da8f8c4bc310ad659121b84dd089062417a2ce2)
1 package org.jsoup.parser;
2 
3 import org.jsoup.Jsoup;
4 import org.jsoup.TextUtil;
5 import org.jsoup.integration.ParseTest;
6 import org.jsoup.internal.StringUtil;
7 import org.jsoup.nodes.*;
8 import org.jsoup.safety.Safelist;
9 import org.jsoup.select.Elements;
10 import org.junit.jupiter.api.Test;
11 import org.junit.jupiter.params.ParameterizedTest;
12 import org.junit.jupiter.params.provider.Arguments;
13 import org.junit.jupiter.params.provider.MethodSource;
14 
15 import java.io.ByteArrayInputStream;
16 import java.io.File;
17 import java.io.IOException;
18 import java.util.List;
19 import java.util.stream.Stream;
20 
21 import static org.jsoup.parser.ParseSettings.preserveCase;
22 import static org.junit.jupiter.api.Assertions.*;
23 
24 /**
25  * Tests for the Parser
26  *
27  * @author Jonathan Hedley, [email protected]
28  */
29 public class HtmlParserTest {
30 
parsesSimpleDocument()31     @Test public void parsesSimpleDocument() {
32         String html = "<html><head><title>First!</title></head><body><p>First post! <img src=\"foo.png\" /></p></body></html>";
33         Document doc = Jsoup.parse(html);
34         // need a better way to verify these:
35         Element p = doc.body().child(0);
36         assertEquals("p", p.tagName());
37         Element img = p.child(0);
38         assertEquals("foo.png", img.attr("src"));
39         assertEquals("img", img.tagName());
40     }
41 
parsesRoughAttributes()42     @Test public void parsesRoughAttributes() {
43         String html = "<html><head><title>First!</title></head><body><p class=\"foo > bar\">First post! <img src=\"foo.png\" /></p></body></html>";
44         Document doc = Jsoup.parse(html);
45 
46         // need a better way to verify these:
47         Element p = doc.body().child(0);
48         assertEquals("p", p.tagName());
49         assertEquals("foo > bar", p.attr("class"));
50     }
51 
52     @ParameterizedTest @MethodSource("dupeAttributeData")
dropsDuplicateAttributes(String html, String expected)53     public void dropsDuplicateAttributes(String html, String expected) {
54         Parser parser = Parser.htmlParser().setTrackErrors(10);
55         Document doc = parser.parseInput(html, "");
56 
57         Element el = doc.expectFirst("body > *");
58         assertEquals(expected, el.outerHtml()); // normalized names due to lower casing
59         String tag = el.normalName();
60 
61         assertEquals(1, parser.getErrors().size());
62         assertEquals("Dropped duplicate attribute(s) in tag [" + tag + "]", parser.getErrors().get(0).getErrorMessage());
63     }
64 
dupeAttributeData()65     private static Stream<Arguments> dupeAttributeData() {
66         return Stream.of(
67             Arguments.of("<p One=One ONE=Two Two=two one=Three One=Four two=Five>Text</p>", "<p one=\"One\" two=\"two\">Text</p>"),
68             Arguments.of("<img One=One ONE=Two Two=two one=Three One=Four two=Five>", "<img one=\"One\" two=\"two\">"),
69             Arguments.of("<form One=One ONE=Two Two=two one=Three One=Four two=Five></form>", "<form one=\"One\" two=\"two\"></form>")
70         );
71     }
72 
retainsAttributesOfDifferentCaseIfSensitive()73     @Test public void retainsAttributesOfDifferentCaseIfSensitive() {
74         String html = "<p One=One One=Two one=Three two=Four two=Five Two=Six>Text</p>";
75         Parser parser = Parser.htmlParser().settings(preserveCase);
76         Document doc = parser.parseInput(html, "");
77         assertEquals("<p One=\"One\" one=\"Three\" two=\"Four\" Two=\"Six\">Text</p>", doc.selectFirst("p").outerHtml());
78     }
79 
parsesQuiteRoughAttributes()80     @Test public void parsesQuiteRoughAttributes() {
81         String html = "<p =a>One<a <p>Something</p>Else";
82         // this (used to; now gets cleaner) gets a <p> with attr '=a' and an <a tag with an attribute named '<p'; and then auto-recreated
83         Document doc = Jsoup.parse(html);
84 
85         // NOTE: per spec this should be the test case. but impacts too many ppl
86         // assertEquals("<p =a>One<a <p>Something</a></p>\n<a <p>Else</a>", doc.body().html());
87 
88         assertEquals("<p a>One<a></a></p><p><a>Something</a></p><a>Else</a>", TextUtil.stripNewlines(doc.body().html()));
89 
90         doc = Jsoup.parse("<p .....>");
91         assertEquals("<p .....></p>", doc.body().html());
92     }
93 
parsesComments()94     @Test public void parsesComments() {
95         String html = "<html><head></head><body><img src=foo><!-- <table><tr><td></table> --><p>Hello</p></body></html>";
96         Document doc = Jsoup.parse(html);
97 
98         Element body = doc.body();
99         Comment comment = (Comment) body.childNode(1); // comment should not be sub of img, as it's an empty tag
100         assertEquals(" <table><tr><td></table> ", comment.getData());
101         Element p = body.child(1);
102         TextNode text = (TextNode) p.childNode(0);
103         assertEquals("Hello", text.getWholeText());
104     }
105 
parsesUnterminatedComments()106     @Test public void parsesUnterminatedComments() {
107         String html = "<p>Hello<!-- <tr><td>";
108         Document doc = Jsoup.parse(html);
109         Element p = doc.getElementsByTag("p").get(0);
110         assertEquals("Hello", p.text());
111         TextNode text = (TextNode) p.childNode(0);
112         assertEquals("Hello", text.getWholeText());
113         Comment comment = (Comment) p.childNode(1);
114         assertEquals(" <tr><td>", comment.getData());
115     }
116 
allDashCommentsAreNotParseErrors()117     @Test void allDashCommentsAreNotParseErrors() {
118         // https://github.com/jhy/jsoup/issues/1667
119         // <!-----> is not a parse error
120         String html = "<!------>";
121         Parser parser = Parser.htmlParser().setTrackErrors(10);
122         Document doc = Jsoup.parse(html, parser);
123         Comment comment = (Comment) doc.childNode(0);
124         assertEquals("--", comment.getData());
125         assertEquals(0, parser.getErrors().size());
126     }
127 
dropsUnterminatedTag()128     @Test public void dropsUnterminatedTag() {
129         // jsoup used to parse this to <p>, but whatwg, webkit will drop.
130         String h1 = "<p";
131         Document doc = Jsoup.parse(h1);
132         assertEquals(0, doc.getElementsByTag("p").size());
133         assertEquals("", doc.text());
134 
135         String h2 = "<div id=1<p id='2'";
136         doc = Jsoup.parse(h2);
137         assertEquals("", doc.text());
138     }
139 
dropsUnterminatedAttribute()140     @Test public void dropsUnterminatedAttribute() {
141         // jsoup used to parse this to <p id="foo">, but whatwg, webkit will drop.
142         String h1 = "<p id=\"foo";
143         Document doc = Jsoup.parse(h1);
144         assertEquals("", doc.text());
145     }
146 
parsesUnterminatedTextarea()147     @Test public void parsesUnterminatedTextarea() {
148         // don't parse right to end, but break on <p>
149         Document doc = Jsoup.parse("<body><p><textarea>one<p>two");
150         Element t = doc.select("textarea").first();
151         assertEquals("one", t.text());
152         assertEquals("two", doc.select("p").get(1).text());
153     }
154 
parsesUnterminatedOption()155     @Test public void parsesUnterminatedOption() {
156         // bit weird this -- browsers and spec get stuck in select until there's a </select>
157         Document doc = Jsoup.parse("<body><p><select><option>One<option>Two</p><p>Three</p>");
158         Elements options = doc.select("option");
159         assertEquals(2, options.size());
160         assertEquals("One", options.first().text());
161         assertEquals("TwoThree", options.last().text());
162     }
163 
testSelectWithOption()164     @Test public void testSelectWithOption() {
165         Parser parser = Parser.htmlParser();
166         parser.setTrackErrors(10);
167         Document document = parser.parseInput("<select><option>Option 1</option></select>", "http://jsoup.org");
168         assertEquals(0, parser.getErrors().size());
169     }
170 
testSpaceAfterTag()171     @Test public void testSpaceAfterTag() {
172         Document doc = Jsoup.parse("<div > <a name=\"top\"></a ><p id=1 >Hello</p></div>");
173         assertEquals("<div><a name=\"top\"></a><p id=\"1\">Hello</p></div>", TextUtil.stripNewlines(doc.body().html()));
174     }
175 
createsDocumentStructure()176     @Test public void createsDocumentStructure() {
177         String html = "<meta name=keywords /><link rel=stylesheet /><title>jsoup</title><p>Hello world</p>";
178         Document doc = Jsoup.parse(html);
179         Element head = doc.head();
180         Element body = doc.body();
181 
182         assertEquals(1, doc.children().size()); // root node: contains html node
183         assertEquals(2, doc.child(0).children().size()); // html node: head and body
184         assertEquals(3, head.children().size());
185         assertEquals(1, body.children().size());
186 
187         assertEquals("keywords", head.getElementsByTag("meta").get(0).attr("name"));
188         assertEquals(0, body.getElementsByTag("meta").size());
189         assertEquals("jsoup", doc.title());
190         assertEquals("Hello world", body.text());
191         assertEquals("Hello world", body.children().get(0).text());
192     }
193 
createsStructureFromBodySnippet()194     @Test public void createsStructureFromBodySnippet() {
195         // the bar baz stuff naturally goes into the body, but the 'foo' goes into root, and the normalisation routine
196         // needs to move into the start of the body
197         String html = "foo <b>bar</b> baz";
198         Document doc = Jsoup.parse(html);
199         assertEquals("foo bar baz", doc.text());
200     }
201 
handlesEscapedData()202     @Test public void handlesEscapedData() {
203         String html = "<div title='Surf &amp; Turf'>Reef &amp; Beef</div>";
204         Document doc = Jsoup.parse(html);
205         Element div = doc.getElementsByTag("div").get(0);
206 
207         assertEquals("Surf & Turf", div.attr("title"));
208         assertEquals("Reef & Beef", div.text());
209     }
210 
handlesDataOnlyTags()211     @Test public void handlesDataOnlyTags() {
212         String t = "<style>font-family: bold</style>";
213         List<Element> tels = Jsoup.parse(t).getElementsByTag("style");
214         assertEquals("font-family: bold", tels.get(0).data());
215         assertEquals("", tels.get(0).text());
216 
217         String s = "<p>Hello</p><script>obj.insert('<a rel=\"none\" />');\ni++;</script><p>There</p>";
218         Document doc = Jsoup.parse(s);
219         assertEquals("Hello There", doc.text());
220         assertEquals("obj.insert('<a rel=\"none\" />');\ni++;", doc.data());
221     }
222 
handlesTextAfterData()223     @Test public void handlesTextAfterData() {
224         String h = "<html><body>pre <script>inner</script> aft</body></html>";
225         Document doc = Jsoup.parse(h);
226         assertEquals("<html><head></head><body>pre <script>inner</script> aft</body></html>", TextUtil.stripNewlines(doc.html()));
227     }
228 
handlesTextArea()229     @Test public void handlesTextArea() {
230         Document doc = Jsoup.parse("<textarea>Hello</textarea>");
231         Elements els = doc.select("textarea");
232         assertEquals("Hello", els.text());
233         assertEquals("Hello", els.val());
234     }
235 
preservesSpaceInTextArea()236     @Test public void preservesSpaceInTextArea() {
237         // preserve because the tag is marked as preserve white space
238         Document doc = Jsoup.parse("<textarea>\n\tOne\n\tTwo\n\tThree\n</textarea>");
239         String expect = "One\n\tTwo\n\tThree"; // the leading and trailing spaces are dropped as a convenience to authors
240         Element el = doc.select("textarea").first();
241         assertEquals(expect, el.text());
242         assertEquals(expect, el.val());
243         assertEquals(expect, el.html());
244         assertEquals("<textarea>\n\t" + expect + "\n</textarea>", el.outerHtml()); // but preserved in round-trip html
245     }
246 
preservesSpaceInScript()247     @Test public void preservesSpaceInScript() {
248         // preserve because it's content is a data node
249         Document doc = Jsoup.parse("<script>\nOne\n\tTwo\n\tThree\n</script>");
250         String expect = "\nOne\n\tTwo\n\tThree\n";
251         Element el = doc.select("script").first();
252         assertEquals(expect, el.data());
253         assertEquals("One\n\tTwo\n\tThree", el.html());
254         assertEquals("<script>" + expect + "</script>", el.outerHtml());
255     }
256 
doesNotCreateImplicitLists()257     @Test public void doesNotCreateImplicitLists() {
258         // old jsoup used to wrap this in <ul>, but that's not to spec
259         String h = "<li>Point one<li>Point two";
260         Document doc = Jsoup.parse(h);
261         Elements ol = doc.select("ul"); // should NOT have created a default ul.
262         assertEquals(0, ol.size());
263         Elements lis = doc.select("li");
264         assertEquals(2, lis.size());
265         assertEquals("body", lis.first().parent().tagName());
266 
267         // no fiddling with non-implicit lists
268         String h2 = "<ol><li><p>Point the first<li><p>Point the second";
269         Document doc2 = Jsoup.parse(h2);
270 
271         assertEquals(0, doc2.select("ul").size());
272         assertEquals(1, doc2.select("ol").size());
273         assertEquals(2, doc2.select("ol li").size());
274         assertEquals(2, doc2.select("ol li p").size());
275         assertEquals(1, doc2.select("ol li").get(0).children().size()); // one p in first li
276     }
277 
discardsNakedTds()278     @Test public void discardsNakedTds() {
279         // jsoup used to make this into an implicit table; but browsers make it into a text run
280         String h = "<td>Hello<td><p>There<p>now";
281         Document doc = Jsoup.parse(h);
282         assertEquals("Hello<p>There</p><p>now</p>", TextUtil.stripNewlines(doc.body().html()));
283         // <tbody> is introduced if no implicitly creating table, but allows tr to be directly under table
284     }
285 
handlesNestedImplicitTable()286     @Test public void handlesNestedImplicitTable() {
287         Document doc = Jsoup.parse("<table><td>1</td></tr> <td>2</td></tr> <td> <table><td>3</td> <td>4</td></table> <tr><td>5</table>");
288         assertEquals("<table><tbody><tr><td>1</td></tr><tr><td>2</td></tr><tr><td><table><tbody><tr><td>3</td><td>4</td></tr></tbody></table></td></tr><tr><td>5</td></tr></tbody></table>", TextUtil.stripNewlines(doc.body().html()));
289     }
290 
handlesWhatWgExpensesTableExample()291     @Test public void handlesWhatWgExpensesTableExample() {
292         // http://www.whatwg.org/specs/web-apps/current-work/multipage/tabular-data.html#examples-0
293         Document doc = Jsoup.parse("<table> <colgroup> <col> <colgroup> <col> <col> <col> <thead> <tr> <th> <th>2008 <th>2007 <th>2006 <tbody> <tr> <th scope=rowgroup> Research and development <td> $ 1,109 <td> $ 782 <td> $ 712 <tr> <th scope=row> Percentage of net sales <td> 3.4% <td> 3.3% <td> 3.7% <tbody> <tr> <th scope=rowgroup> Selling, general, and administrative <td> $ 3,761 <td> $ 2,963 <td> $ 2,433 <tr> <th scope=row> Percentage of net sales <td> 11.6% <td> 12.3% <td> 12.6% </table>");
294         assertEquals("<table><colgroup><col></colgroup><colgroup><col><col><col></colgroup><thead><tr><th></th><th>2008</th><th>2007</th><th>2006</th></tr></thead><tbody><tr><th scope=\"rowgroup\">Research and development</th><td>$ 1,109</td><td>$ 782</td><td>$ 712</td></tr><tr><th scope=\"row\">Percentage of net sales</th><td>3.4%</td><td>3.3%</td><td>3.7%</td></tr></tbody><tbody><tr><th scope=\"rowgroup\">Selling, general, and administrative</th><td>$ 3,761</td><td>$ 2,963</td><td>$ 2,433</td></tr><tr><th scope=\"row\">Percentage of net sales</th><td>11.6%</td><td>12.3%</td><td>12.6%</td></tr></tbody></table>", TextUtil.stripNewlines(doc.body().html()));
295     }
296 
handlesTbodyTable()297     @Test public void handlesTbodyTable() {
298         Document doc = Jsoup.parse("<html><head></head><body><table><tbody><tr><td>aaa</td><td>bbb</td></tr></tbody></table></body></html>");
299         assertEquals("<table><tbody><tr><td>aaa</td><td>bbb</td></tr></tbody></table>", TextUtil.stripNewlines(doc.body().html()));
300     }
301 
handlesImplicitCaptionClose()302     @Test public void handlesImplicitCaptionClose() {
303         Document doc = Jsoup.parse("<table><caption>A caption<td>One<td>Two");
304         assertEquals("<table><caption>A caption</caption><tbody><tr><td>One</td><td>Two</td></tr></tbody></table>", TextUtil.stripNewlines(doc.body().html()));
305     }
306 
noTableDirectInTable()307     @Test public void noTableDirectInTable() {
308         Document doc = Jsoup.parse("<table> <td>One <td><table><td>Two</table> <table><td>Three");
309         assertEquals("<table><tbody><tr><td>One</td><td><table><tbody><tr><td>Two</td></tr></tbody></table><table><tbody><tr><td>Three</td></tr></tbody></table></td></tr></tbody></table>",
310             TextUtil.stripNewlines(doc.body().html()));
311     }
312 
ignoresDupeEndTrTag()313     @Test public void ignoresDupeEndTrTag() {
314         Document doc = Jsoup.parse("<table><tr><td>One</td><td><table><tr><td>Two</td></tr></tr></table></td><td>Three</td></tr></table>"); // two </tr></tr>, must ignore or will close table
315         assertEquals("<table><tbody><tr><td>One</td><td><table><tbody><tr><td>Two</td></tr></tbody></table></td><td>Three</td></tr></tbody></table>",
316             TextUtil.stripNewlines(doc.body().html()));
317     }
318 
handlesBaseTags()319     @Test public void handlesBaseTags() {
320         // only listen to the first base href
321         String h = "<a href=1>#</a><base href='/2/'><a href='3'>#</a><base href='http://bar'><a href=/4>#</a>";
322         Document doc = Jsoup.parse(h, "http://foo/");
323         assertEquals("http://foo/2/", doc.baseUri()); // gets set once, so doc and descendants have first only
324 
325         Elements anchors = doc.getElementsByTag("a");
326         assertEquals(3, anchors.size());
327 
328         assertEquals("http://foo/2/", anchors.get(0).baseUri());
329         assertEquals("http://foo/2/", anchors.get(1).baseUri());
330         assertEquals("http://foo/2/", anchors.get(2).baseUri());
331 
332         assertEquals("http://foo/2/1", anchors.get(0).absUrl("href"));
333         assertEquals("http://foo/2/3", anchors.get(1).absUrl("href"));
334         assertEquals("http://foo/4", anchors.get(2).absUrl("href"));
335     }
336 
handlesProtocolRelativeUrl()337     @Test public void handlesProtocolRelativeUrl() {
338         String base = "https://example.com/";
339         String html = "<img src='//example.net/img.jpg'>";
340         Document doc = Jsoup.parse(html, base);
341         Element el = doc.select("img").first();
342         assertEquals("https://example.net/img.jpg", el.absUrl("src"));
343     }
344 
handlesCdata()345     @Test public void handlesCdata() {
346         // todo: as this is html namespace, should actually treat as bogus comment, not cdata. keep as cdata for now
347         String h = "<div id=1><![CDATA[<html>\n <foo><&amp;]]></div>"; // the &amp; in there should remain literal
348         Document doc = Jsoup.parse(h);
349         Element div = doc.getElementById("1");
350         assertEquals("<html>\n <foo><&amp;", div.text());
351         assertEquals(0, div.children().size());
352         assertEquals(1, div.childNodeSize()); // no elements, one text node
353     }
354 
roundTripsCdata()355     @Test public void roundTripsCdata() {
356         String h = "<div id=1><![CDATA[\n<html>\n <foo><&amp;]]></div>";
357         Document doc = Jsoup.parse(h);
358         Element div = doc.getElementById("1");
359         assertEquals("<html>\n <foo><&amp;", div.text());
360         assertEquals(0, div.children().size());
361         assertEquals(1, div.childNodeSize()); // no elements, one text node
362 
363         assertEquals("<div id=\"1\"><![CDATA[\n<html>\n <foo><&amp;]]>\n</div>", div.outerHtml());
364 
365         CDataNode cdata = (CDataNode) div.textNodes().get(0);
366         assertEquals("\n<html>\n <foo><&amp;", cdata.text());
367     }
368 
handlesCdataAcrossBuffer()369     @Test public void handlesCdataAcrossBuffer() {
370         StringBuilder sb = new StringBuilder();
371         while (sb.length() <= CharacterReader.maxBufferLen) {
372             sb.append("A suitable amount of CData.\n");
373         }
374         String cdata = sb.toString();
375         String h = "<div><![CDATA[" + cdata + "]]></div>";
376         Document doc = Jsoup.parse(h);
377         Element div = doc.selectFirst("div");
378 
379         CDataNode node = (CDataNode) div.textNodes().get(0);
380         assertEquals(cdata, node.text());
381     }
382 
handlesCdataInScript()383     @Test public void handlesCdataInScript() {
384         String html = "<script type=\"text/javascript\">//<![CDATA[\n\n  foo();\n//]]></script>";
385         Document doc = Jsoup.parse(html);
386 
387         String data = "//<![CDATA[\n\n  foo();\n//]]>";
388         Element script = doc.selectFirst("script");
389         assertEquals("", script.text()); // won't be parsed as cdata because in script data section
390         assertEquals(data, script.data());
391         assertEquals(html, script.outerHtml());
392 
393         DataNode dataNode = (DataNode) script.childNode(0);
394         assertEquals(data, dataNode.getWholeData());
395         // see - not a cdata node, because in script. contrast with XmlTreeBuilder - will be cdata.
396     }
397 
handlesUnclosedCdataAtEOF()398     @Test public void handlesUnclosedCdataAtEOF() {
399         // https://github.com/jhy/jsoup/issues/349 would crash, as character reader would try to seek past EOF
400         String h = "<![CDATA[]]";
401         Document doc = Jsoup.parse(h);
402         assertEquals(1, doc.body().childNodeSize());
403     }
404 
handleCDataInText()405     @Test public void handleCDataInText() {
406         String h = "<p>One <![CDATA[Two <&]]> Three</p>";
407         Document doc = Jsoup.parse(h);
408         Element p = doc.selectFirst("p");
409 
410         List<Node> nodes = p.childNodes();
411         assertEquals("One ", ((TextNode) nodes.get(0)).getWholeText());
412         assertEquals("Two <&", ((TextNode) nodes.get(1)).getWholeText());
413         assertEquals("Two <&", ((CDataNode) nodes.get(1)).getWholeText());
414         assertEquals(" Three", ((TextNode) nodes.get(2)).getWholeText());
415 
416         assertEquals(h, p.outerHtml());
417     }
418 
cdataNodesAreTextNodes()419     @Test public void cdataNodesAreTextNodes() {
420         String h = "<p>One <![CDATA[ Two <& ]]> Three</p>";
421         Document doc = Jsoup.parse(h);
422         Element p = doc.selectFirst("p");
423 
424         List<TextNode> nodes = p.textNodes();
425         assertEquals("One ", nodes.get(0).text());
426         assertEquals(" Two <& ", nodes.get(1).text());
427         assertEquals(" Three", nodes.get(2).text());
428     }
429 
handlesInvalidStartTags()430     @Test public void handlesInvalidStartTags() {
431         String h = "<div>Hello < There <&amp;></div>"; // parse to <div {#text=Hello < There <&>}>
432         Document doc = Jsoup.parse(h);
433         assertEquals("Hello < There <&>", doc.select("div").first().text());
434     }
435 
handlesUnknownTags()436     @Test public void handlesUnknownTags() {
437         String h = "<div><foo title=bar>Hello<foo title=qux>there</foo></div>";
438         Document doc = Jsoup.parse(h);
439         Elements foos = doc.select("foo");
440         assertEquals(2, foos.size());
441         assertEquals("bar", foos.first().attr("title"));
442         assertEquals("qux", foos.last().attr("title"));
443         assertEquals("there", foos.last().text());
444     }
445 
handlesUnknownInlineTags()446     @Test public void handlesUnknownInlineTags() {
447         String h = "<p><cust>Test</cust></p><p><cust><cust>Test</cust></cust></p>";
448         Document doc = Jsoup.parseBodyFragment(h);
449         String out = doc.body().html();
450         assertEquals(h, TextUtil.stripNewlines(out));
451     }
452 
parsesBodyFragment()453     @Test public void parsesBodyFragment() {
454         String h = "<!-- comment --><p><a href='foo'>One</a></p>";
455         Document doc = Jsoup.parseBodyFragment(h, "http://example.com");
456         assertEquals("<body><!-- comment --><p><a href=\"foo\">One</a></p></body>", TextUtil.stripNewlines(doc.body().outerHtml()));
457         assertEquals("http://example.com/foo", doc.select("a").first().absUrl("href"));
458     }
459 
parseBodyIsIndexNoAttributes()460     @Test public void parseBodyIsIndexNoAttributes() {
461         // https://github.com/jhy/jsoup/issues/1404
462         String expectedHtml = "<form>\n" +
463             " <hr><label>This is a searchable index. Enter search keywords: <input name=\"isindex\"></label>\n" +
464             " <hr>\n" +
465             "</form>";
466         Document doc = Jsoup.parse("<isindex>");
467         assertEquals(expectedHtml, doc.body().html());
468 
469         doc = Jsoup.parseBodyFragment("<isindex>");
470         assertEquals(expectedHtml, doc.body().html());
471 
472         doc = Jsoup.parseBodyFragment("<table><input></table>");
473         assertEquals("<input>\n<table></table>", doc.body().html());
474     }
475 
handlesUnknownNamespaceTags()476     @Test public void handlesUnknownNamespaceTags() {
477         // note that the first foo:bar should not really be allowed to be self closing, if parsed in html mode.
478         String h = "<foo:bar id='1' /><abc:def id=2>Foo<p>Hello</p></abc:def><foo:bar>There</foo:bar>";
479         Document doc = Jsoup.parse(h);
480         assertEquals("<foo:bar id=\"1\" /><abc:def id=\"2\">Foo<p>Hello</p></abc:def><foo:bar>There</foo:bar>", TextUtil.stripNewlines(doc.body().html()));
481     }
482 
handlesKnownEmptyBlocks()483     @Test public void handlesKnownEmptyBlocks() {
484         // if a known tag, allow self closing outside of spec, but force an end tag. unknown tags can be self closing.
485         String h = "<div id='1' /><script src='/foo' /><div id=2><img /><img></div><a id=3 /><i /><foo /><foo>One</foo> <hr /> hr text <hr> hr text two";
486         Document doc = Jsoup.parse(h);
487         assertEquals("<div id=\"1\"></div><script src=\"/foo\"></script><div id=\"2\"><img><img></div><a id=\"3\"></a><i></i><foo /><foo>One</foo><hr> hr text <hr> hr text two", TextUtil.stripNewlines(doc.body().html()));
488     }
489 
handlesKnownEmptyNoFrames()490     @Test public void handlesKnownEmptyNoFrames() {
491         String h = "<html><head><noframes /><meta name=foo></head><body>One</body></html>";
492         Document doc = Jsoup.parse(h);
493         assertEquals("<html><head><noframes></noframes><meta name=\"foo\"></head><body>One</body></html>", TextUtil.stripNewlines(doc.html()));
494     }
495 
handlesKnownEmptyStyle()496     @Test public void handlesKnownEmptyStyle() {
497         String h = "<html><head><style /><meta name=foo></head><body>One</body></html>";
498         Document doc = Jsoup.parse(h);
499         assertEquals("<html><head><style></style><meta name=\"foo\"></head><body>One</body></html>", TextUtil.stripNewlines(doc.html()));
500     }
501 
handlesKnownEmptyTitle()502     @Test public void handlesKnownEmptyTitle() {
503         String h = "<html><head><title /><meta name=foo></head><body>One</body></html>";
504         Document doc = Jsoup.parse(h);
505         assertEquals("<html><head><title></title><meta name=\"foo\"></head><body>One</body></html>", TextUtil.stripNewlines(doc.html()));
506     }
507 
handlesKnownEmptyIframe()508     @Test public void handlesKnownEmptyIframe() {
509         String h = "<p>One</p><iframe id=1 /><p>Two";
510         Document doc = Jsoup.parse(h);
511         assertEquals("<html><head></head><body><p>One</p><iframe id=\"1\"></iframe><p>Two</p></body></html>", TextUtil.stripNewlines(doc.html()));
512     }
513 
handlesSolidusAtAttributeEnd()514     @Test public void handlesSolidusAtAttributeEnd() {
515         // this test makes sure [<a href=/>link</a>] is parsed as [<a href="/">link</a>], not [<a href="" /><a>link</a>]
516         String h = "<a href=/>link</a>";
517         Document doc = Jsoup.parse(h);
518         assertEquals("<a href=\"/\">link</a>", doc.body().html());
519     }
520 
handlesMultiClosingBody()521     @Test public void handlesMultiClosingBody() {
522         String h = "<body><p>Hello</body><p>there</p></body></body></html><p>now";
523         Document doc = Jsoup.parse(h);
524         assertEquals(3, doc.select("p").size());
525         assertEquals(3, doc.body().children().size());
526     }
527 
handlesUnclosedDefinitionLists()528     @Test public void handlesUnclosedDefinitionLists() {
529         // jsoup used to create a <dl>, but that's not to spec
530         String h = "<dt>Foo<dd>Bar<dt>Qux<dd>Zug";
531         Document doc = Jsoup.parse(h);
532         assertEquals(0, doc.select("dl").size()); // no auto dl
533         assertEquals(4, doc.select("dt, dd").size());
534         Elements dts = doc.select("dt");
535         assertEquals(2, dts.size());
536         assertEquals("Zug", dts.get(1).nextElementSibling().text());
537     }
538 
handlesBlocksInDefinitions()539     @Test public void handlesBlocksInDefinitions() {
540         // per the spec, dt and dd are inline, but in practise are block
541         String h = "<dl><dt><div id=1>Term</div></dt><dd><div id=2>Def</div></dd></dl>";
542         Document doc = Jsoup.parse(h);
543         assertEquals("dt", doc.select("#1").first().parent().tagName());
544         assertEquals("dd", doc.select("#2").first().parent().tagName());
545         assertEquals("<dl><dt><div id=\"1\">Term</div></dt><dd><div id=\"2\">Def</div></dd></dl>", TextUtil.stripNewlines(doc.body().html()));
546     }
547 
handlesFrames()548     @Test public void handlesFrames() {
549         String h = "<html><head><script></script><noscript></noscript></head><frameset><frame src=foo></frame><frame src=foo></frameset></html>";
550         Document doc = Jsoup.parse(h);
551         assertEquals("<html><head><script></script><noscript></noscript></head><frameset><frame src=\"foo\"><frame src=\"foo\"></frameset></html>",
552             TextUtil.stripNewlines(doc.html()));
553         // no body auto vivification
554     }
555 
ignoresContentAfterFrameset()556     @Test public void ignoresContentAfterFrameset() {
557         String h = "<html><head><title>One</title></head><frameset><frame /><frame /></frameset><table></table></html>";
558         Document doc = Jsoup.parse(h);
559         assertEquals("<html><head><title>One</title></head><frameset><frame><frame></frameset></html>", TextUtil.stripNewlines(doc.html()));
560         // no body, no table. No crash!
561     }
562 
handlesJavadocFont()563     @Test public void handlesJavadocFont() {
564         String h = "<TD BGCOLOR=\"#EEEEFF\" CLASS=\"NavBarCell1\">    <A HREF=\"deprecated-list.html\"><FONT CLASS=\"NavBarFont1\"><B>Deprecated</B></FONT></A>&nbsp;</TD>";
565         Document doc = Jsoup.parse(h);
566         Element a = doc.select("a").first();
567         assertEquals("Deprecated", a.text());
568         assertEquals("font", a.child(0).tagName());
569         assertEquals("b", a.child(0).child(0).tagName());
570     }
571 
handlesBaseWithoutHref()572     @Test public void handlesBaseWithoutHref() {
573         String h = "<head><base target='_blank'></head><body><a href=/foo>Test</a></body>";
574         Document doc = Jsoup.parse(h, "http://example.com/");
575         Element a = doc.select("a").first();
576         assertEquals("/foo", a.attr("href"));
577         assertEquals("http://example.com/foo", a.attr("abs:href"));
578     }
579 
normalisesDocument()580     @Test public void normalisesDocument() {
581         String h = "<!doctype html>One<html>Two<head>Three<link></head>Four<body>Five </body>Six </html>Seven ";
582         Document doc = Jsoup.parse(h);
583         assertEquals("<!doctype html><html><head></head><body>OneTwoThree<link>FourFive Six Seven</body></html>",
584             TextUtil.stripNewlines(doc.html()));
585     }
586 
normalisesEmptyDocument()587     @Test public void normalisesEmptyDocument() {
588         Document doc = Jsoup.parse("");
589         assertEquals("<html><head></head><body></body></html>", TextUtil.stripNewlines(doc.html()));
590     }
591 
normalisesHeadlessBody()592     @Test public void normalisesHeadlessBody() {
593         Document doc = Jsoup.parse("<html><body><span class=\"foo\">bar</span>");
594         assertEquals("<html><head></head><body><span class=\"foo\">bar</span></body></html>",
595             TextUtil.stripNewlines(doc.html()));
596     }
597 
normalisedBodyAfterContent()598     @Test public void normalisedBodyAfterContent() {
599         Document doc = Jsoup.parse("<font face=Arial><body class=name><div>One</div></body></font>");
600         assertEquals("<html><head></head><body class=\"name\"><font face=\"Arial\"><div>One</div></font></body></html>",
601             TextUtil.stripNewlines(doc.html()));
602     }
603 
findsCharsetInMalformedMeta()604     @Test public void findsCharsetInMalformedMeta() {
605         String h = "<meta http-equiv=Content-Type content=text/html; charset=gb2312>";
606         // example cited for reason of html5's <meta charset> element
607         Document doc = Jsoup.parse(h);
608         assertEquals("gb2312", doc.select("meta").attr("charset"));
609     }
610 
testHgroup()611     @Test public void testHgroup() {
612         // jsoup used to not allow hgroup in h{n}, but that's not in spec, and browsers are OK
613         Document doc = Jsoup.parse("<h1>Hello <h2>There <hgroup><h1>Another<h2>headline</hgroup> <hgroup><h1>More</h1><p>stuff</p></hgroup>");
614         assertEquals("<h1>Hello</h1><h2>There <hgroup><h1>Another</h1><h2>headline</h2></hgroup><hgroup><h1>More</h1><p>stuff</p></hgroup></h2>", TextUtil.stripNewlines(doc.body().html()));
615     }
616 
testRelaxedTags()617     @Test public void testRelaxedTags() {
618         Document doc = Jsoup.parse("<abc_def id=1>Hello</abc_def> <abc-def>There</abc-def>");
619         assertEquals("<abc_def id=\"1\">Hello</abc_def> <abc-def>There</abc-def>", TextUtil.stripNewlines(doc.body().html()));
620     }
621 
testHeaderContents()622     @Test public void testHeaderContents() {
623         // h* tags (h1 .. h9) in browsers can handle any internal content other than other h*. which is not per any
624         // spec, which defines them as containing phrasing content only. so, reality over theory.
625         Document doc = Jsoup.parse("<h1>Hello <div>There</div> now</h1> <h2>More <h3>Content</h3></h2>");
626         assertEquals("<h1>Hello <div>There</div> now</h1><h2>More</h2><h3>Content</h3>", TextUtil.stripNewlines(doc.body().html()));
627     }
628 
testSpanContents()629     @Test public void testSpanContents() {
630         // like h1 tags, the spec says SPAN is phrasing only, but browsers and publisher treat span as a block tag
631         Document doc = Jsoup.parse("<span>Hello <div>there</div> <span>now</span></span>");
632         assertEquals("<span>Hello <div>there</div><span>now</span></span>", TextUtil.stripNewlines(doc.body().html()));
633     }
634 
testNoImagesInNoScriptInHead()635     @Test public void testNoImagesInNoScriptInHead() {
636         // jsoup used to allow, but against spec if parsing with noscript
637         Document doc = Jsoup.parse("<html><head><noscript><img src='foo'></noscript></head><body><p>Hello</p></body></html>");
638         assertEquals("<html><head><noscript>&lt;img src=\"foo\"&gt;</noscript></head><body><p>Hello</p></body></html>", TextUtil.stripNewlines(doc.html()));
639     }
640 
testUnclosedNoscriptInHead()641     @Test public void testUnclosedNoscriptInHead() {
642         // Was getting "EOF" in html output, because the #anythingElse handler was calling an undefined toString, so used object.toString.
643         String[] strings = {"<noscript>", "<noscript>One"};
644         for (String html : strings) {
645             Document doc = Jsoup.parse(html);
646             assertEquals(html + "</noscript>", TextUtil.stripNewlines(doc.head().html()));
647         }
648     }
649 
testAFlowContents()650     @Test public void testAFlowContents() {
651         // html5 has <a> as either phrasing or block
652         Document doc = Jsoup.parse("<a>Hello <div>there</div> <span>now</span></a>");
653         assertEquals("<a>Hello <div>there</div><span>now</span></a>", TextUtil.stripNewlines(doc.body().html()));
654     }
655 
testFontFlowContents()656     @Test public void testFontFlowContents() {
657         // html5 has no definition of <font>; often used as flow
658         Document doc = Jsoup.parse("<font>Hello <div>there</div> <span>now</span></font>");
659         assertEquals("<font>Hello <div>there</div><span>now</span></font>", TextUtil.stripNewlines(doc.body().html()));
660     }
661 
handlesMisnestedTagsBI()662     @Test public void handlesMisnestedTagsBI() {
663         // whatwg: <b><i></b></i>
664         String h = "<p>1<b>2<i>3</b>4</i>5</p>";
665         Document doc = Jsoup.parse(h);
666         assertEquals("<p>1<b>2<i>3</i></b><i>4</i>5</p>", doc.body().html());
667         // adoption agency on </b>, reconstruction of formatters on 4.
668     }
669 
handlesMisnestedTagsBP()670     @Test public void handlesMisnestedTagsBP() {
671         //  whatwg: <b><p></b></p>
672         String h = "<b>1<p>2</b>3</p>";
673         Document doc = Jsoup.parse(h);
674         assertEquals("<b>1</b>\n<p><b>2</b>3</p>", doc.body().html());
675     }
676 
handlesMisnestedAInDivs()677     @Test public void handlesMisnestedAInDivs() {
678         String h = "<a href='#1'><div><div><a href='#2'>child</a></div</div></a>";
679         String w = "<a href=\"#1\"></a> <div> <a href=\"#1\"></a> <div> <a href=\"#1\"></a><a href=\"#2\">child</a> </div> </div>";
680         Document doc = Jsoup.parse(h);
681         assertEquals(
682             StringUtil.normaliseWhitespace(w),
683             StringUtil.normaliseWhitespace(doc.body().html()));
684     }
685 
handlesUnexpectedMarkupInTables()686     @Test public void handlesUnexpectedMarkupInTables() {
687         // whatwg - tests markers in active formatting (if they didn't work, would get in table)
688         // also tests foster parenting
689         String h = "<table><b><tr><td>aaa</td></tr>bbb</table>ccc";
690         Document doc = Jsoup.parse(h);
691         assertEquals("<b></b><b>bbb</b><table><tbody><tr><td>aaa</td></tr></tbody></table><b>ccc</b>", TextUtil.stripNewlines(doc.body().html()));
692     }
693 
handlesUnclosedFormattingElements()694     @Test public void handlesUnclosedFormattingElements() {
695         // whatwg: formatting elements get collected and applied, but excess elements are thrown away
696         String h = "<!DOCTYPE html>\n" +
697             "<p><b class=x><b class=x><b><b class=x><b class=x><b>X\n" +
698             "<p>X\n" +
699             "<p><b><b class=x><b>X\n" +
700             "<p></b></b></b></b></b></b>X";
701         Document doc = Jsoup.parse(h);
702         doc.outputSettings().indentAmount(0);
703         String want = "<!doctype html>\n" +
704             "<html>\n" +
705             "<head></head>\n" +
706             "<body>\n" +
707             "<p><b class=\"x\"><b class=\"x\"><b><b class=\"x\"><b class=\"x\"><b>X </b></b></b></b></b></b></p>\n" +
708             "<p><b class=\"x\"><b><b class=\"x\"><b class=\"x\"><b>X </b></b></b></b></b></p>\n" +
709             "<p><b class=\"x\"><b><b class=\"x\"><b class=\"x\"><b><b><b class=\"x\"><b>X </b></b></b></b></b></b></b></b></p>\n" +
710             "<p>X</p>\n" +
711             "</body>\n" +
712             "</html>";
713         assertEquals(want, doc.html());
714     }
715 
handlesUnclosedAnchors()716     @Test public void handlesUnclosedAnchors() {
717         String h = "<a href='http://example.com/'>Link<p>Error link</a>";
718         Document doc = Jsoup.parse(h);
719         String want = "<a href=\"http://example.com/\">Link</a>\n<p><a href=\"http://example.com/\">Error link</a></p>";
720         assertEquals(want, doc.body().html());
721     }
722 
reconstructFormattingElements()723     @Test public void reconstructFormattingElements() {
724         // tests attributes and multi b
725         String h = "<p><b class=one>One <i>Two <b>Three</p><p>Hello</p>";
726         Document doc = Jsoup.parse(h);
727         assertEquals("<p><b class=\"one\">One <i>Two <b>Three</b></i></b></p>\n<p><b class=\"one\"><i><b>Hello</b></i></b></p>", doc.body().html());
728     }
729 
reconstructFormattingElementsInTable()730     @Test public void reconstructFormattingElementsInTable() {
731         // tests that tables get formatting markers -- the <b> applies outside the table and does not leak in,
732         // and the <i> inside the table and does not leak out.
733         String h = "<p><b>One</p> <table><tr><td><p><i>Three<p>Four</i></td></tr></table> <p>Five</p>";
734         Document doc = Jsoup.parse(h);
735         String want = "<p><b>One</b></p><b><table><tbody><tr><td><p><i>Three</i></p><p><i>Four</i></p></td></tr></tbody></table><p>Five</p></b>";
736         assertEquals(want, TextUtil.stripNewlines(doc.body().html()));
737     }
738 
commentBeforeHtml()739     @Test public void commentBeforeHtml() {
740         String h = "<!-- comment --><!-- comment 2 --><p>One</p>";
741         Document doc = Jsoup.parse(h);
742         assertEquals("<!-- comment --><!-- comment 2 --><html><head></head><body><p>One</p></body></html>", TextUtil.stripNewlines(doc.html()));
743     }
744 
emptyTdTag()745     @Test public void emptyTdTag() {
746         String h = "<table><tr><td>One</td><td id='2' /></tr></table>";
747         Document doc = Jsoup.parse(h);
748         assertEquals("<td>One</td>\n<td id=\"2\"></td>", doc.select("tr").first().html());
749     }
750 
handlesSolidusInA()751     @Test public void handlesSolidusInA() {
752         // test for bug #66
753         String h = "<a class=lp href=/lib/14160711/>link text</a>";
754         Document doc = Jsoup.parse(h);
755         Element a = doc.select("a").first();
756         assertEquals("link text", a.text());
757         assertEquals("/lib/14160711/", a.attr("href"));
758     }
759 
handlesSpanInTbody()760     @Test public void handlesSpanInTbody() {
761         // test for bug 64
762         String h = "<table><tbody><span class='1'><tr><td>One</td></tr><tr><td>Two</td></tr></span></tbody></table>";
763         Document doc = Jsoup.parse(h);
764         assertEquals(doc.select("span").first().children().size(), 0); // the span gets closed
765         assertEquals(doc.select("table").size(), 1); // only one table
766     }
767 
handlesUnclosedTitleAtEof()768     @Test public void handlesUnclosedTitleAtEof() {
769         assertEquals("Data", Jsoup.parse("<title>Data").title());
770         assertEquals("Data<", Jsoup.parse("<title>Data<").title());
771         assertEquals("Data</", Jsoup.parse("<title>Data</").title());
772         assertEquals("Data</t", Jsoup.parse("<title>Data</t").title());
773         assertEquals("Data</ti", Jsoup.parse("<title>Data</ti").title());
774         assertEquals("Data", Jsoup.parse("<title>Data</title>").title());
775         assertEquals("Data", Jsoup.parse("<title>Data</title >").title());
776     }
777 
handlesUnclosedTitle()778     @Test public void handlesUnclosedTitle() {
779         Document one = Jsoup.parse("<title>One <b>Two <b>Three</TITLE><p>Test</p>"); // has title, so <b> is plain text
780         assertEquals("One <b>Two <b>Three", one.title());
781         assertEquals("Test", one.select("p").first().text());
782 
783         Document two = Jsoup.parse("<title>One<b>Two <p>Test</p>"); // no title, so <b> causes </title> breakout
784         assertEquals("One", two.title());
785         assertEquals("<b>Two \n <p>Test</p></b>", two.body().html());
786     }
787 
handlesUnclosedScriptAtEof()788     @Test public void handlesUnclosedScriptAtEof() {
789         assertEquals("Data", Jsoup.parse("<script>Data").select("script").first().data());
790         assertEquals("Data<", Jsoup.parse("<script>Data<").select("script").first().data());
791         assertEquals("Data</sc", Jsoup.parse("<script>Data</sc").select("script").first().data());
792         assertEquals("Data</-sc", Jsoup.parse("<script>Data</-sc").select("script").first().data());
793         assertEquals("Data</sc-", Jsoup.parse("<script>Data</sc-").select("script").first().data());
794         assertEquals("Data</sc--", Jsoup.parse("<script>Data</sc--").select("script").first().data());
795         assertEquals("Data", Jsoup.parse("<script>Data</script>").select("script").first().data());
796         assertEquals("Data</script", Jsoup.parse("<script>Data</script").select("script").first().data());
797         assertEquals("Data", Jsoup.parse("<script>Data</script ").select("script").first().data());
798         assertEquals("Data", Jsoup.parse("<script>Data</script n").select("script").first().data());
799         assertEquals("Data", Jsoup.parse("<script>Data</script n=").select("script").first().data());
800         assertEquals("Data", Jsoup.parse("<script>Data</script n=\"").select("script").first().data());
801         assertEquals("Data", Jsoup.parse("<script>Data</script n=\"p").select("script").first().data());
802     }
803 
handlesUnclosedRawtextAtEof()804     @Test public void handlesUnclosedRawtextAtEof() {
805         assertEquals("Data", Jsoup.parse("<style>Data").select("style").first().data());
806         assertEquals("Data</st", Jsoup.parse("<style>Data</st").select("style").first().data());
807         assertEquals("Data", Jsoup.parse("<style>Data</style>").select("style").first().data());
808         assertEquals("Data</style", Jsoup.parse("<style>Data</style").select("style").first().data());
809         assertEquals("Data</-style", Jsoup.parse("<style>Data</-style").select("style").first().data());
810         assertEquals("Data</style-", Jsoup.parse("<style>Data</style-").select("style").first().data());
811         assertEquals("Data</style--", Jsoup.parse("<style>Data</style--").select("style").first().data());
812     }
813 
noImplicitFormForTextAreas()814     @Test public void noImplicitFormForTextAreas() {
815         // old jsoup parser would create implicit forms for form children like <textarea>, but no more
816         Document doc = Jsoup.parse("<textarea>One</textarea>");
817         assertEquals("<textarea>One</textarea>", doc.body().html());
818     }
819 
handlesEscapedScript()820     @Test public void handlesEscapedScript() {
821         Document doc = Jsoup.parse("<script><!-- one <script>Blah</script> --></script>");
822         assertEquals("<!-- one <script>Blah</script> -->", doc.select("script").first().data());
823     }
824 
handles0CharacterAsText()825     @Test public void handles0CharacterAsText() {
826         Document doc = Jsoup.parse("0<p>0</p>");
827         assertEquals("0\n<p>0</p>", doc.body().html());
828     }
829 
handlesNullInData()830     @Test public void handlesNullInData() {
831         Document doc = Jsoup.parse("<p id=\u0000>Blah \u0000</p>");
832         assertEquals("<p id=\"\uFFFD\">Blah &#x0;</p>", doc.body().html()); // replaced in attr, NOT replaced in data (but is escaped as control char <0x20)
833     }
834 
handlesNullInComments()835     @Test public void handlesNullInComments() {
836         Document doc = Jsoup.parse("<body><!-- \u0000 \u0000 -->");
837         assertEquals("<!-- \uFFFD \uFFFD -->", doc.body().html());
838     }
839 
handlesNewlinesAndWhitespaceInTag()840     @Test public void handlesNewlinesAndWhitespaceInTag() {
841         Document doc = Jsoup.parse("<a \n href=\"one\" \r\n id=\"two\" \f >");
842         assertEquals("<a href=\"one\" id=\"two\"></a>", doc.body().html());
843     }
844 
handlesWhitespaceInoDocType()845     @Test public void handlesWhitespaceInoDocType() {
846         String html = "<!DOCTYPE html\r\n" +
847             "      PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\r\n" +
848             "      \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">";
849         Document doc = Jsoup.parse(html);
850         assertEquals("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">", doc.childNode(0).outerHtml());
851     }
852 
tracksErrorsWhenRequested()853     @Test public void tracksErrorsWhenRequested() {
854         String html = "<p>One</p href='no'>\n<!DOCTYPE html>\n&arrgh;<font />&#33 &amp &#x110000;<br /></div><foo";
855         Parser parser = Parser.htmlParser().setTrackErrors(500);
856         Document doc = Jsoup.parse(html, "http://example.com", parser);
857 
858         List<ParseError> errors = parser.getErrors();
859         assertEquals(9, errors.size());
860         assertEquals("<1:21>: Attributes incorrectly present on end tag [/p]", errors.get(0).toString());
861         assertEquals("<2:16>: Unexpected Doctype token [<!doctype html>] when in state [InBody]", errors.get(1).toString());
862         assertEquals("<3:2>: Invalid character reference: invalid named reference [arrgh]", errors.get(2).toString());
863         assertEquals("<3:16>: Tag [font] cannot be self closing; not a void tag", errors.get(3).toString());
864         assertEquals("<3:20>: Invalid character reference: missing semicolon on [&#33]", errors.get(4).toString());
865         assertEquals("<3:25>: Invalid character reference: missing semicolon on [&amp]", errors.get(5).toString());
866         assertEquals("<3:36>: Invalid character reference: character [1114112] outside of valid range", errors.get(6).toString());
867         assertEquals("<3:48>: Unexpected EndTag token [</div>] when in state [InBody]", errors.get(7).toString());
868         assertEquals("<3:53>: Unexpectedly reached end of file (EOF) in input state [TagName]", errors.get(8).toString());
869     }
870 
tracksLimitedErrorsWhenRequested()871     @Test public void tracksLimitedErrorsWhenRequested() {
872         String html = "<p>One</p href='no'>\n<!DOCTYPE html>\n&arrgh;<font /><br /><foo";
873         Parser parser = Parser.htmlParser().setTrackErrors(3);
874         Document doc = parser.parseInput(html, "http://example.com");
875 
876         List<ParseError> errors = parser.getErrors();
877         assertEquals(3, errors.size());
878         assertEquals("<1:21>: Attributes incorrectly present on end tag [/p]", errors.get(0).toString());
879         assertEquals("<2:16>: Unexpected Doctype token [<!doctype html>] when in state [InBody]", errors.get(1).toString());
880         assertEquals("<3:2>: Invalid character reference: invalid named reference [arrgh]", errors.get(2).toString());
881     }
882 
noErrorsByDefault()883     @Test public void noErrorsByDefault() {
884         String html = "<p>One</p href='no'>&arrgh;<font /><br /><foo";
885         Parser parser = Parser.htmlParser();
886         Document doc = Jsoup.parse(html, "http://example.com", parser);
887 
888         List<ParseError> errors = parser.getErrors();
889         assertEquals(0, errors.size());
890     }
891 
optionalPClosersAreNotErrors()892     @Test public void optionalPClosersAreNotErrors() {
893         String html = "<body><div><p>One<p>Two</div></body>";
894         Parser parser = Parser.htmlParser().setTrackErrors(128);
895         Document doc = Jsoup.parse(html, "", parser);
896         ParseErrorList errors = parser.getErrors();
897         assertEquals(0, errors.size());
898     }
899 
handlesCommentsInTable()900     @Test public void handlesCommentsInTable() {
901         String html = "<table><tr><td>text</td><!-- Comment --></tr></table>";
902         Document node = Jsoup.parseBodyFragment(html);
903         assertEquals("<html><head></head><body><table><tbody><tr><td>text</td><!-- Comment --></tr></tbody></table></body></html>", TextUtil.stripNewlines(node.outerHtml()));
904     }
905 
handlesQuotesInCommentsInScripts()906     @Test public void handlesQuotesInCommentsInScripts() {
907         String html = "<script>\n" +
908             "  <!--\n" +
909             "    document.write('</scr' + 'ipt>');\n" +
910             "  // -->\n" +
911             "</script>";
912         Document node = Jsoup.parseBodyFragment(html);
913         assertEquals("<script>\n" +
914             "  <!--\n" +
915             "    document.write('</scr' + 'ipt>');\n" +
916             "  // -->\n" +
917             "</script>", node.body().html());
918     }
919 
handleNullContextInParseFragment()920     @Test public void handleNullContextInParseFragment() {
921         String html = "<ol><li>One</li></ol><p>Two</p>";
922         List<Node> nodes = Parser.parseFragment(html, null, "http://example.com/");
923         assertEquals(1, nodes.size()); // returns <html> node (not document) -- no context means doc gets created
924         assertEquals("html", nodes.get(0).nodeName());
925         assertEquals("<html> <head></head> <body> <ol> <li>One</li> </ol> <p>Two</p> </body> </html>", StringUtil.normaliseWhitespace(nodes.get(0).outerHtml()));
926     }
927 
doesNotFindShortestMatchingEntity()928     @Test public void doesNotFindShortestMatchingEntity() {
929         // previous behaviour was to identify a possible entity, then chomp down the string until a match was found.
930         // (as defined in html5.) However in practise that lead to spurious matches against the author's intent.
931         String html = "One &clubsuite; &clubsuit;";
932         Document doc = Jsoup.parse(html);
933         assertEquals(StringUtil.normaliseWhitespace("One &amp;clubsuite; ♣"), doc.body().html());
934     }
935 
relaxedBaseEntityMatchAndStrictExtendedMatch()936     @Test public void relaxedBaseEntityMatchAndStrictExtendedMatch() {
937         // extended entities need a ; at the end to match, base does not
938         String html = "&amp &quot &reg &icy &hopf &icy; &hopf;";
939         Document doc = Jsoup.parse(html);
940         doc.outputSettings().escapeMode(Entities.EscapeMode.extended).charset("ascii"); // modifies output only to clarify test
941         assertEquals("&amp; \" &reg; &amp;icy &amp;hopf &icy; &hopf;", doc.body().html());
942     }
943 
handlesXmlDeclarationAsBogusComment()944     @Test public void handlesXmlDeclarationAsBogusComment() {
945         String html = "<?xml encoding='UTF-8' ?><body>One</body>";
946         Document doc = Jsoup.parse(html);
947         assertEquals("<!--?xml encoding='UTF-8' ?--> <html> <head></head> <body> One </body> </html>", StringUtil.normaliseWhitespace(doc.outerHtml()));
948     }
949 
handlesTagsInTextarea()950     @Test public void handlesTagsInTextarea() {
951         String html = "<textarea><p>Jsoup</p></textarea>";
952         Document doc = Jsoup.parse(html);
953         assertEquals("<textarea>&lt;p&gt;Jsoup&lt;/p&gt;</textarea>", doc.body().html());
954     }
955 
956     // form tests
createsFormElements()957     @Test public void createsFormElements() {
958         String html = "<body><form><input id=1><input id=2></form></body>";
959         Document doc = Jsoup.parse(html);
960         Element el = doc.select("form").first();
961 
962         assertTrue(el instanceof FormElement, "Is form element");
963         FormElement form = (FormElement) el;
964         Elements controls = form.elements();
965         assertEquals(2, controls.size());
966         assertEquals("1", controls.get(0).id());
967         assertEquals("2", controls.get(1).id());
968     }
969 
associatedFormControlsWithDisjointForms()970     @Test public void associatedFormControlsWithDisjointForms() {
971         // form gets closed, isn't parent of controls
972         String html = "<table><tr><form><input type=hidden id=1><td><input type=text id=2></td><tr></table>";
973         Document doc = Jsoup.parse(html);
974         Element el = doc.select("form").first();
975 
976         assertTrue(el instanceof FormElement, "Is form element");
977         FormElement form = (FormElement) el;
978         Elements controls = form.elements();
979         assertEquals(2, controls.size());
980         assertEquals("1", controls.get(0).id());
981         assertEquals("2", controls.get(1).id());
982 
983         assertEquals("<table><tbody><tr><form></form><input type=\"hidden\" id=\"1\"><td><input type=\"text\" id=\"2\"></td></tr><tr></tr></tbody></table>", TextUtil.stripNewlines(doc.body().html()));
984     }
985 
handlesInputInTable()986     @Test public void handlesInputInTable() {
987         String h = "<body>\n" +
988             "<input type=\"hidden\" name=\"a\" value=\"\">\n" +
989             "<table>\n" +
990             "<input type=\"hidden\" name=\"b\" value=\"\" />\n" +
991             "</table>\n" +
992             "</body>";
993         Document doc = Jsoup.parse(h);
994         assertEquals(1, doc.select("table input").size());
995         assertEquals(2, doc.select("input").size());
996     }
997 
convertsImageToImg()998     @Test public void convertsImageToImg() {
999         // image to img, unless in a svg. old html cruft.
1000         String h = "<body><image><svg><image /></svg></body>";
1001         Document doc = Jsoup.parse(h);
1002         assertEquals("<img>\n<svg>\n <image />\n</svg>", doc.body().html());
1003     }
1004 
handlesInvalidDoctypes()1005     @Test public void handlesInvalidDoctypes() {
1006         // would previously throw invalid name exception on empty doctype
1007         Document doc = Jsoup.parse("<!DOCTYPE>");
1008         assertEquals(
1009             "<!doctype> <html> <head></head> <body></body> </html>",
1010             StringUtil.normaliseWhitespace(doc.outerHtml()));
1011 
1012         doc = Jsoup.parse("<!DOCTYPE><html><p>Foo</p></html>");
1013         assertEquals(
1014             "<!doctype> <html> <head></head> <body> <p>Foo</p> </body> </html>",
1015             StringUtil.normaliseWhitespace(doc.outerHtml()));
1016 
1017         doc = Jsoup.parse("<!DOCTYPE \u0000>");
1018         assertEquals(
1019             "<!doctype �> <html> <head></head> <body></body> </html>",
1020             StringUtil.normaliseWhitespace(doc.outerHtml()));
1021     }
1022 
handlesManyChildren()1023     @Test public void handlesManyChildren() {
1024         // Arrange
1025         StringBuilder longBody = new StringBuilder(500000);
1026         for (int i = 0; i < 25000; i++) {
1027             longBody.append(i).append("<br>");
1028         }
1029 
1030         // Act
1031         long start = System.currentTimeMillis();
1032         Document doc = Parser.parseBodyFragment(longBody.toString(), "");
1033 
1034         // Assert
1035         assertEquals(50000, doc.body().childNodeSize());
1036         assertTrue(System.currentTimeMillis() - start < 1000);
1037     }
1038 
1039     @Test
1040     public void testInvalidTableContents() throws IOException {
1041         File in = ParseTest.getFile("/htmltests/table-invalid-elements.html");
1042         Document doc = Jsoup.parse(in, "UTF-8");
1043         doc.outputSettings().prettyPrint(true);
1044         String rendered = doc.toString();
1045         int endOfEmail = rendered.indexOf("Comment");
1046         int guarantee = rendered.indexOf("Why am I here?");
1047         assertTrue(endOfEmail > -1, "Comment not found");
1048         assertTrue(guarantee > -1, "Search text not found");
1049         assertTrue(guarantee > endOfEmail, "Search text did not come after comment");
1050     }
1051 
testNormalisesIsIndex()1052     @Test public void testNormalisesIsIndex() {
1053         Document doc = Jsoup.parse("<body><isindex action='/submit'></body>");
1054         String html = doc.outerHtml();
1055         assertEquals("<form action=\"/submit\"> <hr><label>This is a searchable index. Enter search keywords: <input name=\"isindex\"></label> <hr> </form>",
1056             StringUtil.normaliseWhitespace(doc.body().html()));
1057     }
1058 
testReinsertionModeForThCelss()1059     @Test public void testReinsertionModeForThCelss() {
1060         String body = "<body> <table> <tr> <th> <table><tr><td></td></tr></table> <div> <table><tr><td></td></tr></table> </div> <div></div> <div></div> <div></div> </th> </tr> </table> </body>";
1061         Document doc = Jsoup.parse(body);
1062         assertEquals(1, doc.body().children().size());
1063     }
1064 
testUsingSingleQuotesInQueries()1065     @Test public void testUsingSingleQuotesInQueries() {
1066         String body = "<body> <div class='main'>hello</div></body>";
1067         Document doc = Jsoup.parse(body);
1068         Elements main = doc.select("div[class='main']");
1069         assertEquals("hello", main.text());
1070     }
1071 
testSupportsNonAsciiTags()1072     @Test public void testSupportsNonAsciiTags() {
1073         String body = "<a進捗推移グラフ>Yes</a進捗推移グラフ><bрусский-тэг>Correct</<bрусский-тэг>";
1074         Document doc = Jsoup.parse(body);
1075         Elements els = doc.select("a進捗推移グラフ");
1076         assertEquals("Yes", els.text());
1077         els = doc.select("bрусский-тэг");
1078         assertEquals("Correct", els.text());
1079     }
1080 
testSupportsPartiallyNonAsciiTags()1081     @Test public void testSupportsPartiallyNonAsciiTags() {
1082         String body = "<div>Check</divá>";
1083         Document doc = Jsoup.parse(body);
1084         Elements els = doc.select("div");
1085         assertEquals("Check", els.text());
1086     }
1087 
testFragment()1088     @Test public void testFragment() {
1089         // make sure when parsing a body fragment, a script tag at start goes into the body
1090         String html =
1091             "<script type=\"text/javascript\">console.log('foo');</script>\n" +
1092                 "<div id=\"somecontent\">some content</div>\n" +
1093                 "<script type=\"text/javascript\">console.log('bar');</script>";
1094 
1095         Document body = Jsoup.parseBodyFragment(html);
1096         assertEquals("<script type=\"text/javascript\">console.log('foo');</script>\n" +
1097             "<div id=\"somecontent\">\n" +
1098             " some content\n" +
1099             "</div>\n" +
1100             "<script type=\"text/javascript\">console.log('bar');</script>", body.body().html());
1101     }
1102 
testHtmlLowerCase()1103     @Test public void testHtmlLowerCase() {
1104         String html = "<!doctype HTML><DIV ID=1>One</DIV>";
1105         Document doc = Jsoup.parse(html);
1106         assertEquals("<!doctype html> <html> <head></head> <body> <div id=\"1\"> One </div> </body> </html>", StringUtil.normaliseWhitespace(doc.outerHtml()));
1107 
1108         Element div = doc.selectFirst("#1");
1109         div.after("<TaG>One</TaG>");
1110         assertEquals("<tag>One</tag>", TextUtil.stripNewlines(div.nextElementSibling().outerHtml()));
1111     }
1112 
testHtmlLowerCaseAttributesOfVoidTags()1113     @Test public void testHtmlLowerCaseAttributesOfVoidTags() {
1114         String html = "<!doctype HTML><IMG ALT=One></DIV>";
1115         Document doc = Jsoup.parse(html);
1116         assertEquals("<!doctype html> <html> <head></head> <body> <img alt=\"One\"> </body> </html>", StringUtil.normaliseWhitespace(doc.outerHtml()));
1117     }
1118 
testHtmlLowerCaseAttributesForm()1119     @Test public void testHtmlLowerCaseAttributesForm() {
1120         String html = "<form NAME=one>";
1121         Document doc = Jsoup.parse(html);
1122         assertEquals("<form name=\"one\"></form>", StringUtil.normaliseWhitespace(doc.body().html()));
1123     }
1124 
canPreserveTagCase()1125     @Test public void canPreserveTagCase() {
1126         Parser parser = Parser.htmlParser();
1127         parser.settings(new ParseSettings(true, false));
1128         Document doc = parser.parseInput("<div id=1><SPAN ID=2>", "");
1129         assertEquals("<html> <head></head> <body> <div id=\"1\"> <SPAN id=\"2\"></SPAN> </div> </body> </html>", StringUtil.normaliseWhitespace(doc.outerHtml()));
1130 
1131         Element div = doc.selectFirst("#1");
1132         div.after("<TaG ID=one>One</TaG>");
1133         assertEquals("<TaG id=\"one\">One</TaG>", TextUtil.stripNewlines(div.nextElementSibling().outerHtml()));
1134     }
1135 
canPreserveAttributeCase()1136     @Test public void canPreserveAttributeCase() {
1137         Parser parser = Parser.htmlParser();
1138         parser.settings(new ParseSettings(false, true));
1139         Document doc = parser.parseInput("<div id=1><SPAN ID=2>", "");
1140         assertEquals("<html> <head></head> <body> <div id=\"1\"> <span ID=\"2\"></span> </div> </body> </html>", StringUtil.normaliseWhitespace(doc.outerHtml()));
1141 
1142         Element div = doc.selectFirst("#1");
1143         div.after("<TaG ID=one>One</TaG>");
1144         assertEquals("<tag ID=\"one\">One</tag>", TextUtil.stripNewlines(div.nextElementSibling().outerHtml()));
1145     }
1146 
canPreserveBothCase()1147     @Test public void canPreserveBothCase() {
1148         Parser parser = Parser.htmlParser();
1149         parser.settings(new ParseSettings(true, true));
1150         Document doc = parser.parseInput("<div id=1><SPAN ID=2>", "");
1151         assertEquals("<html> <head></head> <body> <div id=\"1\"> <SPAN ID=\"2\"></SPAN> </div> </body> </html>", StringUtil.normaliseWhitespace(doc.outerHtml()));
1152 
1153         Element div = doc.selectFirst("#1");
1154         div.after("<TaG ID=one>One</TaG>");
1155         assertEquals("<TaG ID=\"one\">One</TaG>", TextUtil.stripNewlines(div.nextElementSibling().outerHtml()));
1156     }
1157 
handlesControlCodeInAttributeName()1158     @Test public void handlesControlCodeInAttributeName() {
1159         Document doc = Jsoup.parse("<p><a \06=foo>One</a><a/\06=bar><a foo\06=bar>Two</a></p>");
1160         assertEquals("<p><a>One</a><a></a><a foo=\"bar\">Two</a></p>", doc.body().html());
1161     }
1162 
caseSensitiveParseTree()1163     @Test public void caseSensitiveParseTree() {
1164         String html = "<r><X>A</X><y>B</y></r>";
1165         Parser parser = Parser.htmlParser();
1166         parser.settings(preserveCase);
1167         Document doc = parser.parseInput(html, "");
1168         assertEquals("<r> <X> A </X> <y> B </y> </r>", StringUtil.normaliseWhitespace(doc.body().html()));
1169     }
1170 
caseInsensitiveParseTree()1171     @Test public void caseInsensitiveParseTree() {
1172         String html = "<r><X>A</X><y>B</y></r>";
1173         Parser parser = Parser.htmlParser();
1174         Document doc = parser.parseInput(html, "");
1175         assertEquals("<r> <x> A </x> <y> B </y> </r>", StringUtil.normaliseWhitespace(doc.body().html()));
1176     }
1177 
preservedCaseLinksCantNest()1178     @Test public void preservedCaseLinksCantNest() {
1179         String html = "<A>ONE <A>Two</A></A>";
1180         Document doc = Parser.htmlParser()
1181             .settings(preserveCase)
1182             .parseInput(html, "");
1183         //assertEquals("<A>ONE </A><A>Two</A>", StringUtil.normaliseWhitespace(doc.body().html()));
1184         assertEquals("<A>ONE </A><A>Two</A>", doc.body().html());
1185     }
1186 
normalizesDiscordantTags()1187     @Test public void normalizesDiscordantTags() {
1188         Document document = Jsoup.parse("<div>test</DIV><p></p>");
1189         assertEquals("<div>\n test\n</div>\n<p></p>", document.body().html());
1190     }
1191 
selfClosingVoidIsNotAnError()1192     @Test public void selfClosingVoidIsNotAnError() {
1193         String html = "<p>test<br/>test<br/></p>";
1194         Parser parser = Parser.htmlParser().setTrackErrors(5);
1195         parser.parseInput(html, "");
1196         assertEquals(0, parser.getErrors().size());
1197 
1198         assertTrue(Jsoup.isValid(html, Safelist.basic()));
1199         String clean = Jsoup.clean(html, Safelist.basic());
1200         assertEquals("<p>test<br>\n test<br></p>", clean);
1201     }
1202 
selfClosingOnNonvoidIsError()1203     @Test public void selfClosingOnNonvoidIsError() {
1204         String html = "<p>test</p>\n\n<div /><div>Two</div>";
1205         Parser parser = Parser.htmlParser().setTrackErrors(5);
1206         parser.parseInput(html, "");
1207         assertEquals(1, parser.getErrors().size());
1208         assertEquals("<3:8>: Tag [div] cannot be self closing; not a void tag", parser.getErrors().get(0).toString());
1209 
1210         assertFalse(Jsoup.isValid(html, Safelist.relaxed()));
1211         String clean = Jsoup.clean(html, Safelist.relaxed());
1212         assertEquals("<p>test</p> <div></div> <div> Two </div>", StringUtil.normaliseWhitespace(clean));
1213     }
1214 
testTemplateInsideTable()1215     @Test public void testTemplateInsideTable() throws IOException {
1216         File in = ParseTest.getFile("/htmltests/table-polymer-template.html");
1217         Document doc = Jsoup.parse(in, "UTF-8");
1218         doc.outputSettings().prettyPrint(true);
1219 
1220         Elements templates = doc.body().getElementsByTag("template");
1221         for (Element template : templates) {
1222             assertTrue(template.childNodes().size() > 1);
1223         }
1224     }
1225 
testHandlesDeepSpans()1226     @Test public void testHandlesDeepSpans() {
1227         StringBuilder sb = new StringBuilder();
1228         for (int i = 0; i < 200; i++) {
1229             sb.append("<span>");
1230         }
1231 
1232         sb.append("<p>One</p>");
1233 
1234         Document doc = Jsoup.parse(sb.toString());
1235         assertEquals(200, doc.select("span").size());
1236         assertEquals(1, doc.select("p").size());
1237     }
1238 
commentAtEnd()1239     @Test public void commentAtEnd() {
1240         Document doc = Jsoup.parse("<!");
1241         assertTrue(doc.childNode(0) instanceof Comment);
1242     }
1243 
preSkipsFirstNewline()1244     @Test public void preSkipsFirstNewline() {
1245         Document doc = Jsoup.parse("<pre>\n\nOne\nTwo\n</pre>");
1246         Element pre = doc.selectFirst("pre");
1247         assertEquals("One\nTwo", pre.text());
1248         assertEquals("\nOne\nTwo\n", pre.wholeText());
1249     }
1250 
handlesXmlDeclAndCommentsBeforeDoctype()1251     @Test public void handlesXmlDeclAndCommentsBeforeDoctype() throws IOException {
1252         File in = ParseTest.getFile("/htmltests/comments.html");
1253         Document doc = Jsoup.parse(in, "UTF-8");
1254 
1255         assertEquals("<!--?xml version=\"1.0\" encoding=\"utf-8\"?--><!-- so --> <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><!-- what --> <html xml:lang=\"en\" lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\"> <!-- now --> <head> <!-- then --> <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\"> <title>A Certain Kind of Test</title> </head> <body> <h1>Hello</h1>h1&gt; (There is a UTF8 hidden BOM at the top of this file.) </body> </html>",
1256             StringUtil.normaliseWhitespace(doc.html()));
1257 
1258         assertEquals("A Certain Kind of Test", doc.head().select("title").text());
1259     }
1260 
fallbackToUtfIfCantEncode()1261     @Test public void fallbackToUtfIfCantEncode() throws IOException {
1262         // that charset can't be encoded, so make sure we flip to utf
1263 
1264         String in = "<html><meta charset=\"ISO-2022-CN\"/>One</html>";
1265         Document doc = Jsoup.parse(new ByteArrayInputStream(in.getBytes()), null, "");
1266 
1267         assertEquals("UTF-8", doc.charset().name());
1268         assertEquals("One", doc.text());
1269 
1270         String html = doc.outerHtml();
1271         assertEquals("<html><head><meta charset=\"UTF-8\"></head><body>One</body></html>", TextUtil.stripNewlines(html));
1272     }
1273 
characterReaderBuffer()1274     @Test public void characterReaderBuffer() throws IOException {
1275         File in = ParseTest.getFile("/htmltests/character-reader-buffer.html.gz");
1276         Document doc = Jsoup.parse(in, "UTF-8");
1277 
1278         String expectedHref = "http://www.domain.com/path?param_one=value&param_two=value";
1279 
1280         Elements links = doc.select("a");
1281         assertEquals(2, links.size());
1282         assertEquals(expectedHref, links.get(0).attr("href")); // passes
1283         assertEquals(expectedHref, links.get(1).attr("href")); // fails, "but was:<...ath?param_one=value&[]_two-value>"
1284     }
1285 
1286     @Test
selfClosingTextAreaDoesntLeaveDroppings()1287     public void selfClosingTextAreaDoesntLeaveDroppings() {
1288         // https://github.com/jhy/jsoup/issues/1220
1289         Document doc = Jsoup.parse("<div><div><textarea/></div></div>");
1290         assertFalse(doc.body().html().contains("&lt;"));
1291         assertFalse(doc.body().html().contains("&gt;"));
1292         assertEquals("<div><div><textarea></textarea></div></div>", TextUtil.stripNewlines(doc.body().html()));
1293     }
1294 
1295     @Test
testNoSpuriousSpace()1296     public void testNoSpuriousSpace() {
1297         Document doc = Jsoup.parse("Just<a>One</a><a>Two</a>");
1298         assertEquals("Just<a>One</a><a>Two</a>", doc.body().html());
1299         assertEquals("JustOneTwo", doc.body().text());
1300     }
1301 
1302     @Test
pTagsGetIndented()1303     public void pTagsGetIndented() {
1304         String html = "<div><p><a href=one>One</a><p><a href=two>Two</a></p></div>";
1305         Document doc = Jsoup.parse(html);
1306         assertEquals("<div>\n" +
1307             " <p><a href=\"one\">One</a></p>\n" +
1308             " <p><a href=\"two\">Two</a></p>\n" +
1309             "</div>", doc.body().html());
1310     }
1311 
1312     @Test
indentRegardlessOfCase()1313     public void indentRegardlessOfCase() {
1314         String html = "<p>1</p><P>2</P>";
1315         Document doc = Jsoup.parse(html);
1316         assertEquals(
1317             "<body>\n" +
1318             " <p>1</p>\n" +
1319             " <p>2</p>\n" +
1320             "</body>", doc.body().outerHtml());
1321 
1322         Document caseDoc = Jsoup.parse(html, "", Parser.htmlParser().settings(preserveCase));
1323         assertEquals(
1324             "<body>\n" +
1325             " <p>1</p>\n" +
1326             " <P>2</P>\n" +
1327             "</body>", caseDoc.body().outerHtml());
1328     }
1329 
1330     @Test
testH20()1331     public void testH20() {
1332         // https://github.com/jhy/jsoup/issues/731
1333         String html = "H<sub>2</sub>O";
1334         String clean = Jsoup.clean(html, Safelist.basic());
1335         assertEquals("H<sub>2</sub>O", clean);
1336 
1337         Document doc = Jsoup.parse(html);
1338         assertEquals("H2O", doc.text());
1339     }
1340 
1341     @Test
testUNewlines()1342     public void testUNewlines() {
1343         // https://github.com/jhy/jsoup/issues/851
1344         String html = "t<u>es</u>t <b>on</b> <i>f</i><u>ir</u>e";
1345         String clean = Jsoup.clean(html, Safelist.basic());
1346         assertEquals("t<u>es</u>t <b>on</b> <i>f</i><u>ir</u>e", clean);
1347 
1348         Document doc = Jsoup.parse(html);
1349         assertEquals("test on fire", doc.text());
1350     }
1351 
testFarsi()1352     @Test public void testFarsi() {
1353         // https://github.com/jhy/jsoup/issues/1227
1354         String text = "نیمه\u200Cشب";
1355         Document doc = Jsoup.parse("<p>" + text);
1356         assertEquals(text, doc.text());
1357     }
1358 
testStartOptGroup()1359     @Test public void testStartOptGroup() {
1360         // https://github.com/jhy/jsoup/issues/1313
1361         String html = "<select>\n" +
1362             "  <optgroup label=\"a\">\n" +
1363             "  <option>one\n" +
1364             "  <option>two\n" +
1365             "  <option>three\n" +
1366             "  <optgroup label=\"b\">\n" +
1367             "  <option>four\n" +
1368             "  <option>fix\n" +
1369             "  <option>six\n" +
1370             "</select>";
1371         Document doc = Jsoup.parse(html);
1372         Element select = doc.selectFirst("select");
1373         assertEquals(2, select.childrenSize());
1374 
1375         assertEquals("<optgroup label=\"a\"> <option>one </option><option>two </option><option>three </option></optgroup><optgroup label=\"b\"> <option>four </option><option>fix </option><option>six </option></optgroup>", select.html());
1376     }
1377 
readerClosedAfterParse()1378     @Test public void readerClosedAfterParse() {
1379         Document doc = Jsoup.parse("Hello");
1380         TreeBuilder treeBuilder = doc.parser().getTreeBuilder();
1381         assertNull(treeBuilder.reader);
1382         assertNull(treeBuilder.tokeniser);
1383     }
1384 
scriptInDataNode()1385     @Test public void scriptInDataNode() {
1386         Document doc = Jsoup.parse("<script>Hello</script><style>There</style>");
1387         assertTrue(doc.selectFirst("script").childNode(0) instanceof DataNode);
1388         assertTrue(doc.selectFirst("style").childNode(0) instanceof DataNode);
1389 
1390         doc = Jsoup.parse("<SCRIPT>Hello</SCRIPT><STYLE>There</STYLE>", "", Parser.htmlParser().settings(preserveCase));
1391         assertTrue(doc.selectFirst("script").childNode(0) instanceof DataNode);
1392         assertTrue(doc.selectFirst("style").childNode(0) instanceof DataNode);
1393     }
1394 
textareaValue()1395     @Test public void textareaValue() {
1396         String html = "<TEXTAREA>YES YES</TEXTAREA>";
1397         Document doc = Jsoup.parse(html);
1398         assertEquals("YES YES", doc.selectFirst("textarea").val());
1399 
1400         doc = Jsoup.parse(html, "", Parser.htmlParser().settings(preserveCase));
1401         assertEquals("YES YES", doc.selectFirst("textarea").val());
1402     }
1403 
preserveWhitespaceInHead()1404     @Test public void preserveWhitespaceInHead() {
1405         String html = "\n<!doctype html>\n<html>\n<head>\n<title>Hello</title>\n</head>\n<body>\n<p>One</p>\n</body>\n</html>\n";
1406         Document doc = Jsoup.parse(html);
1407         doc.outputSettings().prettyPrint(false);
1408         assertEquals("<!doctype html>\n<html>\n<head>\n<title>Hello</title>\n</head>\n<body>\n<p>One</p>\n</body>\n</html>\n", doc.outerHtml());
1409     }
1410 
handleContentAfterBody()1411     @Test public void handleContentAfterBody() {
1412         String html = "<body>One</body>  <p>Hello!</p></html> <p>There</p>";
1413         Document doc = Jsoup.parse(html);
1414         doc.outputSettings().prettyPrint(false);
1415         assertEquals("<html><head></head><body>One<p>Hello!</p><p>There</p></body>  </html> ", doc.outerHtml());
1416     }
1417 
preservesTabs()1418     @Test public void preservesTabs() {
1419         // testcase to demonstrate tab retention - https://github.com/jhy/jsoup/issues/1240
1420         String html = "<pre>One\tTwo</pre><span>\tThree\tFour</span>";
1421         Document doc = Jsoup.parse(html);
1422 
1423         Element pre = doc.selectFirst("pre");
1424         Element span = doc.selectFirst("span");
1425 
1426         assertEquals("One\tTwo", pre.text());
1427         assertEquals("Three Four", span.text()); // normalized, including overall trim
1428         assertEquals("\tThree\tFour", span.wholeText()); // text normalizes, wholeText retains original spaces incl tabs
1429         assertEquals("One\tTwo Three Four", doc.body().text());
1430 
1431         assertEquals("<pre>One\tTwo</pre><span> Three Four</span>", doc.body().html()); // html output provides normalized space, incl tab in pre but not in span
1432 
1433         doc.outputSettings().prettyPrint(false);
1434         assertEquals(html, doc.body().html()); // disabling pretty-printing - round-trips the tab throughout, as no normalization occurs
1435     }
1436 
wholeTextTreatsBRasNewline()1437     @Test void wholeTextTreatsBRasNewline() {
1438         String html = "<div>\nOne<br>Two <p>Three<br>Four</div>";
1439         Document doc = Jsoup.parse(html);
1440         Element div = doc.selectFirst("div");
1441         assertNotNull(div);
1442         assertEquals("\nOne\nTwo Three\nFour", div.wholeText());
1443         assertEquals("\nOne\nTwo ", div.wholeOwnText());
1444     }
1445 
canDetectAutomaticallyAddedElements()1446     @Test public void canDetectAutomaticallyAddedElements() {
1447         String bare = "<script>One</script>";
1448         String full = "<html><head><title>Check</title></head><body><p>One</p></body></html>";
1449 
1450         assertTrue(didAddElements(bare));
1451         assertFalse(didAddElements(full));
1452     }
1453 
didAddElements(String input)1454     private boolean didAddElements(String input) {
1455         // two passes, one as XML and one as HTML. XML does not vivify missing/optional tags
1456         Document html = Jsoup.parse(input);
1457         Document xml = Jsoup.parse(input, "", Parser.xmlParser());
1458 
1459         int htmlElementCount = html.getAllElements().size();
1460         int xmlElementCount = xml.getAllElements().size();
1461         return htmlElementCount > xmlElementCount;
1462     }
1463 
canSetHtmlOnCreatedTableElements()1464     @Test public void canSetHtmlOnCreatedTableElements() {
1465         // https://github.com/jhy/jsoup/issues/1603
1466         Element element = new Element("tr");
1467         element.html("<tr><td>One</td></tr>");
1468         assertEquals("<tr>\n <tr>\n  <td>One</td>\n </tr>\n</tr>", element.outerHtml());
1469     }
1470 
parseFragmentOnCreatedDocument()1471     @Test public void parseFragmentOnCreatedDocument() {
1472         // https://github.com/jhy/jsoup/issues/1601
1473         String bareFragment = "<h2>text</h2>";
1474         List<Node> nodes = new Document("").parser().parseFragmentInput(bareFragment, new Element("p"), "");
1475         assertEquals(1, nodes.size());
1476         Node node = nodes.get(0);
1477         assertEquals("h2", node.nodeName());
1478         assertEquals("<p>\n <h2>text</h2></p>", node.parent().outerHtml());
1479     }
1480 
nestedPFragments()1481     @Test public void nestedPFragments() {
1482         // https://github.com/jhy/jsoup/issues/1602
1483         String bareFragment = "<p></p><a></a>";
1484         List<Node> nodes = new Document("").parser().parseFragmentInput(bareFragment, new Element("p"), "");
1485         assertEquals(2, nodes.size());
1486         Node node = nodes.get(0);
1487         assertEquals("<p>\n <p></p><a></a></p>", node.parent().outerHtml()); // mis-nested because fragment forced into the element, OK
1488     }
1489 
nestedAnchorAdoption()1490     @Test public void nestedAnchorAdoption() {
1491         // https://github.com/jhy/jsoup/issues/1608
1492         String html = "<a>\n<b>\n<div>\n<a>test</a>\n</div>\n</b>\n</a>";
1493         Document doc = Jsoup.parse(html);
1494         assertNotNull(doc);
1495         assertEquals("<a> <b> </b></a><b><div><a> </a><a>test</a></div></b>", TextUtil.stripNewlines(doc.body().html()));
1496     }
1497 
tagsMustStartWithAscii()1498     @Test public void tagsMustStartWithAscii() {
1499         // https://github.com/jhy/jsoup/issues/1006
1500         String[] valid = {"a一", "a会员挂单金额5", "table(╯°□°)╯"};
1501         String[] invalid = {"一", "会员挂单金额5", "(╯°□°)╯"};
1502 
1503         for (String tag : valid) {
1504             Document doc = Jsoup.parse("<" + tag + ">Text</" + tag + ">");
1505             Elements els = doc.getElementsByTag(tag);
1506             assertEquals(1, els.size());
1507             assertEquals(tag, els.get(0).tagName());
1508             assertEquals("Text", els.get(0).text());
1509         }
1510 
1511         for (String tag : invalid) {
1512             Document doc = Jsoup.parse("<" + tag + ">Text</" + tag + ">");
1513             Elements els = doc.getElementsByTag(tag);
1514             assertEquals(0, els.size());
1515             assertEquals("&lt;" + tag + "&gt;Text<!--/" + tag + "-->", doc.body().html());
1516         }
1517     }
1518 
htmlOutputCorrectsInvalidAttributeNames()1519     @Test void htmlOutputCorrectsInvalidAttributeNames() {
1520         String html = "<body style=\"color: red\" \" name\"><div =\"\"></div></body>";
1521         Document doc = Jsoup.parse(html);
1522         assertEquals(Document.OutputSettings.Syntax.html, doc.outputSettings().syntax());
1523 
1524         String out = doc.body().outerHtml();
1525         assertEquals("<body style=\"color: red\" name>\n <div></div>\n</body>", out);
1526     }
1527 
templateInHead()1528     @Test void templateInHead() {
1529         // https://try.jsoup.org/~EGp3UZxQe503TJDHQEQEzm8IeUc
1530         String html = "<head><template id=1><meta name=tmpl></template><title>Test</title><style>One</style></head><body><p>Two</p>";
1531         Document doc = Jsoup.parse(html);
1532 
1533         String want = "<html><head><template id=\"1\"><meta name=\"tmpl\"></template><title>Test</title><style>One</style></head><body><p>Two</p></body></html>";
1534         assertEquals(want, TextUtil.stripNewlines(doc.html()));
1535 
1536         Elements template = doc.select("template#1");
1537         template.select("meta").attr("content", "Yes");
1538         template.unwrap();
1539 
1540         want = "<html><head><meta name=\"tmpl\" content=\"Yes\"><title>Test</title><style>One</style></head><body><p>Two</p></body></html>";
1541         assertEquals(want, TextUtil.stripNewlines(doc.html()));
1542     }
1543 
nestedTemplateInBody()1544     @Test void nestedTemplateInBody() {
1545         String html = "<body><template id=1><table><tr><template id=2><td>One</td><td>Two</td></template></tr></template></body>";
1546         Document doc = Jsoup.parse(html);
1547 
1548         String want = "<html><head></head><body><template id=\"1\"><table><tbody><tr><template id=\"2\"><td>One</td><td>Two</td></template></tr></tbody></table></template></body></html>";
1549         assertEquals(want, TextUtil.stripNewlines(doc.html()));
1550 
1551         // todo - will be nice to add some simpler template element handling like clone children etc?
1552         Element tmplTbl = doc.selectFirst("template#1");
1553         Element tmplRow = doc.selectFirst("template#2");
1554         assertNotNull(tmplRow);
1555         assertNotNull(tmplTbl);
1556         tmplRow.appendChild(tmplRow.clone());
1557         doc.select("template").unwrap();
1558 
1559         want = "<html><head></head><body><table><tbody><tr><td>One</td><td>Two</td><td>One</td><td>Two</td></tr></tbody></table></body></html>";
1560         assertEquals(want, TextUtil.stripNewlines(doc.html()));
1561     }
1562 
canSelectIntoTemplate()1563     @Test void canSelectIntoTemplate() {
1564         String html = "<body><div><template><p>Hello</p>";
1565         Document doc = Jsoup.parse(html);
1566         String want = "<html><head></head><body><div><template><p>Hello</p></template></div></body></html>";
1567         assertEquals(want, TextUtil.stripNewlines(doc.html()));
1568 
1569         Element p = doc.selectFirst("div p");
1570         Element p1 = doc.selectFirst("template :containsOwn(Hello)");
1571         assertEquals("p", p.normalName());
1572         assertEquals(p, p1);
1573     }
1574 
tableRowFragment()1575     @Test void tableRowFragment() {
1576         Document doc = Jsoup.parse("<body><table></table></body");
1577         String html = "<tr><td><img></td></tr>";
1578         Element table = doc.selectFirst("table");
1579         table.html(html); // invokes the fragment parser with table as context
1580         String want = "<tbody><tr><td><img></td></tr></tbody>";
1581         assertEquals(want, TextUtil.stripNewlines(table.html()));
1582         want = "<table><tbody><tr><td><img></td></tr></tbody></table>";
1583         assertEquals(want, TextUtil.stripNewlines(doc.body().html()));
1584     }
1585 
templateTableRowFragment()1586     @Test void templateTableRowFragment() {
1587         // https://github.com/jhy/jsoup/issues/1409 (per the fragment <tr> use case)
1588         Document doc = Jsoup.parse("<body><table><template></template></table></body");
1589         String html = "<tr><td><img></td></tr>";
1590         Element tmpl = doc.selectFirst("template");
1591         tmpl.html(html); // invokes the fragment parser with template as context
1592         String want = "<tr><td><img></td></tr>";
1593         assertEquals(want, TextUtil.stripNewlines(tmpl.html()));
1594         tmpl.unwrap();
1595 
1596         want = "<html><head></head><body><table><tr><td><img></td></tr></table></body></html>";
1597         assertEquals(want, TextUtil.stripNewlines(doc.html()));
1598     }
1599 
templateNotInTableRowFragment()1600     @Test void templateNotInTableRowFragment() {
1601         // https://github.com/jhy/jsoup/issues/1409 (per the fragment <tr> use case)
1602         Document doc = Jsoup.parse("<body><template></template></body");
1603         String html = "<tr><td><img></td></tr>";
1604         Element tmpl = doc.selectFirst("template");
1605         tmpl.html(html); // invokes the fragment parser with template as context
1606         String want = "<tr><td><img></td></tr>";
1607         assertEquals(want, TextUtil.stripNewlines(tmpl.html()));
1608         tmpl.unwrap();
1609 
1610         want = "<html><head></head><body><tr><td><img></td></tr></body></html>";
1611         assertEquals(want, TextUtil.stripNewlines(doc.html()));
1612     }
1613 
templateFragment()1614     @Test void templateFragment() {
1615         // https://github.com/jhy/jsoup/issues/1315
1616         String html = "<template id=\"lorem-ipsum\"><tr><td>Lorem</td><td>Ipsum</td></tr></template>";
1617         Document frag = Jsoup.parseBodyFragment(html);
1618         String want = "<template id=\"lorem-ipsum\"><tr><td>Lorem</td><td>Ipsum</td></tr></template>";
1619         assertEquals(want, TextUtil.stripNewlines(frag.body().html()));
1620     }
1621 
templateInferredForm()1622     @Test void templateInferredForm() {
1623         // https://github.com/jhy/jsoup/issues/1637 | https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=38987
1624         Document doc = Jsoup.parse("<template><isindex action>");
1625         assertNotNull(doc);
1626         assertEquals("<template><form><hr><label>This is a searchable index. Enter search keywords: <input name=\"isindex\"></label><hr></form></template>",
1627             TextUtil.stripNewlines(doc.head().html()));
1628     }
1629 
trimNormalizeElementNamesInBuilder()1630     @Test void trimNormalizeElementNamesInBuilder() {
1631         // https://github.com/jhy/jsoup/issues/1637 | https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=38983
1632         // This is interesting - in TB state, the element name was "template\u001E", so no name checks matched. Then,
1633         // when the Element is created, the name got normalized to "template" and so looked like there should be a
1634         // template on the stack during resetInsertionMode for the select.
1635         // The issue was that the normalization in Tag.valueOf did a trim which the Token.Tag did not
1636         Document doc = Jsoup.parse("<template\u001E<select<input<");
1637         assertNotNull(doc);
1638         assertEquals("<template><select></select><input>&lt;</template>",
1639             TextUtil.stripNewlines(doc.head().html()));
1640     }
1641 
errorsBeforeHtml()1642     @Test void errorsBeforeHtml() {
1643         Parser parser = Parser.htmlParser();
1644         parser.setTrackErrors(10);
1645         Document doc = Jsoup.parse("<!doctype html><!doctype something></div>", parser);
1646         ParseErrorList errors = parser.getErrors();
1647         assertEquals(2, errors.size());
1648         assertEquals("<1:36>: Unexpected Doctype token [<!doctype something>] when in state [BeforeHtml]", errors.get(0).toString());
1649         assertEquals("<1:42>: Unexpected EndTag token [</div>] when in state [BeforeHtml]", errors.get(1).toString());
1650         assertEquals("<!doctype html><html><head></head><body></body></html>", TextUtil.stripNewlines(doc.html()));
1651     }
1652 
afterHeadReAdds()1653     @Test void afterHeadReAdds() {
1654         Parser parser = Parser.htmlParser();
1655         parser.setTrackErrors(10);
1656         Document doc = Jsoup.parse("<head></head><meta charset=UTF8><p>Hello", parser);
1657         ParseErrorList errors = parser.getErrors();
1658         assertEquals(1, errors.size());
1659         assertEquals("<1:33>: Unexpected StartTag token [<meta  charset=\"UTF8\">] when in state [AfterHead]", errors.get(0).toString());
1660         assertEquals("<html><head><meta charset=\"UTF8\"></head><body><p>Hello</p></body></html>", TextUtil.stripNewlines(doc.html()));
1661         // meta gets added back into head
1662     }
1663 
mergeHtmlAttributesFromBody()1664     @Test void mergeHtmlAttributesFromBody() {
1665         Document doc = Jsoup.parse("<html id=1 class=foo><body><html class=bar data=x><p>One");
1666         assertEquals("<html id=\"1\" class=\"foo\" data=\"x\"><head></head><body><p>One</p></body></html>", TextUtil.stripNewlines(doc.html()));
1667     }
1668 
mergeHtmlNoAttributesFromBody()1669     @Test void mergeHtmlNoAttributesFromBody() {
1670         Document doc = Jsoup.parse("<html id=1 class=foo><body><html><p>One");
1671         assertEquals("<html id=\"1\" class=\"foo\"><head></head><body><p>One</p></body></html>", TextUtil.stripNewlines(doc.html()));
1672     }
1673 
supportsRuby()1674     @Test void supportsRuby() {
1675         String html = "<ruby><rbc><rb>10</rb><rb>31</rb><rb>2002</rb></rbc><rtc><rt>Month</rt><rt>Day</rt><rt>Year</rt></rtc><rtc><rt>Expiration Date</rt><rp>(*)</rtc></ruby>";
1676         Parser parser = Parser.htmlParser();
1677         parser.setTrackErrors(10);
1678         Document doc = Jsoup.parse(html, parser);
1679         ParseErrorList errors = parser.getErrors();
1680         assertEquals(3, errors.size());
1681         Element ruby = doc.expectFirst("ruby");
1682         assertEquals(
1683             "<ruby><rbc><rb>10</rb><rb>31</rb><rb>2002</rb></rbc><rtc><rt>Month</rt><rt>Day</rt><rt>Year</rt></rtc><rtc><rt>Expiration Date</rt><rp>(*)</rp></rtc></ruby>",
1684             TextUtil.stripNewlines(ruby.outerHtml()));
1685         assertEquals("<1:38>: Unexpected StartTag token [<rb>] when in state [InBody]", errors.get(2).toString()); // 3 errors from rb in rtc as undefined
1686     }
1687 
rubyRpRtImplicitClose()1688     @Test void rubyRpRtImplicitClose() {
1689         String html = "<ruby><rp>(<rt>Hello<rt>Hello<rp>)</ruby>\n";
1690         Parser parser = Parser.htmlParser();
1691         parser.setTrackErrors(10);
1692         Document doc = Jsoup.parse(html, parser);
1693         assertEquals(0, parser.getErrors().size());
1694         Element ruby = doc.expectFirst("ruby");
1695         assertEquals(
1696             "<ruby><rp>(</rp><rt>Hello</rt><rt>Hello</rt><rp>)</rp></ruby>",
1697             TextUtil.stripNewlines(ruby.outerHtml()));
1698     }
1699 
rubyScopeError()1700     @Test void rubyScopeError() {
1701         String html = "<ruby><div><rp>Hello";
1702         Parser parser = Parser.htmlParser();
1703         parser.setTrackErrors(10);
1704         Document doc = Jsoup.parse(html, parser);
1705         ParseErrorList errors = parser.getErrors();
1706         assertEquals(2, errors.size());
1707         Element ruby = doc.expectFirst("ruby");
1708         assertEquals(
1709             "<ruby><div><rp>Hello</rp></div></ruby>",
1710             TextUtil.stripNewlines(ruby.outerHtml()));
1711         assertEquals("<1:16>: Unexpected StartTag token [<rp>] when in state [InBody]", errors.get(0).toString());
1712     }
1713 
errorOnEofIfOpen()1714     @Test void errorOnEofIfOpen() {
1715         String html = "<div>";
1716         Parser parser = Parser.htmlParser();
1717         parser.setTrackErrors(10);
1718         Document doc = Jsoup.parse(html, parser);
1719         ParseErrorList errors = parser.getErrors();
1720         assertEquals(1, errors.size());
1721         assertEquals("Unexpected EOF token [] when in state [InBody]", errors.get(0).getErrorMessage());
1722     }
1723 
NoErrorOnEofIfBodyOpen()1724     @Test void NoErrorOnEofIfBodyOpen() {
1725         String html = "<body>";
1726         Parser parser = Parser.htmlParser();
1727         parser.setTrackErrors(10);
1728         Document doc = Jsoup.parse(html, parser);
1729         ParseErrorList errors = parser.getErrors();
1730         assertEquals(0, errors.size());
1731     }
1732 
htmlClose()1733     @Test void htmlClose() {
1734         // https://github.com/jhy/jsoup/issues/1851
1735         String html = "<body><div>One</html>Two</div></body>";
1736         Document doc = Jsoup.parse(html);
1737         assertEquals("OneTwo", doc.expectFirst("body > div").text());
1738     }
1739 
largeTextareaContents()1740     @Test void largeTextareaContents() {
1741         // https://github.com/jhy/jsoup/issues/1929
1742         StringBuilder sb = new StringBuilder();
1743         int num = 2000;
1744         for (int i = 0; i <= num; i++) {
1745             sb.append("\n<text>foo</text>\n");
1746         }
1747         String textContent = sb.toString();
1748         String sourceHtml = "<textarea>" + textContent + "</textarea>";
1749 
1750         Document doc = Jsoup.parse(sourceHtml);
1751         Element textArea = doc.expectFirst("textarea");
1752 
1753         assertEquals(textContent, textArea.wholeText());
1754     }
1755 
svgParseTest()1756     @Test void svgParseTest() {
1757         String html = "<div><svg viewBox=2><foreignObject><p>One</p></foreignObject></svg></div>";
1758         Document doc = Jsoup.parse(html);
1759 
1760         assertHtmlNamespace(doc);
1761         Element div = doc.expectFirst("div");
1762         assertHtmlNamespace(div);
1763 
1764         Element svg = doc.expectFirst("svg");
1765         assertTrue(svg.attributes().hasKey("viewBox"));
1766         assertSvgNamespace(svg);
1767         assertSvgNamespace(doc.expectFirst("foreignObject"));
1768         assertHtmlNamespace(doc.expectFirst("p"));
1769 
1770         String serialized = div.html();
1771         assertEquals("<svg viewBox=\"2\">\n" +
1772             " <foreignObject>\n" +
1773             "  <p>One</p>\n" +
1774             " </foreignObject>\n" +
1775             "</svg>", serialized);
1776     }
1777 
mathParseText()1778     @Test void mathParseText() {
1779         String html = "<div><math><mi><p>One</p><svg><text>Blah</text></svg></mi><ms></ms></div>";
1780         Document doc = Jsoup.parse(html);
1781 
1782         assertHtmlNamespace(doc.expectFirst("div"));
1783         assertMathNamespace(doc.expectFirst("math"));
1784         assertMathNamespace(doc.expectFirst("mi"));
1785         assertHtmlNamespace(doc.expectFirst("p"));
1786         assertSvgNamespace(doc.expectFirst("svg"));
1787         assertSvgNamespace(doc.expectFirst("text"));
1788         assertMathNamespace(doc.expectFirst("ms"));
1789 
1790         String serialized = doc.expectFirst("div").html();
1791         assertEquals("<math>\n" +
1792             " <mi>\n" +
1793             "  <p>One</p>\n" +
1794             "  <svg>\n" +
1795             "   <text>Blah</text>\n" +
1796             "  </svg></mi><ms></ms>\n" +
1797             "</math>", serialized);
1798     }
1799 
assertHtmlNamespace(Element el)1800     private static void assertHtmlNamespace(Element el) {
1801         assertEquals(Parser.NamespaceHtml, el.tag().namespace());
1802     }
1803 
assertSvgNamespace(Element el)1804     private static void assertSvgNamespace(Element el) {
1805         assertEquals(Parser.NamespaceSvg, el.tag().namespace());
1806     }
1807 
assertMathNamespace(Element el)1808     private static void assertMathNamespace(Element el) {
1809         assertEquals(Parser.NamespaceMathml, el.tag().namespace());
1810     }
1811 
mathSvgStyleTest()1812     @Test void mathSvgStyleTest() {
1813         String html = "<style><img></style><math><svg><style><img></img></style></svg></math>";
1814         Document doc = Jsoup.parse(html);
1815 
1816         Element htmlStyle = doc.expectFirst("style");
1817         assertHtmlNamespace(htmlStyle);
1818         assertEquals("<img>", htmlStyle.data()); // that's not an element, it's data (textish)
1819 
1820         Element svgStyle = doc.expectFirst("svg style");
1821         assertMathNamespace(svgStyle); // in inherited math namespace as not an HTML integration point
1822         Element styleImg = svgStyle.expectFirst("img");
1823         assertHtmlNamespace(styleImg); // this one is an img tag - in foreign to html elements
1824 
1825         assertMathNamespace(doc.expectFirst("svg"));
1826         assertMathNamespace(doc.expectFirst("math"));
1827     }
1828 
xmlnsAttributeError()1829     @Test void xmlnsAttributeError() {
1830         String html = "<p><svg></svg></body>";
1831         Parser parser = Parser.htmlParser().setTrackErrors(10);
1832         Document doc = Jsoup.parse(html, parser);
1833         assertEquals(0, doc.parser().getErrors().size());
1834 
1835         String html2 = "<html xmlns='http://www.w3.org/1999/xhtml'><p xmlns='http://www.w3.org/1999/xhtml'><i xmlns='xhtml'></i></body>";
1836         Document doc2 = Jsoup.parse(html2, parser);
1837         assertEquals(1, doc2.parser().getErrors().size());
1838         assertEquals("Invalid xmlns attribute [xhtml] on tag [i]", parser.getErrors().get(0).getErrorMessage());
1839     }
1840 
mathAnnotationSvg()1841     @Test void mathAnnotationSvg() {
1842         String html = "<math><svg>"; // not in annotation, svg will be in math ns
1843         Document doc = Jsoup.parse(html);
1844         assertMathNamespace(doc.expectFirst("math"));
1845         assertMathNamespace(doc.expectFirst("svg"));
1846 
1847         String html2 = "<math><annotation-xml><svg>"; // svg will be in svg ns
1848         Document doc2 = Jsoup.parse(html2);
1849         assertMathNamespace(doc2.expectFirst("math"));
1850         assertMathNamespace(doc2.expectFirst("annotation-xml"));
1851         assertSvgNamespace(doc2.expectFirst("svg"));
1852     }
1853 
mathHtmlIntegrationPoint()1854     @Test void mathHtmlIntegrationPoint() {
1855         String html = "<math><div>Hello";
1856         Document doc = Jsoup.parse(html);
1857         assertMathNamespace(doc.expectFirst("math"));
1858         assertHtmlNamespace(doc.expectFirst("div"));
1859 
1860         String html2 = "<math><divv>Hello";
1861         Document doc2 = Jsoup.parse(html2);
1862         assertMathNamespace(doc2.expectFirst("math"));
1863         assertMathNamespace(doc2.expectFirst("divv"));
1864 
1865         String html3 = "<math><annotation-xml><divv>Hello";
1866         Document doc3 = Jsoup.parse(html3);
1867         assertMathNamespace(doc3.expectFirst("math"));
1868         assertMathNamespace(doc3.expectFirst("annotation-xml"));
1869         assertMathNamespace(doc3.expectFirst("divv"));
1870 
1871         String html4 = "<math><annotation-xml encoding=text/html><divv>Hello";
1872         Document doc4 = Jsoup.parse(html4);
1873         assertMathNamespace(doc4.expectFirst("math"));
1874         assertMathNamespace(doc4.expectFirst("annotation-xml"));
1875         assertHtmlNamespace(doc4.expectFirst("divv"));
1876     }
1877 
parseEmojiFromMultipointEncoded()1878     @Test void parseEmojiFromMultipointEncoded() {
1879         String html = "<img multi='&#55357;&#56495;' single='&#128175;' hexsingle='&#x1f4af;'>";
1880         Document document = Jsoup.parse(html);
1881         Element img = document.expectFirst("img");
1882         assertEquals("\uD83D\uDCAF", img.attr("multi"));
1883         assertEquals("\uD83D\uDCAF", img.attr("single"));
1884         assertEquals("\uD83D\uDCAF", img.attr("hexsingle"));
1885 
1886         assertEquals("<img multi=\"\uD83D\uDCAF\" single=\"\uD83D\uDCAF\" hexsingle=\"\uD83D\uDCAF\">", img.outerHtml());
1887 
1888         img.ownerDocument().outputSettings().charset("ascii");
1889         assertEquals("<img multi=\"&#x1f4af;\" single=\"&#x1f4af;\" hexsingle=\"&#x1f4af;\">", img.outerHtml());
1890     }
1891 }
1892