1 package org.jsoup.parser; 2 3 import org.jsoup.Jsoup; 4 import org.jsoup.TextUtil; 5 import org.jsoup.integration.ParseTest; 6 import org.jsoup.internal.StringUtil; 7 import org.jsoup.nodes.*; 8 import org.jsoup.safety.Safelist; 9 import org.jsoup.select.Elements; 10 import org.junit.jupiter.api.Test; 11 import org.junit.jupiter.params.ParameterizedTest; 12 import org.junit.jupiter.params.provider.Arguments; 13 import org.junit.jupiter.params.provider.MethodSource; 14 15 import java.io.ByteArrayInputStream; 16 import java.io.File; 17 import java.io.IOException; 18 import java.util.List; 19 import java.util.stream.Stream; 20 21 import static org.jsoup.parser.ParseSettings.preserveCase; 22 import static org.junit.jupiter.api.Assertions.*; 23 24 /** 25 * Tests for the Parser 26 * 27 * @author Jonathan Hedley, [email protected] 28 */ 29 public class HtmlParserTest { 30 parsesSimpleDocument()31 @Test public void parsesSimpleDocument() { 32 String html = "<html><head><title>First!</title></head><body><p>First post! <img src=\"foo.png\" /></p></body></html>"; 33 Document doc = Jsoup.parse(html); 34 // need a better way to verify these: 35 Element p = doc.body().child(0); 36 assertEquals("p", p.tagName()); 37 Element img = p.child(0); 38 assertEquals("foo.png", img.attr("src")); 39 assertEquals("img", img.tagName()); 40 } 41 parsesRoughAttributes()42 @Test public void parsesRoughAttributes() { 43 String html = "<html><head><title>First!</title></head><body><p class=\"foo > bar\">First post! <img src=\"foo.png\" /></p></body></html>"; 44 Document doc = Jsoup.parse(html); 45 46 // need a better way to verify these: 47 Element p = doc.body().child(0); 48 assertEquals("p", p.tagName()); 49 assertEquals("foo > bar", p.attr("class")); 50 } 51 52 @ParameterizedTest @MethodSource("dupeAttributeData") dropsDuplicateAttributes(String html, String expected)53 public void dropsDuplicateAttributes(String html, String expected) { 54 Parser parser = Parser.htmlParser().setTrackErrors(10); 55 Document doc = parser.parseInput(html, ""); 56 57 Element el = doc.expectFirst("body > *"); 58 assertEquals(expected, el.outerHtml()); // normalized names due to lower casing 59 String tag = el.normalName(); 60 61 assertEquals(1, parser.getErrors().size()); 62 assertEquals("Dropped duplicate attribute(s) in tag [" + tag + "]", parser.getErrors().get(0).getErrorMessage()); 63 } 64 dupeAttributeData()65 private static Stream<Arguments> dupeAttributeData() { 66 return Stream.of( 67 Arguments.of("<p One=One ONE=Two Two=two one=Three One=Four two=Five>Text</p>", "<p one=\"One\" two=\"two\">Text</p>"), 68 Arguments.of("<img One=One ONE=Two Two=two one=Three One=Four two=Five>", "<img one=\"One\" two=\"two\">"), 69 Arguments.of("<form One=One ONE=Two Two=two one=Three One=Four two=Five></form>", "<form one=\"One\" two=\"two\"></form>") 70 ); 71 } 72 retainsAttributesOfDifferentCaseIfSensitive()73 @Test public void retainsAttributesOfDifferentCaseIfSensitive() { 74 String html = "<p One=One One=Two one=Three two=Four two=Five Two=Six>Text</p>"; 75 Parser parser = Parser.htmlParser().settings(preserveCase); 76 Document doc = parser.parseInput(html, ""); 77 assertEquals("<p One=\"One\" one=\"Three\" two=\"Four\" Two=\"Six\">Text</p>", doc.selectFirst("p").outerHtml()); 78 } 79 parsesQuiteRoughAttributes()80 @Test public void parsesQuiteRoughAttributes() { 81 String html = "<p =a>One<a <p>Something</p>Else"; 82 // this (used to; now gets cleaner) gets a <p> with attr '=a' and an <a tag with an attribute named '<p'; and then auto-recreated 83 Document doc = Jsoup.parse(html); 84 85 // NOTE: per spec this should be the test case. but impacts too many ppl 86 // assertEquals("<p =a>One<a <p>Something</a></p>\n<a <p>Else</a>", doc.body().html()); 87 88 assertEquals("<p a>One<a></a></p><p><a>Something</a></p><a>Else</a>", TextUtil.stripNewlines(doc.body().html())); 89 90 doc = Jsoup.parse("<p .....>"); 91 assertEquals("<p .....></p>", doc.body().html()); 92 } 93 parsesComments()94 @Test public void parsesComments() { 95 String html = "<html><head></head><body><img src=foo><!-- <table><tr><td></table> --><p>Hello</p></body></html>"; 96 Document doc = Jsoup.parse(html); 97 98 Element body = doc.body(); 99 Comment comment = (Comment) body.childNode(1); // comment should not be sub of img, as it's an empty tag 100 assertEquals(" <table><tr><td></table> ", comment.getData()); 101 Element p = body.child(1); 102 TextNode text = (TextNode) p.childNode(0); 103 assertEquals("Hello", text.getWholeText()); 104 } 105 parsesUnterminatedComments()106 @Test public void parsesUnterminatedComments() { 107 String html = "<p>Hello<!-- <tr><td>"; 108 Document doc = Jsoup.parse(html); 109 Element p = doc.getElementsByTag("p").get(0); 110 assertEquals("Hello", p.text()); 111 TextNode text = (TextNode) p.childNode(0); 112 assertEquals("Hello", text.getWholeText()); 113 Comment comment = (Comment) p.childNode(1); 114 assertEquals(" <tr><td>", comment.getData()); 115 } 116 allDashCommentsAreNotParseErrors()117 @Test void allDashCommentsAreNotParseErrors() { 118 // https://github.com/jhy/jsoup/issues/1667 119 // <!-----> is not a parse error 120 String html = "<!------>"; 121 Parser parser = Parser.htmlParser().setTrackErrors(10); 122 Document doc = Jsoup.parse(html, parser); 123 Comment comment = (Comment) doc.childNode(0); 124 assertEquals("--", comment.getData()); 125 assertEquals(0, parser.getErrors().size()); 126 } 127 dropsUnterminatedTag()128 @Test public void dropsUnterminatedTag() { 129 // jsoup used to parse this to <p>, but whatwg, webkit will drop. 130 String h1 = "<p"; 131 Document doc = Jsoup.parse(h1); 132 assertEquals(0, doc.getElementsByTag("p").size()); 133 assertEquals("", doc.text()); 134 135 String h2 = "<div id=1<p id='2'"; 136 doc = Jsoup.parse(h2); 137 assertEquals("", doc.text()); 138 } 139 dropsUnterminatedAttribute()140 @Test public void dropsUnterminatedAttribute() { 141 // jsoup used to parse this to <p id="foo">, but whatwg, webkit will drop. 142 String h1 = "<p id=\"foo"; 143 Document doc = Jsoup.parse(h1); 144 assertEquals("", doc.text()); 145 } 146 parsesUnterminatedTextarea()147 @Test public void parsesUnterminatedTextarea() { 148 // don't parse right to end, but break on <p> 149 Document doc = Jsoup.parse("<body><p><textarea>one<p>two"); 150 Element t = doc.select("textarea").first(); 151 assertEquals("one", t.text()); 152 assertEquals("two", doc.select("p").get(1).text()); 153 } 154 parsesUnterminatedOption()155 @Test public void parsesUnterminatedOption() { 156 // bit weird this -- browsers and spec get stuck in select until there's a </select> 157 Document doc = Jsoup.parse("<body><p><select><option>One<option>Two</p><p>Three</p>"); 158 Elements options = doc.select("option"); 159 assertEquals(2, options.size()); 160 assertEquals("One", options.first().text()); 161 assertEquals("TwoThree", options.last().text()); 162 } 163 testSelectWithOption()164 @Test public void testSelectWithOption() { 165 Parser parser = Parser.htmlParser(); 166 parser.setTrackErrors(10); 167 Document document = parser.parseInput("<select><option>Option 1</option></select>", "http://jsoup.org"); 168 assertEquals(0, parser.getErrors().size()); 169 } 170 testSpaceAfterTag()171 @Test public void testSpaceAfterTag() { 172 Document doc = Jsoup.parse("<div > <a name=\"top\"></a ><p id=1 >Hello</p></div>"); 173 assertEquals("<div><a name=\"top\"></a><p id=\"1\">Hello</p></div>", TextUtil.stripNewlines(doc.body().html())); 174 } 175 createsDocumentStructure()176 @Test public void createsDocumentStructure() { 177 String html = "<meta name=keywords /><link rel=stylesheet /><title>jsoup</title><p>Hello world</p>"; 178 Document doc = Jsoup.parse(html); 179 Element head = doc.head(); 180 Element body = doc.body(); 181 182 assertEquals(1, doc.children().size()); // root node: contains html node 183 assertEquals(2, doc.child(0).children().size()); // html node: head and body 184 assertEquals(3, head.children().size()); 185 assertEquals(1, body.children().size()); 186 187 assertEquals("keywords", head.getElementsByTag("meta").get(0).attr("name")); 188 assertEquals(0, body.getElementsByTag("meta").size()); 189 assertEquals("jsoup", doc.title()); 190 assertEquals("Hello world", body.text()); 191 assertEquals("Hello world", body.children().get(0).text()); 192 } 193 createsStructureFromBodySnippet()194 @Test public void createsStructureFromBodySnippet() { 195 // the bar baz stuff naturally goes into the body, but the 'foo' goes into root, and the normalisation routine 196 // needs to move into the start of the body 197 String html = "foo <b>bar</b> baz"; 198 Document doc = Jsoup.parse(html); 199 assertEquals("foo bar baz", doc.text()); 200 } 201 handlesEscapedData()202 @Test public void handlesEscapedData() { 203 String html = "<div title='Surf & Turf'>Reef & Beef</div>"; 204 Document doc = Jsoup.parse(html); 205 Element div = doc.getElementsByTag("div").get(0); 206 207 assertEquals("Surf & Turf", div.attr("title")); 208 assertEquals("Reef & Beef", div.text()); 209 } 210 handlesDataOnlyTags()211 @Test public void handlesDataOnlyTags() { 212 String t = "<style>font-family: bold</style>"; 213 List<Element> tels = Jsoup.parse(t).getElementsByTag("style"); 214 assertEquals("font-family: bold", tels.get(0).data()); 215 assertEquals("", tels.get(0).text()); 216 217 String s = "<p>Hello</p><script>obj.insert('<a rel=\"none\" />');\ni++;</script><p>There</p>"; 218 Document doc = Jsoup.parse(s); 219 assertEquals("Hello There", doc.text()); 220 assertEquals("obj.insert('<a rel=\"none\" />');\ni++;", doc.data()); 221 } 222 handlesTextAfterData()223 @Test public void handlesTextAfterData() { 224 String h = "<html><body>pre <script>inner</script> aft</body></html>"; 225 Document doc = Jsoup.parse(h); 226 assertEquals("<html><head></head><body>pre <script>inner</script> aft</body></html>", TextUtil.stripNewlines(doc.html())); 227 } 228 handlesTextArea()229 @Test public void handlesTextArea() { 230 Document doc = Jsoup.parse("<textarea>Hello</textarea>"); 231 Elements els = doc.select("textarea"); 232 assertEquals("Hello", els.text()); 233 assertEquals("Hello", els.val()); 234 } 235 preservesSpaceInTextArea()236 @Test public void preservesSpaceInTextArea() { 237 // preserve because the tag is marked as preserve white space 238 Document doc = Jsoup.parse("<textarea>\n\tOne\n\tTwo\n\tThree\n</textarea>"); 239 String expect = "One\n\tTwo\n\tThree"; // the leading and trailing spaces are dropped as a convenience to authors 240 Element el = doc.select("textarea").first(); 241 assertEquals(expect, el.text()); 242 assertEquals(expect, el.val()); 243 assertEquals(expect, el.html()); 244 assertEquals("<textarea>\n\t" + expect + "\n</textarea>", el.outerHtml()); // but preserved in round-trip html 245 } 246 preservesSpaceInScript()247 @Test public void preservesSpaceInScript() { 248 // preserve because it's content is a data node 249 Document doc = Jsoup.parse("<script>\nOne\n\tTwo\n\tThree\n</script>"); 250 String expect = "\nOne\n\tTwo\n\tThree\n"; 251 Element el = doc.select("script").first(); 252 assertEquals(expect, el.data()); 253 assertEquals("One\n\tTwo\n\tThree", el.html()); 254 assertEquals("<script>" + expect + "</script>", el.outerHtml()); 255 } 256 doesNotCreateImplicitLists()257 @Test public void doesNotCreateImplicitLists() { 258 // old jsoup used to wrap this in <ul>, but that's not to spec 259 String h = "<li>Point one<li>Point two"; 260 Document doc = Jsoup.parse(h); 261 Elements ol = doc.select("ul"); // should NOT have created a default ul. 262 assertEquals(0, ol.size()); 263 Elements lis = doc.select("li"); 264 assertEquals(2, lis.size()); 265 assertEquals("body", lis.first().parent().tagName()); 266 267 // no fiddling with non-implicit lists 268 String h2 = "<ol><li><p>Point the first<li><p>Point the second"; 269 Document doc2 = Jsoup.parse(h2); 270 271 assertEquals(0, doc2.select("ul").size()); 272 assertEquals(1, doc2.select("ol").size()); 273 assertEquals(2, doc2.select("ol li").size()); 274 assertEquals(2, doc2.select("ol li p").size()); 275 assertEquals(1, doc2.select("ol li").get(0).children().size()); // one p in first li 276 } 277 discardsNakedTds()278 @Test public void discardsNakedTds() { 279 // jsoup used to make this into an implicit table; but browsers make it into a text run 280 String h = "<td>Hello<td><p>There<p>now"; 281 Document doc = Jsoup.parse(h); 282 assertEquals("Hello<p>There</p><p>now</p>", TextUtil.stripNewlines(doc.body().html())); 283 // <tbody> is introduced if no implicitly creating table, but allows tr to be directly under table 284 } 285 handlesNestedImplicitTable()286 @Test public void handlesNestedImplicitTable() { 287 Document doc = Jsoup.parse("<table><td>1</td></tr> <td>2</td></tr> <td> <table><td>3</td> <td>4</td></table> <tr><td>5</table>"); 288 assertEquals("<table><tbody><tr><td>1</td></tr><tr><td>2</td></tr><tr><td><table><tbody><tr><td>3</td><td>4</td></tr></tbody></table></td></tr><tr><td>5</td></tr></tbody></table>", TextUtil.stripNewlines(doc.body().html())); 289 } 290 handlesWhatWgExpensesTableExample()291 @Test public void handlesWhatWgExpensesTableExample() { 292 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tabular-data.html#examples-0 293 Document doc = Jsoup.parse("<table> <colgroup> <col> <colgroup> <col> <col> <col> <thead> <tr> <th> <th>2008 <th>2007 <th>2006 <tbody> <tr> <th scope=rowgroup> Research and development <td> $ 1,109 <td> $ 782 <td> $ 712 <tr> <th scope=row> Percentage of net sales <td> 3.4% <td> 3.3% <td> 3.7% <tbody> <tr> <th scope=rowgroup> Selling, general, and administrative <td> $ 3,761 <td> $ 2,963 <td> $ 2,433 <tr> <th scope=row> Percentage of net sales <td> 11.6% <td> 12.3% <td> 12.6% </table>"); 294 assertEquals("<table><colgroup><col></colgroup><colgroup><col><col><col></colgroup><thead><tr><th></th><th>2008</th><th>2007</th><th>2006</th></tr></thead><tbody><tr><th scope=\"rowgroup\">Research and development</th><td>$ 1,109</td><td>$ 782</td><td>$ 712</td></tr><tr><th scope=\"row\">Percentage of net sales</th><td>3.4%</td><td>3.3%</td><td>3.7%</td></tr></tbody><tbody><tr><th scope=\"rowgroup\">Selling, general, and administrative</th><td>$ 3,761</td><td>$ 2,963</td><td>$ 2,433</td></tr><tr><th scope=\"row\">Percentage of net sales</th><td>11.6%</td><td>12.3%</td><td>12.6%</td></tr></tbody></table>", TextUtil.stripNewlines(doc.body().html())); 295 } 296 handlesTbodyTable()297 @Test public void handlesTbodyTable() { 298 Document doc = Jsoup.parse("<html><head></head><body><table><tbody><tr><td>aaa</td><td>bbb</td></tr></tbody></table></body></html>"); 299 assertEquals("<table><tbody><tr><td>aaa</td><td>bbb</td></tr></tbody></table>", TextUtil.stripNewlines(doc.body().html())); 300 } 301 handlesImplicitCaptionClose()302 @Test public void handlesImplicitCaptionClose() { 303 Document doc = Jsoup.parse("<table><caption>A caption<td>One<td>Two"); 304 assertEquals("<table><caption>A caption</caption><tbody><tr><td>One</td><td>Two</td></tr></tbody></table>", TextUtil.stripNewlines(doc.body().html())); 305 } 306 noTableDirectInTable()307 @Test public void noTableDirectInTable() { 308 Document doc = Jsoup.parse("<table> <td>One <td><table><td>Two</table> <table><td>Three"); 309 assertEquals("<table><tbody><tr><td>One</td><td><table><tbody><tr><td>Two</td></tr></tbody></table><table><tbody><tr><td>Three</td></tr></tbody></table></td></tr></tbody></table>", 310 TextUtil.stripNewlines(doc.body().html())); 311 } 312 ignoresDupeEndTrTag()313 @Test public void ignoresDupeEndTrTag() { 314 Document doc = Jsoup.parse("<table><tr><td>One</td><td><table><tr><td>Two</td></tr></tr></table></td><td>Three</td></tr></table>"); // two </tr></tr>, must ignore or will close table 315 assertEquals("<table><tbody><tr><td>One</td><td><table><tbody><tr><td>Two</td></tr></tbody></table></td><td>Three</td></tr></tbody></table>", 316 TextUtil.stripNewlines(doc.body().html())); 317 } 318 handlesBaseTags()319 @Test public void handlesBaseTags() { 320 // only listen to the first base href 321 String h = "<a href=1>#</a><base href='/2/'><a href='3'>#</a><base href='http://bar'><a href=/4>#</a>"; 322 Document doc = Jsoup.parse(h, "http://foo/"); 323 assertEquals("http://foo/2/", doc.baseUri()); // gets set once, so doc and descendants have first only 324 325 Elements anchors = doc.getElementsByTag("a"); 326 assertEquals(3, anchors.size()); 327 328 assertEquals("http://foo/2/", anchors.get(0).baseUri()); 329 assertEquals("http://foo/2/", anchors.get(1).baseUri()); 330 assertEquals("http://foo/2/", anchors.get(2).baseUri()); 331 332 assertEquals("http://foo/2/1", anchors.get(0).absUrl("href")); 333 assertEquals("http://foo/2/3", anchors.get(1).absUrl("href")); 334 assertEquals("http://foo/4", anchors.get(2).absUrl("href")); 335 } 336 handlesProtocolRelativeUrl()337 @Test public void handlesProtocolRelativeUrl() { 338 String base = "https://example.com/"; 339 String html = "<img src='//example.net/img.jpg'>"; 340 Document doc = Jsoup.parse(html, base); 341 Element el = doc.select("img").first(); 342 assertEquals("https://example.net/img.jpg", el.absUrl("src")); 343 } 344 handlesCdata()345 @Test public void handlesCdata() { 346 // todo: as this is html namespace, should actually treat as bogus comment, not cdata. keep as cdata for now 347 String h = "<div id=1><![CDATA[<html>\n <foo><&]]></div>"; // the & in there should remain literal 348 Document doc = Jsoup.parse(h); 349 Element div = doc.getElementById("1"); 350 assertEquals("<html>\n <foo><&", div.text()); 351 assertEquals(0, div.children().size()); 352 assertEquals(1, div.childNodeSize()); // no elements, one text node 353 } 354 roundTripsCdata()355 @Test public void roundTripsCdata() { 356 String h = "<div id=1><![CDATA[\n<html>\n <foo><&]]></div>"; 357 Document doc = Jsoup.parse(h); 358 Element div = doc.getElementById("1"); 359 assertEquals("<html>\n <foo><&", div.text()); 360 assertEquals(0, div.children().size()); 361 assertEquals(1, div.childNodeSize()); // no elements, one text node 362 363 assertEquals("<div id=\"1\"><![CDATA[\n<html>\n <foo><&]]>\n</div>", div.outerHtml()); 364 365 CDataNode cdata = (CDataNode) div.textNodes().get(0); 366 assertEquals("\n<html>\n <foo><&", cdata.text()); 367 } 368 handlesCdataAcrossBuffer()369 @Test public void handlesCdataAcrossBuffer() { 370 StringBuilder sb = new StringBuilder(); 371 while (sb.length() <= CharacterReader.maxBufferLen) { 372 sb.append("A suitable amount of CData.\n"); 373 } 374 String cdata = sb.toString(); 375 String h = "<div><![CDATA[" + cdata + "]]></div>"; 376 Document doc = Jsoup.parse(h); 377 Element div = doc.selectFirst("div"); 378 379 CDataNode node = (CDataNode) div.textNodes().get(0); 380 assertEquals(cdata, node.text()); 381 } 382 handlesCdataInScript()383 @Test public void handlesCdataInScript() { 384 String html = "<script type=\"text/javascript\">//<![CDATA[\n\n foo();\n//]]></script>"; 385 Document doc = Jsoup.parse(html); 386 387 String data = "//<![CDATA[\n\n foo();\n//]]>"; 388 Element script = doc.selectFirst("script"); 389 assertEquals("", script.text()); // won't be parsed as cdata because in script data section 390 assertEquals(data, script.data()); 391 assertEquals(html, script.outerHtml()); 392 393 DataNode dataNode = (DataNode) script.childNode(0); 394 assertEquals(data, dataNode.getWholeData()); 395 // see - not a cdata node, because in script. contrast with XmlTreeBuilder - will be cdata. 396 } 397 handlesUnclosedCdataAtEOF()398 @Test public void handlesUnclosedCdataAtEOF() { 399 // https://github.com/jhy/jsoup/issues/349 would crash, as character reader would try to seek past EOF 400 String h = "<![CDATA[]]"; 401 Document doc = Jsoup.parse(h); 402 assertEquals(1, doc.body().childNodeSize()); 403 } 404 handleCDataInText()405 @Test public void handleCDataInText() { 406 String h = "<p>One <![CDATA[Two <&]]> Three</p>"; 407 Document doc = Jsoup.parse(h); 408 Element p = doc.selectFirst("p"); 409 410 List<Node> nodes = p.childNodes(); 411 assertEquals("One ", ((TextNode) nodes.get(0)).getWholeText()); 412 assertEquals("Two <&", ((TextNode) nodes.get(1)).getWholeText()); 413 assertEquals("Two <&", ((CDataNode) nodes.get(1)).getWholeText()); 414 assertEquals(" Three", ((TextNode) nodes.get(2)).getWholeText()); 415 416 assertEquals(h, p.outerHtml()); 417 } 418 cdataNodesAreTextNodes()419 @Test public void cdataNodesAreTextNodes() { 420 String h = "<p>One <![CDATA[ Two <& ]]> Three</p>"; 421 Document doc = Jsoup.parse(h); 422 Element p = doc.selectFirst("p"); 423 424 List<TextNode> nodes = p.textNodes(); 425 assertEquals("One ", nodes.get(0).text()); 426 assertEquals(" Two <& ", nodes.get(1).text()); 427 assertEquals(" Three", nodes.get(2).text()); 428 } 429 handlesInvalidStartTags()430 @Test public void handlesInvalidStartTags() { 431 String h = "<div>Hello < There <&></div>"; // parse to <div {#text=Hello < There <&>}> 432 Document doc = Jsoup.parse(h); 433 assertEquals("Hello < There <&>", doc.select("div").first().text()); 434 } 435 handlesUnknownTags()436 @Test public void handlesUnknownTags() { 437 String h = "<div><foo title=bar>Hello<foo title=qux>there</foo></div>"; 438 Document doc = Jsoup.parse(h); 439 Elements foos = doc.select("foo"); 440 assertEquals(2, foos.size()); 441 assertEquals("bar", foos.first().attr("title")); 442 assertEquals("qux", foos.last().attr("title")); 443 assertEquals("there", foos.last().text()); 444 } 445 handlesUnknownInlineTags()446 @Test public void handlesUnknownInlineTags() { 447 String h = "<p><cust>Test</cust></p><p><cust><cust>Test</cust></cust></p>"; 448 Document doc = Jsoup.parseBodyFragment(h); 449 String out = doc.body().html(); 450 assertEquals(h, TextUtil.stripNewlines(out)); 451 } 452 parsesBodyFragment()453 @Test public void parsesBodyFragment() { 454 String h = "<!-- comment --><p><a href='foo'>One</a></p>"; 455 Document doc = Jsoup.parseBodyFragment(h, "http://example.com"); 456 assertEquals("<body><!-- comment --><p><a href=\"foo\">One</a></p></body>", TextUtil.stripNewlines(doc.body().outerHtml())); 457 assertEquals("http://example.com/foo", doc.select("a").first().absUrl("href")); 458 } 459 parseBodyIsIndexNoAttributes()460 @Test public void parseBodyIsIndexNoAttributes() { 461 // https://github.com/jhy/jsoup/issues/1404 462 String expectedHtml = "<form>\n" + 463 " <hr><label>This is a searchable index. Enter search keywords: <input name=\"isindex\"></label>\n" + 464 " <hr>\n" + 465 "</form>"; 466 Document doc = Jsoup.parse("<isindex>"); 467 assertEquals(expectedHtml, doc.body().html()); 468 469 doc = Jsoup.parseBodyFragment("<isindex>"); 470 assertEquals(expectedHtml, doc.body().html()); 471 472 doc = Jsoup.parseBodyFragment("<table><input></table>"); 473 assertEquals("<input>\n<table></table>", doc.body().html()); 474 } 475 handlesUnknownNamespaceTags()476 @Test public void handlesUnknownNamespaceTags() { 477 // note that the first foo:bar should not really be allowed to be self closing, if parsed in html mode. 478 String h = "<foo:bar id='1' /><abc:def id=2>Foo<p>Hello</p></abc:def><foo:bar>There</foo:bar>"; 479 Document doc = Jsoup.parse(h); 480 assertEquals("<foo:bar id=\"1\" /><abc:def id=\"2\">Foo<p>Hello</p></abc:def><foo:bar>There</foo:bar>", TextUtil.stripNewlines(doc.body().html())); 481 } 482 handlesKnownEmptyBlocks()483 @Test public void handlesKnownEmptyBlocks() { 484 // if a known tag, allow self closing outside of spec, but force an end tag. unknown tags can be self closing. 485 String h = "<div id='1' /><script src='/foo' /><div id=2><img /><img></div><a id=3 /><i /><foo /><foo>One</foo> <hr /> hr text <hr> hr text two"; 486 Document doc = Jsoup.parse(h); 487 assertEquals("<div id=\"1\"></div><script src=\"/foo\"></script><div id=\"2\"><img><img></div><a id=\"3\"></a><i></i><foo /><foo>One</foo><hr> hr text <hr> hr text two", TextUtil.stripNewlines(doc.body().html())); 488 } 489 handlesKnownEmptyNoFrames()490 @Test public void handlesKnownEmptyNoFrames() { 491 String h = "<html><head><noframes /><meta name=foo></head><body>One</body></html>"; 492 Document doc = Jsoup.parse(h); 493 assertEquals("<html><head><noframes></noframes><meta name=\"foo\"></head><body>One</body></html>", TextUtil.stripNewlines(doc.html())); 494 } 495 handlesKnownEmptyStyle()496 @Test public void handlesKnownEmptyStyle() { 497 String h = "<html><head><style /><meta name=foo></head><body>One</body></html>"; 498 Document doc = Jsoup.parse(h); 499 assertEquals("<html><head><style></style><meta name=\"foo\"></head><body>One</body></html>", TextUtil.stripNewlines(doc.html())); 500 } 501 handlesKnownEmptyTitle()502 @Test public void handlesKnownEmptyTitle() { 503 String h = "<html><head><title /><meta name=foo></head><body>One</body></html>"; 504 Document doc = Jsoup.parse(h); 505 assertEquals("<html><head><title></title><meta name=\"foo\"></head><body>One</body></html>", TextUtil.stripNewlines(doc.html())); 506 } 507 handlesKnownEmptyIframe()508 @Test public void handlesKnownEmptyIframe() { 509 String h = "<p>One</p><iframe id=1 /><p>Two"; 510 Document doc = Jsoup.parse(h); 511 assertEquals("<html><head></head><body><p>One</p><iframe id=\"1\"></iframe><p>Two</p></body></html>", TextUtil.stripNewlines(doc.html())); 512 } 513 handlesSolidusAtAttributeEnd()514 @Test public void handlesSolidusAtAttributeEnd() { 515 // this test makes sure [<a href=/>link</a>] is parsed as [<a href="/">link</a>], not [<a href="" /><a>link</a>] 516 String h = "<a href=/>link</a>"; 517 Document doc = Jsoup.parse(h); 518 assertEquals("<a href=\"/\">link</a>", doc.body().html()); 519 } 520 handlesMultiClosingBody()521 @Test public void handlesMultiClosingBody() { 522 String h = "<body><p>Hello</body><p>there</p></body></body></html><p>now"; 523 Document doc = Jsoup.parse(h); 524 assertEquals(3, doc.select("p").size()); 525 assertEquals(3, doc.body().children().size()); 526 } 527 handlesUnclosedDefinitionLists()528 @Test public void handlesUnclosedDefinitionLists() { 529 // jsoup used to create a <dl>, but that's not to spec 530 String h = "<dt>Foo<dd>Bar<dt>Qux<dd>Zug"; 531 Document doc = Jsoup.parse(h); 532 assertEquals(0, doc.select("dl").size()); // no auto dl 533 assertEquals(4, doc.select("dt, dd").size()); 534 Elements dts = doc.select("dt"); 535 assertEquals(2, dts.size()); 536 assertEquals("Zug", dts.get(1).nextElementSibling().text()); 537 } 538 handlesBlocksInDefinitions()539 @Test public void handlesBlocksInDefinitions() { 540 // per the spec, dt and dd are inline, but in practise are block 541 String h = "<dl><dt><div id=1>Term</div></dt><dd><div id=2>Def</div></dd></dl>"; 542 Document doc = Jsoup.parse(h); 543 assertEquals("dt", doc.select("#1").first().parent().tagName()); 544 assertEquals("dd", doc.select("#2").first().parent().tagName()); 545 assertEquals("<dl><dt><div id=\"1\">Term</div></dt><dd><div id=\"2\">Def</div></dd></dl>", TextUtil.stripNewlines(doc.body().html())); 546 } 547 handlesFrames()548 @Test public void handlesFrames() { 549 String h = "<html><head><script></script><noscript></noscript></head><frameset><frame src=foo></frame><frame src=foo></frameset></html>"; 550 Document doc = Jsoup.parse(h); 551 assertEquals("<html><head><script></script><noscript></noscript></head><frameset><frame src=\"foo\"><frame src=\"foo\"></frameset></html>", 552 TextUtil.stripNewlines(doc.html())); 553 // no body auto vivification 554 } 555 ignoresContentAfterFrameset()556 @Test public void ignoresContentAfterFrameset() { 557 String h = "<html><head><title>One</title></head><frameset><frame /><frame /></frameset><table></table></html>"; 558 Document doc = Jsoup.parse(h); 559 assertEquals("<html><head><title>One</title></head><frameset><frame><frame></frameset></html>", TextUtil.stripNewlines(doc.html())); 560 // no body, no table. No crash! 561 } 562 handlesJavadocFont()563 @Test public void handlesJavadocFont() { 564 String h = "<TD BGCOLOR=\"#EEEEFF\" CLASS=\"NavBarCell1\"> <A HREF=\"deprecated-list.html\"><FONT CLASS=\"NavBarFont1\"><B>Deprecated</B></FONT></A> </TD>"; 565 Document doc = Jsoup.parse(h); 566 Element a = doc.select("a").first(); 567 assertEquals("Deprecated", a.text()); 568 assertEquals("font", a.child(0).tagName()); 569 assertEquals("b", a.child(0).child(0).tagName()); 570 } 571 handlesBaseWithoutHref()572 @Test public void handlesBaseWithoutHref() { 573 String h = "<head><base target='_blank'></head><body><a href=/foo>Test</a></body>"; 574 Document doc = Jsoup.parse(h, "http://example.com/"); 575 Element a = doc.select("a").first(); 576 assertEquals("/foo", a.attr("href")); 577 assertEquals("http://example.com/foo", a.attr("abs:href")); 578 } 579 normalisesDocument()580 @Test public void normalisesDocument() { 581 String h = "<!doctype html>One<html>Two<head>Three<link></head>Four<body>Five </body>Six </html>Seven "; 582 Document doc = Jsoup.parse(h); 583 assertEquals("<!doctype html><html><head></head><body>OneTwoThree<link>FourFive Six Seven</body></html>", 584 TextUtil.stripNewlines(doc.html())); 585 } 586 normalisesEmptyDocument()587 @Test public void normalisesEmptyDocument() { 588 Document doc = Jsoup.parse(""); 589 assertEquals("<html><head></head><body></body></html>", TextUtil.stripNewlines(doc.html())); 590 } 591 normalisesHeadlessBody()592 @Test public void normalisesHeadlessBody() { 593 Document doc = Jsoup.parse("<html><body><span class=\"foo\">bar</span>"); 594 assertEquals("<html><head></head><body><span class=\"foo\">bar</span></body></html>", 595 TextUtil.stripNewlines(doc.html())); 596 } 597 normalisedBodyAfterContent()598 @Test public void normalisedBodyAfterContent() { 599 Document doc = Jsoup.parse("<font face=Arial><body class=name><div>One</div></body></font>"); 600 assertEquals("<html><head></head><body class=\"name\"><font face=\"Arial\"><div>One</div></font></body></html>", 601 TextUtil.stripNewlines(doc.html())); 602 } 603 findsCharsetInMalformedMeta()604 @Test public void findsCharsetInMalformedMeta() { 605 String h = "<meta http-equiv=Content-Type content=text/html; charset=gb2312>"; 606 // example cited for reason of html5's <meta charset> element 607 Document doc = Jsoup.parse(h); 608 assertEquals("gb2312", doc.select("meta").attr("charset")); 609 } 610 testHgroup()611 @Test public void testHgroup() { 612 // jsoup used to not allow hgroup in h{n}, but that's not in spec, and browsers are OK 613 Document doc = Jsoup.parse("<h1>Hello <h2>There <hgroup><h1>Another<h2>headline</hgroup> <hgroup><h1>More</h1><p>stuff</p></hgroup>"); 614 assertEquals("<h1>Hello</h1><h2>There <hgroup><h1>Another</h1><h2>headline</h2></hgroup><hgroup><h1>More</h1><p>stuff</p></hgroup></h2>", TextUtil.stripNewlines(doc.body().html())); 615 } 616 testRelaxedTags()617 @Test public void testRelaxedTags() { 618 Document doc = Jsoup.parse("<abc_def id=1>Hello</abc_def> <abc-def>There</abc-def>"); 619 assertEquals("<abc_def id=\"1\">Hello</abc_def> <abc-def>There</abc-def>", TextUtil.stripNewlines(doc.body().html())); 620 } 621 testHeaderContents()622 @Test public void testHeaderContents() { 623 // h* tags (h1 .. h9) in browsers can handle any internal content other than other h*. which is not per any 624 // spec, which defines them as containing phrasing content only. so, reality over theory. 625 Document doc = Jsoup.parse("<h1>Hello <div>There</div> now</h1> <h2>More <h3>Content</h3></h2>"); 626 assertEquals("<h1>Hello <div>There</div> now</h1><h2>More</h2><h3>Content</h3>", TextUtil.stripNewlines(doc.body().html())); 627 } 628 testSpanContents()629 @Test public void testSpanContents() { 630 // like h1 tags, the spec says SPAN is phrasing only, but browsers and publisher treat span as a block tag 631 Document doc = Jsoup.parse("<span>Hello <div>there</div> <span>now</span></span>"); 632 assertEquals("<span>Hello <div>there</div><span>now</span></span>", TextUtil.stripNewlines(doc.body().html())); 633 } 634 testNoImagesInNoScriptInHead()635 @Test public void testNoImagesInNoScriptInHead() { 636 // jsoup used to allow, but against spec if parsing with noscript 637 Document doc = Jsoup.parse("<html><head><noscript><img src='foo'></noscript></head><body><p>Hello</p></body></html>"); 638 assertEquals("<html><head><noscript><img src=\"foo\"></noscript></head><body><p>Hello</p></body></html>", TextUtil.stripNewlines(doc.html())); 639 } 640 testUnclosedNoscriptInHead()641 @Test public void testUnclosedNoscriptInHead() { 642 // Was getting "EOF" in html output, because the #anythingElse handler was calling an undefined toString, so used object.toString. 643 String[] strings = {"<noscript>", "<noscript>One"}; 644 for (String html : strings) { 645 Document doc = Jsoup.parse(html); 646 assertEquals(html + "</noscript>", TextUtil.stripNewlines(doc.head().html())); 647 } 648 } 649 testAFlowContents()650 @Test public void testAFlowContents() { 651 // html5 has <a> as either phrasing or block 652 Document doc = Jsoup.parse("<a>Hello <div>there</div> <span>now</span></a>"); 653 assertEquals("<a>Hello <div>there</div><span>now</span></a>", TextUtil.stripNewlines(doc.body().html())); 654 } 655 testFontFlowContents()656 @Test public void testFontFlowContents() { 657 // html5 has no definition of <font>; often used as flow 658 Document doc = Jsoup.parse("<font>Hello <div>there</div> <span>now</span></font>"); 659 assertEquals("<font>Hello <div>there</div><span>now</span></font>", TextUtil.stripNewlines(doc.body().html())); 660 } 661 handlesMisnestedTagsBI()662 @Test public void handlesMisnestedTagsBI() { 663 // whatwg: <b><i></b></i> 664 String h = "<p>1<b>2<i>3</b>4</i>5</p>"; 665 Document doc = Jsoup.parse(h); 666 assertEquals("<p>1<b>2<i>3</i></b><i>4</i>5</p>", doc.body().html()); 667 // adoption agency on </b>, reconstruction of formatters on 4. 668 } 669 handlesMisnestedTagsBP()670 @Test public void handlesMisnestedTagsBP() { 671 // whatwg: <b><p></b></p> 672 String h = "<b>1<p>2</b>3</p>"; 673 Document doc = Jsoup.parse(h); 674 assertEquals("<b>1</b>\n<p><b>2</b>3</p>", doc.body().html()); 675 } 676 handlesMisnestedAInDivs()677 @Test public void handlesMisnestedAInDivs() { 678 String h = "<a href='#1'><div><div><a href='#2'>child</a></div</div></a>"; 679 String w = "<a href=\"#1\"></a> <div> <a href=\"#1\"></a> <div> <a href=\"#1\"></a><a href=\"#2\">child</a> </div> </div>"; 680 Document doc = Jsoup.parse(h); 681 assertEquals( 682 StringUtil.normaliseWhitespace(w), 683 StringUtil.normaliseWhitespace(doc.body().html())); 684 } 685 handlesUnexpectedMarkupInTables()686 @Test public void handlesUnexpectedMarkupInTables() { 687 // whatwg - tests markers in active formatting (if they didn't work, would get in table) 688 // also tests foster parenting 689 String h = "<table><b><tr><td>aaa</td></tr>bbb</table>ccc"; 690 Document doc = Jsoup.parse(h); 691 assertEquals("<b></b><b>bbb</b><table><tbody><tr><td>aaa</td></tr></tbody></table><b>ccc</b>", TextUtil.stripNewlines(doc.body().html())); 692 } 693 handlesUnclosedFormattingElements()694 @Test public void handlesUnclosedFormattingElements() { 695 // whatwg: formatting elements get collected and applied, but excess elements are thrown away 696 String h = "<!DOCTYPE html>\n" + 697 "<p><b class=x><b class=x><b><b class=x><b class=x><b>X\n" + 698 "<p>X\n" + 699 "<p><b><b class=x><b>X\n" + 700 "<p></b></b></b></b></b></b>X"; 701 Document doc = Jsoup.parse(h); 702 doc.outputSettings().indentAmount(0); 703 String want = "<!doctype html>\n" + 704 "<html>\n" + 705 "<head></head>\n" + 706 "<body>\n" + 707 "<p><b class=\"x\"><b class=\"x\"><b><b class=\"x\"><b class=\"x\"><b>X </b></b></b></b></b></b></p>\n" + 708 "<p><b class=\"x\"><b><b class=\"x\"><b class=\"x\"><b>X </b></b></b></b></b></p>\n" + 709 "<p><b class=\"x\"><b><b class=\"x\"><b class=\"x\"><b><b><b class=\"x\"><b>X </b></b></b></b></b></b></b></b></p>\n" + 710 "<p>X</p>\n" + 711 "</body>\n" + 712 "</html>"; 713 assertEquals(want, doc.html()); 714 } 715 handlesUnclosedAnchors()716 @Test public void handlesUnclosedAnchors() { 717 String h = "<a href='http://example.com/'>Link<p>Error link</a>"; 718 Document doc = Jsoup.parse(h); 719 String want = "<a href=\"http://example.com/\">Link</a>\n<p><a href=\"http://example.com/\">Error link</a></p>"; 720 assertEquals(want, doc.body().html()); 721 } 722 reconstructFormattingElements()723 @Test public void reconstructFormattingElements() { 724 // tests attributes and multi b 725 String h = "<p><b class=one>One <i>Two <b>Three</p><p>Hello</p>"; 726 Document doc = Jsoup.parse(h); 727 assertEquals("<p><b class=\"one\">One <i>Two <b>Three</b></i></b></p>\n<p><b class=\"one\"><i><b>Hello</b></i></b></p>", doc.body().html()); 728 } 729 reconstructFormattingElementsInTable()730 @Test public void reconstructFormattingElementsInTable() { 731 // tests that tables get formatting markers -- the <b> applies outside the table and does not leak in, 732 // and the <i> inside the table and does not leak out. 733 String h = "<p><b>One</p> <table><tr><td><p><i>Three<p>Four</i></td></tr></table> <p>Five</p>"; 734 Document doc = Jsoup.parse(h); 735 String want = "<p><b>One</b></p><b><table><tbody><tr><td><p><i>Three</i></p><p><i>Four</i></p></td></tr></tbody></table><p>Five</p></b>"; 736 assertEquals(want, TextUtil.stripNewlines(doc.body().html())); 737 } 738 commentBeforeHtml()739 @Test public void commentBeforeHtml() { 740 String h = "<!-- comment --><!-- comment 2 --><p>One</p>"; 741 Document doc = Jsoup.parse(h); 742 assertEquals("<!-- comment --><!-- comment 2 --><html><head></head><body><p>One</p></body></html>", TextUtil.stripNewlines(doc.html())); 743 } 744 emptyTdTag()745 @Test public void emptyTdTag() { 746 String h = "<table><tr><td>One</td><td id='2' /></tr></table>"; 747 Document doc = Jsoup.parse(h); 748 assertEquals("<td>One</td>\n<td id=\"2\"></td>", doc.select("tr").first().html()); 749 } 750 handlesSolidusInA()751 @Test public void handlesSolidusInA() { 752 // test for bug #66 753 String h = "<a class=lp href=/lib/14160711/>link text</a>"; 754 Document doc = Jsoup.parse(h); 755 Element a = doc.select("a").first(); 756 assertEquals("link text", a.text()); 757 assertEquals("/lib/14160711/", a.attr("href")); 758 } 759 handlesSpanInTbody()760 @Test public void handlesSpanInTbody() { 761 // test for bug 64 762 String h = "<table><tbody><span class='1'><tr><td>One</td></tr><tr><td>Two</td></tr></span></tbody></table>"; 763 Document doc = Jsoup.parse(h); 764 assertEquals(doc.select("span").first().children().size(), 0); // the span gets closed 765 assertEquals(doc.select("table").size(), 1); // only one table 766 } 767 handlesUnclosedTitleAtEof()768 @Test public void handlesUnclosedTitleAtEof() { 769 assertEquals("Data", Jsoup.parse("<title>Data").title()); 770 assertEquals("Data<", Jsoup.parse("<title>Data<").title()); 771 assertEquals("Data</", Jsoup.parse("<title>Data</").title()); 772 assertEquals("Data</t", Jsoup.parse("<title>Data</t").title()); 773 assertEquals("Data</ti", Jsoup.parse("<title>Data</ti").title()); 774 assertEquals("Data", Jsoup.parse("<title>Data</title>").title()); 775 assertEquals("Data", Jsoup.parse("<title>Data</title >").title()); 776 } 777 handlesUnclosedTitle()778 @Test public void handlesUnclosedTitle() { 779 Document one = Jsoup.parse("<title>One <b>Two <b>Three</TITLE><p>Test</p>"); // has title, so <b> is plain text 780 assertEquals("One <b>Two <b>Three", one.title()); 781 assertEquals("Test", one.select("p").first().text()); 782 783 Document two = Jsoup.parse("<title>One<b>Two <p>Test</p>"); // no title, so <b> causes </title> breakout 784 assertEquals("One", two.title()); 785 assertEquals("<b>Two \n <p>Test</p></b>", two.body().html()); 786 } 787 handlesUnclosedScriptAtEof()788 @Test public void handlesUnclosedScriptAtEof() { 789 assertEquals("Data", Jsoup.parse("<script>Data").select("script").first().data()); 790 assertEquals("Data<", Jsoup.parse("<script>Data<").select("script").first().data()); 791 assertEquals("Data</sc", Jsoup.parse("<script>Data</sc").select("script").first().data()); 792 assertEquals("Data</-sc", Jsoup.parse("<script>Data</-sc").select("script").first().data()); 793 assertEquals("Data</sc-", Jsoup.parse("<script>Data</sc-").select("script").first().data()); 794 assertEquals("Data</sc--", Jsoup.parse("<script>Data</sc--").select("script").first().data()); 795 assertEquals("Data", Jsoup.parse("<script>Data</script>").select("script").first().data()); 796 assertEquals("Data</script", Jsoup.parse("<script>Data</script").select("script").first().data()); 797 assertEquals("Data", Jsoup.parse("<script>Data</script ").select("script").first().data()); 798 assertEquals("Data", Jsoup.parse("<script>Data</script n").select("script").first().data()); 799 assertEquals("Data", Jsoup.parse("<script>Data</script n=").select("script").first().data()); 800 assertEquals("Data", Jsoup.parse("<script>Data</script n=\"").select("script").first().data()); 801 assertEquals("Data", Jsoup.parse("<script>Data</script n=\"p").select("script").first().data()); 802 } 803 handlesUnclosedRawtextAtEof()804 @Test public void handlesUnclosedRawtextAtEof() { 805 assertEquals("Data", Jsoup.parse("<style>Data").select("style").first().data()); 806 assertEquals("Data</st", Jsoup.parse("<style>Data</st").select("style").first().data()); 807 assertEquals("Data", Jsoup.parse("<style>Data</style>").select("style").first().data()); 808 assertEquals("Data</style", Jsoup.parse("<style>Data</style").select("style").first().data()); 809 assertEquals("Data</-style", Jsoup.parse("<style>Data</-style").select("style").first().data()); 810 assertEquals("Data</style-", Jsoup.parse("<style>Data</style-").select("style").first().data()); 811 assertEquals("Data</style--", Jsoup.parse("<style>Data</style--").select("style").first().data()); 812 } 813 noImplicitFormForTextAreas()814 @Test public void noImplicitFormForTextAreas() { 815 // old jsoup parser would create implicit forms for form children like <textarea>, but no more 816 Document doc = Jsoup.parse("<textarea>One</textarea>"); 817 assertEquals("<textarea>One</textarea>", doc.body().html()); 818 } 819 handlesEscapedScript()820 @Test public void handlesEscapedScript() { 821 Document doc = Jsoup.parse("<script><!-- one <script>Blah</script> --></script>"); 822 assertEquals("<!-- one <script>Blah</script> -->", doc.select("script").first().data()); 823 } 824 handles0CharacterAsText()825 @Test public void handles0CharacterAsText() { 826 Document doc = Jsoup.parse("0<p>0</p>"); 827 assertEquals("0\n<p>0</p>", doc.body().html()); 828 } 829 handlesNullInData()830 @Test public void handlesNullInData() { 831 Document doc = Jsoup.parse("<p id=\u0000>Blah \u0000</p>"); 832 assertEquals("<p id=\"\uFFFD\">Blah �</p>", doc.body().html()); // replaced in attr, NOT replaced in data (but is escaped as control char <0x20) 833 } 834 handlesNullInComments()835 @Test public void handlesNullInComments() { 836 Document doc = Jsoup.parse("<body><!-- \u0000 \u0000 -->"); 837 assertEquals("<!-- \uFFFD \uFFFD -->", doc.body().html()); 838 } 839 handlesNewlinesAndWhitespaceInTag()840 @Test public void handlesNewlinesAndWhitespaceInTag() { 841 Document doc = Jsoup.parse("<a \n href=\"one\" \r\n id=\"two\" \f >"); 842 assertEquals("<a href=\"one\" id=\"two\"></a>", doc.body().html()); 843 } 844 handlesWhitespaceInoDocType()845 @Test public void handlesWhitespaceInoDocType() { 846 String html = "<!DOCTYPE html\r\n" + 847 " PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\r\n" + 848 " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">"; 849 Document doc = Jsoup.parse(html); 850 assertEquals("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">", doc.childNode(0).outerHtml()); 851 } 852 tracksErrorsWhenRequested()853 @Test public void tracksErrorsWhenRequested() { 854 String html = "<p>One</p href='no'>\n<!DOCTYPE html>\n&arrgh;<font />! & �<br /></div><foo"; 855 Parser parser = Parser.htmlParser().setTrackErrors(500); 856 Document doc = Jsoup.parse(html, "http://example.com", parser); 857 858 List<ParseError> errors = parser.getErrors(); 859 assertEquals(9, errors.size()); 860 assertEquals("<1:21>: Attributes incorrectly present on end tag [/p]", errors.get(0).toString()); 861 assertEquals("<2:16>: Unexpected Doctype token [<!doctype html>] when in state [InBody]", errors.get(1).toString()); 862 assertEquals("<3:2>: Invalid character reference: invalid named reference [arrgh]", errors.get(2).toString()); 863 assertEquals("<3:16>: Tag [font] cannot be self closing; not a void tag", errors.get(3).toString()); 864 assertEquals("<3:20>: Invalid character reference: missing semicolon on [!]", errors.get(4).toString()); 865 assertEquals("<3:25>: Invalid character reference: missing semicolon on [&]", errors.get(5).toString()); 866 assertEquals("<3:36>: Invalid character reference: character [1114112] outside of valid range", errors.get(6).toString()); 867 assertEquals("<3:48>: Unexpected EndTag token [</div>] when in state [InBody]", errors.get(7).toString()); 868 assertEquals("<3:53>: Unexpectedly reached end of file (EOF) in input state [TagName]", errors.get(8).toString()); 869 } 870 tracksLimitedErrorsWhenRequested()871 @Test public void tracksLimitedErrorsWhenRequested() { 872 String html = "<p>One</p href='no'>\n<!DOCTYPE html>\n&arrgh;<font /><br /><foo"; 873 Parser parser = Parser.htmlParser().setTrackErrors(3); 874 Document doc = parser.parseInput(html, "http://example.com"); 875 876 List<ParseError> errors = parser.getErrors(); 877 assertEquals(3, errors.size()); 878 assertEquals("<1:21>: Attributes incorrectly present on end tag [/p]", errors.get(0).toString()); 879 assertEquals("<2:16>: Unexpected Doctype token [<!doctype html>] when in state [InBody]", errors.get(1).toString()); 880 assertEquals("<3:2>: Invalid character reference: invalid named reference [arrgh]", errors.get(2).toString()); 881 } 882 noErrorsByDefault()883 @Test public void noErrorsByDefault() { 884 String html = "<p>One</p href='no'>&arrgh;<font /><br /><foo"; 885 Parser parser = Parser.htmlParser(); 886 Document doc = Jsoup.parse(html, "http://example.com", parser); 887 888 List<ParseError> errors = parser.getErrors(); 889 assertEquals(0, errors.size()); 890 } 891 optionalPClosersAreNotErrors()892 @Test public void optionalPClosersAreNotErrors() { 893 String html = "<body><div><p>One<p>Two</div></body>"; 894 Parser parser = Parser.htmlParser().setTrackErrors(128); 895 Document doc = Jsoup.parse(html, "", parser); 896 ParseErrorList errors = parser.getErrors(); 897 assertEquals(0, errors.size()); 898 } 899 handlesCommentsInTable()900 @Test public void handlesCommentsInTable() { 901 String html = "<table><tr><td>text</td><!-- Comment --></tr></table>"; 902 Document node = Jsoup.parseBodyFragment(html); 903 assertEquals("<html><head></head><body><table><tbody><tr><td>text</td><!-- Comment --></tr></tbody></table></body></html>", TextUtil.stripNewlines(node.outerHtml())); 904 } 905 handlesQuotesInCommentsInScripts()906 @Test public void handlesQuotesInCommentsInScripts() { 907 String html = "<script>\n" + 908 " <!--\n" + 909 " document.write('</scr' + 'ipt>');\n" + 910 " // -->\n" + 911 "</script>"; 912 Document node = Jsoup.parseBodyFragment(html); 913 assertEquals("<script>\n" + 914 " <!--\n" + 915 " document.write('</scr' + 'ipt>');\n" + 916 " // -->\n" + 917 "</script>", node.body().html()); 918 } 919 handleNullContextInParseFragment()920 @Test public void handleNullContextInParseFragment() { 921 String html = "<ol><li>One</li></ol><p>Two</p>"; 922 List<Node> nodes = Parser.parseFragment(html, null, "http://example.com/"); 923 assertEquals(1, nodes.size()); // returns <html> node (not document) -- no context means doc gets created 924 assertEquals("html", nodes.get(0).nodeName()); 925 assertEquals("<html> <head></head> <body> <ol> <li>One</li> </ol> <p>Two</p> </body> </html>", StringUtil.normaliseWhitespace(nodes.get(0).outerHtml())); 926 } 927 doesNotFindShortestMatchingEntity()928 @Test public void doesNotFindShortestMatchingEntity() { 929 // previous behaviour was to identify a possible entity, then chomp down the string until a match was found. 930 // (as defined in html5.) However in practise that lead to spurious matches against the author's intent. 931 String html = "One &clubsuite; ♣"; 932 Document doc = Jsoup.parse(html); 933 assertEquals(StringUtil.normaliseWhitespace("One &clubsuite; ♣"), doc.body().html()); 934 } 935 relaxedBaseEntityMatchAndStrictExtendedMatch()936 @Test public void relaxedBaseEntityMatchAndStrictExtendedMatch() { 937 // extended entities need a ; at the end to match, base does not 938 String html = "& " ® &icy &hopf и 𝕙"; 939 Document doc = Jsoup.parse(html); 940 doc.outputSettings().escapeMode(Entities.EscapeMode.extended).charset("ascii"); // modifies output only to clarify test 941 assertEquals("& \" ® &icy &hopf и 𝕙", doc.body().html()); 942 } 943 handlesXmlDeclarationAsBogusComment()944 @Test public void handlesXmlDeclarationAsBogusComment() { 945 String html = "<?xml encoding='UTF-8' ?><body>One</body>"; 946 Document doc = Jsoup.parse(html); 947 assertEquals("<!--?xml encoding='UTF-8' ?--> <html> <head></head> <body> One </body> </html>", StringUtil.normaliseWhitespace(doc.outerHtml())); 948 } 949 handlesTagsInTextarea()950 @Test public void handlesTagsInTextarea() { 951 String html = "<textarea><p>Jsoup</p></textarea>"; 952 Document doc = Jsoup.parse(html); 953 assertEquals("<textarea><p>Jsoup</p></textarea>", doc.body().html()); 954 } 955 956 // form tests createsFormElements()957 @Test public void createsFormElements() { 958 String html = "<body><form><input id=1><input id=2></form></body>"; 959 Document doc = Jsoup.parse(html); 960 Element el = doc.select("form").first(); 961 962 assertTrue(el instanceof FormElement, "Is form element"); 963 FormElement form = (FormElement) el; 964 Elements controls = form.elements(); 965 assertEquals(2, controls.size()); 966 assertEquals("1", controls.get(0).id()); 967 assertEquals("2", controls.get(1).id()); 968 } 969 associatedFormControlsWithDisjointForms()970 @Test public void associatedFormControlsWithDisjointForms() { 971 // form gets closed, isn't parent of controls 972 String html = "<table><tr><form><input type=hidden id=1><td><input type=text id=2></td><tr></table>"; 973 Document doc = Jsoup.parse(html); 974 Element el = doc.select("form").first(); 975 976 assertTrue(el instanceof FormElement, "Is form element"); 977 FormElement form = (FormElement) el; 978 Elements controls = form.elements(); 979 assertEquals(2, controls.size()); 980 assertEquals("1", controls.get(0).id()); 981 assertEquals("2", controls.get(1).id()); 982 983 assertEquals("<table><tbody><tr><form></form><input type=\"hidden\" id=\"1\"><td><input type=\"text\" id=\"2\"></td></tr><tr></tr></tbody></table>", TextUtil.stripNewlines(doc.body().html())); 984 } 985 handlesInputInTable()986 @Test public void handlesInputInTable() { 987 String h = "<body>\n" + 988 "<input type=\"hidden\" name=\"a\" value=\"\">\n" + 989 "<table>\n" + 990 "<input type=\"hidden\" name=\"b\" value=\"\" />\n" + 991 "</table>\n" + 992 "</body>"; 993 Document doc = Jsoup.parse(h); 994 assertEquals(1, doc.select("table input").size()); 995 assertEquals(2, doc.select("input").size()); 996 } 997 convertsImageToImg()998 @Test public void convertsImageToImg() { 999 // image to img, unless in a svg. old html cruft. 1000 String h = "<body><image><svg><image /></svg></body>"; 1001 Document doc = Jsoup.parse(h); 1002 assertEquals("<img>\n<svg>\n <image />\n</svg>", doc.body().html()); 1003 } 1004 handlesInvalidDoctypes()1005 @Test public void handlesInvalidDoctypes() { 1006 // would previously throw invalid name exception on empty doctype 1007 Document doc = Jsoup.parse("<!DOCTYPE>"); 1008 assertEquals( 1009 "<!doctype> <html> <head></head> <body></body> </html>", 1010 StringUtil.normaliseWhitespace(doc.outerHtml())); 1011 1012 doc = Jsoup.parse("<!DOCTYPE><html><p>Foo</p></html>"); 1013 assertEquals( 1014 "<!doctype> <html> <head></head> <body> <p>Foo</p> </body> </html>", 1015 StringUtil.normaliseWhitespace(doc.outerHtml())); 1016 1017 doc = Jsoup.parse("<!DOCTYPE \u0000>"); 1018 assertEquals( 1019 "<!doctype �> <html> <head></head> <body></body> </html>", 1020 StringUtil.normaliseWhitespace(doc.outerHtml())); 1021 } 1022 handlesManyChildren()1023 @Test public void handlesManyChildren() { 1024 // Arrange 1025 StringBuilder longBody = new StringBuilder(500000); 1026 for (int i = 0; i < 25000; i++) { 1027 longBody.append(i).append("<br>"); 1028 } 1029 1030 // Act 1031 long start = System.currentTimeMillis(); 1032 Document doc = Parser.parseBodyFragment(longBody.toString(), ""); 1033 1034 // Assert 1035 assertEquals(50000, doc.body().childNodeSize()); 1036 assertTrue(System.currentTimeMillis() - start < 1000); 1037 } 1038 1039 @Test 1040 public void testInvalidTableContents() throws IOException { 1041 File in = ParseTest.getFile("/htmltests/table-invalid-elements.html"); 1042 Document doc = Jsoup.parse(in, "UTF-8"); 1043 doc.outputSettings().prettyPrint(true); 1044 String rendered = doc.toString(); 1045 int endOfEmail = rendered.indexOf("Comment"); 1046 int guarantee = rendered.indexOf("Why am I here?"); 1047 assertTrue(endOfEmail > -1, "Comment not found"); 1048 assertTrue(guarantee > -1, "Search text not found"); 1049 assertTrue(guarantee > endOfEmail, "Search text did not come after comment"); 1050 } 1051 testNormalisesIsIndex()1052 @Test public void testNormalisesIsIndex() { 1053 Document doc = Jsoup.parse("<body><isindex action='/submit'></body>"); 1054 String html = doc.outerHtml(); 1055 assertEquals("<form action=\"/submit\"> <hr><label>This is a searchable index. Enter search keywords: <input name=\"isindex\"></label> <hr> </form>", 1056 StringUtil.normaliseWhitespace(doc.body().html())); 1057 } 1058 testReinsertionModeForThCelss()1059 @Test public void testReinsertionModeForThCelss() { 1060 String body = "<body> <table> <tr> <th> <table><tr><td></td></tr></table> <div> <table><tr><td></td></tr></table> </div> <div></div> <div></div> <div></div> </th> </tr> </table> </body>"; 1061 Document doc = Jsoup.parse(body); 1062 assertEquals(1, doc.body().children().size()); 1063 } 1064 testUsingSingleQuotesInQueries()1065 @Test public void testUsingSingleQuotesInQueries() { 1066 String body = "<body> <div class='main'>hello</div></body>"; 1067 Document doc = Jsoup.parse(body); 1068 Elements main = doc.select("div[class='main']"); 1069 assertEquals("hello", main.text()); 1070 } 1071 testSupportsNonAsciiTags()1072 @Test public void testSupportsNonAsciiTags() { 1073 String body = "<a進捗推移グラフ>Yes</a進捗推移グラフ><bрусский-тэг>Correct</<bрусский-тэг>"; 1074 Document doc = Jsoup.parse(body); 1075 Elements els = doc.select("a進捗推移グラフ"); 1076 assertEquals("Yes", els.text()); 1077 els = doc.select("bрусский-тэг"); 1078 assertEquals("Correct", els.text()); 1079 } 1080 testSupportsPartiallyNonAsciiTags()1081 @Test public void testSupportsPartiallyNonAsciiTags() { 1082 String body = "<div>Check</divá>"; 1083 Document doc = Jsoup.parse(body); 1084 Elements els = doc.select("div"); 1085 assertEquals("Check", els.text()); 1086 } 1087 testFragment()1088 @Test public void testFragment() { 1089 // make sure when parsing a body fragment, a script tag at start goes into the body 1090 String html = 1091 "<script type=\"text/javascript\">console.log('foo');</script>\n" + 1092 "<div id=\"somecontent\">some content</div>\n" + 1093 "<script type=\"text/javascript\">console.log('bar');</script>"; 1094 1095 Document body = Jsoup.parseBodyFragment(html); 1096 assertEquals("<script type=\"text/javascript\">console.log('foo');</script>\n" + 1097 "<div id=\"somecontent\">\n" + 1098 " some content\n" + 1099 "</div>\n" + 1100 "<script type=\"text/javascript\">console.log('bar');</script>", body.body().html()); 1101 } 1102 testHtmlLowerCase()1103 @Test public void testHtmlLowerCase() { 1104 String html = "<!doctype HTML><DIV ID=1>One</DIV>"; 1105 Document doc = Jsoup.parse(html); 1106 assertEquals("<!doctype html> <html> <head></head> <body> <div id=\"1\"> One </div> </body> </html>", StringUtil.normaliseWhitespace(doc.outerHtml())); 1107 1108 Element div = doc.selectFirst("#1"); 1109 div.after("<TaG>One</TaG>"); 1110 assertEquals("<tag>One</tag>", TextUtil.stripNewlines(div.nextElementSibling().outerHtml())); 1111 } 1112 testHtmlLowerCaseAttributesOfVoidTags()1113 @Test public void testHtmlLowerCaseAttributesOfVoidTags() { 1114 String html = "<!doctype HTML><IMG ALT=One></DIV>"; 1115 Document doc = Jsoup.parse(html); 1116 assertEquals("<!doctype html> <html> <head></head> <body> <img alt=\"One\"> </body> </html>", StringUtil.normaliseWhitespace(doc.outerHtml())); 1117 } 1118 testHtmlLowerCaseAttributesForm()1119 @Test public void testHtmlLowerCaseAttributesForm() { 1120 String html = "<form NAME=one>"; 1121 Document doc = Jsoup.parse(html); 1122 assertEquals("<form name=\"one\"></form>", StringUtil.normaliseWhitespace(doc.body().html())); 1123 } 1124 canPreserveTagCase()1125 @Test public void canPreserveTagCase() { 1126 Parser parser = Parser.htmlParser(); 1127 parser.settings(new ParseSettings(true, false)); 1128 Document doc = parser.parseInput("<div id=1><SPAN ID=2>", ""); 1129 assertEquals("<html> <head></head> <body> <div id=\"1\"> <SPAN id=\"2\"></SPAN> </div> </body> </html>", StringUtil.normaliseWhitespace(doc.outerHtml())); 1130 1131 Element div = doc.selectFirst("#1"); 1132 div.after("<TaG ID=one>One</TaG>"); 1133 assertEquals("<TaG id=\"one\">One</TaG>", TextUtil.stripNewlines(div.nextElementSibling().outerHtml())); 1134 } 1135 canPreserveAttributeCase()1136 @Test public void canPreserveAttributeCase() { 1137 Parser parser = Parser.htmlParser(); 1138 parser.settings(new ParseSettings(false, true)); 1139 Document doc = parser.parseInput("<div id=1><SPAN ID=2>", ""); 1140 assertEquals("<html> <head></head> <body> <div id=\"1\"> <span ID=\"2\"></span> </div> </body> </html>", StringUtil.normaliseWhitespace(doc.outerHtml())); 1141 1142 Element div = doc.selectFirst("#1"); 1143 div.after("<TaG ID=one>One</TaG>"); 1144 assertEquals("<tag ID=\"one\">One</tag>", TextUtil.stripNewlines(div.nextElementSibling().outerHtml())); 1145 } 1146 canPreserveBothCase()1147 @Test public void canPreserveBothCase() { 1148 Parser parser = Parser.htmlParser(); 1149 parser.settings(new ParseSettings(true, true)); 1150 Document doc = parser.parseInput("<div id=1><SPAN ID=2>", ""); 1151 assertEquals("<html> <head></head> <body> <div id=\"1\"> <SPAN ID=\"2\"></SPAN> </div> </body> </html>", StringUtil.normaliseWhitespace(doc.outerHtml())); 1152 1153 Element div = doc.selectFirst("#1"); 1154 div.after("<TaG ID=one>One</TaG>"); 1155 assertEquals("<TaG ID=\"one\">One</TaG>", TextUtil.stripNewlines(div.nextElementSibling().outerHtml())); 1156 } 1157 handlesControlCodeInAttributeName()1158 @Test public void handlesControlCodeInAttributeName() { 1159 Document doc = Jsoup.parse("<p><a \06=foo>One</a><a/\06=bar><a foo\06=bar>Two</a></p>"); 1160 assertEquals("<p><a>One</a><a></a><a foo=\"bar\">Two</a></p>", doc.body().html()); 1161 } 1162 caseSensitiveParseTree()1163 @Test public void caseSensitiveParseTree() { 1164 String html = "<r><X>A</X><y>B</y></r>"; 1165 Parser parser = Parser.htmlParser(); 1166 parser.settings(preserveCase); 1167 Document doc = parser.parseInput(html, ""); 1168 assertEquals("<r> <X> A </X> <y> B </y> </r>", StringUtil.normaliseWhitespace(doc.body().html())); 1169 } 1170 caseInsensitiveParseTree()1171 @Test public void caseInsensitiveParseTree() { 1172 String html = "<r><X>A</X><y>B</y></r>"; 1173 Parser parser = Parser.htmlParser(); 1174 Document doc = parser.parseInput(html, ""); 1175 assertEquals("<r> <x> A </x> <y> B </y> </r>", StringUtil.normaliseWhitespace(doc.body().html())); 1176 } 1177 preservedCaseLinksCantNest()1178 @Test public void preservedCaseLinksCantNest() { 1179 String html = "<A>ONE <A>Two</A></A>"; 1180 Document doc = Parser.htmlParser() 1181 .settings(preserveCase) 1182 .parseInput(html, ""); 1183 //assertEquals("<A>ONE </A><A>Two</A>", StringUtil.normaliseWhitespace(doc.body().html())); 1184 assertEquals("<A>ONE </A><A>Two</A>", doc.body().html()); 1185 } 1186 normalizesDiscordantTags()1187 @Test public void normalizesDiscordantTags() { 1188 Document document = Jsoup.parse("<div>test</DIV><p></p>"); 1189 assertEquals("<div>\n test\n</div>\n<p></p>", document.body().html()); 1190 } 1191 selfClosingVoidIsNotAnError()1192 @Test public void selfClosingVoidIsNotAnError() { 1193 String html = "<p>test<br/>test<br/></p>"; 1194 Parser parser = Parser.htmlParser().setTrackErrors(5); 1195 parser.parseInput(html, ""); 1196 assertEquals(0, parser.getErrors().size()); 1197 1198 assertTrue(Jsoup.isValid(html, Safelist.basic())); 1199 String clean = Jsoup.clean(html, Safelist.basic()); 1200 assertEquals("<p>test<br>\n test<br></p>", clean); 1201 } 1202 selfClosingOnNonvoidIsError()1203 @Test public void selfClosingOnNonvoidIsError() { 1204 String html = "<p>test</p>\n\n<div /><div>Two</div>"; 1205 Parser parser = Parser.htmlParser().setTrackErrors(5); 1206 parser.parseInput(html, ""); 1207 assertEquals(1, parser.getErrors().size()); 1208 assertEquals("<3:8>: Tag [div] cannot be self closing; not a void tag", parser.getErrors().get(0).toString()); 1209 1210 assertFalse(Jsoup.isValid(html, Safelist.relaxed())); 1211 String clean = Jsoup.clean(html, Safelist.relaxed()); 1212 assertEquals("<p>test</p> <div></div> <div> Two </div>", StringUtil.normaliseWhitespace(clean)); 1213 } 1214 testTemplateInsideTable()1215 @Test public void testTemplateInsideTable() throws IOException { 1216 File in = ParseTest.getFile("/htmltests/table-polymer-template.html"); 1217 Document doc = Jsoup.parse(in, "UTF-8"); 1218 doc.outputSettings().prettyPrint(true); 1219 1220 Elements templates = doc.body().getElementsByTag("template"); 1221 for (Element template : templates) { 1222 assertTrue(template.childNodes().size() > 1); 1223 } 1224 } 1225 testHandlesDeepSpans()1226 @Test public void testHandlesDeepSpans() { 1227 StringBuilder sb = new StringBuilder(); 1228 for (int i = 0; i < 200; i++) { 1229 sb.append("<span>"); 1230 } 1231 1232 sb.append("<p>One</p>"); 1233 1234 Document doc = Jsoup.parse(sb.toString()); 1235 assertEquals(200, doc.select("span").size()); 1236 assertEquals(1, doc.select("p").size()); 1237 } 1238 commentAtEnd()1239 @Test public void commentAtEnd() { 1240 Document doc = Jsoup.parse("<!"); 1241 assertTrue(doc.childNode(0) instanceof Comment); 1242 } 1243 preSkipsFirstNewline()1244 @Test public void preSkipsFirstNewline() { 1245 Document doc = Jsoup.parse("<pre>\n\nOne\nTwo\n</pre>"); 1246 Element pre = doc.selectFirst("pre"); 1247 assertEquals("One\nTwo", pre.text()); 1248 assertEquals("\nOne\nTwo\n", pre.wholeText()); 1249 } 1250 handlesXmlDeclAndCommentsBeforeDoctype()1251 @Test public void handlesXmlDeclAndCommentsBeforeDoctype() throws IOException { 1252 File in = ParseTest.getFile("/htmltests/comments.html"); 1253 Document doc = Jsoup.parse(in, "UTF-8"); 1254 1255 assertEquals("<!--?xml version=\"1.0\" encoding=\"utf-8\"?--><!-- so --> <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><!-- what --> <html xml:lang=\"en\" lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\"> <!-- now --> <head> <!-- then --> <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\"> <title>A Certain Kind of Test</title> </head> <body> <h1>Hello</h1>h1> (There is a UTF8 hidden BOM at the top of this file.) </body> </html>", 1256 StringUtil.normaliseWhitespace(doc.html())); 1257 1258 assertEquals("A Certain Kind of Test", doc.head().select("title").text()); 1259 } 1260 fallbackToUtfIfCantEncode()1261 @Test public void fallbackToUtfIfCantEncode() throws IOException { 1262 // that charset can't be encoded, so make sure we flip to utf 1263 1264 String in = "<html><meta charset=\"ISO-2022-CN\"/>One</html>"; 1265 Document doc = Jsoup.parse(new ByteArrayInputStream(in.getBytes()), null, ""); 1266 1267 assertEquals("UTF-8", doc.charset().name()); 1268 assertEquals("One", doc.text()); 1269 1270 String html = doc.outerHtml(); 1271 assertEquals("<html><head><meta charset=\"UTF-8\"></head><body>One</body></html>", TextUtil.stripNewlines(html)); 1272 } 1273 characterReaderBuffer()1274 @Test public void characterReaderBuffer() throws IOException { 1275 File in = ParseTest.getFile("/htmltests/character-reader-buffer.html.gz"); 1276 Document doc = Jsoup.parse(in, "UTF-8"); 1277 1278 String expectedHref = "http://www.domain.com/path?param_one=value¶m_two=value"; 1279 1280 Elements links = doc.select("a"); 1281 assertEquals(2, links.size()); 1282 assertEquals(expectedHref, links.get(0).attr("href")); // passes 1283 assertEquals(expectedHref, links.get(1).attr("href")); // fails, "but was:<...ath?param_one=value&[]_two-value>" 1284 } 1285 1286 @Test selfClosingTextAreaDoesntLeaveDroppings()1287 public void selfClosingTextAreaDoesntLeaveDroppings() { 1288 // https://github.com/jhy/jsoup/issues/1220 1289 Document doc = Jsoup.parse("<div><div><textarea/></div></div>"); 1290 assertFalse(doc.body().html().contains("<")); 1291 assertFalse(doc.body().html().contains(">")); 1292 assertEquals("<div><div><textarea></textarea></div></div>", TextUtil.stripNewlines(doc.body().html())); 1293 } 1294 1295 @Test testNoSpuriousSpace()1296 public void testNoSpuriousSpace() { 1297 Document doc = Jsoup.parse("Just<a>One</a><a>Two</a>"); 1298 assertEquals("Just<a>One</a><a>Two</a>", doc.body().html()); 1299 assertEquals("JustOneTwo", doc.body().text()); 1300 } 1301 1302 @Test pTagsGetIndented()1303 public void pTagsGetIndented() { 1304 String html = "<div><p><a href=one>One</a><p><a href=two>Two</a></p></div>"; 1305 Document doc = Jsoup.parse(html); 1306 assertEquals("<div>\n" + 1307 " <p><a href=\"one\">One</a></p>\n" + 1308 " <p><a href=\"two\">Two</a></p>\n" + 1309 "</div>", doc.body().html()); 1310 } 1311 1312 @Test indentRegardlessOfCase()1313 public void indentRegardlessOfCase() { 1314 String html = "<p>1</p><P>2</P>"; 1315 Document doc = Jsoup.parse(html); 1316 assertEquals( 1317 "<body>\n" + 1318 " <p>1</p>\n" + 1319 " <p>2</p>\n" + 1320 "</body>", doc.body().outerHtml()); 1321 1322 Document caseDoc = Jsoup.parse(html, "", Parser.htmlParser().settings(preserveCase)); 1323 assertEquals( 1324 "<body>\n" + 1325 " <p>1</p>\n" + 1326 " <P>2</P>\n" + 1327 "</body>", caseDoc.body().outerHtml()); 1328 } 1329 1330 @Test testH20()1331 public void testH20() { 1332 // https://github.com/jhy/jsoup/issues/731 1333 String html = "H<sub>2</sub>O"; 1334 String clean = Jsoup.clean(html, Safelist.basic()); 1335 assertEquals("H<sub>2</sub>O", clean); 1336 1337 Document doc = Jsoup.parse(html); 1338 assertEquals("H2O", doc.text()); 1339 } 1340 1341 @Test testUNewlines()1342 public void testUNewlines() { 1343 // https://github.com/jhy/jsoup/issues/851 1344 String html = "t<u>es</u>t <b>on</b> <i>f</i><u>ir</u>e"; 1345 String clean = Jsoup.clean(html, Safelist.basic()); 1346 assertEquals("t<u>es</u>t <b>on</b> <i>f</i><u>ir</u>e", clean); 1347 1348 Document doc = Jsoup.parse(html); 1349 assertEquals("test on fire", doc.text()); 1350 } 1351 testFarsi()1352 @Test public void testFarsi() { 1353 // https://github.com/jhy/jsoup/issues/1227 1354 String text = "نیمه\u200Cشب"; 1355 Document doc = Jsoup.parse("<p>" + text); 1356 assertEquals(text, doc.text()); 1357 } 1358 testStartOptGroup()1359 @Test public void testStartOptGroup() { 1360 // https://github.com/jhy/jsoup/issues/1313 1361 String html = "<select>\n" + 1362 " <optgroup label=\"a\">\n" + 1363 " <option>one\n" + 1364 " <option>two\n" + 1365 " <option>three\n" + 1366 " <optgroup label=\"b\">\n" + 1367 " <option>four\n" + 1368 " <option>fix\n" + 1369 " <option>six\n" + 1370 "</select>"; 1371 Document doc = Jsoup.parse(html); 1372 Element select = doc.selectFirst("select"); 1373 assertEquals(2, select.childrenSize()); 1374 1375 assertEquals("<optgroup label=\"a\"> <option>one </option><option>two </option><option>three </option></optgroup><optgroup label=\"b\"> <option>four </option><option>fix </option><option>six </option></optgroup>", select.html()); 1376 } 1377 readerClosedAfterParse()1378 @Test public void readerClosedAfterParse() { 1379 Document doc = Jsoup.parse("Hello"); 1380 TreeBuilder treeBuilder = doc.parser().getTreeBuilder(); 1381 assertNull(treeBuilder.reader); 1382 assertNull(treeBuilder.tokeniser); 1383 } 1384 scriptInDataNode()1385 @Test public void scriptInDataNode() { 1386 Document doc = Jsoup.parse("<script>Hello</script><style>There</style>"); 1387 assertTrue(doc.selectFirst("script").childNode(0) instanceof DataNode); 1388 assertTrue(doc.selectFirst("style").childNode(0) instanceof DataNode); 1389 1390 doc = Jsoup.parse("<SCRIPT>Hello</SCRIPT><STYLE>There</STYLE>", "", Parser.htmlParser().settings(preserveCase)); 1391 assertTrue(doc.selectFirst("script").childNode(0) instanceof DataNode); 1392 assertTrue(doc.selectFirst("style").childNode(0) instanceof DataNode); 1393 } 1394 textareaValue()1395 @Test public void textareaValue() { 1396 String html = "<TEXTAREA>YES YES</TEXTAREA>"; 1397 Document doc = Jsoup.parse(html); 1398 assertEquals("YES YES", doc.selectFirst("textarea").val()); 1399 1400 doc = Jsoup.parse(html, "", Parser.htmlParser().settings(preserveCase)); 1401 assertEquals("YES YES", doc.selectFirst("textarea").val()); 1402 } 1403 preserveWhitespaceInHead()1404 @Test public void preserveWhitespaceInHead() { 1405 String html = "\n<!doctype html>\n<html>\n<head>\n<title>Hello</title>\n</head>\n<body>\n<p>One</p>\n</body>\n</html>\n"; 1406 Document doc = Jsoup.parse(html); 1407 doc.outputSettings().prettyPrint(false); 1408 assertEquals("<!doctype html>\n<html>\n<head>\n<title>Hello</title>\n</head>\n<body>\n<p>One</p>\n</body>\n</html>\n", doc.outerHtml()); 1409 } 1410 handleContentAfterBody()1411 @Test public void handleContentAfterBody() { 1412 String html = "<body>One</body> <p>Hello!</p></html> <p>There</p>"; 1413 Document doc = Jsoup.parse(html); 1414 doc.outputSettings().prettyPrint(false); 1415 assertEquals("<html><head></head><body>One<p>Hello!</p><p>There</p></body> </html> ", doc.outerHtml()); 1416 } 1417 preservesTabs()1418 @Test public void preservesTabs() { 1419 // testcase to demonstrate tab retention - https://github.com/jhy/jsoup/issues/1240 1420 String html = "<pre>One\tTwo</pre><span>\tThree\tFour</span>"; 1421 Document doc = Jsoup.parse(html); 1422 1423 Element pre = doc.selectFirst("pre"); 1424 Element span = doc.selectFirst("span"); 1425 1426 assertEquals("One\tTwo", pre.text()); 1427 assertEquals("Three Four", span.text()); // normalized, including overall trim 1428 assertEquals("\tThree\tFour", span.wholeText()); // text normalizes, wholeText retains original spaces incl tabs 1429 assertEquals("One\tTwo Three Four", doc.body().text()); 1430 1431 assertEquals("<pre>One\tTwo</pre><span> Three Four</span>", doc.body().html()); // html output provides normalized space, incl tab in pre but not in span 1432 1433 doc.outputSettings().prettyPrint(false); 1434 assertEquals(html, doc.body().html()); // disabling pretty-printing - round-trips the tab throughout, as no normalization occurs 1435 } 1436 wholeTextTreatsBRasNewline()1437 @Test void wholeTextTreatsBRasNewline() { 1438 String html = "<div>\nOne<br>Two <p>Three<br>Four</div>"; 1439 Document doc = Jsoup.parse(html); 1440 Element div = doc.selectFirst("div"); 1441 assertNotNull(div); 1442 assertEquals("\nOne\nTwo Three\nFour", div.wholeText()); 1443 assertEquals("\nOne\nTwo ", div.wholeOwnText()); 1444 } 1445 canDetectAutomaticallyAddedElements()1446 @Test public void canDetectAutomaticallyAddedElements() { 1447 String bare = "<script>One</script>"; 1448 String full = "<html><head><title>Check</title></head><body><p>One</p></body></html>"; 1449 1450 assertTrue(didAddElements(bare)); 1451 assertFalse(didAddElements(full)); 1452 } 1453 didAddElements(String input)1454 private boolean didAddElements(String input) { 1455 // two passes, one as XML and one as HTML. XML does not vivify missing/optional tags 1456 Document html = Jsoup.parse(input); 1457 Document xml = Jsoup.parse(input, "", Parser.xmlParser()); 1458 1459 int htmlElementCount = html.getAllElements().size(); 1460 int xmlElementCount = xml.getAllElements().size(); 1461 return htmlElementCount > xmlElementCount; 1462 } 1463 canSetHtmlOnCreatedTableElements()1464 @Test public void canSetHtmlOnCreatedTableElements() { 1465 // https://github.com/jhy/jsoup/issues/1603 1466 Element element = new Element("tr"); 1467 element.html("<tr><td>One</td></tr>"); 1468 assertEquals("<tr>\n <tr>\n <td>One</td>\n </tr>\n</tr>", element.outerHtml()); 1469 } 1470 parseFragmentOnCreatedDocument()1471 @Test public void parseFragmentOnCreatedDocument() { 1472 // https://github.com/jhy/jsoup/issues/1601 1473 String bareFragment = "<h2>text</h2>"; 1474 List<Node> nodes = new Document("").parser().parseFragmentInput(bareFragment, new Element("p"), ""); 1475 assertEquals(1, nodes.size()); 1476 Node node = nodes.get(0); 1477 assertEquals("h2", node.nodeName()); 1478 assertEquals("<p>\n <h2>text</h2></p>", node.parent().outerHtml()); 1479 } 1480 nestedPFragments()1481 @Test public void nestedPFragments() { 1482 // https://github.com/jhy/jsoup/issues/1602 1483 String bareFragment = "<p></p><a></a>"; 1484 List<Node> nodes = new Document("").parser().parseFragmentInput(bareFragment, new Element("p"), ""); 1485 assertEquals(2, nodes.size()); 1486 Node node = nodes.get(0); 1487 assertEquals("<p>\n <p></p><a></a></p>", node.parent().outerHtml()); // mis-nested because fragment forced into the element, OK 1488 } 1489 nestedAnchorAdoption()1490 @Test public void nestedAnchorAdoption() { 1491 // https://github.com/jhy/jsoup/issues/1608 1492 String html = "<a>\n<b>\n<div>\n<a>test</a>\n</div>\n</b>\n</a>"; 1493 Document doc = Jsoup.parse(html); 1494 assertNotNull(doc); 1495 assertEquals("<a> <b> </b></a><b><div><a> </a><a>test</a></div></b>", TextUtil.stripNewlines(doc.body().html())); 1496 } 1497 tagsMustStartWithAscii()1498 @Test public void tagsMustStartWithAscii() { 1499 // https://github.com/jhy/jsoup/issues/1006 1500 String[] valid = {"a一", "a会员挂单金额5", "table(╯°□°)╯"}; 1501 String[] invalid = {"一", "会员挂单金额5", "(╯°□°)╯"}; 1502 1503 for (String tag : valid) { 1504 Document doc = Jsoup.parse("<" + tag + ">Text</" + tag + ">"); 1505 Elements els = doc.getElementsByTag(tag); 1506 assertEquals(1, els.size()); 1507 assertEquals(tag, els.get(0).tagName()); 1508 assertEquals("Text", els.get(0).text()); 1509 } 1510 1511 for (String tag : invalid) { 1512 Document doc = Jsoup.parse("<" + tag + ">Text</" + tag + ">"); 1513 Elements els = doc.getElementsByTag(tag); 1514 assertEquals(0, els.size()); 1515 assertEquals("<" + tag + ">Text<!--/" + tag + "-->", doc.body().html()); 1516 } 1517 } 1518 htmlOutputCorrectsInvalidAttributeNames()1519 @Test void htmlOutputCorrectsInvalidAttributeNames() { 1520 String html = "<body style=\"color: red\" \" name\"><div =\"\"></div></body>"; 1521 Document doc = Jsoup.parse(html); 1522 assertEquals(Document.OutputSettings.Syntax.html, doc.outputSettings().syntax()); 1523 1524 String out = doc.body().outerHtml(); 1525 assertEquals("<body style=\"color: red\" name>\n <div></div>\n</body>", out); 1526 } 1527 templateInHead()1528 @Test void templateInHead() { 1529 // https://try.jsoup.org/~EGp3UZxQe503TJDHQEQEzm8IeUc 1530 String html = "<head><template id=1><meta name=tmpl></template><title>Test</title><style>One</style></head><body><p>Two</p>"; 1531 Document doc = Jsoup.parse(html); 1532 1533 String want = "<html><head><template id=\"1\"><meta name=\"tmpl\"></template><title>Test</title><style>One</style></head><body><p>Two</p></body></html>"; 1534 assertEquals(want, TextUtil.stripNewlines(doc.html())); 1535 1536 Elements template = doc.select("template#1"); 1537 template.select("meta").attr("content", "Yes"); 1538 template.unwrap(); 1539 1540 want = "<html><head><meta name=\"tmpl\" content=\"Yes\"><title>Test</title><style>One</style></head><body><p>Two</p></body></html>"; 1541 assertEquals(want, TextUtil.stripNewlines(doc.html())); 1542 } 1543 nestedTemplateInBody()1544 @Test void nestedTemplateInBody() { 1545 String html = "<body><template id=1><table><tr><template id=2><td>One</td><td>Two</td></template></tr></template></body>"; 1546 Document doc = Jsoup.parse(html); 1547 1548 String want = "<html><head></head><body><template id=\"1\"><table><tbody><tr><template id=\"2\"><td>One</td><td>Two</td></template></tr></tbody></table></template></body></html>"; 1549 assertEquals(want, TextUtil.stripNewlines(doc.html())); 1550 1551 // todo - will be nice to add some simpler template element handling like clone children etc? 1552 Element tmplTbl = doc.selectFirst("template#1"); 1553 Element tmplRow = doc.selectFirst("template#2"); 1554 assertNotNull(tmplRow); 1555 assertNotNull(tmplTbl); 1556 tmplRow.appendChild(tmplRow.clone()); 1557 doc.select("template").unwrap(); 1558 1559 want = "<html><head></head><body><table><tbody><tr><td>One</td><td>Two</td><td>One</td><td>Two</td></tr></tbody></table></body></html>"; 1560 assertEquals(want, TextUtil.stripNewlines(doc.html())); 1561 } 1562 canSelectIntoTemplate()1563 @Test void canSelectIntoTemplate() { 1564 String html = "<body><div><template><p>Hello</p>"; 1565 Document doc = Jsoup.parse(html); 1566 String want = "<html><head></head><body><div><template><p>Hello</p></template></div></body></html>"; 1567 assertEquals(want, TextUtil.stripNewlines(doc.html())); 1568 1569 Element p = doc.selectFirst("div p"); 1570 Element p1 = doc.selectFirst("template :containsOwn(Hello)"); 1571 assertEquals("p", p.normalName()); 1572 assertEquals(p, p1); 1573 } 1574 tableRowFragment()1575 @Test void tableRowFragment() { 1576 Document doc = Jsoup.parse("<body><table></table></body"); 1577 String html = "<tr><td><img></td></tr>"; 1578 Element table = doc.selectFirst("table"); 1579 table.html(html); // invokes the fragment parser with table as context 1580 String want = "<tbody><tr><td><img></td></tr></tbody>"; 1581 assertEquals(want, TextUtil.stripNewlines(table.html())); 1582 want = "<table><tbody><tr><td><img></td></tr></tbody></table>"; 1583 assertEquals(want, TextUtil.stripNewlines(doc.body().html())); 1584 } 1585 templateTableRowFragment()1586 @Test void templateTableRowFragment() { 1587 // https://github.com/jhy/jsoup/issues/1409 (per the fragment <tr> use case) 1588 Document doc = Jsoup.parse("<body><table><template></template></table></body"); 1589 String html = "<tr><td><img></td></tr>"; 1590 Element tmpl = doc.selectFirst("template"); 1591 tmpl.html(html); // invokes the fragment parser with template as context 1592 String want = "<tr><td><img></td></tr>"; 1593 assertEquals(want, TextUtil.stripNewlines(tmpl.html())); 1594 tmpl.unwrap(); 1595 1596 want = "<html><head></head><body><table><tr><td><img></td></tr></table></body></html>"; 1597 assertEquals(want, TextUtil.stripNewlines(doc.html())); 1598 } 1599 templateNotInTableRowFragment()1600 @Test void templateNotInTableRowFragment() { 1601 // https://github.com/jhy/jsoup/issues/1409 (per the fragment <tr> use case) 1602 Document doc = Jsoup.parse("<body><template></template></body"); 1603 String html = "<tr><td><img></td></tr>"; 1604 Element tmpl = doc.selectFirst("template"); 1605 tmpl.html(html); // invokes the fragment parser with template as context 1606 String want = "<tr><td><img></td></tr>"; 1607 assertEquals(want, TextUtil.stripNewlines(tmpl.html())); 1608 tmpl.unwrap(); 1609 1610 want = "<html><head></head><body><tr><td><img></td></tr></body></html>"; 1611 assertEquals(want, TextUtil.stripNewlines(doc.html())); 1612 } 1613 templateFragment()1614 @Test void templateFragment() { 1615 // https://github.com/jhy/jsoup/issues/1315 1616 String html = "<template id=\"lorem-ipsum\"><tr><td>Lorem</td><td>Ipsum</td></tr></template>"; 1617 Document frag = Jsoup.parseBodyFragment(html); 1618 String want = "<template id=\"lorem-ipsum\"><tr><td>Lorem</td><td>Ipsum</td></tr></template>"; 1619 assertEquals(want, TextUtil.stripNewlines(frag.body().html())); 1620 } 1621 templateInferredForm()1622 @Test void templateInferredForm() { 1623 // https://github.com/jhy/jsoup/issues/1637 | https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=38987 1624 Document doc = Jsoup.parse("<template><isindex action>"); 1625 assertNotNull(doc); 1626 assertEquals("<template><form><hr><label>This is a searchable index. Enter search keywords: <input name=\"isindex\"></label><hr></form></template>", 1627 TextUtil.stripNewlines(doc.head().html())); 1628 } 1629 trimNormalizeElementNamesInBuilder()1630 @Test void trimNormalizeElementNamesInBuilder() { 1631 // https://github.com/jhy/jsoup/issues/1637 | https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=38983 1632 // This is interesting - in TB state, the element name was "template\u001E", so no name checks matched. Then, 1633 // when the Element is created, the name got normalized to "template" and so looked like there should be a 1634 // template on the stack during resetInsertionMode for the select. 1635 // The issue was that the normalization in Tag.valueOf did a trim which the Token.Tag did not 1636 Document doc = Jsoup.parse("<template\u001E<select<input<"); 1637 assertNotNull(doc); 1638 assertEquals("<template><select></select><input><</template>", 1639 TextUtil.stripNewlines(doc.head().html())); 1640 } 1641 errorsBeforeHtml()1642 @Test void errorsBeforeHtml() { 1643 Parser parser = Parser.htmlParser(); 1644 parser.setTrackErrors(10); 1645 Document doc = Jsoup.parse("<!doctype html><!doctype something></div>", parser); 1646 ParseErrorList errors = parser.getErrors(); 1647 assertEquals(2, errors.size()); 1648 assertEquals("<1:36>: Unexpected Doctype token [<!doctype something>] when in state [BeforeHtml]", errors.get(0).toString()); 1649 assertEquals("<1:42>: Unexpected EndTag token [</div>] when in state [BeforeHtml]", errors.get(1).toString()); 1650 assertEquals("<!doctype html><html><head></head><body></body></html>", TextUtil.stripNewlines(doc.html())); 1651 } 1652 afterHeadReAdds()1653 @Test void afterHeadReAdds() { 1654 Parser parser = Parser.htmlParser(); 1655 parser.setTrackErrors(10); 1656 Document doc = Jsoup.parse("<head></head><meta charset=UTF8><p>Hello", parser); 1657 ParseErrorList errors = parser.getErrors(); 1658 assertEquals(1, errors.size()); 1659 assertEquals("<1:33>: Unexpected StartTag token [<meta charset=\"UTF8\">] when in state [AfterHead]", errors.get(0).toString()); 1660 assertEquals("<html><head><meta charset=\"UTF8\"></head><body><p>Hello</p></body></html>", TextUtil.stripNewlines(doc.html())); 1661 // meta gets added back into head 1662 } 1663 mergeHtmlAttributesFromBody()1664 @Test void mergeHtmlAttributesFromBody() { 1665 Document doc = Jsoup.parse("<html id=1 class=foo><body><html class=bar data=x><p>One"); 1666 assertEquals("<html id=\"1\" class=\"foo\" data=\"x\"><head></head><body><p>One</p></body></html>", TextUtil.stripNewlines(doc.html())); 1667 } 1668 mergeHtmlNoAttributesFromBody()1669 @Test void mergeHtmlNoAttributesFromBody() { 1670 Document doc = Jsoup.parse("<html id=1 class=foo><body><html><p>One"); 1671 assertEquals("<html id=\"1\" class=\"foo\"><head></head><body><p>One</p></body></html>", TextUtil.stripNewlines(doc.html())); 1672 } 1673 supportsRuby()1674 @Test void supportsRuby() { 1675 String html = "<ruby><rbc><rb>10</rb><rb>31</rb><rb>2002</rb></rbc><rtc><rt>Month</rt><rt>Day</rt><rt>Year</rt></rtc><rtc><rt>Expiration Date</rt><rp>(*)</rtc></ruby>"; 1676 Parser parser = Parser.htmlParser(); 1677 parser.setTrackErrors(10); 1678 Document doc = Jsoup.parse(html, parser); 1679 ParseErrorList errors = parser.getErrors(); 1680 assertEquals(3, errors.size()); 1681 Element ruby = doc.expectFirst("ruby"); 1682 assertEquals( 1683 "<ruby><rbc><rb>10</rb><rb>31</rb><rb>2002</rb></rbc><rtc><rt>Month</rt><rt>Day</rt><rt>Year</rt></rtc><rtc><rt>Expiration Date</rt><rp>(*)</rp></rtc></ruby>", 1684 TextUtil.stripNewlines(ruby.outerHtml())); 1685 assertEquals("<1:38>: Unexpected StartTag token [<rb>] when in state [InBody]", errors.get(2).toString()); // 3 errors from rb in rtc as undefined 1686 } 1687 rubyRpRtImplicitClose()1688 @Test void rubyRpRtImplicitClose() { 1689 String html = "<ruby><rp>(<rt>Hello<rt>Hello<rp>)</ruby>\n"; 1690 Parser parser = Parser.htmlParser(); 1691 parser.setTrackErrors(10); 1692 Document doc = Jsoup.parse(html, parser); 1693 assertEquals(0, parser.getErrors().size()); 1694 Element ruby = doc.expectFirst("ruby"); 1695 assertEquals( 1696 "<ruby><rp>(</rp><rt>Hello</rt><rt>Hello</rt><rp>)</rp></ruby>", 1697 TextUtil.stripNewlines(ruby.outerHtml())); 1698 } 1699 rubyScopeError()1700 @Test void rubyScopeError() { 1701 String html = "<ruby><div><rp>Hello"; 1702 Parser parser = Parser.htmlParser(); 1703 parser.setTrackErrors(10); 1704 Document doc = Jsoup.parse(html, parser); 1705 ParseErrorList errors = parser.getErrors(); 1706 assertEquals(2, errors.size()); 1707 Element ruby = doc.expectFirst("ruby"); 1708 assertEquals( 1709 "<ruby><div><rp>Hello</rp></div></ruby>", 1710 TextUtil.stripNewlines(ruby.outerHtml())); 1711 assertEquals("<1:16>: Unexpected StartTag token [<rp>] when in state [InBody]", errors.get(0).toString()); 1712 } 1713 errorOnEofIfOpen()1714 @Test void errorOnEofIfOpen() { 1715 String html = "<div>"; 1716 Parser parser = Parser.htmlParser(); 1717 parser.setTrackErrors(10); 1718 Document doc = Jsoup.parse(html, parser); 1719 ParseErrorList errors = parser.getErrors(); 1720 assertEquals(1, errors.size()); 1721 assertEquals("Unexpected EOF token [] when in state [InBody]", errors.get(0).getErrorMessage()); 1722 } 1723 NoErrorOnEofIfBodyOpen()1724 @Test void NoErrorOnEofIfBodyOpen() { 1725 String html = "<body>"; 1726 Parser parser = Parser.htmlParser(); 1727 parser.setTrackErrors(10); 1728 Document doc = Jsoup.parse(html, parser); 1729 ParseErrorList errors = parser.getErrors(); 1730 assertEquals(0, errors.size()); 1731 } 1732 htmlClose()1733 @Test void htmlClose() { 1734 // https://github.com/jhy/jsoup/issues/1851 1735 String html = "<body><div>One</html>Two</div></body>"; 1736 Document doc = Jsoup.parse(html); 1737 assertEquals("OneTwo", doc.expectFirst("body > div").text()); 1738 } 1739 largeTextareaContents()1740 @Test void largeTextareaContents() { 1741 // https://github.com/jhy/jsoup/issues/1929 1742 StringBuilder sb = new StringBuilder(); 1743 int num = 2000; 1744 for (int i = 0; i <= num; i++) { 1745 sb.append("\n<text>foo</text>\n"); 1746 } 1747 String textContent = sb.toString(); 1748 String sourceHtml = "<textarea>" + textContent + "</textarea>"; 1749 1750 Document doc = Jsoup.parse(sourceHtml); 1751 Element textArea = doc.expectFirst("textarea"); 1752 1753 assertEquals(textContent, textArea.wholeText()); 1754 } 1755 svgParseTest()1756 @Test void svgParseTest() { 1757 String html = "<div><svg viewBox=2><foreignObject><p>One</p></foreignObject></svg></div>"; 1758 Document doc = Jsoup.parse(html); 1759 1760 assertHtmlNamespace(doc); 1761 Element div = doc.expectFirst("div"); 1762 assertHtmlNamespace(div); 1763 1764 Element svg = doc.expectFirst("svg"); 1765 assertTrue(svg.attributes().hasKey("viewBox")); 1766 assertSvgNamespace(svg); 1767 assertSvgNamespace(doc.expectFirst("foreignObject")); 1768 assertHtmlNamespace(doc.expectFirst("p")); 1769 1770 String serialized = div.html(); 1771 assertEquals("<svg viewBox=\"2\">\n" + 1772 " <foreignObject>\n" + 1773 " <p>One</p>\n" + 1774 " </foreignObject>\n" + 1775 "</svg>", serialized); 1776 } 1777 mathParseText()1778 @Test void mathParseText() { 1779 String html = "<div><math><mi><p>One</p><svg><text>Blah</text></svg></mi><ms></ms></div>"; 1780 Document doc = Jsoup.parse(html); 1781 1782 assertHtmlNamespace(doc.expectFirst("div")); 1783 assertMathNamespace(doc.expectFirst("math")); 1784 assertMathNamespace(doc.expectFirst("mi")); 1785 assertHtmlNamespace(doc.expectFirst("p")); 1786 assertSvgNamespace(doc.expectFirst("svg")); 1787 assertSvgNamespace(doc.expectFirst("text")); 1788 assertMathNamespace(doc.expectFirst("ms")); 1789 1790 String serialized = doc.expectFirst("div").html(); 1791 assertEquals("<math>\n" + 1792 " <mi>\n" + 1793 " <p>One</p>\n" + 1794 " <svg>\n" + 1795 " <text>Blah</text>\n" + 1796 " </svg></mi><ms></ms>\n" + 1797 "</math>", serialized); 1798 } 1799 assertHtmlNamespace(Element el)1800 private static void assertHtmlNamespace(Element el) { 1801 assertEquals(Parser.NamespaceHtml, el.tag().namespace()); 1802 } 1803 assertSvgNamespace(Element el)1804 private static void assertSvgNamespace(Element el) { 1805 assertEquals(Parser.NamespaceSvg, el.tag().namespace()); 1806 } 1807 assertMathNamespace(Element el)1808 private static void assertMathNamespace(Element el) { 1809 assertEquals(Parser.NamespaceMathml, el.tag().namespace()); 1810 } 1811 mathSvgStyleTest()1812 @Test void mathSvgStyleTest() { 1813 String html = "<style><img></style><math><svg><style><img></img></style></svg></math>"; 1814 Document doc = Jsoup.parse(html); 1815 1816 Element htmlStyle = doc.expectFirst("style"); 1817 assertHtmlNamespace(htmlStyle); 1818 assertEquals("<img>", htmlStyle.data()); // that's not an element, it's data (textish) 1819 1820 Element svgStyle = doc.expectFirst("svg style"); 1821 assertMathNamespace(svgStyle); // in inherited math namespace as not an HTML integration point 1822 Element styleImg = svgStyle.expectFirst("img"); 1823 assertHtmlNamespace(styleImg); // this one is an img tag - in foreign to html elements 1824 1825 assertMathNamespace(doc.expectFirst("svg")); 1826 assertMathNamespace(doc.expectFirst("math")); 1827 } 1828 xmlnsAttributeError()1829 @Test void xmlnsAttributeError() { 1830 String html = "<p><svg></svg></body>"; 1831 Parser parser = Parser.htmlParser().setTrackErrors(10); 1832 Document doc = Jsoup.parse(html, parser); 1833 assertEquals(0, doc.parser().getErrors().size()); 1834 1835 String html2 = "<html xmlns='http://www.w3.org/1999/xhtml'><p xmlns='http://www.w3.org/1999/xhtml'><i xmlns='xhtml'></i></body>"; 1836 Document doc2 = Jsoup.parse(html2, parser); 1837 assertEquals(1, doc2.parser().getErrors().size()); 1838 assertEquals("Invalid xmlns attribute [xhtml] on tag [i]", parser.getErrors().get(0).getErrorMessage()); 1839 } 1840 mathAnnotationSvg()1841 @Test void mathAnnotationSvg() { 1842 String html = "<math><svg>"; // not in annotation, svg will be in math ns 1843 Document doc = Jsoup.parse(html); 1844 assertMathNamespace(doc.expectFirst("math")); 1845 assertMathNamespace(doc.expectFirst("svg")); 1846 1847 String html2 = "<math><annotation-xml><svg>"; // svg will be in svg ns 1848 Document doc2 = Jsoup.parse(html2); 1849 assertMathNamespace(doc2.expectFirst("math")); 1850 assertMathNamespace(doc2.expectFirst("annotation-xml")); 1851 assertSvgNamespace(doc2.expectFirst("svg")); 1852 } 1853 mathHtmlIntegrationPoint()1854 @Test void mathHtmlIntegrationPoint() { 1855 String html = "<math><div>Hello"; 1856 Document doc = Jsoup.parse(html); 1857 assertMathNamespace(doc.expectFirst("math")); 1858 assertHtmlNamespace(doc.expectFirst("div")); 1859 1860 String html2 = "<math><divv>Hello"; 1861 Document doc2 = Jsoup.parse(html2); 1862 assertMathNamespace(doc2.expectFirst("math")); 1863 assertMathNamespace(doc2.expectFirst("divv")); 1864 1865 String html3 = "<math><annotation-xml><divv>Hello"; 1866 Document doc3 = Jsoup.parse(html3); 1867 assertMathNamespace(doc3.expectFirst("math")); 1868 assertMathNamespace(doc3.expectFirst("annotation-xml")); 1869 assertMathNamespace(doc3.expectFirst("divv")); 1870 1871 String html4 = "<math><annotation-xml encoding=text/html><divv>Hello"; 1872 Document doc4 = Jsoup.parse(html4); 1873 assertMathNamespace(doc4.expectFirst("math")); 1874 assertMathNamespace(doc4.expectFirst("annotation-xml")); 1875 assertHtmlNamespace(doc4.expectFirst("divv")); 1876 } 1877 parseEmojiFromMultipointEncoded()1878 @Test void parseEmojiFromMultipointEncoded() { 1879 String html = "<img multi='��' single='💯' hexsingle='💯'>"; 1880 Document document = Jsoup.parse(html); 1881 Element img = document.expectFirst("img"); 1882 assertEquals("\uD83D\uDCAF", img.attr("multi")); 1883 assertEquals("\uD83D\uDCAF", img.attr("single")); 1884 assertEquals("\uD83D\uDCAF", img.attr("hexsingle")); 1885 1886 assertEquals("<img multi=\"\uD83D\uDCAF\" single=\"\uD83D\uDCAF\" hexsingle=\"\uD83D\uDCAF\">", img.outerHtml()); 1887 1888 img.ownerDocument().outputSettings().charset("ascii"); 1889 assertEquals("<img multi=\"💯\" single=\"💯\" hexsingle=\"💯\">", img.outerHtml()); 1890 } 1891 } 1892