1 package org.jsoup.parser; 2 3 import org.jsoup.helper.Validate; 4 import org.jsoup.internal.Normalizer; 5 6 import java.util.HashMap; 7 import java.util.Map; 8 import java.util.function.Consumer; 9 10 /** 11 * Tag capabilities. 12 * 13 * @author Jonathan Hedley, [email protected] 14 */ 15 public class Tag implements Cloneable { 16 private static final Map<String, Tag> Tags = new HashMap<>(); // map of known tags 17 18 private String tagName; 19 private final String normalName; // always the lower case version of this tag, regardless of case preservation mode 20 private String namespace; 21 private boolean isBlock = true; // block 22 private boolean formatAsBlock = true; // should be formatted as a block 23 private boolean empty = false; // can hold nothing; e.g. img 24 private boolean selfClosing = false; // can self close (<foo />). used for unknown tags that self close, without forcing them as empty. 25 private boolean preserveWhitespace = false; // for pre, textarea, script etc 26 private boolean formList = false; // a control that appears in forms: input, textarea, output etc 27 private boolean formSubmit = false; // a control that can be submitted in a form: input etc 28 Tag(String tagName, String namespace)29 private Tag(String tagName, String namespace) { 30 this.tagName = tagName; 31 normalName = Normalizer.lowerCase(tagName); 32 this.namespace = namespace; 33 } 34 35 /** 36 * Get this tag's name. 37 * 38 * @return the tag's name 39 */ getName()40 public String getName() { 41 return tagName; 42 } 43 44 /** 45 * Get this tag's normalized (lowercased) name. 46 * @return the tag's normal name. 47 */ normalName()48 public String normalName() { 49 return normalName; 50 } 51 namespace()52 public String namespace() { 53 return namespace; 54 } 55 56 /** 57 * Get a Tag by name. If not previously defined (unknown), returns a new generic tag, that can do anything. 58 * <p> 59 * Pre-defined tags (p, div etc) will be ==, but unknown tags are not registered and will only .equals(). 60 * </p> 61 * 62 * @param tagName Name of tag, e.g. "p". Case-insensitive. 63 * @param namespace the namespace for the tag. 64 * @param settings used to control tag name sensitivity 65 * @return The tag, either defined or new generic. 66 */ valueOf(String tagName, String namespace, ParseSettings settings)67 public static Tag valueOf(String tagName, String namespace, ParseSettings settings) { 68 Validate.notEmpty(tagName); 69 Validate.notNull(namespace); 70 Tag tag = Tags.get(tagName); 71 if (tag != null && tag.namespace.equals(namespace)) 72 return tag; 73 74 tagName = settings.normalizeTag(tagName); // the name we'll use 75 Validate.notEmpty(tagName); 76 String normalName = Normalizer.lowerCase(tagName); // the lower-case name to get tag settings off 77 tag = Tags.get(normalName); 78 if (tag != null && tag.namespace.equals(namespace)) { 79 if (settings.preserveTagCase() && !tagName.equals(normalName)) { 80 tag = tag.clone(); // get a new version vs the static one, so name update doesn't reset all 81 tag.tagName = tagName; 82 } 83 return tag; 84 } 85 86 // not defined: create default; go anywhere, do anything! (incl be inside a <p>) 87 tag = new Tag(tagName, namespace); 88 tag.isBlock = false; 89 90 return tag; 91 } 92 93 /** 94 * Get a Tag by name. If not previously defined (unknown), returns a new generic tag, that can do anything. 95 * <p> 96 * Pre-defined tags (P, DIV etc) will be ==, but unknown tags are not registered and will only .equals(). 97 * </p> 98 * 99 * @param tagName Name of tag, e.g. "p". <b>Case sensitive</b>. 100 * @return The tag, either defined or new generic. 101 * @see #valueOf(String tagName, String namespace, ParseSettings settings) 102 */ valueOf(String tagName)103 public static Tag valueOf(String tagName) { 104 return valueOf(tagName, Parser.NamespaceHtml, ParseSettings.preserveCase); 105 } 106 107 /** 108 * Get a Tag by name. If not previously defined (unknown), returns a new generic tag, that can do anything. 109 * <p> 110 * Pre-defined tags (P, DIV etc) will be ==, but unknown tags are not registered and will only .equals(). 111 * </p> 112 * 113 * @param tagName Name of tag, e.g. "p". <b>Case sensitive</b>. 114 * @param settings used to control tag name sensitivity 115 * @return The tag, either defined or new generic. 116 * @see #valueOf(String tagName, String namespace, ParseSettings settings) 117 */ valueOf(String tagName, ParseSettings settings)118 public static Tag valueOf(String tagName, ParseSettings settings) { 119 return valueOf(tagName, Parser.NamespaceHtml, settings); 120 } 121 122 /** 123 * Gets if this is a block tag. 124 * 125 * @return if block tag 126 */ isBlock()127 public boolean isBlock() { 128 return isBlock; 129 } 130 131 /** 132 * Gets if this tag should be formatted as a block (or as inline) 133 * 134 * @return if should be formatted as block or inline 135 */ formatAsBlock()136 public boolean formatAsBlock() { 137 return formatAsBlock; 138 } 139 140 /** 141 * Gets if this tag is an inline tag. 142 * 143 * @return if this tag is an inline tag. 144 */ isInline()145 public boolean isInline() { 146 return !isBlock; 147 } 148 149 /** 150 * Get if this is an empty tag 151 * 152 * @return if this is an empty tag 153 */ isEmpty()154 public boolean isEmpty() { 155 return empty; 156 } 157 158 /** 159 * Get if this tag is self-closing. 160 * 161 * @return if this tag should be output as self-closing. 162 */ isSelfClosing()163 public boolean isSelfClosing() { 164 return empty || selfClosing; 165 } 166 167 /** 168 * Get if this is a pre-defined tag, or was auto created on parsing. 169 * 170 * @return if a known tag 171 */ isKnownTag()172 public boolean isKnownTag() { 173 return Tags.containsKey(tagName); 174 } 175 176 /** 177 * Check if this tagname is a known tag. 178 * 179 * @param tagName name of tag 180 * @return if known HTML tag 181 */ isKnownTag(String tagName)182 public static boolean isKnownTag(String tagName) { 183 return Tags.containsKey(tagName); 184 } 185 186 /** 187 * Get if this tag should preserve whitespace within child text nodes. 188 * 189 * @return if preserve whitespace 190 */ preserveWhitespace()191 public boolean preserveWhitespace() { 192 return preserveWhitespace; 193 } 194 195 /** 196 * Get if this tag represents a control associated with a form. E.g. input, textarea, output 197 * @return if associated with a form 198 */ isFormListed()199 public boolean isFormListed() { 200 return formList; 201 } 202 203 /** 204 * Get if this tag represents an element that should be submitted with a form. E.g. input, option 205 * @return if submittable with a form 206 */ isFormSubmittable()207 public boolean isFormSubmittable() { 208 return formSubmit; 209 } 210 setSelfClosing()211 Tag setSelfClosing() { 212 selfClosing = true; 213 return this; 214 } 215 216 @Override equals(Object o)217 public boolean equals(Object o) { 218 if (this == o) return true; 219 if (!(o instanceof Tag)) return false; 220 221 Tag tag = (Tag) o; 222 223 if (!tagName.equals(tag.tagName)) return false; 224 if (empty != tag.empty) return false; 225 if (formatAsBlock != tag.formatAsBlock) return false; 226 if (isBlock != tag.isBlock) return false; 227 if (preserveWhitespace != tag.preserveWhitespace) return false; 228 if (selfClosing != tag.selfClosing) return false; 229 if (formList != tag.formList) return false; 230 return formSubmit == tag.formSubmit; 231 } 232 233 @Override hashCode()234 public int hashCode() { 235 int result = tagName.hashCode(); 236 result = 31 * result + (isBlock ? 1 : 0); 237 result = 31 * result + (formatAsBlock ? 1 : 0); 238 result = 31 * result + (empty ? 1 : 0); 239 result = 31 * result + (selfClosing ? 1 : 0); 240 result = 31 * result + (preserveWhitespace ? 1 : 0); 241 result = 31 * result + (formList ? 1 : 0); 242 result = 31 * result + (formSubmit ? 1 : 0); 243 return result; 244 } 245 246 @Override toString()247 public String toString() { 248 return tagName; 249 } 250 251 @Override clone()252 protected Tag clone() { 253 try { 254 return (Tag) super.clone(); 255 } catch (CloneNotSupportedException e) { 256 throw new RuntimeException(e); 257 } 258 } 259 260 // internal static initialisers: 261 // prepped from http://www.w3.org/TR/REC-html40/sgml/dtd.html and other sources 262 private static final String[] blockTags = { 263 "html", "head", "body", "frameset", "script", "noscript", "style", "meta", "link", "title", "frame", 264 "noframes", "section", "nav", "aside", "hgroup", "header", "footer", "p", "h1", "h2", "h3", "h4", "h5", "h6", 265 "ul", "ol", "pre", "div", "blockquote", "hr", "address", "figure", "figcaption", "form", "fieldset", "ins", 266 "del", "dl", "dt", "dd", "li", "table", "caption", "thead", "tfoot", "tbody", "colgroup", "col", "tr", "th", 267 "td", "video", "audio", "canvas", "details", "menu", "plaintext", "template", "article", "main", 268 "svg", "math", "center", "template", 269 "dir", "applet", "marquee", "listing" // deprecated but still known / special handling 270 }; 271 private static final String[] inlineTags = { 272 "object", "base", "font", "tt", "i", "b", "u", "big", "small", "em", "strong", "dfn", "code", "samp", "kbd", 273 "var", "cite", "abbr", "time", "acronym", "mark", "ruby", "rt", "rp", "rtc", "a", "img", "br", "wbr", "map", "q", 274 "sub", "sup", "bdo", "iframe", "embed", "span", "input", "select", "textarea", "label", "button", "optgroup", 275 "option", "legend", "datalist", "keygen", "output", "progress", "meter", "area", "param", "source", "track", 276 "summary", "command", "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track", 277 "data", "bdi", "s", "strike", "nobr", 278 "rb", // deprecated but still known / special handling 279 "text", // in SVG NS 280 "mi", "mo", "msup", "mn", "mtext" // in MathML NS, to ensure inline 281 }; 282 private static final String[] emptyTags = { 283 "meta", "link", "base", "frame", "img", "br", "wbr", "embed", "hr", "input", "keygen", "col", "command", 284 "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track" 285 }; 286 // todo - rework this to format contents as inline; and update html emitter in Element. Same output, just neater. 287 private static final String[] formatAsInlineTags = { 288 "title", "a", "p", "h1", "h2", "h3", "h4", "h5", "h6", "pre", "address", "li", "th", "td", "script", "style", 289 "ins", "del", "s" 290 }; 291 private static final String[] preserveWhitespaceTags = { 292 "pre", "plaintext", "title", "textarea" 293 // script is not here as it is a data node, which always preserve whitespace 294 }; 295 // todo: I think we just need submit tags, and can scrub listed 296 private static final String[] formListedTags = { 297 "button", "fieldset", "input", "keygen", "object", "output", "select", "textarea" 298 }; 299 private static final String[] formSubmitTags = { 300 "input", "keygen", "object", "select", "textarea" 301 }; 302 303 private static final Map<String, String[]> namespaces = new HashMap<>(); 304 static { namespaces.put(Parser.NamespaceMathml, new String[]{"math", "mi", "mo", "msup", "mn", "mtext"})305 namespaces.put(Parser.NamespaceMathml, new String[]{"math", "mi", "mo", "msup", "mn", "mtext"}); namespaces.put(Parser.NamespaceSvg, new String[]{"svg", "text"})306 namespaces.put(Parser.NamespaceSvg, new String[]{"svg", "text"}); 307 // We don't need absolute coverage here as other cases will be inferred by the HtmlTreeBuilder 308 } 309 setupTags(String[] tagNames, Consumer<Tag> tagModifier)310 private static void setupTags(String[] tagNames, Consumer<Tag> tagModifier) { 311 for (String tagName : tagNames) { 312 Tag tag = Tags.get(tagName); 313 if (tag == null) { 314 tag = new Tag(tagName, Parser.NamespaceHtml); 315 Tags.put(tag.tagName, tag); 316 } 317 tagModifier.accept(tag); 318 } 319 } 320 321 static { setupTags(blockTags, tag -> { tag.isBlock = true; tag.formatAsBlock = true; })322 setupTags(blockTags, tag -> { 323 tag.isBlock = true; 324 tag.formatAsBlock = true; 325 }); 326 setupTags(inlineTags, tag -> { tag.isBlock = false; tag.formatAsBlock = false; })327 setupTags(inlineTags, tag -> { 328 tag.isBlock = false; 329 tag.formatAsBlock = false; 330 }); 331 setupTags(emptyTags, tag -> tag.empty = true)332 setupTags(emptyTags, tag -> tag.empty = true); setupTags(formatAsInlineTags, tag -> tag.formatAsBlock = false)333 setupTags(formatAsInlineTags, tag -> tag.formatAsBlock = false); setupTags(preserveWhitespaceTags, tag -> tag.preserveWhitespace = true)334 setupTags(preserveWhitespaceTags, tag -> tag.preserveWhitespace = true); setupTags(formListedTags, tag -> tag.formList = true)335 setupTags(formListedTags, tag -> tag.formList = true); setupTags(formSubmitTags, tag -> tag.formSubmit = true)336 setupTags(formSubmitTags, tag -> tag.formSubmit = true); 337 for (Map.Entry<String, String[]> ns : namespaces.entrySet()) { ns.getValue()338 setupTags(ns.getValue(), tag -> tag.namespace = ns.getKey()); 339 } 340 } 341 } 342