1 /* 2 * Summary: interface for an HTML 4.0 non-verifying parser 3 * Description: this module implements an HTML 4.0 non-verifying parser 4 * with API compatible with the XML parser ones. It should 5 * be able to parse "real world" HTML, even if severely 6 * broken from a specification point of view. 7 * 8 * Copy: See Copyright for the status of this software. 9 * 10 * Author: Daniel Veillard 11 */ 12 13 #ifndef __HTML_PARSER_H__ 14 #define __HTML_PARSER_H__ 15 #include <libxml/xmlversion.h> 16 #include <libxml/parser.h> 17 18 #ifdef LIBXML_HTML_ENABLED 19 20 #ifdef __cplusplus 21 extern "C" { 22 #endif 23 24 /* 25 * Most of the back-end structures from XML and HTML are shared. 26 */ 27 typedef xmlParserCtxt htmlParserCtxt; 28 typedef xmlParserCtxtPtr htmlParserCtxtPtr; 29 typedef xmlParserNodeInfo htmlParserNodeInfo; 30 typedef xmlSAXHandler htmlSAXHandler; 31 typedef xmlSAXHandlerPtr htmlSAXHandlerPtr; 32 typedef xmlParserInput htmlParserInput; 33 typedef xmlParserInputPtr htmlParserInputPtr; 34 typedef xmlDocPtr htmlDocPtr; 35 typedef xmlNodePtr htmlNodePtr; 36 37 /* 38 * Internal description of an HTML element, representing HTML 4.01 39 * and XHTML 1.0 (which share the same structure). 40 */ 41 typedef struct _htmlElemDesc htmlElemDesc; 42 typedef htmlElemDesc *htmlElemDescPtr; 43 struct _htmlElemDesc { 44 const char *name; /* The tag name */ 45 char startTag; /* unused */ 46 char endTag; /* Whether the end tag can be implied */ 47 char saveEndTag; /* Whether the end tag should be saved */ 48 char empty; /* Is this an empty element ? */ 49 char depr; /* unused */ 50 char dtd; /* unused */ 51 char isinline; /* is this a block 0 or inline 1 element */ 52 const char *desc; /* the description */ 53 54 const char** subelts XML_DEPRECATED_MEMBER; 55 const char* defaultsubelt XML_DEPRECATED_MEMBER; 56 const char** attrs_opt XML_DEPRECATED_MEMBER; 57 const char** attrs_depr XML_DEPRECATED_MEMBER; 58 const char** attrs_req XML_DEPRECATED_MEMBER; 59 60 int dataMode; 61 }; 62 63 /* 64 * Internal description of an HTML entity. 65 */ 66 typedef struct _htmlEntityDesc htmlEntityDesc; 67 typedef htmlEntityDesc *htmlEntityDescPtr; 68 struct _htmlEntityDesc { 69 unsigned int value; /* the UNICODE value for the character */ 70 const char *name; /* The entity name */ 71 const char *desc; /* the description */ 72 }; 73 74 #ifdef LIBXML_SAX1_ENABLED 75 76 XML_DEPRECATED 77 XMLPUBVAR const xmlSAXHandlerV1 htmlDefaultSAXHandler; 78 79 #ifdef LIBXML_THREAD_ENABLED 80 XML_DEPRECATED 81 XMLPUBFUN const xmlSAXHandlerV1 *__htmlDefaultSAXHandler(void); 82 #endif 83 84 #endif /* LIBXML_SAX1_ENABLED */ 85 86 /* 87 * There is only few public functions. 88 */ 89 XML_DEPRECATED 90 XMLPUBFUN void 91 htmlInitAutoClose (void); 92 XMLPUBFUN const htmlElemDesc * 93 htmlTagLookup (const xmlChar *tag); 94 XMLPUBFUN const htmlEntityDesc * 95 htmlEntityLookup(const xmlChar *name); 96 XMLPUBFUN const htmlEntityDesc * 97 htmlEntityValueLookup(unsigned int value); 98 99 XML_DEPRECATED 100 XMLPUBFUN int 101 htmlIsAutoClosed(htmlDocPtr doc, 102 htmlNodePtr elem); 103 XML_DEPRECATED 104 XMLPUBFUN int 105 htmlAutoCloseTag(htmlDocPtr doc, 106 const xmlChar *name, 107 htmlNodePtr elem); 108 XML_DEPRECATED 109 XMLPUBFUN const htmlEntityDesc * 110 htmlParseEntityRef(htmlParserCtxtPtr ctxt, 111 const xmlChar **str); 112 XML_DEPRECATED 113 XMLPUBFUN int 114 htmlParseCharRef(htmlParserCtxtPtr ctxt); 115 XML_DEPRECATED 116 XMLPUBFUN void 117 htmlParseElement(htmlParserCtxtPtr ctxt); 118 119 XMLPUBFUN htmlParserCtxtPtr 120 htmlNewParserCtxt(void); 121 XMLPUBFUN htmlParserCtxtPtr 122 htmlNewSAXParserCtxt(const htmlSAXHandler *sax, 123 void *userData); 124 125 XMLPUBFUN htmlParserCtxtPtr 126 htmlCreateMemoryParserCtxt(const char *buffer, 127 int size); 128 129 XMLPUBFUN int 130 htmlParseDocument(htmlParserCtxtPtr ctxt); 131 XML_DEPRECATED 132 XMLPUBFUN htmlDocPtr 133 htmlSAXParseDoc (const xmlChar *cur, 134 const char *encoding, 135 htmlSAXHandlerPtr sax, 136 void *userData); 137 XMLPUBFUN htmlDocPtr 138 htmlParseDoc (const xmlChar *cur, 139 const char *encoding); 140 XMLPUBFUN htmlParserCtxtPtr 141 htmlCreateFileParserCtxt(const char *filename, 142 const char *encoding); 143 XML_DEPRECATED 144 XMLPUBFUN htmlDocPtr 145 htmlSAXParseFile(const char *filename, 146 const char *encoding, 147 htmlSAXHandlerPtr sax, 148 void *userData); 149 XMLPUBFUN htmlDocPtr 150 htmlParseFile (const char *filename, 151 const char *encoding); 152 XMLPUBFUN int 153 UTF8ToHtml (unsigned char *out, 154 int *outlen, 155 const unsigned char *in, 156 int *inlen); 157 XMLPUBFUN int 158 htmlEncodeEntities(unsigned char *out, 159 int *outlen, 160 const unsigned char *in, 161 int *inlen, int quoteChar); 162 XMLPUBFUN int 163 htmlIsScriptAttribute(const xmlChar *name); 164 XML_DEPRECATED 165 XMLPUBFUN int 166 htmlHandleOmittedElem(int val); 167 168 #ifdef LIBXML_PUSH_ENABLED 169 /** 170 * Interfaces for the Push mode. 171 */ 172 XMLPUBFUN htmlParserCtxtPtr 173 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, 174 void *user_data, 175 const char *chunk, 176 int size, 177 const char *filename, 178 xmlCharEncoding enc); 179 XMLPUBFUN int 180 htmlParseChunk (htmlParserCtxtPtr ctxt, 181 const char *chunk, 182 int size, 183 int terminate); 184 #endif /* LIBXML_PUSH_ENABLED */ 185 186 XMLPUBFUN void 187 htmlFreeParserCtxt (htmlParserCtxtPtr ctxt); 188 189 /* 190 * New set of simpler/more flexible APIs 191 */ 192 /** 193 * xmlParserOption: 194 * 195 * This is the set of XML parser options that can be passed down 196 * to the xmlReadDoc() and similar calls. 197 */ 198 typedef enum { 199 HTML_PARSE_RECOVER = 1<<0, /* No effect */ 200 HTML_PARSE_HTML5 = 1<<1, /* HTML5 support */ 201 HTML_PARSE_NODEFDTD = 1<<2, /* do not default a doctype if not found */ 202 HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */ 203 HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */ 204 HTML_PARSE_PEDANTIC = 1<<7, /* No effect */ 205 HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */ 206 HTML_PARSE_NONET = 1<<11,/* No effect */ 207 HTML_PARSE_NOIMPLIED= 1<<13,/* Do not add implied html/body... elements */ 208 HTML_PARSE_COMPACT = 1<<16,/* compact small text nodes */ 209 HTML_PARSE_HUGE = 1<<19,/* relax any hardcoded limit from the parser */ 210 HTML_PARSE_IGNORE_ENC=1<<21,/* ignore internal document encoding hint */ 211 HTML_PARSE_BIG_LINES= 1<<22 /* Store big lines numbers in text PSVI field */ 212 } htmlParserOption; 213 214 XMLPUBFUN void 215 htmlCtxtReset (htmlParserCtxtPtr ctxt); 216 XMLPUBFUN int 217 htmlCtxtSetOptions (htmlParserCtxtPtr ctxt, 218 int options); 219 XMLPUBFUN int 220 htmlCtxtUseOptions (htmlParserCtxtPtr ctxt, 221 int options); 222 XMLPUBFUN htmlDocPtr 223 htmlReadDoc (const xmlChar *cur, 224 const char *URL, 225 const char *encoding, 226 int options); 227 XMLPUBFUN htmlDocPtr 228 htmlReadFile (const char *URL, 229 const char *encoding, 230 int options); 231 XMLPUBFUN htmlDocPtr 232 htmlReadMemory (const char *buffer, 233 int size, 234 const char *URL, 235 const char *encoding, 236 int options); 237 XMLPUBFUN htmlDocPtr 238 htmlReadFd (int fd, 239 const char *URL, 240 const char *encoding, 241 int options); 242 XMLPUBFUN htmlDocPtr 243 htmlReadIO (xmlInputReadCallback ioread, 244 xmlInputCloseCallback ioclose, 245 void *ioctx, 246 const char *URL, 247 const char *encoding, 248 int options); 249 XMLPUBFUN htmlDocPtr 250 htmlCtxtParseDocument (htmlParserCtxtPtr ctxt, 251 xmlParserInputPtr input); 252 XMLPUBFUN htmlDocPtr 253 htmlCtxtReadDoc (xmlParserCtxtPtr ctxt, 254 const xmlChar *cur, 255 const char *URL, 256 const char *encoding, 257 int options); 258 XMLPUBFUN htmlDocPtr 259 htmlCtxtReadFile (xmlParserCtxtPtr ctxt, 260 const char *filename, 261 const char *encoding, 262 int options); 263 XMLPUBFUN htmlDocPtr 264 htmlCtxtReadMemory (xmlParserCtxtPtr ctxt, 265 const char *buffer, 266 int size, 267 const char *URL, 268 const char *encoding, 269 int options); 270 XMLPUBFUN htmlDocPtr 271 htmlCtxtReadFd (xmlParserCtxtPtr ctxt, 272 int fd, 273 const char *URL, 274 const char *encoding, 275 int options); 276 XMLPUBFUN htmlDocPtr 277 htmlCtxtReadIO (xmlParserCtxtPtr ctxt, 278 xmlInputReadCallback ioread, 279 xmlInputCloseCallback ioclose, 280 void *ioctx, 281 const char *URL, 282 const char *encoding, 283 int options); 284 285 /* deprecated content model 286 */ 287 typedef enum { 288 HTML_NA = 0 , /* something we don't check at all */ 289 HTML_INVALID = 0x1 , 290 HTML_DEPRECATED = 0x2 , 291 HTML_VALID = 0x4 , 292 HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */ 293 } htmlStatus ; 294 295 /* Using htmlElemDesc rather than name here, to emphasise the fact 296 that otherwise there's a lookup overhead 297 */ 298 XML_DEPRECATED 299 XMLPUBFUN htmlStatus htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ; 300 XML_DEPRECATED 301 XMLPUBFUN int htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ; 302 XML_DEPRECATED 303 XMLPUBFUN htmlStatus htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ; 304 XML_DEPRECATED 305 XMLPUBFUN htmlStatus htmlNodeStatus(htmlNodePtr, int) ; 306 /** 307 * htmlDefaultSubelement: 308 * @elt: HTML element 309 * 310 * Returns the default subelement for this element 311 */ 312 #define htmlDefaultSubelement(elt) elt->defaultsubelt 313 /** 314 * htmlElementAllowedHereDesc: 315 * @parent: HTML parent element 316 * @elt: HTML element 317 * 318 * Checks whether an HTML element description may be a 319 * direct child of the specified element. 320 * 321 * Returns 1 if allowed; 0 otherwise. 322 */ 323 #define htmlElementAllowedHereDesc(parent,elt) \ 324 htmlElementAllowedHere((parent), (elt)->name) 325 /** 326 * htmlRequiredAttrs: 327 * @elt: HTML element 328 * 329 * Returns the attributes required for the specified element. 330 */ 331 #define htmlRequiredAttrs(elt) (elt)->attrs_req 332 333 334 #ifdef __cplusplus 335 } 336 #endif 337 338 #endif /* LIBXML_HTML_ENABLED */ 339 #endif /* __HTML_PARSER_H__ */ 340