1*e1fe3e4aSElliott Hughes"""Shim module exporting the same ElementTree API for lxml and 2*e1fe3e4aSElliott Hughesxml.etree backends. 3*e1fe3e4aSElliott Hughes 4*e1fe3e4aSElliott HughesWhen lxml is installed, it is automatically preferred over the built-in 5*e1fe3e4aSElliott Hughesxml.etree module. 6*e1fe3e4aSElliott HughesOn Python 2.7, the cElementTree module is preferred over the pure-python 7*e1fe3e4aSElliott HughesElementTree module. 8*e1fe3e4aSElliott Hughes 9*e1fe3e4aSElliott HughesBesides exporting a unified interface, this also defines extra functions 10*e1fe3e4aSElliott Hughesor subclasses built-in ElementTree classes to add features that are 11*e1fe3e4aSElliott Hughesonly availble in lxml, like OrderedDict for attributes, pretty_print and 12*e1fe3e4aSElliott Hughesiterwalk. 13*e1fe3e4aSElliott Hughes""" 14*e1fe3e4aSElliott Hughes 15*e1fe3e4aSElliott Hughesfrom fontTools.misc.textTools import tostr 16*e1fe3e4aSElliott Hughes 17*e1fe3e4aSElliott Hughes 18*e1fe3e4aSElliott HughesXML_DECLARATION = """<?xml version='1.0' encoding='%s'?>""" 19*e1fe3e4aSElliott Hughes 20*e1fe3e4aSElliott Hughes__all__ = [ 21*e1fe3e4aSElliott Hughes # public symbols 22*e1fe3e4aSElliott Hughes "Comment", 23*e1fe3e4aSElliott Hughes "dump", 24*e1fe3e4aSElliott Hughes "Element", 25*e1fe3e4aSElliott Hughes "ElementTree", 26*e1fe3e4aSElliott Hughes "fromstring", 27*e1fe3e4aSElliott Hughes "fromstringlist", 28*e1fe3e4aSElliott Hughes "iselement", 29*e1fe3e4aSElliott Hughes "iterparse", 30*e1fe3e4aSElliott Hughes "parse", 31*e1fe3e4aSElliott Hughes "ParseError", 32*e1fe3e4aSElliott Hughes "PI", 33*e1fe3e4aSElliott Hughes "ProcessingInstruction", 34*e1fe3e4aSElliott Hughes "QName", 35*e1fe3e4aSElliott Hughes "SubElement", 36*e1fe3e4aSElliott Hughes "tostring", 37*e1fe3e4aSElliott Hughes "tostringlist", 38*e1fe3e4aSElliott Hughes "TreeBuilder", 39*e1fe3e4aSElliott Hughes "XML", 40*e1fe3e4aSElliott Hughes "XMLParser", 41*e1fe3e4aSElliott Hughes "register_namespace", 42*e1fe3e4aSElliott Hughes] 43*e1fe3e4aSElliott Hughes 44*e1fe3e4aSElliott Hughestry: 45*e1fe3e4aSElliott Hughes from lxml.etree import * 46*e1fe3e4aSElliott Hughes 47*e1fe3e4aSElliott Hughes _have_lxml = True 48*e1fe3e4aSElliott Hughesexcept ImportError: 49*e1fe3e4aSElliott Hughes try: 50*e1fe3e4aSElliott Hughes from xml.etree.cElementTree import * 51*e1fe3e4aSElliott Hughes 52*e1fe3e4aSElliott Hughes # the cElementTree version of XML function doesn't support 53*e1fe3e4aSElliott Hughes # the optional 'parser' keyword argument 54*e1fe3e4aSElliott Hughes from xml.etree.ElementTree import XML 55*e1fe3e4aSElliott Hughes except ImportError: # pragma: no cover 56*e1fe3e4aSElliott Hughes from xml.etree.ElementTree import * 57*e1fe3e4aSElliott Hughes _have_lxml = False 58*e1fe3e4aSElliott Hughes 59*e1fe3e4aSElliott Hughes import sys 60*e1fe3e4aSElliott Hughes 61*e1fe3e4aSElliott Hughes # dict is always ordered in python >= 3.6 and on pypy 62*e1fe3e4aSElliott Hughes PY36 = sys.version_info >= (3, 6) 63*e1fe3e4aSElliott Hughes try: 64*e1fe3e4aSElliott Hughes import __pypy__ 65*e1fe3e4aSElliott Hughes except ImportError: 66*e1fe3e4aSElliott Hughes __pypy__ = None 67*e1fe3e4aSElliott Hughes _dict_is_ordered = bool(PY36 or __pypy__) 68*e1fe3e4aSElliott Hughes del PY36, __pypy__ 69*e1fe3e4aSElliott Hughes 70*e1fe3e4aSElliott Hughes if _dict_is_ordered: 71*e1fe3e4aSElliott Hughes _Attrib = dict 72*e1fe3e4aSElliott Hughes else: 73*e1fe3e4aSElliott Hughes from collections import OrderedDict as _Attrib 74*e1fe3e4aSElliott Hughes 75*e1fe3e4aSElliott Hughes if isinstance(Element, type): 76*e1fe3e4aSElliott Hughes _Element = Element 77*e1fe3e4aSElliott Hughes else: 78*e1fe3e4aSElliott Hughes # in py27, cElementTree.Element cannot be subclassed, so 79*e1fe3e4aSElliott Hughes # we need to import the pure-python class 80*e1fe3e4aSElliott Hughes from xml.etree.ElementTree import Element as _Element 81*e1fe3e4aSElliott Hughes 82*e1fe3e4aSElliott Hughes class Element(_Element): 83*e1fe3e4aSElliott Hughes """Element subclass that keeps the order of attributes.""" 84*e1fe3e4aSElliott Hughes 85*e1fe3e4aSElliott Hughes def __init__(self, tag, attrib=_Attrib(), **extra): 86*e1fe3e4aSElliott Hughes super(Element, self).__init__(tag) 87*e1fe3e4aSElliott Hughes self.attrib = _Attrib() 88*e1fe3e4aSElliott Hughes if attrib: 89*e1fe3e4aSElliott Hughes self.attrib.update(attrib) 90*e1fe3e4aSElliott Hughes if extra: 91*e1fe3e4aSElliott Hughes self.attrib.update(extra) 92*e1fe3e4aSElliott Hughes 93*e1fe3e4aSElliott Hughes def SubElement(parent, tag, attrib=_Attrib(), **extra): 94*e1fe3e4aSElliott Hughes """Must override SubElement as well otherwise _elementtree.SubElement 95*e1fe3e4aSElliott Hughes fails if 'parent' is a subclass of Element object. 96*e1fe3e4aSElliott Hughes """ 97*e1fe3e4aSElliott Hughes element = parent.__class__(tag, attrib, **extra) 98*e1fe3e4aSElliott Hughes parent.append(element) 99*e1fe3e4aSElliott Hughes return element 100*e1fe3e4aSElliott Hughes 101*e1fe3e4aSElliott Hughes def _iterwalk(element, events, tag): 102*e1fe3e4aSElliott Hughes include = tag is None or element.tag == tag 103*e1fe3e4aSElliott Hughes if include and "start" in events: 104*e1fe3e4aSElliott Hughes yield ("start", element) 105*e1fe3e4aSElliott Hughes for e in element: 106*e1fe3e4aSElliott Hughes for item in _iterwalk(e, events, tag): 107*e1fe3e4aSElliott Hughes yield item 108*e1fe3e4aSElliott Hughes if include: 109*e1fe3e4aSElliott Hughes yield ("end", element) 110*e1fe3e4aSElliott Hughes 111*e1fe3e4aSElliott Hughes def iterwalk(element_or_tree, events=("end",), tag=None): 112*e1fe3e4aSElliott Hughes """A tree walker that generates events from an existing tree as 113*e1fe3e4aSElliott Hughes if it was parsing XML data with iterparse(). 114*e1fe3e4aSElliott Hughes Drop-in replacement for lxml.etree.iterwalk. 115*e1fe3e4aSElliott Hughes """ 116*e1fe3e4aSElliott Hughes if iselement(element_or_tree): 117*e1fe3e4aSElliott Hughes element = element_or_tree 118*e1fe3e4aSElliott Hughes else: 119*e1fe3e4aSElliott Hughes element = element_or_tree.getroot() 120*e1fe3e4aSElliott Hughes if tag == "*": 121*e1fe3e4aSElliott Hughes tag = None 122*e1fe3e4aSElliott Hughes for item in _iterwalk(element, events, tag): 123*e1fe3e4aSElliott Hughes yield item 124*e1fe3e4aSElliott Hughes 125*e1fe3e4aSElliott Hughes _ElementTree = ElementTree 126*e1fe3e4aSElliott Hughes 127*e1fe3e4aSElliott Hughes class ElementTree(_ElementTree): 128*e1fe3e4aSElliott Hughes """ElementTree subclass that adds 'pretty_print' and 'doctype' 129*e1fe3e4aSElliott Hughes arguments to the 'write' method. 130*e1fe3e4aSElliott Hughes Currently these are only supported for the default XML serialization 131*e1fe3e4aSElliott Hughes 'method', and not also for "html" or "text", for these are delegated 132*e1fe3e4aSElliott Hughes to the base class. 133*e1fe3e4aSElliott Hughes """ 134*e1fe3e4aSElliott Hughes 135*e1fe3e4aSElliott Hughes def write( 136*e1fe3e4aSElliott Hughes self, 137*e1fe3e4aSElliott Hughes file_or_filename, 138*e1fe3e4aSElliott Hughes encoding=None, 139*e1fe3e4aSElliott Hughes xml_declaration=False, 140*e1fe3e4aSElliott Hughes method=None, 141*e1fe3e4aSElliott Hughes doctype=None, 142*e1fe3e4aSElliott Hughes pretty_print=False, 143*e1fe3e4aSElliott Hughes ): 144*e1fe3e4aSElliott Hughes if method and method != "xml": 145*e1fe3e4aSElliott Hughes # delegate to super-class 146*e1fe3e4aSElliott Hughes super(ElementTree, self).write( 147*e1fe3e4aSElliott Hughes file_or_filename, 148*e1fe3e4aSElliott Hughes encoding=encoding, 149*e1fe3e4aSElliott Hughes xml_declaration=xml_declaration, 150*e1fe3e4aSElliott Hughes method=method, 151*e1fe3e4aSElliott Hughes ) 152*e1fe3e4aSElliott Hughes return 153*e1fe3e4aSElliott Hughes 154*e1fe3e4aSElliott Hughes if encoding is not None and encoding.lower() == "unicode": 155*e1fe3e4aSElliott Hughes if xml_declaration: 156*e1fe3e4aSElliott Hughes raise ValueError( 157*e1fe3e4aSElliott Hughes "Serialisation to unicode must not request an XML declaration" 158*e1fe3e4aSElliott Hughes ) 159*e1fe3e4aSElliott Hughes write_declaration = False 160*e1fe3e4aSElliott Hughes encoding = "unicode" 161*e1fe3e4aSElliott Hughes elif xml_declaration is None: 162*e1fe3e4aSElliott Hughes # by default, write an XML declaration only for non-standard encodings 163*e1fe3e4aSElliott Hughes write_declaration = encoding is not None and encoding.upper() not in ( 164*e1fe3e4aSElliott Hughes "ASCII", 165*e1fe3e4aSElliott Hughes "UTF-8", 166*e1fe3e4aSElliott Hughes "UTF8", 167*e1fe3e4aSElliott Hughes "US-ASCII", 168*e1fe3e4aSElliott Hughes ) 169*e1fe3e4aSElliott Hughes else: 170*e1fe3e4aSElliott Hughes write_declaration = xml_declaration 171*e1fe3e4aSElliott Hughes 172*e1fe3e4aSElliott Hughes if encoding is None: 173*e1fe3e4aSElliott Hughes encoding = "ASCII" 174*e1fe3e4aSElliott Hughes 175*e1fe3e4aSElliott Hughes if pretty_print: 176*e1fe3e4aSElliott Hughes # NOTE this will modify the tree in-place 177*e1fe3e4aSElliott Hughes _indent(self._root) 178*e1fe3e4aSElliott Hughes 179*e1fe3e4aSElliott Hughes with _get_writer(file_or_filename, encoding) as write: 180*e1fe3e4aSElliott Hughes if write_declaration: 181*e1fe3e4aSElliott Hughes write(XML_DECLARATION % encoding.upper()) 182*e1fe3e4aSElliott Hughes if pretty_print: 183*e1fe3e4aSElliott Hughes write("\n") 184*e1fe3e4aSElliott Hughes if doctype: 185*e1fe3e4aSElliott Hughes write(_tounicode(doctype)) 186*e1fe3e4aSElliott Hughes if pretty_print: 187*e1fe3e4aSElliott Hughes write("\n") 188*e1fe3e4aSElliott Hughes 189*e1fe3e4aSElliott Hughes qnames, namespaces = _namespaces(self._root) 190*e1fe3e4aSElliott Hughes _serialize_xml(write, self._root, qnames, namespaces) 191*e1fe3e4aSElliott Hughes 192*e1fe3e4aSElliott Hughes import io 193*e1fe3e4aSElliott Hughes 194*e1fe3e4aSElliott Hughes def tostring( 195*e1fe3e4aSElliott Hughes element, 196*e1fe3e4aSElliott Hughes encoding=None, 197*e1fe3e4aSElliott Hughes xml_declaration=None, 198*e1fe3e4aSElliott Hughes method=None, 199*e1fe3e4aSElliott Hughes doctype=None, 200*e1fe3e4aSElliott Hughes pretty_print=False, 201*e1fe3e4aSElliott Hughes ): 202*e1fe3e4aSElliott Hughes """Custom 'tostring' function that uses our ElementTree subclass, with 203*e1fe3e4aSElliott Hughes pretty_print support. 204*e1fe3e4aSElliott Hughes """ 205*e1fe3e4aSElliott Hughes stream = io.StringIO() if encoding == "unicode" else io.BytesIO() 206*e1fe3e4aSElliott Hughes ElementTree(element).write( 207*e1fe3e4aSElliott Hughes stream, 208*e1fe3e4aSElliott Hughes encoding=encoding, 209*e1fe3e4aSElliott Hughes xml_declaration=xml_declaration, 210*e1fe3e4aSElliott Hughes method=method, 211*e1fe3e4aSElliott Hughes doctype=doctype, 212*e1fe3e4aSElliott Hughes pretty_print=pretty_print, 213*e1fe3e4aSElliott Hughes ) 214*e1fe3e4aSElliott Hughes return stream.getvalue() 215*e1fe3e4aSElliott Hughes 216*e1fe3e4aSElliott Hughes # serialization support 217*e1fe3e4aSElliott Hughes 218*e1fe3e4aSElliott Hughes import re 219*e1fe3e4aSElliott Hughes 220*e1fe3e4aSElliott Hughes # Valid XML strings can include any Unicode character, excluding control 221*e1fe3e4aSElliott Hughes # characters, the surrogate blocks, FFFE, and FFFF: 222*e1fe3e4aSElliott Hughes # Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] 223*e1fe3e4aSElliott Hughes # Here we reversed the pattern to match only the invalid characters. 224*e1fe3e4aSElliott Hughes # For the 'narrow' python builds supporting only UCS-2, which represent 225*e1fe3e4aSElliott Hughes # characters beyond BMP as UTF-16 surrogate pairs, we need to pass through 226*e1fe3e4aSElliott Hughes # the surrogate block. I haven't found a more elegant solution... 227*e1fe3e4aSElliott Hughes UCS2 = sys.maxunicode < 0x10FFFF 228*e1fe3e4aSElliott Hughes if UCS2: 229*e1fe3e4aSElliott Hughes _invalid_xml_string = re.compile( 230*e1fe3e4aSElliott Hughes "[\u0000-\u0008\u000B-\u000C\u000E-\u001F\uFFFE-\uFFFF]" 231*e1fe3e4aSElliott Hughes ) 232*e1fe3e4aSElliott Hughes else: 233*e1fe3e4aSElliott Hughes _invalid_xml_string = re.compile( 234*e1fe3e4aSElliott Hughes "[\u0000-\u0008\u000B-\u000C\u000E-\u001F\uD800-\uDFFF\uFFFE-\uFFFF]" 235*e1fe3e4aSElliott Hughes ) 236*e1fe3e4aSElliott Hughes 237*e1fe3e4aSElliott Hughes def _tounicode(s): 238*e1fe3e4aSElliott Hughes """Test if a string is valid user input and decode it to unicode string 239*e1fe3e4aSElliott Hughes using ASCII encoding if it's a bytes string. 240*e1fe3e4aSElliott Hughes Reject all bytes/unicode input that contains non-XML characters. 241*e1fe3e4aSElliott Hughes Reject all bytes input that contains non-ASCII characters. 242*e1fe3e4aSElliott Hughes """ 243*e1fe3e4aSElliott Hughes try: 244*e1fe3e4aSElliott Hughes s = tostr(s, encoding="ascii", errors="strict") 245*e1fe3e4aSElliott Hughes except UnicodeDecodeError: 246*e1fe3e4aSElliott Hughes raise ValueError( 247*e1fe3e4aSElliott Hughes "Bytes strings can only contain ASCII characters. " 248*e1fe3e4aSElliott Hughes "Use unicode strings for non-ASCII characters." 249*e1fe3e4aSElliott Hughes ) 250*e1fe3e4aSElliott Hughes except AttributeError: 251*e1fe3e4aSElliott Hughes _raise_serialization_error(s) 252*e1fe3e4aSElliott Hughes if s and _invalid_xml_string.search(s): 253*e1fe3e4aSElliott Hughes raise ValueError( 254*e1fe3e4aSElliott Hughes "All strings must be XML compatible: Unicode or ASCII, " 255*e1fe3e4aSElliott Hughes "no NULL bytes or control characters" 256*e1fe3e4aSElliott Hughes ) 257*e1fe3e4aSElliott Hughes return s 258*e1fe3e4aSElliott Hughes 259*e1fe3e4aSElliott Hughes import contextlib 260*e1fe3e4aSElliott Hughes 261*e1fe3e4aSElliott Hughes @contextlib.contextmanager 262*e1fe3e4aSElliott Hughes def _get_writer(file_or_filename, encoding): 263*e1fe3e4aSElliott Hughes # returns text write method and release all resources after using 264*e1fe3e4aSElliott Hughes try: 265*e1fe3e4aSElliott Hughes write = file_or_filename.write 266*e1fe3e4aSElliott Hughes except AttributeError: 267*e1fe3e4aSElliott Hughes # file_or_filename is a file name 268*e1fe3e4aSElliott Hughes f = open( 269*e1fe3e4aSElliott Hughes file_or_filename, 270*e1fe3e4aSElliott Hughes "w", 271*e1fe3e4aSElliott Hughes encoding="utf-8" if encoding == "unicode" else encoding, 272*e1fe3e4aSElliott Hughes errors="xmlcharrefreplace", 273*e1fe3e4aSElliott Hughes ) 274*e1fe3e4aSElliott Hughes with f: 275*e1fe3e4aSElliott Hughes yield f.write 276*e1fe3e4aSElliott Hughes else: 277*e1fe3e4aSElliott Hughes # file_or_filename is a file-like object 278*e1fe3e4aSElliott Hughes # encoding determines if it is a text or binary writer 279*e1fe3e4aSElliott Hughes if encoding == "unicode": 280*e1fe3e4aSElliott Hughes # use a text writer as is 281*e1fe3e4aSElliott Hughes yield write 282*e1fe3e4aSElliott Hughes else: 283*e1fe3e4aSElliott Hughes # wrap a binary writer with TextIOWrapper 284*e1fe3e4aSElliott Hughes detach_buffer = False 285*e1fe3e4aSElliott Hughes if isinstance(file_or_filename, io.BufferedIOBase): 286*e1fe3e4aSElliott Hughes buf = file_or_filename 287*e1fe3e4aSElliott Hughes elif isinstance(file_or_filename, io.RawIOBase): 288*e1fe3e4aSElliott Hughes buf = io.BufferedWriter(file_or_filename) 289*e1fe3e4aSElliott Hughes detach_buffer = True 290*e1fe3e4aSElliott Hughes else: 291*e1fe3e4aSElliott Hughes # This is to handle passed objects that aren't in the 292*e1fe3e4aSElliott Hughes # IOBase hierarchy, but just have a write method 293*e1fe3e4aSElliott Hughes buf = io.BufferedIOBase() 294*e1fe3e4aSElliott Hughes buf.writable = lambda: True 295*e1fe3e4aSElliott Hughes buf.write = write 296*e1fe3e4aSElliott Hughes try: 297*e1fe3e4aSElliott Hughes # TextIOWrapper uses this methods to determine 298*e1fe3e4aSElliott Hughes # if BOM (for UTF-16, etc) should be added 299*e1fe3e4aSElliott Hughes buf.seekable = file_or_filename.seekable 300*e1fe3e4aSElliott Hughes buf.tell = file_or_filename.tell 301*e1fe3e4aSElliott Hughes except AttributeError: 302*e1fe3e4aSElliott Hughes pass 303*e1fe3e4aSElliott Hughes wrapper = io.TextIOWrapper( 304*e1fe3e4aSElliott Hughes buf, 305*e1fe3e4aSElliott Hughes encoding=encoding, 306*e1fe3e4aSElliott Hughes errors="xmlcharrefreplace", 307*e1fe3e4aSElliott Hughes newline="\n", 308*e1fe3e4aSElliott Hughes ) 309*e1fe3e4aSElliott Hughes try: 310*e1fe3e4aSElliott Hughes yield wrapper.write 311*e1fe3e4aSElliott Hughes finally: 312*e1fe3e4aSElliott Hughes # Keep the original file open when the TextIOWrapper and 313*e1fe3e4aSElliott Hughes # the BufferedWriter are destroyed 314*e1fe3e4aSElliott Hughes wrapper.detach() 315*e1fe3e4aSElliott Hughes if detach_buffer: 316*e1fe3e4aSElliott Hughes buf.detach() 317*e1fe3e4aSElliott Hughes 318*e1fe3e4aSElliott Hughes from xml.etree.ElementTree import _namespace_map 319*e1fe3e4aSElliott Hughes 320*e1fe3e4aSElliott Hughes def _namespaces(elem): 321*e1fe3e4aSElliott Hughes # identify namespaces used in this tree 322*e1fe3e4aSElliott Hughes 323*e1fe3e4aSElliott Hughes # maps qnames to *encoded* prefix:local names 324*e1fe3e4aSElliott Hughes qnames = {None: None} 325*e1fe3e4aSElliott Hughes 326*e1fe3e4aSElliott Hughes # maps uri:s to prefixes 327*e1fe3e4aSElliott Hughes namespaces = {} 328*e1fe3e4aSElliott Hughes 329*e1fe3e4aSElliott Hughes def add_qname(qname): 330*e1fe3e4aSElliott Hughes # calculate serialized qname representation 331*e1fe3e4aSElliott Hughes try: 332*e1fe3e4aSElliott Hughes qname = _tounicode(qname) 333*e1fe3e4aSElliott Hughes if qname[:1] == "{": 334*e1fe3e4aSElliott Hughes uri, tag = qname[1:].rsplit("}", 1) 335*e1fe3e4aSElliott Hughes prefix = namespaces.get(uri) 336*e1fe3e4aSElliott Hughes if prefix is None: 337*e1fe3e4aSElliott Hughes prefix = _namespace_map.get(uri) 338*e1fe3e4aSElliott Hughes if prefix is None: 339*e1fe3e4aSElliott Hughes prefix = "ns%d" % len(namespaces) 340*e1fe3e4aSElliott Hughes else: 341*e1fe3e4aSElliott Hughes prefix = _tounicode(prefix) 342*e1fe3e4aSElliott Hughes if prefix != "xml": 343*e1fe3e4aSElliott Hughes namespaces[uri] = prefix 344*e1fe3e4aSElliott Hughes if prefix: 345*e1fe3e4aSElliott Hughes qnames[qname] = "%s:%s" % (prefix, tag) 346*e1fe3e4aSElliott Hughes else: 347*e1fe3e4aSElliott Hughes qnames[qname] = tag # default element 348*e1fe3e4aSElliott Hughes else: 349*e1fe3e4aSElliott Hughes qnames[qname] = qname 350*e1fe3e4aSElliott Hughes except TypeError: 351*e1fe3e4aSElliott Hughes _raise_serialization_error(qname) 352*e1fe3e4aSElliott Hughes 353*e1fe3e4aSElliott Hughes # populate qname and namespaces table 354*e1fe3e4aSElliott Hughes for elem in elem.iter(): 355*e1fe3e4aSElliott Hughes tag = elem.tag 356*e1fe3e4aSElliott Hughes if isinstance(tag, QName): 357*e1fe3e4aSElliott Hughes if tag.text not in qnames: 358*e1fe3e4aSElliott Hughes add_qname(tag.text) 359*e1fe3e4aSElliott Hughes elif isinstance(tag, str): 360*e1fe3e4aSElliott Hughes if tag not in qnames: 361*e1fe3e4aSElliott Hughes add_qname(tag) 362*e1fe3e4aSElliott Hughes elif tag is not None and tag is not Comment and tag is not PI: 363*e1fe3e4aSElliott Hughes _raise_serialization_error(tag) 364*e1fe3e4aSElliott Hughes for key, value in elem.items(): 365*e1fe3e4aSElliott Hughes if isinstance(key, QName): 366*e1fe3e4aSElliott Hughes key = key.text 367*e1fe3e4aSElliott Hughes if key not in qnames: 368*e1fe3e4aSElliott Hughes add_qname(key) 369*e1fe3e4aSElliott Hughes if isinstance(value, QName) and value.text not in qnames: 370*e1fe3e4aSElliott Hughes add_qname(value.text) 371*e1fe3e4aSElliott Hughes text = elem.text 372*e1fe3e4aSElliott Hughes if isinstance(text, QName) and text.text not in qnames: 373*e1fe3e4aSElliott Hughes add_qname(text.text) 374*e1fe3e4aSElliott Hughes return qnames, namespaces 375*e1fe3e4aSElliott Hughes 376*e1fe3e4aSElliott Hughes def _serialize_xml(write, elem, qnames, namespaces, **kwargs): 377*e1fe3e4aSElliott Hughes tag = elem.tag 378*e1fe3e4aSElliott Hughes text = elem.text 379*e1fe3e4aSElliott Hughes if tag is Comment: 380*e1fe3e4aSElliott Hughes write("<!--%s-->" % _tounicode(text)) 381*e1fe3e4aSElliott Hughes elif tag is ProcessingInstruction: 382*e1fe3e4aSElliott Hughes write("<?%s?>" % _tounicode(text)) 383*e1fe3e4aSElliott Hughes else: 384*e1fe3e4aSElliott Hughes tag = qnames[_tounicode(tag) if tag is not None else None] 385*e1fe3e4aSElliott Hughes if tag is None: 386*e1fe3e4aSElliott Hughes if text: 387*e1fe3e4aSElliott Hughes write(_escape_cdata(text)) 388*e1fe3e4aSElliott Hughes for e in elem: 389*e1fe3e4aSElliott Hughes _serialize_xml(write, e, qnames, None) 390*e1fe3e4aSElliott Hughes else: 391*e1fe3e4aSElliott Hughes write("<" + tag) 392*e1fe3e4aSElliott Hughes if namespaces: 393*e1fe3e4aSElliott Hughes for uri, prefix in sorted( 394*e1fe3e4aSElliott Hughes namespaces.items(), key=lambda x: x[1] 395*e1fe3e4aSElliott Hughes ): # sort on prefix 396*e1fe3e4aSElliott Hughes if prefix: 397*e1fe3e4aSElliott Hughes prefix = ":" + prefix 398*e1fe3e4aSElliott Hughes write(' xmlns%s="%s"' % (prefix, _escape_attrib(uri))) 399*e1fe3e4aSElliott Hughes attrs = elem.attrib 400*e1fe3e4aSElliott Hughes if attrs: 401*e1fe3e4aSElliott Hughes # try to keep existing attrib order 402*e1fe3e4aSElliott Hughes if len(attrs) <= 1 or type(attrs) is _Attrib: 403*e1fe3e4aSElliott Hughes items = attrs.items() 404*e1fe3e4aSElliott Hughes else: 405*e1fe3e4aSElliott Hughes # if plain dict, use lexical order 406*e1fe3e4aSElliott Hughes items = sorted(attrs.items()) 407*e1fe3e4aSElliott Hughes for k, v in items: 408*e1fe3e4aSElliott Hughes if isinstance(k, QName): 409*e1fe3e4aSElliott Hughes k = _tounicode(k.text) 410*e1fe3e4aSElliott Hughes else: 411*e1fe3e4aSElliott Hughes k = _tounicode(k) 412*e1fe3e4aSElliott Hughes if isinstance(v, QName): 413*e1fe3e4aSElliott Hughes v = qnames[_tounicode(v.text)] 414*e1fe3e4aSElliott Hughes else: 415*e1fe3e4aSElliott Hughes v = _escape_attrib(v) 416*e1fe3e4aSElliott Hughes write(' %s="%s"' % (qnames[k], v)) 417*e1fe3e4aSElliott Hughes if text is not None or len(elem): 418*e1fe3e4aSElliott Hughes write(">") 419*e1fe3e4aSElliott Hughes if text: 420*e1fe3e4aSElliott Hughes write(_escape_cdata(text)) 421*e1fe3e4aSElliott Hughes for e in elem: 422*e1fe3e4aSElliott Hughes _serialize_xml(write, e, qnames, None) 423*e1fe3e4aSElliott Hughes write("</" + tag + ">") 424*e1fe3e4aSElliott Hughes else: 425*e1fe3e4aSElliott Hughes write("/>") 426*e1fe3e4aSElliott Hughes if elem.tail: 427*e1fe3e4aSElliott Hughes write(_escape_cdata(elem.tail)) 428*e1fe3e4aSElliott Hughes 429*e1fe3e4aSElliott Hughes def _raise_serialization_error(text): 430*e1fe3e4aSElliott Hughes raise TypeError("cannot serialize %r (type %s)" % (text, type(text).__name__)) 431*e1fe3e4aSElliott Hughes 432*e1fe3e4aSElliott Hughes def _escape_cdata(text): 433*e1fe3e4aSElliott Hughes # escape character data 434*e1fe3e4aSElliott Hughes try: 435*e1fe3e4aSElliott Hughes text = _tounicode(text) 436*e1fe3e4aSElliott Hughes # it's worth avoiding do-nothing calls for short strings 437*e1fe3e4aSElliott Hughes if "&" in text: 438*e1fe3e4aSElliott Hughes text = text.replace("&", "&") 439*e1fe3e4aSElliott Hughes if "<" in text: 440*e1fe3e4aSElliott Hughes text = text.replace("<", "<") 441*e1fe3e4aSElliott Hughes if ">" in text: 442*e1fe3e4aSElliott Hughes text = text.replace(">", ">") 443*e1fe3e4aSElliott Hughes return text 444*e1fe3e4aSElliott Hughes except (TypeError, AttributeError): 445*e1fe3e4aSElliott Hughes _raise_serialization_error(text) 446*e1fe3e4aSElliott Hughes 447*e1fe3e4aSElliott Hughes def _escape_attrib(text): 448*e1fe3e4aSElliott Hughes # escape attribute value 449*e1fe3e4aSElliott Hughes try: 450*e1fe3e4aSElliott Hughes text = _tounicode(text) 451*e1fe3e4aSElliott Hughes if "&" in text: 452*e1fe3e4aSElliott Hughes text = text.replace("&", "&") 453*e1fe3e4aSElliott Hughes if "<" in text: 454*e1fe3e4aSElliott Hughes text = text.replace("<", "<") 455*e1fe3e4aSElliott Hughes if ">" in text: 456*e1fe3e4aSElliott Hughes text = text.replace(">", ">") 457*e1fe3e4aSElliott Hughes if '"' in text: 458*e1fe3e4aSElliott Hughes text = text.replace('"', """) 459*e1fe3e4aSElliott Hughes if "\n" in text: 460*e1fe3e4aSElliott Hughes text = text.replace("\n", " ") 461*e1fe3e4aSElliott Hughes return text 462*e1fe3e4aSElliott Hughes except (TypeError, AttributeError): 463*e1fe3e4aSElliott Hughes _raise_serialization_error(text) 464*e1fe3e4aSElliott Hughes 465*e1fe3e4aSElliott Hughes def _indent(elem, level=0): 466*e1fe3e4aSElliott Hughes # From http://effbot.org/zone/element-lib.htm#prettyprint 467*e1fe3e4aSElliott Hughes i = "\n" + level * " " 468*e1fe3e4aSElliott Hughes if len(elem): 469*e1fe3e4aSElliott Hughes if not elem.text or not elem.text.strip(): 470*e1fe3e4aSElliott Hughes elem.text = i + " " 471*e1fe3e4aSElliott Hughes if not elem.tail or not elem.tail.strip(): 472*e1fe3e4aSElliott Hughes elem.tail = i 473*e1fe3e4aSElliott Hughes for elem in elem: 474*e1fe3e4aSElliott Hughes _indent(elem, level + 1) 475*e1fe3e4aSElliott Hughes if not elem.tail or not elem.tail.strip(): 476*e1fe3e4aSElliott Hughes elem.tail = i 477*e1fe3e4aSElliott Hughes else: 478*e1fe3e4aSElliott Hughes if level and (not elem.tail or not elem.tail.strip()): 479*e1fe3e4aSElliott Hughes elem.tail = i 480