jpayne@68: """Lightweight XML support for Python. jpayne@68: jpayne@68: XML is an inherently hierarchical data format, and the most natural way to jpayne@68: represent it is with a tree. This module has two classes for this purpose: jpayne@68: jpayne@68: 1. ElementTree represents the whole XML document as a tree and jpayne@68: jpayne@68: 2. Element represents a single node in this tree. jpayne@68: jpayne@68: Interactions with the whole document (reading and writing to/from files) are jpayne@68: usually done on the ElementTree level. Interactions with a single XML element jpayne@68: and its sub-elements are done on the Element level. jpayne@68: jpayne@68: Element is a flexible container object designed to store hierarchical data jpayne@68: structures in memory. It can be described as a cross between a list and a jpayne@68: dictionary. Each Element has a number of properties associated with it: jpayne@68: jpayne@68: 'tag' - a string containing the element's name. jpayne@68: jpayne@68: 'attributes' - a Python dictionary storing the element's attributes. jpayne@68: jpayne@68: 'text' - a string containing the element's text content. jpayne@68: jpayne@68: 'tail' - an optional string containing text after the element's end tag. jpayne@68: jpayne@68: And a number of child elements stored in a Python sequence. jpayne@68: jpayne@68: To create an element instance, use the Element constructor, jpayne@68: or the SubElement factory function. jpayne@68: jpayne@68: You can also use the ElementTree class to wrap an element structure jpayne@68: and convert it to and from XML. jpayne@68: jpayne@68: """ jpayne@68: jpayne@68: #--------------------------------------------------------------------- jpayne@68: # Licensed to PSF under a Contributor Agreement. jpayne@68: # See http://www.python.org/psf/license for licensing details. jpayne@68: # jpayne@68: # ElementTree jpayne@68: # Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved. jpayne@68: # jpayne@68: # fredrik@pythonware.com jpayne@68: # http://www.pythonware.com jpayne@68: # -------------------------------------------------------------------- jpayne@68: # The ElementTree toolkit is jpayne@68: # jpayne@68: # Copyright (c) 1999-2008 by Fredrik Lundh jpayne@68: # jpayne@68: # By obtaining, using, and/or copying this software and/or its jpayne@68: # associated documentation, you agree that you have read, understood, jpayne@68: # and will comply with the following terms and conditions: jpayne@68: # jpayne@68: # Permission to use, copy, modify, and distribute this software and jpayne@68: # its associated documentation for any purpose and without fee is jpayne@68: # hereby granted, provided that the above copyright notice appears in jpayne@68: # all copies, and that both that copyright notice and this permission jpayne@68: # notice appear in supporting documentation, and that the name of jpayne@68: # Secret Labs AB or the author not be used in advertising or publicity jpayne@68: # pertaining to distribution of the software without specific, written jpayne@68: # prior permission. jpayne@68: # jpayne@68: # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD jpayne@68: # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- jpayne@68: # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR jpayne@68: # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY jpayne@68: # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, jpayne@68: # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS jpayne@68: # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE jpayne@68: # OF THIS SOFTWARE. jpayne@68: # -------------------------------------------------------------------- jpayne@68: jpayne@68: __all__ = [ jpayne@68: # public symbols jpayne@68: "Comment", jpayne@68: "dump", jpayne@68: "Element", "ElementTree", jpayne@68: "fromstring", "fromstringlist", jpayne@68: "iselement", "iterparse", jpayne@68: "parse", "ParseError", jpayne@68: "PI", "ProcessingInstruction", jpayne@68: "QName", jpayne@68: "SubElement", jpayne@68: "tostring", "tostringlist", jpayne@68: "TreeBuilder", jpayne@68: "VERSION", jpayne@68: "XML", "XMLID", jpayne@68: "XMLParser", "XMLPullParser", jpayne@68: "register_namespace", jpayne@68: "canonicalize", "C14NWriterTarget", jpayne@68: ] jpayne@68: jpayne@68: VERSION = "1.3.0" jpayne@68: jpayne@68: import sys jpayne@68: import re jpayne@68: import warnings jpayne@68: import io jpayne@68: import collections jpayne@68: import collections.abc jpayne@68: import contextlib jpayne@68: jpayne@68: from . import ElementPath jpayne@68: jpayne@68: jpayne@68: class ParseError(SyntaxError): jpayne@68: """An error when parsing an XML document. jpayne@68: jpayne@68: In addition to its exception value, a ParseError contains jpayne@68: two extra attributes: jpayne@68: 'code' - the specific exception code jpayne@68: 'position' - the line and column of the error jpayne@68: jpayne@68: """ jpayne@68: pass jpayne@68: jpayne@68: # -------------------------------------------------------------------- jpayne@68: jpayne@68: jpayne@68: def iselement(element): jpayne@68: """Return True if *element* appears to be an Element.""" jpayne@68: return hasattr(element, 'tag') jpayne@68: jpayne@68: jpayne@68: class Element: jpayne@68: """An XML element. jpayne@68: jpayne@68: This class is the reference implementation of the Element interface. jpayne@68: jpayne@68: An element's length is its number of subelements. That means if you jpayne@68: want to check if an element is truly empty, you should check BOTH jpayne@68: its length AND its text attribute. jpayne@68: jpayne@68: The element tag, attribute names, and attribute values can be either jpayne@68: bytes or strings. jpayne@68: jpayne@68: *tag* is the element name. *attrib* is an optional dictionary containing jpayne@68: element attributes. *extra* are additional element attributes given as jpayne@68: keyword arguments. jpayne@68: jpayne@68: Example form: jpayne@68: text...tail jpayne@68: jpayne@68: """ jpayne@68: jpayne@68: tag = None jpayne@68: """The element's name.""" jpayne@68: jpayne@68: attrib = None jpayne@68: """Dictionary of the element's attributes.""" jpayne@68: jpayne@68: text = None jpayne@68: """ jpayne@68: Text before first subelement. This is either a string or the value None. jpayne@68: Note that if there is no text, this attribute may be either jpayne@68: None or the empty string, depending on the parser. jpayne@68: jpayne@68: """ jpayne@68: jpayne@68: tail = None jpayne@68: """ jpayne@68: Text after this element's end tag, but before the next sibling element's jpayne@68: start tag. This is either a string or the value None. Note that if there jpayne@68: was no text, this attribute may be either None or an empty string, jpayne@68: depending on the parser. jpayne@68: jpayne@68: """ jpayne@68: jpayne@68: def __init__(self, tag, attrib={}, **extra): jpayne@68: if not isinstance(attrib, dict): jpayne@68: raise TypeError("attrib must be dict, not %s" % ( jpayne@68: attrib.__class__.__name__,)) jpayne@68: self.tag = tag jpayne@68: self.attrib = {**attrib, **extra} jpayne@68: self._children = [] jpayne@68: jpayne@68: def __repr__(self): jpayne@68: return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self)) jpayne@68: jpayne@68: def makeelement(self, tag, attrib): jpayne@68: """Create a new element with the same type. jpayne@68: jpayne@68: *tag* is a string containing the element name. jpayne@68: *attrib* is a dictionary containing the element attributes. jpayne@68: jpayne@68: Do not call this method, use the SubElement factory function instead. jpayne@68: jpayne@68: """ jpayne@68: return self.__class__(tag, attrib) jpayne@68: jpayne@68: def copy(self): jpayne@68: """Return copy of current element. jpayne@68: jpayne@68: This creates a shallow copy. Subelements will be shared with the jpayne@68: original tree. jpayne@68: jpayne@68: """ jpayne@68: elem = self.makeelement(self.tag, self.attrib) jpayne@68: elem.text = self.text jpayne@68: elem.tail = self.tail jpayne@68: elem[:] = self jpayne@68: return elem jpayne@68: jpayne@68: def __len__(self): jpayne@68: return len(self._children) jpayne@68: jpayne@68: def __bool__(self): jpayne@68: warnings.warn( jpayne@68: "The behavior of this method will change in future versions. " jpayne@68: "Use specific 'len(elem)' or 'elem is not None' test instead.", jpayne@68: FutureWarning, stacklevel=2 jpayne@68: ) jpayne@68: return len(self._children) != 0 # emulate old behaviour, for now jpayne@68: jpayne@68: def __getitem__(self, index): jpayne@68: return self._children[index] jpayne@68: jpayne@68: def __setitem__(self, index, element): jpayne@68: if isinstance(index, slice): jpayne@68: for elt in element: jpayne@68: self._assert_is_element(elt) jpayne@68: else: jpayne@68: self._assert_is_element(element) jpayne@68: self._children[index] = element jpayne@68: jpayne@68: def __delitem__(self, index): jpayne@68: del self._children[index] jpayne@68: jpayne@68: def append(self, subelement): jpayne@68: """Add *subelement* to the end of this element. jpayne@68: jpayne@68: The new element will appear in document order after the last existing jpayne@68: subelement (or directly after the text, if it's the first subelement), jpayne@68: but before the end tag for this element. jpayne@68: jpayne@68: """ jpayne@68: self._assert_is_element(subelement) jpayne@68: self._children.append(subelement) jpayne@68: jpayne@68: def extend(self, elements): jpayne@68: """Append subelements from a sequence. jpayne@68: jpayne@68: *elements* is a sequence with zero or more elements. jpayne@68: jpayne@68: """ jpayne@68: for element in elements: jpayne@68: self._assert_is_element(element) jpayne@68: self._children.extend(elements) jpayne@68: jpayne@68: def insert(self, index, subelement): jpayne@68: """Insert *subelement* at position *index*.""" jpayne@68: self._assert_is_element(subelement) jpayne@68: self._children.insert(index, subelement) jpayne@68: jpayne@68: def _assert_is_element(self, e): jpayne@68: # Need to refer to the actual Python implementation, not the jpayne@68: # shadowing C implementation. jpayne@68: if not isinstance(e, _Element_Py): jpayne@68: raise TypeError('expected an Element, not %s' % type(e).__name__) jpayne@68: jpayne@68: def remove(self, subelement): jpayne@68: """Remove matching subelement. jpayne@68: jpayne@68: Unlike the find methods, this method compares elements based on jpayne@68: identity, NOT ON tag value or contents. To remove subelements by jpayne@68: other means, the easiest way is to use a list comprehension to jpayne@68: select what elements to keep, and then use slice assignment to update jpayne@68: the parent element. jpayne@68: jpayne@68: ValueError is raised if a matching element could not be found. jpayne@68: jpayne@68: """ jpayne@68: # assert iselement(element) jpayne@68: self._children.remove(subelement) jpayne@68: jpayne@68: def getchildren(self): jpayne@68: """(Deprecated) Return all subelements. jpayne@68: jpayne@68: Elements are returned in document order. jpayne@68: jpayne@68: """ jpayne@68: warnings.warn( jpayne@68: "This method will be removed in future versions. " jpayne@68: "Use 'list(elem)' or iteration over elem instead.", jpayne@68: DeprecationWarning, stacklevel=2 jpayne@68: ) jpayne@68: return self._children jpayne@68: jpayne@68: def find(self, path, namespaces=None): jpayne@68: """Find first matching element by tag name or path. jpayne@68: jpayne@68: *path* is a string having either an element tag or an XPath, jpayne@68: *namespaces* is an optional mapping from namespace prefix to full name. jpayne@68: jpayne@68: Return the first matching element, or None if no element was found. jpayne@68: jpayne@68: """ jpayne@68: return ElementPath.find(self, path, namespaces) jpayne@68: jpayne@68: def findtext(self, path, default=None, namespaces=None): jpayne@68: """Find text for first matching element by tag name or path. jpayne@68: jpayne@68: *path* is a string having either an element tag or an XPath, jpayne@68: *default* is the value to return if the element was not found, jpayne@68: *namespaces* is an optional mapping from namespace prefix to full name. jpayne@68: jpayne@68: Return text content of first matching element, or default value if jpayne@68: none was found. Note that if an element is found having no text jpayne@68: content, the empty string is returned. jpayne@68: jpayne@68: """ jpayne@68: return ElementPath.findtext(self, path, default, namespaces) jpayne@68: jpayne@68: def findall(self, path, namespaces=None): jpayne@68: """Find all matching subelements by tag name or path. jpayne@68: jpayne@68: *path* is a string having either an element tag or an XPath, jpayne@68: *namespaces* is an optional mapping from namespace prefix to full name. jpayne@68: jpayne@68: Returns list containing all matching elements in document order. jpayne@68: jpayne@68: """ jpayne@68: return ElementPath.findall(self, path, namespaces) jpayne@68: jpayne@68: def iterfind(self, path, namespaces=None): jpayne@68: """Find all matching subelements by tag name or path. jpayne@68: jpayne@68: *path* is a string having either an element tag or an XPath, jpayne@68: *namespaces* is an optional mapping from namespace prefix to full name. jpayne@68: jpayne@68: Return an iterable yielding all matching elements in document order. jpayne@68: jpayne@68: """ jpayne@68: return ElementPath.iterfind(self, path, namespaces) jpayne@68: jpayne@68: def clear(self): jpayne@68: """Reset element. jpayne@68: jpayne@68: This function removes all subelements, clears all attributes, and sets jpayne@68: the text and tail attributes to None. jpayne@68: jpayne@68: """ jpayne@68: self.attrib.clear() jpayne@68: self._children = [] jpayne@68: self.text = self.tail = None jpayne@68: jpayne@68: def get(self, key, default=None): jpayne@68: """Get element attribute. jpayne@68: jpayne@68: Equivalent to attrib.get, but some implementations may handle this a jpayne@68: bit more efficiently. *key* is what attribute to look for, and jpayne@68: *default* is what to return if the attribute was not found. jpayne@68: jpayne@68: Returns a string containing the attribute value, or the default if jpayne@68: attribute was not found. jpayne@68: jpayne@68: """ jpayne@68: return self.attrib.get(key, default) jpayne@68: jpayne@68: def set(self, key, value): jpayne@68: """Set element attribute. jpayne@68: jpayne@68: Equivalent to attrib[key] = value, but some implementations may handle jpayne@68: this a bit more efficiently. *key* is what attribute to set, and jpayne@68: *value* is the attribute value to set it to. jpayne@68: jpayne@68: """ jpayne@68: self.attrib[key] = value jpayne@68: jpayne@68: def keys(self): jpayne@68: """Get list of attribute names. jpayne@68: jpayne@68: Names are returned in an arbitrary order, just like an ordinary jpayne@68: Python dict. Equivalent to attrib.keys() jpayne@68: jpayne@68: """ jpayne@68: return self.attrib.keys() jpayne@68: jpayne@68: def items(self): jpayne@68: """Get element attributes as a sequence. jpayne@68: jpayne@68: The attributes are returned in arbitrary order. Equivalent to jpayne@68: attrib.items(). jpayne@68: jpayne@68: Return a list of (name, value) tuples. jpayne@68: jpayne@68: """ jpayne@68: return self.attrib.items() jpayne@68: jpayne@68: def iter(self, tag=None): jpayne@68: """Create tree iterator. jpayne@68: jpayne@68: The iterator loops over the element and all subelements in document jpayne@68: order, returning all elements with a matching tag. jpayne@68: jpayne@68: If the tree structure is modified during iteration, new or removed jpayne@68: elements may or may not be included. To get a stable set, use the jpayne@68: list() function on the iterator, and loop over the resulting list. jpayne@68: jpayne@68: *tag* is what tags to look for (default is to return all elements) jpayne@68: jpayne@68: Return an iterator containing all the matching elements. jpayne@68: jpayne@68: """ jpayne@68: if tag == "*": jpayne@68: tag = None jpayne@68: if tag is None or self.tag == tag: jpayne@68: yield self jpayne@68: for e in self._children: jpayne@68: yield from e.iter(tag) jpayne@68: jpayne@68: # compatibility jpayne@68: def getiterator(self, tag=None): jpayne@68: warnings.warn( jpayne@68: "This method will be removed in future versions. " jpayne@68: "Use 'elem.iter()' or 'list(elem.iter())' instead.", jpayne@68: DeprecationWarning, stacklevel=2 jpayne@68: ) jpayne@68: return list(self.iter(tag)) jpayne@68: jpayne@68: def itertext(self): jpayne@68: """Create text iterator. jpayne@68: jpayne@68: The iterator loops over the element and all subelements in document jpayne@68: order, returning all inner text. jpayne@68: jpayne@68: """ jpayne@68: tag = self.tag jpayne@68: if not isinstance(tag, str) and tag is not None: jpayne@68: return jpayne@68: t = self.text jpayne@68: if t: jpayne@68: yield t jpayne@68: for e in self: jpayne@68: yield from e.itertext() jpayne@68: t = e.tail jpayne@68: if t: jpayne@68: yield t jpayne@68: jpayne@68: jpayne@68: def SubElement(parent, tag, attrib={}, **extra): jpayne@68: """Subelement factory which creates an element instance, and appends it jpayne@68: to an existing parent. jpayne@68: jpayne@68: The element tag, attribute names, and attribute values can be either jpayne@68: bytes or Unicode strings. jpayne@68: jpayne@68: *parent* is the parent element, *tag* is the subelements name, *attrib* is jpayne@68: an optional directory containing element attributes, *extra* are jpayne@68: additional attributes given as keyword arguments. jpayne@68: jpayne@68: """ jpayne@68: attrib = {**attrib, **extra} jpayne@68: element = parent.makeelement(tag, attrib) jpayne@68: parent.append(element) jpayne@68: return element jpayne@68: jpayne@68: jpayne@68: def Comment(text=None): jpayne@68: """Comment element factory. jpayne@68: jpayne@68: This function creates a special element which the standard serializer jpayne@68: serializes as an XML comment. jpayne@68: jpayne@68: *text* is a string containing the comment string. jpayne@68: jpayne@68: """ jpayne@68: element = Element(Comment) jpayne@68: element.text = text jpayne@68: return element jpayne@68: jpayne@68: jpayne@68: def ProcessingInstruction(target, text=None): jpayne@68: """Processing Instruction element factory. jpayne@68: jpayne@68: This function creates a special element which the standard serializer jpayne@68: serializes as an XML comment. jpayne@68: jpayne@68: *target* is a string containing the processing instruction, *text* is a jpayne@68: string containing the processing instruction contents, if any. jpayne@68: jpayne@68: """ jpayne@68: element = Element(ProcessingInstruction) jpayne@68: element.text = target jpayne@68: if text: jpayne@68: element.text = element.text + " " + text jpayne@68: return element jpayne@68: jpayne@68: PI = ProcessingInstruction jpayne@68: jpayne@68: jpayne@68: class QName: jpayne@68: """Qualified name wrapper. jpayne@68: jpayne@68: This class can be used to wrap a QName attribute value in order to get jpayne@68: proper namespace handing on output. jpayne@68: jpayne@68: *text_or_uri* is a string containing the QName value either in the form jpayne@68: {uri}local, or if the tag argument is given, the URI part of a QName. jpayne@68: jpayne@68: *tag* is an optional argument which if given, will make the first jpayne@68: argument (text_or_uri) be interpreted as a URI, and this argument (tag) jpayne@68: be interpreted as a local name. jpayne@68: jpayne@68: """ jpayne@68: def __init__(self, text_or_uri, tag=None): jpayne@68: if tag: jpayne@68: text_or_uri = "{%s}%s" % (text_or_uri, tag) jpayne@68: self.text = text_or_uri jpayne@68: def __str__(self): jpayne@68: return self.text jpayne@68: def __repr__(self): jpayne@68: return '<%s %r>' % (self.__class__.__name__, self.text) jpayne@68: def __hash__(self): jpayne@68: return hash(self.text) jpayne@68: def __le__(self, other): jpayne@68: if isinstance(other, QName): jpayne@68: return self.text <= other.text jpayne@68: return self.text <= other jpayne@68: def __lt__(self, other): jpayne@68: if isinstance(other, QName): jpayne@68: return self.text < other.text jpayne@68: return self.text < other jpayne@68: def __ge__(self, other): jpayne@68: if isinstance(other, QName): jpayne@68: return self.text >= other.text jpayne@68: return self.text >= other jpayne@68: def __gt__(self, other): jpayne@68: if isinstance(other, QName): jpayne@68: return self.text > other.text jpayne@68: return self.text > other jpayne@68: def __eq__(self, other): jpayne@68: if isinstance(other, QName): jpayne@68: return self.text == other.text jpayne@68: return self.text == other jpayne@68: jpayne@68: # -------------------------------------------------------------------- jpayne@68: jpayne@68: jpayne@68: class ElementTree: jpayne@68: """An XML element hierarchy. jpayne@68: jpayne@68: This class also provides support for serialization to and from jpayne@68: standard XML. jpayne@68: jpayne@68: *element* is an optional root element node, jpayne@68: *file* is an optional file handle or file name of an XML file whose jpayne@68: contents will be used to initialize the tree with. jpayne@68: jpayne@68: """ jpayne@68: def __init__(self, element=None, file=None): jpayne@68: # assert element is None or iselement(element) jpayne@68: self._root = element # first node jpayne@68: if file: jpayne@68: self.parse(file) jpayne@68: jpayne@68: def getroot(self): jpayne@68: """Return root element of this tree.""" jpayne@68: return self._root jpayne@68: jpayne@68: def _setroot(self, element): jpayne@68: """Replace root element of this tree. jpayne@68: jpayne@68: This will discard the current contents of the tree and replace it jpayne@68: with the given element. Use with care! jpayne@68: jpayne@68: """ jpayne@68: # assert iselement(element) jpayne@68: self._root = element jpayne@68: jpayne@68: def parse(self, source, parser=None): jpayne@68: """Load external XML document into element tree. jpayne@68: jpayne@68: *source* is a file name or file object, *parser* is an optional parser jpayne@68: instance that defaults to XMLParser. jpayne@68: jpayne@68: ParseError is raised if the parser fails to parse the document. jpayne@68: jpayne@68: Returns the root element of the given source document. jpayne@68: jpayne@68: """ jpayne@68: close_source = False jpayne@68: if not hasattr(source, "read"): jpayne@68: source = open(source, "rb") jpayne@68: close_source = True jpayne@68: try: jpayne@68: if parser is None: jpayne@68: # If no parser was specified, create a default XMLParser jpayne@68: parser = XMLParser() jpayne@68: if hasattr(parser, '_parse_whole'): jpayne@68: # The default XMLParser, when it comes from an accelerator, jpayne@68: # can define an internal _parse_whole API for efficiency. jpayne@68: # It can be used to parse the whole source without feeding jpayne@68: # it with chunks. jpayne@68: self._root = parser._parse_whole(source) jpayne@68: return self._root jpayne@68: while True: jpayne@68: data = source.read(65536) jpayne@68: if not data: jpayne@68: break jpayne@68: parser.feed(data) jpayne@68: self._root = parser.close() jpayne@68: return self._root jpayne@68: finally: jpayne@68: if close_source: jpayne@68: source.close() jpayne@68: jpayne@68: def iter(self, tag=None): jpayne@68: """Create and return tree iterator for the root element. jpayne@68: jpayne@68: The iterator loops over all elements in this tree, in document order. jpayne@68: jpayne@68: *tag* is a string with the tag name to iterate over jpayne@68: (default is to return all elements). jpayne@68: jpayne@68: """ jpayne@68: # assert self._root is not None jpayne@68: return self._root.iter(tag) jpayne@68: jpayne@68: # compatibility jpayne@68: def getiterator(self, tag=None): jpayne@68: warnings.warn( jpayne@68: "This method will be removed in future versions. " jpayne@68: "Use 'tree.iter()' or 'list(tree.iter())' instead.", jpayne@68: DeprecationWarning, stacklevel=2 jpayne@68: ) jpayne@68: return list(self.iter(tag)) jpayne@68: jpayne@68: def find(self, path, namespaces=None): jpayne@68: """Find first matching element by tag name or path. jpayne@68: jpayne@68: Same as getroot().find(path), which is Element.find() jpayne@68: jpayne@68: *path* is a string having either an element tag or an XPath, jpayne@68: *namespaces* is an optional mapping from namespace prefix to full name. jpayne@68: jpayne@68: Return the first matching element, or None if no element was found. jpayne@68: jpayne@68: """ jpayne@68: # assert self._root is not None jpayne@68: if path[:1] == "/": jpayne@68: path = "." + path jpayne@68: warnings.warn( jpayne@68: "This search is broken in 1.3 and earlier, and will be " jpayne@68: "fixed in a future version. If you rely on the current " jpayne@68: "behaviour, change it to %r" % path, jpayne@68: FutureWarning, stacklevel=2 jpayne@68: ) jpayne@68: return self._root.find(path, namespaces) jpayne@68: jpayne@68: def findtext(self, path, default=None, namespaces=None): jpayne@68: """Find first matching element by tag name or path. jpayne@68: jpayne@68: Same as getroot().findtext(path), which is Element.findtext() jpayne@68: jpayne@68: *path* is a string having either an element tag or an XPath, jpayne@68: *namespaces* is an optional mapping from namespace prefix to full name. jpayne@68: jpayne@68: Return the first matching element, or None if no element was found. jpayne@68: jpayne@68: """ jpayne@68: # assert self._root is not None jpayne@68: if path[:1] == "/": jpayne@68: path = "." + path jpayne@68: warnings.warn( jpayne@68: "This search is broken in 1.3 and earlier, and will be " jpayne@68: "fixed in a future version. If you rely on the current " jpayne@68: "behaviour, change it to %r" % path, jpayne@68: FutureWarning, stacklevel=2 jpayne@68: ) jpayne@68: return self._root.findtext(path, default, namespaces) jpayne@68: jpayne@68: def findall(self, path, namespaces=None): jpayne@68: """Find all matching subelements by tag name or path. jpayne@68: jpayne@68: Same as getroot().findall(path), which is Element.findall(). jpayne@68: jpayne@68: *path* is a string having either an element tag or an XPath, jpayne@68: *namespaces* is an optional mapping from namespace prefix to full name. jpayne@68: jpayne@68: Return list containing all matching elements in document order. jpayne@68: jpayne@68: """ jpayne@68: # assert self._root is not None jpayne@68: if path[:1] == "/": jpayne@68: path = "." + path jpayne@68: warnings.warn( jpayne@68: "This search is broken in 1.3 and earlier, and will be " jpayne@68: "fixed in a future version. If you rely on the current " jpayne@68: "behaviour, change it to %r" % path, jpayne@68: FutureWarning, stacklevel=2 jpayne@68: ) jpayne@68: return self._root.findall(path, namespaces) jpayne@68: jpayne@68: def iterfind(self, path, namespaces=None): jpayne@68: """Find all matching subelements by tag name or path. jpayne@68: jpayne@68: Same as getroot().iterfind(path), which is element.iterfind() jpayne@68: jpayne@68: *path* is a string having either an element tag or an XPath, jpayne@68: *namespaces* is an optional mapping from namespace prefix to full name. jpayne@68: jpayne@68: Return an iterable yielding all matching elements in document order. jpayne@68: jpayne@68: """ jpayne@68: # assert self._root is not None jpayne@68: if path[:1] == "/": jpayne@68: path = "." + path jpayne@68: warnings.warn( jpayne@68: "This search is broken in 1.3 and earlier, and will be " jpayne@68: "fixed in a future version. If you rely on the current " jpayne@68: "behaviour, change it to %r" % path, jpayne@68: FutureWarning, stacklevel=2 jpayne@68: ) jpayne@68: return self._root.iterfind(path, namespaces) jpayne@68: jpayne@68: def write(self, file_or_filename, jpayne@68: encoding=None, jpayne@68: xml_declaration=None, jpayne@68: default_namespace=None, jpayne@68: method=None, *, jpayne@68: short_empty_elements=True): jpayne@68: """Write element tree to a file as XML. jpayne@68: jpayne@68: Arguments: jpayne@68: *file_or_filename* -- file name or a file object opened for writing jpayne@68: jpayne@68: *encoding* -- the output encoding (default: US-ASCII) jpayne@68: jpayne@68: *xml_declaration* -- bool indicating if an XML declaration should be jpayne@68: added to the output. If None, an XML declaration jpayne@68: is added if encoding IS NOT either of: jpayne@68: US-ASCII, UTF-8, or Unicode jpayne@68: jpayne@68: *default_namespace* -- sets the default XML namespace (for "xmlns") jpayne@68: jpayne@68: *method* -- either "xml" (default), "html, "text", or "c14n" jpayne@68: jpayne@68: *short_empty_elements* -- controls the formatting of elements jpayne@68: that contain no content. If True (default) jpayne@68: they are emitted as a single self-closed jpayne@68: tag, otherwise they are emitted as a pair jpayne@68: of start/end tags jpayne@68: jpayne@68: """ jpayne@68: if not method: jpayne@68: method = "xml" jpayne@68: elif method not in _serialize: jpayne@68: raise ValueError("unknown method %r" % method) jpayne@68: if not encoding: jpayne@68: if method == "c14n": jpayne@68: encoding = "utf-8" jpayne@68: else: jpayne@68: encoding = "us-ascii" jpayne@68: enc_lower = encoding.lower() jpayne@68: with _get_writer(file_or_filename, enc_lower) as write: jpayne@68: if method == "xml" and (xml_declaration or jpayne@68: (xml_declaration is None and jpayne@68: enc_lower not in ("utf-8", "us-ascii", "unicode"))): jpayne@68: declared_encoding = encoding jpayne@68: if enc_lower == "unicode": jpayne@68: # Retrieve the default encoding for the xml declaration jpayne@68: import locale jpayne@68: declared_encoding = locale.getpreferredencoding() jpayne@68: write("\n" % ( jpayne@68: declared_encoding,)) jpayne@68: if method == "text": jpayne@68: _serialize_text(write, self._root) jpayne@68: else: jpayne@68: qnames, namespaces = _namespaces(self._root, default_namespace) jpayne@68: serialize = _serialize[method] jpayne@68: serialize(write, self._root, qnames, namespaces, jpayne@68: short_empty_elements=short_empty_elements) jpayne@68: jpayne@68: def write_c14n(self, file): jpayne@68: # lxml.etree compatibility. use output method instead jpayne@68: return self.write(file, method="c14n") jpayne@68: jpayne@68: # -------------------------------------------------------------------- jpayne@68: # serialization support jpayne@68: jpayne@68: @contextlib.contextmanager jpayne@68: def _get_writer(file_or_filename, encoding): jpayne@68: # returns text write method and release all resources after using jpayne@68: try: jpayne@68: write = file_or_filename.write jpayne@68: except AttributeError: jpayne@68: # file_or_filename is a file name jpayne@68: if encoding == "unicode": jpayne@68: file = open(file_or_filename, "w") jpayne@68: else: jpayne@68: file = open(file_or_filename, "w", encoding=encoding, jpayne@68: errors="xmlcharrefreplace") jpayne@68: with file: jpayne@68: yield file.write jpayne@68: else: jpayne@68: # file_or_filename is a file-like object jpayne@68: # encoding determines if it is a text or binary writer jpayne@68: if encoding == "unicode": jpayne@68: # use a text writer as is jpayne@68: yield write jpayne@68: else: jpayne@68: # wrap a binary writer with TextIOWrapper jpayne@68: with contextlib.ExitStack() as stack: jpayne@68: if isinstance(file_or_filename, io.BufferedIOBase): jpayne@68: file = file_or_filename jpayne@68: elif isinstance(file_or_filename, io.RawIOBase): jpayne@68: file = io.BufferedWriter(file_or_filename) jpayne@68: # Keep the original file open when the BufferedWriter is jpayne@68: # destroyed jpayne@68: stack.callback(file.detach) jpayne@68: else: jpayne@68: # This is to handle passed objects that aren't in the jpayne@68: # IOBase hierarchy, but just have a write method jpayne@68: file = io.BufferedIOBase() jpayne@68: file.writable = lambda: True jpayne@68: file.write = write jpayne@68: try: jpayne@68: # TextIOWrapper uses this methods to determine jpayne@68: # if BOM (for UTF-16, etc) should be added jpayne@68: file.seekable = file_or_filename.seekable jpayne@68: file.tell = file_or_filename.tell jpayne@68: except AttributeError: jpayne@68: pass jpayne@68: file = io.TextIOWrapper(file, jpayne@68: encoding=encoding, jpayne@68: errors="xmlcharrefreplace", jpayne@68: newline="\n") jpayne@68: # Keep the original file open when the TextIOWrapper is jpayne@68: # destroyed jpayne@68: stack.callback(file.detach) jpayne@68: yield file.write jpayne@68: jpayne@68: def _namespaces(elem, default_namespace=None): jpayne@68: # identify namespaces used in this tree jpayne@68: jpayne@68: # maps qnames to *encoded* prefix:local names jpayne@68: qnames = {None: None} jpayne@68: jpayne@68: # maps uri:s to prefixes jpayne@68: namespaces = {} jpayne@68: if default_namespace: jpayne@68: namespaces[default_namespace] = "" jpayne@68: jpayne@68: def add_qname(qname): jpayne@68: # calculate serialized qname representation jpayne@68: try: jpayne@68: if qname[:1] == "{": jpayne@68: uri, tag = qname[1:].rsplit("}", 1) jpayne@68: prefix = namespaces.get(uri) jpayne@68: if prefix is None: jpayne@68: prefix = _namespace_map.get(uri) jpayne@68: if prefix is None: jpayne@68: prefix = "ns%d" % len(namespaces) jpayne@68: if prefix != "xml": jpayne@68: namespaces[uri] = prefix jpayne@68: if prefix: jpayne@68: qnames[qname] = "%s:%s" % (prefix, tag) jpayne@68: else: jpayne@68: qnames[qname] = tag # default element jpayne@68: else: jpayne@68: if default_namespace: jpayne@68: # FIXME: can this be handled in XML 1.0? jpayne@68: raise ValueError( jpayne@68: "cannot use non-qualified names with " jpayne@68: "default_namespace option" jpayne@68: ) jpayne@68: qnames[qname] = qname jpayne@68: except TypeError: jpayne@68: _raise_serialization_error(qname) jpayne@68: jpayne@68: # populate qname and namespaces table jpayne@68: for elem in elem.iter(): jpayne@68: tag = elem.tag jpayne@68: if isinstance(tag, QName): jpayne@68: if tag.text not in qnames: jpayne@68: add_qname(tag.text) jpayne@68: elif isinstance(tag, str): jpayne@68: if tag not in qnames: jpayne@68: add_qname(tag) jpayne@68: elif tag is not None and tag is not Comment and tag is not PI: jpayne@68: _raise_serialization_error(tag) jpayne@68: for key, value in elem.items(): jpayne@68: if isinstance(key, QName): jpayne@68: key = key.text jpayne@68: if key not in qnames: jpayne@68: add_qname(key) jpayne@68: if isinstance(value, QName) and value.text not in qnames: jpayne@68: add_qname(value.text) jpayne@68: text = elem.text jpayne@68: if isinstance(text, QName) and text.text not in qnames: jpayne@68: add_qname(text.text) jpayne@68: return qnames, namespaces jpayne@68: jpayne@68: def _serialize_xml(write, elem, qnames, namespaces, jpayne@68: short_empty_elements, **kwargs): jpayne@68: tag = elem.tag jpayne@68: text = elem.text jpayne@68: if tag is Comment: jpayne@68: write("" % text) jpayne@68: elif tag is ProcessingInstruction: jpayne@68: write("" % text) jpayne@68: else: jpayne@68: tag = qnames[tag] jpayne@68: if tag is None: jpayne@68: if text: jpayne@68: write(_escape_cdata(text)) jpayne@68: for e in elem: jpayne@68: _serialize_xml(write, e, qnames, None, jpayne@68: short_empty_elements=short_empty_elements) jpayne@68: else: jpayne@68: write("<" + tag) jpayne@68: items = list(elem.items()) jpayne@68: if items or namespaces: jpayne@68: if namespaces: jpayne@68: for v, k in sorted(namespaces.items(), jpayne@68: key=lambda x: x[1]): # sort on prefix jpayne@68: if k: jpayne@68: k = ":" + k jpayne@68: write(" xmlns%s=\"%s\"" % ( jpayne@68: k, jpayne@68: _escape_attrib(v) jpayne@68: )) jpayne@68: for k, v in items: jpayne@68: if isinstance(k, QName): jpayne@68: k = k.text jpayne@68: if isinstance(v, QName): jpayne@68: v = qnames[v.text] jpayne@68: else: jpayne@68: v = _escape_attrib(v) jpayne@68: write(" %s=\"%s\"" % (qnames[k], v)) jpayne@68: if text or len(elem) or not short_empty_elements: jpayne@68: write(">") jpayne@68: if text: jpayne@68: write(_escape_cdata(text)) jpayne@68: for e in elem: jpayne@68: _serialize_xml(write, e, qnames, None, jpayne@68: short_empty_elements=short_empty_elements) jpayne@68: write("") jpayne@68: else: jpayne@68: write(" />") jpayne@68: if elem.tail: jpayne@68: write(_escape_cdata(elem.tail)) jpayne@68: jpayne@68: HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr", jpayne@68: "img", "input", "isindex", "link", "meta", "param") jpayne@68: jpayne@68: try: jpayne@68: HTML_EMPTY = set(HTML_EMPTY) jpayne@68: except NameError: jpayne@68: pass jpayne@68: jpayne@68: def _serialize_html(write, elem, qnames, namespaces, **kwargs): jpayne@68: tag = elem.tag jpayne@68: text = elem.text jpayne@68: if tag is Comment: jpayne@68: write("" % _escape_cdata(text)) jpayne@68: elif tag is ProcessingInstruction: jpayne@68: write("" % _escape_cdata(text)) jpayne@68: else: jpayne@68: tag = qnames[tag] jpayne@68: if tag is None: jpayne@68: if text: jpayne@68: write(_escape_cdata(text)) jpayne@68: for e in elem: jpayne@68: _serialize_html(write, e, qnames, None) jpayne@68: else: jpayne@68: write("<" + tag) jpayne@68: items = list(elem.items()) jpayne@68: if items or namespaces: jpayne@68: if namespaces: jpayne@68: for v, k in sorted(namespaces.items(), jpayne@68: key=lambda x: x[1]): # sort on prefix jpayne@68: if k: jpayne@68: k = ":" + k jpayne@68: write(" xmlns%s=\"%s\"" % ( jpayne@68: k, jpayne@68: _escape_attrib(v) jpayne@68: )) jpayne@68: for k, v in items: jpayne@68: if isinstance(k, QName): jpayne@68: k = k.text jpayne@68: if isinstance(v, QName): jpayne@68: v = qnames[v.text] jpayne@68: else: jpayne@68: v = _escape_attrib_html(v) jpayne@68: # FIXME: handle boolean attributes jpayne@68: write(" %s=\"%s\"" % (qnames[k], v)) jpayne@68: write(">") jpayne@68: ltag = tag.lower() jpayne@68: if text: jpayne@68: if ltag == "script" or ltag == "style": jpayne@68: write(text) jpayne@68: else: jpayne@68: write(_escape_cdata(text)) jpayne@68: for e in elem: jpayne@68: _serialize_html(write, e, qnames, None) jpayne@68: if ltag not in HTML_EMPTY: jpayne@68: write("") jpayne@68: if elem.tail: jpayne@68: write(_escape_cdata(elem.tail)) jpayne@68: jpayne@68: def _serialize_text(write, elem): jpayne@68: for part in elem.itertext(): jpayne@68: write(part) jpayne@68: if elem.tail: jpayne@68: write(elem.tail) jpayne@68: jpayne@68: _serialize = { jpayne@68: "xml": _serialize_xml, jpayne@68: "html": _serialize_html, jpayne@68: "text": _serialize_text, jpayne@68: # this optional method is imported at the end of the module jpayne@68: # "c14n": _serialize_c14n, jpayne@68: } jpayne@68: jpayne@68: jpayne@68: def register_namespace(prefix, uri): jpayne@68: """Register a namespace prefix. jpayne@68: jpayne@68: The registry is global, and any existing mapping for either the jpayne@68: given prefix or the namespace URI will be removed. jpayne@68: jpayne@68: *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and jpayne@68: attributes in this namespace will be serialized with prefix if possible. jpayne@68: jpayne@68: ValueError is raised if prefix is reserved or is invalid. jpayne@68: jpayne@68: """ jpayne@68: if re.match(r"ns\d+$", prefix): jpayne@68: raise ValueError("Prefix format reserved for internal use") jpayne@68: for k, v in list(_namespace_map.items()): jpayne@68: if k == uri or v == prefix: jpayne@68: del _namespace_map[k] jpayne@68: _namespace_map[uri] = prefix jpayne@68: jpayne@68: _namespace_map = { jpayne@68: # "well-known" namespace prefixes jpayne@68: "http://www.w3.org/XML/1998/namespace": "xml", jpayne@68: "http://www.w3.org/1999/xhtml": "html", jpayne@68: "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf", jpayne@68: "http://schemas.xmlsoap.org/wsdl/": "wsdl", jpayne@68: # xml schema jpayne@68: "http://www.w3.org/2001/XMLSchema": "xs", jpayne@68: "http://www.w3.org/2001/XMLSchema-instance": "xsi", jpayne@68: # dublin core jpayne@68: "http://purl.org/dc/elements/1.1/": "dc", jpayne@68: } jpayne@68: # For tests and troubleshooting jpayne@68: register_namespace._namespace_map = _namespace_map jpayne@68: jpayne@68: def _raise_serialization_error(text): jpayne@68: raise TypeError( jpayne@68: "cannot serialize %r (type %s)" % (text, type(text).__name__) jpayne@68: ) jpayne@68: jpayne@68: def _escape_cdata(text): jpayne@68: # escape character data jpayne@68: try: jpayne@68: # it's worth avoiding do-nothing calls for strings that are jpayne@68: # shorter than 500 characters, or so. assume that's, by far, jpayne@68: # the most common case in most applications. jpayne@68: if "&" in text: jpayne@68: text = text.replace("&", "&") jpayne@68: if "<" in text: jpayne@68: text = text.replace("<", "<") jpayne@68: if ">" in text: jpayne@68: text = text.replace(">", ">") jpayne@68: return text jpayne@68: except (TypeError, AttributeError): jpayne@68: _raise_serialization_error(text) jpayne@68: jpayne@68: def _escape_attrib(text): jpayne@68: # escape attribute value jpayne@68: try: jpayne@68: if "&" in text: jpayne@68: text = text.replace("&", "&") jpayne@68: if "<" in text: jpayne@68: text = text.replace("<", "<") jpayne@68: if ">" in text: jpayne@68: text = text.replace(">", ">") jpayne@68: if "\"" in text: jpayne@68: text = text.replace("\"", """) jpayne@68: # The following business with carriage returns is to satisfy jpayne@68: # Section 2.11 of the XML specification, stating that jpayne@68: # CR or CR LN should be replaced with just LN jpayne@68: # http://www.w3.org/TR/REC-xml/#sec-line-ends jpayne@68: if "\r\n" in text: jpayne@68: text = text.replace("\r\n", "\n") jpayne@68: if "\r" in text: jpayne@68: text = text.replace("\r", "\n") jpayne@68: #The following four lines are issue 17582 jpayne@68: if "\n" in text: jpayne@68: text = text.replace("\n", " ") jpayne@68: if "\t" in text: jpayne@68: text = text.replace("\t", " ") jpayne@68: return text jpayne@68: except (TypeError, AttributeError): jpayne@68: _raise_serialization_error(text) jpayne@68: jpayne@68: def _escape_attrib_html(text): jpayne@68: # escape attribute value jpayne@68: try: jpayne@68: if "&" in text: jpayne@68: text = text.replace("&", "&") jpayne@68: if ">" in text: jpayne@68: text = text.replace(">", ">") jpayne@68: if "\"" in text: jpayne@68: text = text.replace("\"", """) jpayne@68: return text jpayne@68: except (TypeError, AttributeError): jpayne@68: _raise_serialization_error(text) jpayne@68: jpayne@68: # -------------------------------------------------------------------- jpayne@68: jpayne@68: def tostring(element, encoding=None, method=None, *, jpayne@68: xml_declaration=None, default_namespace=None, jpayne@68: short_empty_elements=True): jpayne@68: """Generate string representation of XML element. jpayne@68: jpayne@68: All subelements are included. If encoding is "unicode", a string jpayne@68: is returned. Otherwise a bytestring is returned. jpayne@68: jpayne@68: *element* is an Element instance, *encoding* is an optional output jpayne@68: encoding defaulting to US-ASCII, *method* is an optional output which can jpayne@68: be one of "xml" (default), "html", "text" or "c14n", *default_namespace* jpayne@68: sets the default XML namespace (for "xmlns"). jpayne@68: jpayne@68: Returns an (optionally) encoded string containing the XML data. jpayne@68: jpayne@68: """ jpayne@68: stream = io.StringIO() if encoding == 'unicode' else io.BytesIO() jpayne@68: ElementTree(element).write(stream, encoding, jpayne@68: xml_declaration=xml_declaration, jpayne@68: default_namespace=default_namespace, jpayne@68: method=method, jpayne@68: short_empty_elements=short_empty_elements) jpayne@68: return stream.getvalue() jpayne@68: jpayne@68: class _ListDataStream(io.BufferedIOBase): jpayne@68: """An auxiliary stream accumulating into a list reference.""" jpayne@68: def __init__(self, lst): jpayne@68: self.lst = lst jpayne@68: jpayne@68: def writable(self): jpayne@68: return True jpayne@68: jpayne@68: def seekable(self): jpayne@68: return True jpayne@68: jpayne@68: def write(self, b): jpayne@68: self.lst.append(b) jpayne@68: jpayne@68: def tell(self): jpayne@68: return len(self.lst) jpayne@68: jpayne@68: def tostringlist(element, encoding=None, method=None, *, jpayne@68: xml_declaration=None, default_namespace=None, jpayne@68: short_empty_elements=True): jpayne@68: lst = [] jpayne@68: stream = _ListDataStream(lst) jpayne@68: ElementTree(element).write(stream, encoding, jpayne@68: xml_declaration=xml_declaration, jpayne@68: default_namespace=default_namespace, jpayne@68: method=method, jpayne@68: short_empty_elements=short_empty_elements) jpayne@68: return lst jpayne@68: jpayne@68: jpayne@68: def dump(elem): jpayne@68: """Write element tree or element structure to sys.stdout. jpayne@68: jpayne@68: This function should be used for debugging only. jpayne@68: jpayne@68: *elem* is either an ElementTree, or a single Element. The exact output jpayne@68: format is implementation dependent. In this version, it's written as an jpayne@68: ordinary XML file. jpayne@68: jpayne@68: """ jpayne@68: # debugging jpayne@68: if not isinstance(elem, ElementTree): jpayne@68: elem = ElementTree(elem) jpayne@68: elem.write(sys.stdout, encoding="unicode") jpayne@68: tail = elem.getroot().tail jpayne@68: if not tail or tail[-1] != "\n": jpayne@68: sys.stdout.write("\n") jpayne@68: jpayne@68: # -------------------------------------------------------------------- jpayne@68: # parsing jpayne@68: jpayne@68: jpayne@68: def parse(source, parser=None): jpayne@68: """Parse XML document into element tree. jpayne@68: jpayne@68: *source* is a filename or file object containing XML data, jpayne@68: *parser* is an optional parser instance defaulting to XMLParser. jpayne@68: jpayne@68: Return an ElementTree instance. jpayne@68: jpayne@68: """ jpayne@68: tree = ElementTree() jpayne@68: tree.parse(source, parser) jpayne@68: return tree jpayne@68: jpayne@68: jpayne@68: def iterparse(source, events=None, parser=None): jpayne@68: """Incrementally parse XML document into ElementTree. jpayne@68: jpayne@68: This class also reports what's going on to the user based on the jpayne@68: *events* it is initialized with. The supported events are the strings jpayne@68: "start", "end", "start-ns" and "end-ns" (the "ns" events are used to get jpayne@68: detailed namespace information). If *events* is omitted, only jpayne@68: "end" events are reported. jpayne@68: jpayne@68: *source* is a filename or file object containing XML data, *events* is jpayne@68: a list of events to report back, *parser* is an optional parser instance. jpayne@68: jpayne@68: Returns an iterator providing (event, elem) pairs. jpayne@68: jpayne@68: """ jpayne@68: # Use the internal, undocumented _parser argument for now; When the jpayne@68: # parser argument of iterparse is removed, this can be killed. jpayne@68: pullparser = XMLPullParser(events=events, _parser=parser) jpayne@68: def iterator(): jpayne@68: try: jpayne@68: while True: jpayne@68: yield from pullparser.read_events() jpayne@68: # load event buffer jpayne@68: data = source.read(16 * 1024) jpayne@68: if not data: jpayne@68: break jpayne@68: pullparser.feed(data) jpayne@68: root = pullparser._close_and_return_root() jpayne@68: yield from pullparser.read_events() jpayne@68: it.root = root jpayne@68: finally: jpayne@68: if close_source: jpayne@68: source.close() jpayne@68: jpayne@68: class IterParseIterator(collections.abc.Iterator): jpayne@68: __next__ = iterator().__next__ jpayne@68: it = IterParseIterator() jpayne@68: it.root = None jpayne@68: del iterator, IterParseIterator jpayne@68: jpayne@68: close_source = False jpayne@68: if not hasattr(source, "read"): jpayne@68: source = open(source, "rb") jpayne@68: close_source = True jpayne@68: jpayne@68: return it jpayne@68: jpayne@68: jpayne@68: class XMLPullParser: jpayne@68: jpayne@68: def __init__(self, events=None, *, _parser=None): jpayne@68: # The _parser argument is for internal use only and must not be relied jpayne@68: # upon in user code. It will be removed in a future release. jpayne@68: # See http://bugs.python.org/issue17741 for more details. jpayne@68: jpayne@68: self._events_queue = collections.deque() jpayne@68: self._parser = _parser or XMLParser(target=TreeBuilder()) jpayne@68: # wire up the parser for event reporting jpayne@68: if events is None: jpayne@68: events = ("end",) jpayne@68: self._parser._setevents(self._events_queue, events) jpayne@68: jpayne@68: def feed(self, data): jpayne@68: """Feed encoded data to parser.""" jpayne@68: if self._parser is None: jpayne@68: raise ValueError("feed() called after end of stream") jpayne@68: if data: jpayne@68: try: jpayne@68: self._parser.feed(data) jpayne@68: except SyntaxError as exc: jpayne@68: self._events_queue.append(exc) jpayne@68: jpayne@68: def _close_and_return_root(self): jpayne@68: # iterparse needs this to set its root attribute properly :( jpayne@68: root = self._parser.close() jpayne@68: self._parser = None jpayne@68: return root jpayne@68: jpayne@68: def close(self): jpayne@68: """Finish feeding data to parser. jpayne@68: jpayne@68: Unlike XMLParser, does not return the root element. Use jpayne@68: read_events() to consume elements from XMLPullParser. jpayne@68: """ jpayne@68: self._close_and_return_root() jpayne@68: jpayne@68: def read_events(self): jpayne@68: """Return an iterator over currently available (event, elem) pairs. jpayne@68: jpayne@68: Events are consumed from the internal event queue as they are jpayne@68: retrieved from the iterator. jpayne@68: """ jpayne@68: events = self._events_queue jpayne@68: while events: jpayne@68: event = events.popleft() jpayne@68: if isinstance(event, Exception): jpayne@68: raise event jpayne@68: else: jpayne@68: yield event jpayne@68: jpayne@68: jpayne@68: def XML(text, parser=None): jpayne@68: """Parse XML document from string constant. jpayne@68: jpayne@68: This function can be used to embed "XML Literals" in Python code. jpayne@68: jpayne@68: *text* is a string containing XML data, *parser* is an jpayne@68: optional parser instance, defaulting to the standard XMLParser. jpayne@68: jpayne@68: Returns an Element instance. jpayne@68: jpayne@68: """ jpayne@68: if not parser: jpayne@68: parser = XMLParser(target=TreeBuilder()) jpayne@68: parser.feed(text) jpayne@68: return parser.close() jpayne@68: jpayne@68: jpayne@68: def XMLID(text, parser=None): jpayne@68: """Parse XML document from string constant for its IDs. jpayne@68: jpayne@68: *text* is a string containing XML data, *parser* is an jpayne@68: optional parser instance, defaulting to the standard XMLParser. jpayne@68: jpayne@68: Returns an (Element, dict) tuple, in which the jpayne@68: dict maps element id:s to elements. jpayne@68: jpayne@68: """ jpayne@68: if not parser: jpayne@68: parser = XMLParser(target=TreeBuilder()) jpayne@68: parser.feed(text) jpayne@68: tree = parser.close() jpayne@68: ids = {} jpayne@68: for elem in tree.iter(): jpayne@68: id = elem.get("id") jpayne@68: if id: jpayne@68: ids[id] = elem jpayne@68: return tree, ids jpayne@68: jpayne@68: # Parse XML document from string constant. Alias for XML(). jpayne@68: fromstring = XML jpayne@68: jpayne@68: def fromstringlist(sequence, parser=None): jpayne@68: """Parse XML document from sequence of string fragments. jpayne@68: jpayne@68: *sequence* is a list of other sequence, *parser* is an optional parser jpayne@68: instance, defaulting to the standard XMLParser. jpayne@68: jpayne@68: Returns an Element instance. jpayne@68: jpayne@68: """ jpayne@68: if not parser: jpayne@68: parser = XMLParser(target=TreeBuilder()) jpayne@68: for text in sequence: jpayne@68: parser.feed(text) jpayne@68: return parser.close() jpayne@68: jpayne@68: # -------------------------------------------------------------------- jpayne@68: jpayne@68: jpayne@68: class TreeBuilder: jpayne@68: """Generic element structure builder. jpayne@68: jpayne@68: This builder converts a sequence of start, data, and end method jpayne@68: calls to a well-formed element structure. jpayne@68: jpayne@68: You can use this class to build an element structure using a custom XML jpayne@68: parser, or a parser for some other XML-like format. jpayne@68: jpayne@68: *element_factory* is an optional element factory which is called jpayne@68: to create new Element instances, as necessary. jpayne@68: jpayne@68: *comment_factory* is a factory to create comments to be used instead of jpayne@68: the standard factory. If *insert_comments* is false (the default), jpayne@68: comments will not be inserted into the tree. jpayne@68: jpayne@68: *pi_factory* is a factory to create processing instructions to be used jpayne@68: instead of the standard factory. If *insert_pis* is false (the default), jpayne@68: processing instructions will not be inserted into the tree. jpayne@68: """ jpayne@68: def __init__(self, element_factory=None, *, jpayne@68: comment_factory=None, pi_factory=None, jpayne@68: insert_comments=False, insert_pis=False): jpayne@68: self._data = [] # data collector jpayne@68: self._elem = [] # element stack jpayne@68: self._last = None # last element jpayne@68: self._root = None # root element jpayne@68: self._tail = None # true if we're after an end tag jpayne@68: if comment_factory is None: jpayne@68: comment_factory = Comment jpayne@68: self._comment_factory = comment_factory jpayne@68: self.insert_comments = insert_comments jpayne@68: if pi_factory is None: jpayne@68: pi_factory = ProcessingInstruction jpayne@68: self._pi_factory = pi_factory jpayne@68: self.insert_pis = insert_pis jpayne@68: if element_factory is None: jpayne@68: element_factory = Element jpayne@68: self._factory = element_factory jpayne@68: jpayne@68: def close(self): jpayne@68: """Flush builder buffers and return toplevel document Element.""" jpayne@68: assert len(self._elem) == 0, "missing end tags" jpayne@68: assert self._root is not None, "missing toplevel element" jpayne@68: return self._root jpayne@68: jpayne@68: def _flush(self): jpayne@68: if self._data: jpayne@68: if self._last is not None: jpayne@68: text = "".join(self._data) jpayne@68: if self._tail: jpayne@68: assert self._last.tail is None, "internal error (tail)" jpayne@68: self._last.tail = text jpayne@68: else: jpayne@68: assert self._last.text is None, "internal error (text)" jpayne@68: self._last.text = text jpayne@68: self._data = [] jpayne@68: jpayne@68: def data(self, data): jpayne@68: """Add text to current element.""" jpayne@68: self._data.append(data) jpayne@68: jpayne@68: def start(self, tag, attrs): jpayne@68: """Open new element and return it. jpayne@68: jpayne@68: *tag* is the element name, *attrs* is a dict containing element jpayne@68: attributes. jpayne@68: jpayne@68: """ jpayne@68: self._flush() jpayne@68: self._last = elem = self._factory(tag, attrs) jpayne@68: if self._elem: jpayne@68: self._elem[-1].append(elem) jpayne@68: elif self._root is None: jpayne@68: self._root = elem jpayne@68: self._elem.append(elem) jpayne@68: self._tail = 0 jpayne@68: return elem jpayne@68: jpayne@68: def end(self, tag): jpayne@68: """Close and return current Element. jpayne@68: jpayne@68: *tag* is the element name. jpayne@68: jpayne@68: """ jpayne@68: self._flush() jpayne@68: self._last = self._elem.pop() jpayne@68: assert self._last.tag == tag,\ jpayne@68: "end tag mismatch (expected %s, got %s)" % ( jpayne@68: self._last.tag, tag) jpayne@68: self._tail = 1 jpayne@68: return self._last jpayne@68: jpayne@68: def comment(self, text): jpayne@68: """Create a comment using the comment_factory. jpayne@68: jpayne@68: *text* is the text of the comment. jpayne@68: """ jpayne@68: return self._handle_single( jpayne@68: self._comment_factory, self.insert_comments, text) jpayne@68: jpayne@68: def pi(self, target, text=None): jpayne@68: """Create a processing instruction using the pi_factory. jpayne@68: jpayne@68: *target* is the target name of the processing instruction. jpayne@68: *text* is the data of the processing instruction, or ''. jpayne@68: """ jpayne@68: return self._handle_single( jpayne@68: self._pi_factory, self.insert_pis, target, text) jpayne@68: jpayne@68: def _handle_single(self, factory, insert, *args): jpayne@68: elem = factory(*args) jpayne@68: if insert: jpayne@68: self._flush() jpayne@68: self._last = elem jpayne@68: if self._elem: jpayne@68: self._elem[-1].append(elem) jpayne@68: self._tail = 1 jpayne@68: return elem jpayne@68: jpayne@68: jpayne@68: # also see ElementTree and TreeBuilder jpayne@68: class XMLParser: jpayne@68: """Element structure builder for XML source data based on the expat parser. jpayne@68: jpayne@68: *target* is an optional target object which defaults to an instance of the jpayne@68: standard TreeBuilder class, *encoding* is an optional encoding string jpayne@68: which if given, overrides the encoding specified in the XML file: jpayne@68: http://www.iana.org/assignments/character-sets jpayne@68: jpayne@68: """ jpayne@68: jpayne@68: def __init__(self, *, target=None, encoding=None): jpayne@68: try: jpayne@68: from xml.parsers import expat jpayne@68: except ImportError: jpayne@68: try: jpayne@68: import pyexpat as expat jpayne@68: except ImportError: jpayne@68: raise ImportError( jpayne@68: "No module named expat; use SimpleXMLTreeBuilder instead" jpayne@68: ) jpayne@68: parser = expat.ParserCreate(encoding, "}") jpayne@68: if target is None: jpayne@68: target = TreeBuilder() jpayne@68: # underscored names are provided for compatibility only jpayne@68: self.parser = self._parser = parser jpayne@68: self.target = self._target = target jpayne@68: self._error = expat.error jpayne@68: self._names = {} # name memo cache jpayne@68: # main callbacks jpayne@68: parser.DefaultHandlerExpand = self._default jpayne@68: if hasattr(target, 'start'): jpayne@68: parser.StartElementHandler = self._start jpayne@68: if hasattr(target, 'end'): jpayne@68: parser.EndElementHandler = self._end jpayne@68: if hasattr(target, 'start_ns'): jpayne@68: parser.StartNamespaceDeclHandler = self._start_ns jpayne@68: if hasattr(target, 'end_ns'): jpayne@68: parser.EndNamespaceDeclHandler = self._end_ns jpayne@68: if hasattr(target, 'data'): jpayne@68: parser.CharacterDataHandler = target.data jpayne@68: # miscellaneous callbacks jpayne@68: if hasattr(target, 'comment'): jpayne@68: parser.CommentHandler = target.comment jpayne@68: if hasattr(target, 'pi'): jpayne@68: parser.ProcessingInstructionHandler = target.pi jpayne@68: # Configure pyexpat: buffering, new-style attribute handling. jpayne@68: parser.buffer_text = 1 jpayne@68: parser.ordered_attributes = 1 jpayne@68: parser.specified_attributes = 1 jpayne@68: self._doctype = None jpayne@68: self.entity = {} jpayne@68: try: jpayne@68: self.version = "Expat %d.%d.%d" % expat.version_info jpayne@68: except AttributeError: jpayne@68: pass # unknown jpayne@68: jpayne@68: def _setevents(self, events_queue, events_to_report): jpayne@68: # Internal API for XMLPullParser jpayne@68: # events_to_report: a list of events to report during parsing (same as jpayne@68: # the *events* of XMLPullParser's constructor. jpayne@68: # events_queue: a list of actual parsing events that will be populated jpayne@68: # by the underlying parser. jpayne@68: # jpayne@68: parser = self._parser jpayne@68: append = events_queue.append jpayne@68: for event_name in events_to_report: jpayne@68: if event_name == "start": jpayne@68: parser.ordered_attributes = 1 jpayne@68: parser.specified_attributes = 1 jpayne@68: def handler(tag, attrib_in, event=event_name, append=append, jpayne@68: start=self._start): jpayne@68: append((event, start(tag, attrib_in))) jpayne@68: parser.StartElementHandler = handler jpayne@68: elif event_name == "end": jpayne@68: def handler(tag, event=event_name, append=append, jpayne@68: end=self._end): jpayne@68: append((event, end(tag))) jpayne@68: parser.EndElementHandler = handler jpayne@68: elif event_name == "start-ns": jpayne@68: # TreeBuilder does not implement .start_ns() jpayne@68: if hasattr(self.target, "start_ns"): jpayne@68: def handler(prefix, uri, event=event_name, append=append, jpayne@68: start_ns=self._start_ns): jpayne@68: append((event, start_ns(prefix, uri))) jpayne@68: else: jpayne@68: def handler(prefix, uri, event=event_name, append=append): jpayne@68: append((event, (prefix or '', uri or ''))) jpayne@68: parser.StartNamespaceDeclHandler = handler jpayne@68: elif event_name == "end-ns": jpayne@68: # TreeBuilder does not implement .end_ns() jpayne@68: if hasattr(self.target, "end_ns"): jpayne@68: def handler(prefix, event=event_name, append=append, jpayne@68: end_ns=self._end_ns): jpayne@68: append((event, end_ns(prefix))) jpayne@68: else: jpayne@68: def handler(prefix, event=event_name, append=append): jpayne@68: append((event, None)) jpayne@68: parser.EndNamespaceDeclHandler = handler jpayne@68: elif event_name == 'comment': jpayne@68: def handler(text, event=event_name, append=append, self=self): jpayne@68: append((event, self.target.comment(text))) jpayne@68: parser.CommentHandler = handler jpayne@68: elif event_name == 'pi': jpayne@68: def handler(pi_target, data, event=event_name, append=append, jpayne@68: self=self): jpayne@68: append((event, self.target.pi(pi_target, data))) jpayne@68: parser.ProcessingInstructionHandler = handler jpayne@68: else: jpayne@68: raise ValueError("unknown event %r" % event_name) jpayne@68: jpayne@68: def _raiseerror(self, value): jpayne@68: err = ParseError(value) jpayne@68: err.code = value.code jpayne@68: err.position = value.lineno, value.offset jpayne@68: raise err jpayne@68: jpayne@68: def _fixname(self, key): jpayne@68: # expand qname, and convert name string to ascii, if possible jpayne@68: try: jpayne@68: name = self._names[key] jpayne@68: except KeyError: jpayne@68: name = key jpayne@68: if "}" in name: jpayne@68: name = "{" + name jpayne@68: self._names[key] = name jpayne@68: return name jpayne@68: jpayne@68: def _start_ns(self, prefix, uri): jpayne@68: return self.target.start_ns(prefix or '', uri or '') jpayne@68: jpayne@68: def _end_ns(self, prefix): jpayne@68: return self.target.end_ns(prefix or '') jpayne@68: jpayne@68: def _start(self, tag, attr_list): jpayne@68: # Handler for expat's StartElementHandler. Since ordered_attributes jpayne@68: # is set, the attributes are reported as a list of alternating jpayne@68: # attribute name,value. jpayne@68: fixname = self._fixname jpayne@68: tag = fixname(tag) jpayne@68: attrib = {} jpayne@68: if attr_list: jpayne@68: for i in range(0, len(attr_list), 2): jpayne@68: attrib[fixname(attr_list[i])] = attr_list[i+1] jpayne@68: return self.target.start(tag, attrib) jpayne@68: jpayne@68: def _end(self, tag): jpayne@68: return self.target.end(self._fixname(tag)) jpayne@68: jpayne@68: def _default(self, text): jpayne@68: prefix = text[:1] jpayne@68: if prefix == "&": jpayne@68: # deal with undefined entities jpayne@68: try: jpayne@68: data_handler = self.target.data jpayne@68: except AttributeError: jpayne@68: return jpayne@68: try: jpayne@68: data_handler(self.entity[text[1:-1]]) jpayne@68: except KeyError: jpayne@68: from xml.parsers import expat jpayne@68: err = expat.error( jpayne@68: "undefined entity %s: line %d, column %d" % jpayne@68: (text, self.parser.ErrorLineNumber, jpayne@68: self.parser.ErrorColumnNumber) jpayne@68: ) jpayne@68: err.code = 11 # XML_ERROR_UNDEFINED_ENTITY jpayne@68: err.lineno = self.parser.ErrorLineNumber jpayne@68: err.offset = self.parser.ErrorColumnNumber jpayne@68: raise err jpayne@68: elif prefix == "<" and text[:9] == "": jpayne@68: self._doctype = None jpayne@68: return jpayne@68: text = text.strip() jpayne@68: if not text: jpayne@68: return jpayne@68: self._doctype.append(text) jpayne@68: n = len(self._doctype) jpayne@68: if n > 2: jpayne@68: type = self._doctype[1] jpayne@68: if type == "PUBLIC" and n == 4: jpayne@68: name, type, pubid, system = self._doctype jpayne@68: if pubid: jpayne@68: pubid = pubid[1:-1] jpayne@68: elif type == "SYSTEM" and n == 3: jpayne@68: name, type, system = self._doctype jpayne@68: pubid = None jpayne@68: else: jpayne@68: return jpayne@68: if hasattr(self.target, "doctype"): jpayne@68: self.target.doctype(name, pubid, system[1:-1]) jpayne@68: elif hasattr(self, "doctype"): jpayne@68: warnings.warn( jpayne@68: "The doctype() method of XMLParser is ignored. " jpayne@68: "Define doctype() method on the TreeBuilder target.", jpayne@68: RuntimeWarning) jpayne@68: jpayne@68: self._doctype = None jpayne@68: jpayne@68: def feed(self, data): jpayne@68: """Feed encoded data to parser.""" jpayne@68: try: jpayne@68: self.parser.Parse(data, 0) jpayne@68: except self._error as v: jpayne@68: self._raiseerror(v) jpayne@68: jpayne@68: def close(self): jpayne@68: """Finish feeding data to parser and return element structure.""" jpayne@68: try: jpayne@68: self.parser.Parse("", 1) # end of data jpayne@68: except self._error as v: jpayne@68: self._raiseerror(v) jpayne@68: try: jpayne@68: close_handler = self.target.close jpayne@68: except AttributeError: jpayne@68: pass jpayne@68: else: jpayne@68: return close_handler() jpayne@68: finally: jpayne@68: # get rid of circular references jpayne@68: del self.parser, self._parser jpayne@68: del self.target, self._target jpayne@68: jpayne@68: jpayne@68: # -------------------------------------------------------------------- jpayne@68: # C14N 2.0 jpayne@68: jpayne@68: def canonicalize(xml_data=None, *, out=None, from_file=None, **options): jpayne@68: """Convert XML to its C14N 2.0 serialised form. jpayne@68: jpayne@68: If *out* is provided, it must be a file or file-like object that receives jpayne@68: the serialised canonical XML output (text, not bytes) through its ``.write()`` jpayne@68: method. To write to a file, open it in text mode with encoding "utf-8". jpayne@68: If *out* is not provided, this function returns the output as text string. jpayne@68: jpayne@68: Either *xml_data* (an XML string) or *from_file* (a file path or jpayne@68: file-like object) must be provided as input. jpayne@68: jpayne@68: The configuration options are the same as for the ``C14NWriterTarget``. jpayne@68: """ jpayne@68: if xml_data is None and from_file is None: jpayne@68: raise ValueError("Either 'xml_data' or 'from_file' must be provided as input") jpayne@68: sio = None jpayne@68: if out is None: jpayne@68: sio = out = io.StringIO() jpayne@68: jpayne@68: parser = XMLParser(target=C14NWriterTarget(out.write, **options)) jpayne@68: jpayne@68: if xml_data is not None: jpayne@68: parser.feed(xml_data) jpayne@68: parser.close() jpayne@68: elif from_file is not None: jpayne@68: parse(from_file, parser=parser) jpayne@68: jpayne@68: return sio.getvalue() if sio is not None else None jpayne@68: jpayne@68: jpayne@68: _looks_like_prefix_name = re.compile(r'^\w+:\w+$', re.UNICODE).match jpayne@68: jpayne@68: jpayne@68: class C14NWriterTarget: jpayne@68: """ jpayne@68: Canonicalization writer target for the XMLParser. jpayne@68: jpayne@68: Serialises parse events to XML C14N 2.0. jpayne@68: jpayne@68: The *write* function is used for writing out the resulting data stream jpayne@68: as text (not bytes). To write to a file, open it in text mode with encoding jpayne@68: "utf-8" and pass its ``.write`` method. jpayne@68: jpayne@68: Configuration options: jpayne@68: jpayne@68: - *with_comments*: set to true to include comments jpayne@68: - *strip_text*: set to true to strip whitespace before and after text content jpayne@68: - *rewrite_prefixes*: set to true to replace namespace prefixes by "n{number}" jpayne@68: - *qname_aware_tags*: a set of qname aware tag names in which prefixes jpayne@68: should be replaced in text content jpayne@68: - *qname_aware_attrs*: a set of qname aware attribute names in which prefixes jpayne@68: should be replaced in text content jpayne@68: - *exclude_attrs*: a set of attribute names that should not be serialised jpayne@68: - *exclude_tags*: a set of tag names that should not be serialised jpayne@68: """ jpayne@68: def __init__(self, write, *, jpayne@68: with_comments=False, strip_text=False, rewrite_prefixes=False, jpayne@68: qname_aware_tags=None, qname_aware_attrs=None, jpayne@68: exclude_attrs=None, exclude_tags=None): jpayne@68: self._write = write jpayne@68: self._data = [] jpayne@68: self._with_comments = with_comments jpayne@68: self._strip_text = strip_text jpayne@68: self._exclude_attrs = set(exclude_attrs) if exclude_attrs else None jpayne@68: self._exclude_tags = set(exclude_tags) if exclude_tags else None jpayne@68: jpayne@68: self._rewrite_prefixes = rewrite_prefixes jpayne@68: if qname_aware_tags: jpayne@68: self._qname_aware_tags = set(qname_aware_tags) jpayne@68: else: jpayne@68: self._qname_aware_tags = None jpayne@68: if qname_aware_attrs: jpayne@68: self._find_qname_aware_attrs = set(qname_aware_attrs).intersection jpayne@68: else: jpayne@68: self._find_qname_aware_attrs = None jpayne@68: jpayne@68: # Stack with globally and newly declared namespaces as (uri, prefix) pairs. jpayne@68: self._declared_ns_stack = [[ jpayne@68: ("http://www.w3.org/XML/1998/namespace", "xml"), jpayne@68: ]] jpayne@68: # Stack with user declared namespace prefixes as (uri, prefix) pairs. jpayne@68: self._ns_stack = [] jpayne@68: if not rewrite_prefixes: jpayne@68: self._ns_stack.append(list(_namespace_map.items())) jpayne@68: self._ns_stack.append([]) jpayne@68: self._prefix_map = {} jpayne@68: self._preserve_space = [False] jpayne@68: self._pending_start = None jpayne@68: self._root_seen = False jpayne@68: self._root_done = False jpayne@68: self._ignored_depth = 0 jpayne@68: jpayne@68: def _iter_namespaces(self, ns_stack, _reversed=reversed): jpayne@68: for namespaces in _reversed(ns_stack): jpayne@68: if namespaces: # almost no element declares new namespaces jpayne@68: yield from namespaces jpayne@68: jpayne@68: def _resolve_prefix_name(self, prefixed_name): jpayne@68: prefix, name = prefixed_name.split(':', 1) jpayne@68: for uri, p in self._iter_namespaces(self._ns_stack): jpayne@68: if p == prefix: jpayne@68: return f'{{{uri}}}{name}' jpayne@68: raise ValueError(f'Prefix {prefix} of QName "{prefixed_name}" is not declared in scope') jpayne@68: jpayne@68: def _qname(self, qname, uri=None): jpayne@68: if uri is None: jpayne@68: uri, tag = qname[1:].rsplit('}', 1) if qname[:1] == '{' else ('', qname) jpayne@68: else: jpayne@68: tag = qname jpayne@68: jpayne@68: prefixes_seen = set() jpayne@68: for u, prefix in self._iter_namespaces(self._declared_ns_stack): jpayne@68: if u == uri and prefix not in prefixes_seen: jpayne@68: return f'{prefix}:{tag}' if prefix else tag, tag, uri jpayne@68: prefixes_seen.add(prefix) jpayne@68: jpayne@68: # Not declared yet => add new declaration. jpayne@68: if self._rewrite_prefixes: jpayne@68: if uri in self._prefix_map: jpayne@68: prefix = self._prefix_map[uri] jpayne@68: else: jpayne@68: prefix = self._prefix_map[uri] = f'n{len(self._prefix_map)}' jpayne@68: self._declared_ns_stack[-1].append((uri, prefix)) jpayne@68: return f'{prefix}:{tag}', tag, uri jpayne@68: jpayne@68: if not uri and '' not in prefixes_seen: jpayne@68: # No default namespace declared => no prefix needed. jpayne@68: return tag, tag, uri jpayne@68: jpayne@68: for u, prefix in self._iter_namespaces(self._ns_stack): jpayne@68: if u == uri: jpayne@68: self._declared_ns_stack[-1].append((uri, prefix)) jpayne@68: return f'{prefix}:{tag}' if prefix else tag, tag, uri jpayne@68: jpayne@68: raise ValueError(f'Namespace "{uri}" is not declared in scope') jpayne@68: jpayne@68: def data(self, data): jpayne@68: if not self._ignored_depth: jpayne@68: self._data.append(data) jpayne@68: jpayne@68: def _flush(self, _join_text=''.join): jpayne@68: data = _join_text(self._data) jpayne@68: del self._data[:] jpayne@68: if self._strip_text and not self._preserve_space[-1]: jpayne@68: data = data.strip() jpayne@68: if self._pending_start is not None: jpayne@68: args, self._pending_start = self._pending_start, None jpayne@68: qname_text = data if data and _looks_like_prefix_name(data) else None jpayne@68: self._start(*args, qname_text) jpayne@68: if qname_text is not None: jpayne@68: return jpayne@68: if data and self._root_seen: jpayne@68: self._write(_escape_cdata_c14n(data)) jpayne@68: jpayne@68: def start_ns(self, prefix, uri): jpayne@68: if self._ignored_depth: jpayne@68: return jpayne@68: # we may have to resolve qnames in text content jpayne@68: if self._data: jpayne@68: self._flush() jpayne@68: self._ns_stack[-1].append((uri, prefix)) jpayne@68: jpayne@68: def start(self, tag, attrs): jpayne@68: if self._exclude_tags is not None and ( jpayne@68: self._ignored_depth or tag in self._exclude_tags): jpayne@68: self._ignored_depth += 1 jpayne@68: return jpayne@68: if self._data: jpayne@68: self._flush() jpayne@68: jpayne@68: new_namespaces = [] jpayne@68: self._declared_ns_stack.append(new_namespaces) jpayne@68: jpayne@68: if self._qname_aware_tags is not None and tag in self._qname_aware_tags: jpayne@68: # Need to parse text first to see if it requires a prefix declaration. jpayne@68: self._pending_start = (tag, attrs, new_namespaces) jpayne@68: return jpayne@68: self._start(tag, attrs, new_namespaces) jpayne@68: jpayne@68: def _start(self, tag, attrs, new_namespaces, qname_text=None): jpayne@68: if self._exclude_attrs is not None and attrs: jpayne@68: attrs = {k: v for k, v in attrs.items() if k not in self._exclude_attrs} jpayne@68: jpayne@68: qnames = {tag, *attrs} jpayne@68: resolved_names = {} jpayne@68: jpayne@68: # Resolve prefixes in attribute and tag text. jpayne@68: if qname_text is not None: jpayne@68: qname = resolved_names[qname_text] = self._resolve_prefix_name(qname_text) jpayne@68: qnames.add(qname) jpayne@68: if self._find_qname_aware_attrs is not None and attrs: jpayne@68: qattrs = self._find_qname_aware_attrs(attrs) jpayne@68: if qattrs: jpayne@68: for attr_name in qattrs: jpayne@68: value = attrs[attr_name] jpayne@68: if _looks_like_prefix_name(value): jpayne@68: qname = resolved_names[value] = self._resolve_prefix_name(value) jpayne@68: qnames.add(qname) jpayne@68: else: jpayne@68: qattrs = None jpayne@68: else: jpayne@68: qattrs = None jpayne@68: jpayne@68: # Assign prefixes in lexicographical order of used URIs. jpayne@68: parse_qname = self._qname jpayne@68: parsed_qnames = {n: parse_qname(n) for n in sorted( jpayne@68: qnames, key=lambda n: n.split('}', 1))} jpayne@68: jpayne@68: # Write namespace declarations in prefix order ... jpayne@68: if new_namespaces: jpayne@68: attr_list = [ jpayne@68: ('xmlns:' + prefix if prefix else 'xmlns', uri) jpayne@68: for uri, prefix in new_namespaces jpayne@68: ] jpayne@68: attr_list.sort() jpayne@68: else: jpayne@68: # almost always empty jpayne@68: attr_list = [] jpayne@68: jpayne@68: # ... followed by attributes in URI+name order jpayne@68: if attrs: jpayne@68: for k, v in sorted(attrs.items()): jpayne@68: if qattrs is not None and k in qattrs and v in resolved_names: jpayne@68: v = parsed_qnames[resolved_names[v]][0] jpayne@68: attr_qname, attr_name, uri = parsed_qnames[k] jpayne@68: # No prefix for attributes in default ('') namespace. jpayne@68: attr_list.append((attr_qname if uri else attr_name, v)) jpayne@68: jpayne@68: # Honour xml:space attributes. jpayne@68: space_behaviour = attrs.get('{http://www.w3.org/XML/1998/namespace}space') jpayne@68: self._preserve_space.append( jpayne@68: space_behaviour == 'preserve' if space_behaviour jpayne@68: else self._preserve_space[-1]) jpayne@68: jpayne@68: # Write the tag. jpayne@68: write = self._write jpayne@68: write('<' + parsed_qnames[tag][0]) jpayne@68: if attr_list: jpayne@68: write(''.join([f' {k}="{_escape_attrib_c14n(v)}"' for k, v in attr_list])) jpayne@68: write('>') jpayne@68: jpayne@68: # Write the resolved qname text content. jpayne@68: if qname_text is not None: jpayne@68: write(_escape_cdata_c14n(parsed_qnames[resolved_names[qname_text]][0])) jpayne@68: jpayne@68: self._root_seen = True jpayne@68: self._ns_stack.append([]) jpayne@68: jpayne@68: def end(self, tag): jpayne@68: if self._ignored_depth: jpayne@68: self._ignored_depth -= 1 jpayne@68: return jpayne@68: if self._data: jpayne@68: self._flush() jpayne@68: self._write(f'') jpayne@68: self._preserve_space.pop() jpayne@68: self._root_done = len(self._preserve_space) == 1 jpayne@68: self._declared_ns_stack.pop() jpayne@68: self._ns_stack.pop() jpayne@68: jpayne@68: def comment(self, text): jpayne@68: if not self._with_comments: jpayne@68: return jpayne@68: if self._ignored_depth: jpayne@68: return jpayne@68: if self._root_done: jpayne@68: self._write('\n') jpayne@68: elif self._root_seen and self._data: jpayne@68: self._flush() jpayne@68: self._write(f'') jpayne@68: if not self._root_seen: jpayne@68: self._write('\n') jpayne@68: jpayne@68: def pi(self, target, data): jpayne@68: if self._ignored_depth: jpayne@68: return jpayne@68: if self._root_done: jpayne@68: self._write('\n') jpayne@68: elif self._root_seen and self._data: jpayne@68: self._flush() jpayne@68: self._write( jpayne@68: f'' if data else f'') jpayne@68: if not self._root_seen: jpayne@68: self._write('\n') jpayne@68: jpayne@68: jpayne@68: def _escape_cdata_c14n(text): jpayne@68: # escape character data jpayne@68: try: jpayne@68: # it's worth avoiding do-nothing calls for strings that are jpayne@68: # shorter than 500 character, or so. assume that's, by far, jpayne@68: # the most common case in most applications. jpayne@68: if '&' in text: jpayne@68: text = text.replace('&', '&') jpayne@68: if '<' in text: jpayne@68: text = text.replace('<', '<') jpayne@68: if '>' in text: jpayne@68: text = text.replace('>', '>') jpayne@68: if '\r' in text: jpayne@68: text = text.replace('\r', ' ') jpayne@68: return text jpayne@68: except (TypeError, AttributeError): jpayne@68: _raise_serialization_error(text) jpayne@68: jpayne@68: jpayne@68: def _escape_attrib_c14n(text): jpayne@68: # escape attribute value jpayne@68: try: jpayne@68: if '&' in text: jpayne@68: text = text.replace('&', '&') jpayne@68: if '<' in text: jpayne@68: text = text.replace('<', '<') jpayne@68: if '"' in text: jpayne@68: text = text.replace('"', '"') jpayne@68: if '\t' in text: jpayne@68: text = text.replace('\t', ' ') jpayne@68: if '\n' in text: jpayne@68: text = text.replace('\n', ' ') jpayne@68: if '\r' in text: jpayne@68: text = text.replace('\r', ' ') jpayne@68: return text jpayne@68: except (TypeError, AttributeError): jpayne@68: _raise_serialization_error(text) jpayne@68: jpayne@68: jpayne@68: # -------------------------------------------------------------------- jpayne@68: jpayne@68: # Import the C accelerators jpayne@68: try: jpayne@68: # Element is going to be shadowed by the C implementation. We need to keep jpayne@68: # the Python version of it accessible for some "creative" by external code jpayne@68: # (see tests) jpayne@68: _Element_Py = Element jpayne@68: jpayne@68: # Element, SubElement, ParseError, TreeBuilder, XMLParser, _set_factories jpayne@68: from _elementtree import * jpayne@68: from _elementtree import _set_factories jpayne@68: except ImportError: jpayne@68: pass jpayne@68: else: jpayne@68: _set_factories(Comment, ProcessingInstruction)