jpayne@68: """Lightweight XML support for Python.
jpayne@68: 
jpayne@68:  XML is an inherently hierarchical data format, and the most natural way to
jpayne@68:  represent it is with a tree.  This module has two classes for this purpose:
jpayne@68: 
jpayne@68:     1. ElementTree represents the whole XML document as a tree and
jpayne@68: 
jpayne@68:     2. Element represents a single node in this tree.
jpayne@68: 
jpayne@68:  Interactions with the whole document (reading and writing to/from files) are
jpayne@68:  usually done on the ElementTree level.  Interactions with a single XML element
jpayne@68:  and its sub-elements are done on the Element level.
jpayne@68: 
jpayne@68:  Element is a flexible container object designed to store hierarchical data
jpayne@68:  structures in memory. It can be described as a cross between a list and a
jpayne@68:  dictionary.  Each Element has a number of properties associated with it:
jpayne@68: 
jpayne@68:     'tag' - a string containing the element's name.
jpayne@68: 
jpayne@68:     'attributes' - a Python dictionary storing the element's attributes.
jpayne@68: 
jpayne@68:     'text' - a string containing the element's text content.
jpayne@68: 
jpayne@68:     'tail' - an optional string containing text after the element's end tag.
jpayne@68: 
jpayne@68:     And a number of child elements stored in a Python sequence.
jpayne@68: 
jpayne@68:  To create an element instance, use the Element constructor,
jpayne@68:  or the SubElement factory function.
jpayne@68: 
jpayne@68:  You can also use the ElementTree class to wrap an element structure
jpayne@68:  and convert it to and from XML.
jpayne@68: 
jpayne@68: """
jpayne@68: 
jpayne@68: #---------------------------------------------------------------------
jpayne@68: # Licensed to PSF under a Contributor Agreement.
jpayne@68: # See http://www.python.org/psf/license for licensing details.
jpayne@68: #
jpayne@68: # ElementTree
jpayne@68: # Copyright (c) 1999-2008 by Fredrik Lundh.  All rights reserved.
jpayne@68: #
jpayne@68: # fredrik@pythonware.com
jpayne@68: # http://www.pythonware.com
jpayne@68: # --------------------------------------------------------------------
jpayne@68: # The ElementTree toolkit is
jpayne@68: #
jpayne@68: # Copyright (c) 1999-2008 by Fredrik Lundh
jpayne@68: #
jpayne@68: # By obtaining, using, and/or copying this software and/or its
jpayne@68: # associated documentation, you agree that you have read, understood,
jpayne@68: # and will comply with the following terms and conditions:
jpayne@68: #
jpayne@68: # Permission to use, copy, modify, and distribute this software and
jpayne@68: # its associated documentation for any purpose and without fee is
jpayne@68: # hereby granted, provided that the above copyright notice appears in
jpayne@68: # all copies, and that both that copyright notice and this permission
jpayne@68: # notice appear in supporting documentation, and that the name of
jpayne@68: # Secret Labs AB or the author not be used in advertising or publicity
jpayne@68: # pertaining to distribution of the software without specific, written
jpayne@68: # prior permission.
jpayne@68: #
jpayne@68: # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
jpayne@68: # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
jpayne@68: # ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
jpayne@68: # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
jpayne@68: # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
jpayne@68: # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
jpayne@68: # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
jpayne@68: # OF THIS SOFTWARE.
jpayne@68: # --------------------------------------------------------------------
jpayne@68: 
jpayne@68: __all__ = [
jpayne@68:     # public symbols
jpayne@68:     "Comment",
jpayne@68:     "dump",
jpayne@68:     "Element", "ElementTree",
jpayne@68:     "fromstring", "fromstringlist",
jpayne@68:     "iselement", "iterparse",
jpayne@68:     "parse", "ParseError",
jpayne@68:     "PI", "ProcessingInstruction",
jpayne@68:     "QName",
jpayne@68:     "SubElement",
jpayne@68:     "tostring", "tostringlist",
jpayne@68:     "TreeBuilder",
jpayne@68:     "VERSION",
jpayne@68:     "XML", "XMLID",
jpayne@68:     "XMLParser", "XMLPullParser",
jpayne@68:     "register_namespace",
jpayne@68:     "canonicalize", "C14NWriterTarget",
jpayne@68:     ]
jpayne@68: 
jpayne@68: VERSION = "1.3.0"
jpayne@68: 
jpayne@68: import sys
jpayne@68: import re
jpayne@68: import warnings
jpayne@68: import io
jpayne@68: import collections
jpayne@68: import collections.abc
jpayne@68: import contextlib
jpayne@68: 
jpayne@68: from . import ElementPath
jpayne@68: 
jpayne@68: 
jpayne@68: class ParseError(SyntaxError):
jpayne@68:     """An error when parsing an XML document.
jpayne@68: 
jpayne@68:     In addition to its exception value, a ParseError contains
jpayne@68:     two extra attributes:
jpayne@68:         'code'     - the specific exception code
jpayne@68:         'position' - the line and column of the error
jpayne@68: 
jpayne@68:     """
jpayne@68:     pass
jpayne@68: 
jpayne@68: # --------------------------------------------------------------------
jpayne@68: 
jpayne@68: 
jpayne@68: def iselement(element):
jpayne@68:     """Return True if *element* appears to be an Element."""
jpayne@68:     return hasattr(element, 'tag')
jpayne@68: 
jpayne@68: 
jpayne@68: class Element:
jpayne@68:     """An XML element.
jpayne@68: 
jpayne@68:     This class is the reference implementation of the Element interface.
jpayne@68: 
jpayne@68:     An element's length is its number of subelements.  That means if you
jpayne@68:     want to check if an element is truly empty, you should check BOTH
jpayne@68:     its length AND its text attribute.
jpayne@68: 
jpayne@68:     The element tag, attribute names, and attribute values can be either
jpayne@68:     bytes or strings.
jpayne@68: 
jpayne@68:     *tag* is the element name.  *attrib* is an optional dictionary containing
jpayne@68:     element attributes. *extra* are additional element attributes given as
jpayne@68:     keyword arguments.
jpayne@68: 
jpayne@68:     Example form:
jpayne@68:         <tag attrib>text<child/>...</tag>tail
jpayne@68: 
jpayne@68:     """
jpayne@68: 
jpayne@68:     tag = None
jpayne@68:     """The element's name."""
jpayne@68: 
jpayne@68:     attrib = None
jpayne@68:     """Dictionary of the element's attributes."""
jpayne@68: 
jpayne@68:     text = None
jpayne@68:     """
jpayne@68:     Text before first subelement. This is either a string or the value None.
jpayne@68:     Note that if there is no text, this attribute may be either
jpayne@68:     None or the empty string, depending on the parser.
jpayne@68: 
jpayne@68:     """
jpayne@68: 
jpayne@68:     tail = None
jpayne@68:     """
jpayne@68:     Text after this element's end tag, but before the next sibling element's
jpayne@68:     start tag.  This is either a string or the value None.  Note that if there
jpayne@68:     was no text, this attribute may be either None or an empty string,
jpayne@68:     depending on the parser.
jpayne@68: 
jpayne@68:     """
jpayne@68: 
jpayne@68:     def __init__(self, tag, attrib={}, **extra):
jpayne@68:         if not isinstance(attrib, dict):
jpayne@68:             raise TypeError("attrib must be dict, not %s" % (
jpayne@68:                 attrib.__class__.__name__,))
jpayne@68:         self.tag = tag
jpayne@68:         self.attrib = {**attrib, **extra}
jpayne@68:         self._children = []
jpayne@68: 
jpayne@68:     def __repr__(self):
jpayne@68:         return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self))
jpayne@68: 
jpayne@68:     def makeelement(self, tag, attrib):
jpayne@68:         """Create a new element with the same type.
jpayne@68: 
jpayne@68:         *tag* is a string containing the element name.
jpayne@68:         *attrib* is a dictionary containing the element attributes.
jpayne@68: 
jpayne@68:         Do not call this method, use the SubElement factory function instead.
jpayne@68: 
jpayne@68:         """
jpayne@68:         return self.__class__(tag, attrib)
jpayne@68: 
jpayne@68:     def copy(self):
jpayne@68:         """Return copy of current element.
jpayne@68: 
jpayne@68:         This creates a shallow copy. Subelements will be shared with the
jpayne@68:         original tree.
jpayne@68: 
jpayne@68:         """
jpayne@68:         elem = self.makeelement(self.tag, self.attrib)
jpayne@68:         elem.text = self.text
jpayne@68:         elem.tail = self.tail
jpayne@68:         elem[:] = self
jpayne@68:         return elem
jpayne@68: 
jpayne@68:     def __len__(self):
jpayne@68:         return len(self._children)
jpayne@68: 
jpayne@68:     def __bool__(self):
jpayne@68:         warnings.warn(
jpayne@68:             "The behavior of this method will change in future versions.  "
jpayne@68:             "Use specific 'len(elem)' or 'elem is not None' test instead.",
jpayne@68:             FutureWarning, stacklevel=2
jpayne@68:             )
jpayne@68:         return len(self._children) != 0 # emulate old behaviour, for now
jpayne@68: 
jpayne@68:     def __getitem__(self, index):
jpayne@68:         return self._children[index]
jpayne@68: 
jpayne@68:     def __setitem__(self, index, element):
jpayne@68:         if isinstance(index, slice):
jpayne@68:             for elt in element:
jpayne@68:                 self._assert_is_element(elt)
jpayne@68:         else:
jpayne@68:             self._assert_is_element(element)
jpayne@68:         self._children[index] = element
jpayne@68: 
jpayne@68:     def __delitem__(self, index):
jpayne@68:         del self._children[index]
jpayne@68: 
jpayne@68:     def append(self, subelement):
jpayne@68:         """Add *subelement* to the end of this element.
jpayne@68: 
jpayne@68:         The new element will appear in document order after the last existing
jpayne@68:         subelement (or directly after the text, if it's the first subelement),
jpayne@68:         but before the end tag for this element.
jpayne@68: 
jpayne@68:         """
jpayne@68:         self._assert_is_element(subelement)
jpayne@68:         self._children.append(subelement)
jpayne@68: 
jpayne@68:     def extend(self, elements):
jpayne@68:         """Append subelements from a sequence.
jpayne@68: 
jpayne@68:         *elements* is a sequence with zero or more elements.
jpayne@68: 
jpayne@68:         """
jpayne@68:         for element in elements:
jpayne@68:             self._assert_is_element(element)
jpayne@68:         self._children.extend(elements)
jpayne@68: 
jpayne@68:     def insert(self, index, subelement):
jpayne@68:         """Insert *subelement* at position *index*."""
jpayne@68:         self._assert_is_element(subelement)
jpayne@68:         self._children.insert(index, subelement)
jpayne@68: 
jpayne@68:     def _assert_is_element(self, e):
jpayne@68:         # Need to refer to the actual Python implementation, not the
jpayne@68:         # shadowing C implementation.
jpayne@68:         if not isinstance(e, _Element_Py):
jpayne@68:             raise TypeError('expected an Element, not %s' % type(e).__name__)
jpayne@68: 
jpayne@68:     def remove(self, subelement):
jpayne@68:         """Remove matching subelement.
jpayne@68: 
jpayne@68:         Unlike the find methods, this method compares elements based on
jpayne@68:         identity, NOT ON tag value or contents.  To remove subelements by
jpayne@68:         other means, the easiest way is to use a list comprehension to
jpayne@68:         select what elements to keep, and then use slice assignment to update
jpayne@68:         the parent element.
jpayne@68: 
jpayne@68:         ValueError is raised if a matching element could not be found.
jpayne@68: 
jpayne@68:         """
jpayne@68:         # assert iselement(element)
jpayne@68:         self._children.remove(subelement)
jpayne@68: 
jpayne@68:     def getchildren(self):
jpayne@68:         """(Deprecated) Return all subelements.
jpayne@68: 
jpayne@68:         Elements are returned in document order.
jpayne@68: 
jpayne@68:         """
jpayne@68:         warnings.warn(
jpayne@68:             "This method will be removed in future versions.  "
jpayne@68:             "Use 'list(elem)' or iteration over elem instead.",
jpayne@68:             DeprecationWarning, stacklevel=2
jpayne@68:             )
jpayne@68:         return self._children
jpayne@68: 
jpayne@68:     def find(self, path, namespaces=None):
jpayne@68:         """Find first matching element by tag name or path.
jpayne@68: 
jpayne@68:         *path* is a string having either an element tag or an XPath,
jpayne@68:         *namespaces* is an optional mapping from namespace prefix to full name.
jpayne@68: 
jpayne@68:         Return the first matching element, or None if no element was found.
jpayne@68: 
jpayne@68:         """
jpayne@68:         return ElementPath.find(self, path, namespaces)
jpayne@68: 
jpayne@68:     def findtext(self, path, default=None, namespaces=None):
jpayne@68:         """Find text for first matching element by tag name or path.
jpayne@68: 
jpayne@68:         *path* is a string having either an element tag or an XPath,
jpayne@68:         *default* is the value to return if the element was not found,
jpayne@68:         *namespaces* is an optional mapping from namespace prefix to full name.
jpayne@68: 
jpayne@68:         Return text content of first matching element, or default value if
jpayne@68:         none was found.  Note that if an element is found having no text
jpayne@68:         content, the empty string is returned.
jpayne@68: 
jpayne@68:         """
jpayne@68:         return ElementPath.findtext(self, path, default, namespaces)
jpayne@68: 
jpayne@68:     def findall(self, path, namespaces=None):
jpayne@68:         """Find all matching subelements by tag name or path.
jpayne@68: 
jpayne@68:         *path* is a string having either an element tag or an XPath,
jpayne@68:         *namespaces* is an optional mapping from namespace prefix to full name.
jpayne@68: 
jpayne@68:         Returns list containing all matching elements in document order.
jpayne@68: 
jpayne@68:         """
jpayne@68:         return ElementPath.findall(self, path, namespaces)
jpayne@68: 
jpayne@68:     def iterfind(self, path, namespaces=None):
jpayne@68:         """Find all matching subelements by tag name or path.
jpayne@68: 
jpayne@68:         *path* is a string having either an element tag or an XPath,
jpayne@68:         *namespaces* is an optional mapping from namespace prefix to full name.
jpayne@68: 
jpayne@68:         Return an iterable yielding all matching elements in document order.
jpayne@68: 
jpayne@68:         """
jpayne@68:         return ElementPath.iterfind(self, path, namespaces)
jpayne@68: 
jpayne@68:     def clear(self):
jpayne@68:         """Reset element.
jpayne@68: 
jpayne@68:         This function removes all subelements, clears all attributes, and sets
jpayne@68:         the text and tail attributes to None.
jpayne@68: 
jpayne@68:         """
jpayne@68:         self.attrib.clear()
jpayne@68:         self._children = []
jpayne@68:         self.text = self.tail = None
jpayne@68: 
jpayne@68:     def get(self, key, default=None):
jpayne@68:         """Get element attribute.
jpayne@68: 
jpayne@68:         Equivalent to attrib.get, but some implementations may handle this a
jpayne@68:         bit more efficiently.  *key* is what attribute to look for, and
jpayne@68:         *default* is what to return if the attribute was not found.
jpayne@68: 
jpayne@68:         Returns a string containing the attribute value, or the default if
jpayne@68:         attribute was not found.
jpayne@68: 
jpayne@68:         """
jpayne@68:         return self.attrib.get(key, default)
jpayne@68: 
jpayne@68:     def set(self, key, value):
jpayne@68:         """Set element attribute.
jpayne@68: 
jpayne@68:         Equivalent to attrib[key] = value, but some implementations may handle
jpayne@68:         this a bit more efficiently.  *key* is what attribute to set, and
jpayne@68:         *value* is the attribute value to set it to.
jpayne@68: 
jpayne@68:         """
jpayne@68:         self.attrib[key] = value
jpayne@68: 
jpayne@68:     def keys(self):
jpayne@68:         """Get list of attribute names.
jpayne@68: 
jpayne@68:         Names are returned in an arbitrary order, just like an ordinary
jpayne@68:         Python dict.  Equivalent to attrib.keys()
jpayne@68: 
jpayne@68:         """
jpayne@68:         return self.attrib.keys()
jpayne@68: 
jpayne@68:     def items(self):
jpayne@68:         """Get element attributes as a sequence.
jpayne@68: 
jpayne@68:         The attributes are returned in arbitrary order.  Equivalent to
jpayne@68:         attrib.items().
jpayne@68: 
jpayne@68:         Return a list of (name, value) tuples.
jpayne@68: 
jpayne@68:         """
jpayne@68:         return self.attrib.items()
jpayne@68: 
jpayne@68:     def iter(self, tag=None):
jpayne@68:         """Create tree iterator.
jpayne@68: 
jpayne@68:         The iterator loops over the element and all subelements in document
jpayne@68:         order, returning all elements with a matching tag.
jpayne@68: 
jpayne@68:         If the tree structure is modified during iteration, new or removed
jpayne@68:         elements may or may not be included.  To get a stable set, use the
jpayne@68:         list() function on the iterator, and loop over the resulting list.
jpayne@68: 
jpayne@68:         *tag* is what tags to look for (default is to return all elements)
jpayne@68: 
jpayne@68:         Return an iterator containing all the matching elements.
jpayne@68: 
jpayne@68:         """
jpayne@68:         if tag == "*":
jpayne@68:             tag = None
jpayne@68:         if tag is None or self.tag == tag:
jpayne@68:             yield self
jpayne@68:         for e in self._children:
jpayne@68:             yield from e.iter(tag)
jpayne@68: 
jpayne@68:     # compatibility
jpayne@68:     def getiterator(self, tag=None):
jpayne@68:         warnings.warn(
jpayne@68:             "This method will be removed in future versions.  "
jpayne@68:             "Use 'elem.iter()' or 'list(elem.iter())' instead.",
jpayne@68:             DeprecationWarning, stacklevel=2
jpayne@68:         )
jpayne@68:         return list(self.iter(tag))
jpayne@68: 
jpayne@68:     def itertext(self):
jpayne@68:         """Create text iterator.
jpayne@68: 
jpayne@68:         The iterator loops over the element and all subelements in document
jpayne@68:         order, returning all inner text.
jpayne@68: 
jpayne@68:         """
jpayne@68:         tag = self.tag
jpayne@68:         if not isinstance(tag, str) and tag is not None:
jpayne@68:             return
jpayne@68:         t = self.text
jpayne@68:         if t:
jpayne@68:             yield t
jpayne@68:         for e in self:
jpayne@68:             yield from e.itertext()
jpayne@68:             t = e.tail
jpayne@68:             if t:
jpayne@68:                 yield t
jpayne@68: 
jpayne@68: 
jpayne@68: def SubElement(parent, tag, attrib={}, **extra):
jpayne@68:     """Subelement factory which creates an element instance, and appends it
jpayne@68:     to an existing parent.
jpayne@68: 
jpayne@68:     The element tag, attribute names, and attribute values can be either
jpayne@68:     bytes or Unicode strings.
jpayne@68: 
jpayne@68:     *parent* is the parent element, *tag* is the subelements name, *attrib* is
jpayne@68:     an optional directory containing element attributes, *extra* are
jpayne@68:     additional attributes given as keyword arguments.
jpayne@68: 
jpayne@68:     """
jpayne@68:     attrib = {**attrib, **extra}
jpayne@68:     element = parent.makeelement(tag, attrib)
jpayne@68:     parent.append(element)
jpayne@68:     return element
jpayne@68: 
jpayne@68: 
jpayne@68: def Comment(text=None):
jpayne@68:     """Comment element factory.
jpayne@68: 
jpayne@68:     This function creates a special element which the standard serializer
jpayne@68:     serializes as an XML comment.
jpayne@68: 
jpayne@68:     *text* is a string containing the comment string.
jpayne@68: 
jpayne@68:     """
jpayne@68:     element = Element(Comment)
jpayne@68:     element.text = text
jpayne@68:     return element
jpayne@68: 
jpayne@68: 
jpayne@68: def ProcessingInstruction(target, text=None):
jpayne@68:     """Processing Instruction element factory.
jpayne@68: 
jpayne@68:     This function creates a special element which the standard serializer
jpayne@68:     serializes as an XML comment.
jpayne@68: 
jpayne@68:     *target* is a string containing the processing instruction, *text* is a
jpayne@68:     string containing the processing instruction contents, if any.
jpayne@68: 
jpayne@68:     """
jpayne@68:     element = Element(ProcessingInstruction)
jpayne@68:     element.text = target
jpayne@68:     if text:
jpayne@68:         element.text = element.text + " " + text
jpayne@68:     return element
jpayne@68: 
jpayne@68: PI = ProcessingInstruction
jpayne@68: 
jpayne@68: 
jpayne@68: class QName:
jpayne@68:     """Qualified name wrapper.
jpayne@68: 
jpayne@68:     This class can be used to wrap a QName attribute value in order to get
jpayne@68:     proper namespace handing on output.
jpayne@68: 
jpayne@68:     *text_or_uri* is a string containing the QName value either in the form
jpayne@68:     {uri}local, or if the tag argument is given, the URI part of a QName.
jpayne@68: 
jpayne@68:     *tag* is an optional argument which if given, will make the first
jpayne@68:     argument (text_or_uri) be interpreted as a URI, and this argument (tag)
jpayne@68:     be interpreted as a local name.
jpayne@68: 
jpayne@68:     """
jpayne@68:     def __init__(self, text_or_uri, tag=None):
jpayne@68:         if tag:
jpayne@68:             text_or_uri = "{%s}%s" % (text_or_uri, tag)
jpayne@68:         self.text = text_or_uri
jpayne@68:     def __str__(self):
jpayne@68:         return self.text
jpayne@68:     def __repr__(self):
jpayne@68:         return '<%s %r>' % (self.__class__.__name__, self.text)
jpayne@68:     def __hash__(self):
jpayne@68:         return hash(self.text)
jpayne@68:     def __le__(self, other):
jpayne@68:         if isinstance(other, QName):
jpayne@68:             return self.text <= other.text
jpayne@68:         return self.text <= other
jpayne@68:     def __lt__(self, other):
jpayne@68:         if isinstance(other, QName):
jpayne@68:             return self.text < other.text
jpayne@68:         return self.text < other
jpayne@68:     def __ge__(self, other):
jpayne@68:         if isinstance(other, QName):
jpayne@68:             return self.text >= other.text
jpayne@68:         return self.text >= other
jpayne@68:     def __gt__(self, other):
jpayne@68:         if isinstance(other, QName):
jpayne@68:             return self.text > other.text
jpayne@68:         return self.text > other
jpayne@68:     def __eq__(self, other):
jpayne@68:         if isinstance(other, QName):
jpayne@68:             return self.text == other.text
jpayne@68:         return self.text == other
jpayne@68: 
jpayne@68: # --------------------------------------------------------------------
jpayne@68: 
jpayne@68: 
jpayne@68: class ElementTree:
jpayne@68:     """An XML element hierarchy.
jpayne@68: 
jpayne@68:     This class also provides support for serialization to and from
jpayne@68:     standard XML.
jpayne@68: 
jpayne@68:     *element* is an optional root element node,
jpayne@68:     *file* is an optional file handle or file name of an XML file whose
jpayne@68:     contents will be used to initialize the tree with.
jpayne@68: 
jpayne@68:     """
jpayne@68:     def __init__(self, element=None, file=None):
jpayne@68:         # assert element is None or iselement(element)
jpayne@68:         self._root = element # first node
jpayne@68:         if file:
jpayne@68:             self.parse(file)
jpayne@68: 
jpayne@68:     def getroot(self):
jpayne@68:         """Return root element of this tree."""
jpayne@68:         return self._root
jpayne@68: 
jpayne@68:     def _setroot(self, element):
jpayne@68:         """Replace root element of this tree.
jpayne@68: 
jpayne@68:         This will discard the current contents of the tree and replace it
jpayne@68:         with the given element.  Use with care!
jpayne@68: 
jpayne@68:         """
jpayne@68:         # assert iselement(element)
jpayne@68:         self._root = element
jpayne@68: 
jpayne@68:     def parse(self, source, parser=None):
jpayne@68:         """Load external XML document into element tree.
jpayne@68: 
jpayne@68:         *source* is a file name or file object, *parser* is an optional parser
jpayne@68:         instance that defaults to XMLParser.
jpayne@68: 
jpayne@68:         ParseError is raised if the parser fails to parse the document.
jpayne@68: 
jpayne@68:         Returns the root element of the given source document.
jpayne@68: 
jpayne@68:         """
jpayne@68:         close_source = False
jpayne@68:         if not hasattr(source, "read"):
jpayne@68:             source = open(source, "rb")
jpayne@68:             close_source = True
jpayne@68:         try:
jpayne@68:             if parser is None:
jpayne@68:                 # If no parser was specified, create a default XMLParser
jpayne@68:                 parser = XMLParser()
jpayne@68:                 if hasattr(parser, '_parse_whole'):
jpayne@68:                     # The default XMLParser, when it comes from an accelerator,
jpayne@68:                     # can define an internal _parse_whole API for efficiency.
jpayne@68:                     # It can be used to parse the whole source without feeding
jpayne@68:                     # it with chunks.
jpayne@68:                     self._root = parser._parse_whole(source)
jpayne@68:                     return self._root
jpayne@68:             while True:
jpayne@68:                 data = source.read(65536)
jpayne@68:                 if not data:
jpayne@68:                     break
jpayne@68:                 parser.feed(data)
jpayne@68:             self._root = parser.close()
jpayne@68:             return self._root
jpayne@68:         finally:
jpayne@68:             if close_source:
jpayne@68:                 source.close()
jpayne@68: 
jpayne@68:     def iter(self, tag=None):
jpayne@68:         """Create and return tree iterator for the root element.
jpayne@68: 
jpayne@68:         The iterator loops over all elements in this tree, in document order.
jpayne@68: 
jpayne@68:         *tag* is a string with the tag name to iterate over
jpayne@68:         (default is to return all elements).
jpayne@68: 
jpayne@68:         """
jpayne@68:         # assert self._root is not None
jpayne@68:         return self._root.iter(tag)
jpayne@68: 
jpayne@68:     # compatibility
jpayne@68:     def getiterator(self, tag=None):
jpayne@68:         warnings.warn(
jpayne@68:             "This method will be removed in future versions.  "
jpayne@68:             "Use 'tree.iter()' or 'list(tree.iter())' instead.",
jpayne@68:             DeprecationWarning, stacklevel=2
jpayne@68:         )
jpayne@68:         return list(self.iter(tag))
jpayne@68: 
jpayne@68:     def find(self, path, namespaces=None):
jpayne@68:         """Find first matching element by tag name or path.
jpayne@68: 
jpayne@68:         Same as getroot().find(path), which is Element.find()
jpayne@68: 
jpayne@68:         *path* is a string having either an element tag or an XPath,
jpayne@68:         *namespaces* is an optional mapping from namespace prefix to full name.
jpayne@68: 
jpayne@68:         Return the first matching element, or None if no element was found.
jpayne@68: 
jpayne@68:         """
jpayne@68:         # assert self._root is not None
jpayne@68:         if path[:1] == "/":
jpayne@68:             path = "." + path
jpayne@68:             warnings.warn(
jpayne@68:                 "This search is broken in 1.3 and earlier, and will be "
jpayne@68:                 "fixed in a future version.  If you rely on the current "
jpayne@68:                 "behaviour, change it to %r" % path,
jpayne@68:                 FutureWarning, stacklevel=2
jpayne@68:                 )
jpayne@68:         return self._root.find(path, namespaces)
jpayne@68: 
jpayne@68:     def findtext(self, path, default=None, namespaces=None):
jpayne@68:         """Find first matching element by tag name or path.
jpayne@68: 
jpayne@68:         Same as getroot().findtext(path),  which is Element.findtext()
jpayne@68: 
jpayne@68:         *path* is a string having either an element tag or an XPath,
jpayne@68:         *namespaces* is an optional mapping from namespace prefix to full name.
jpayne@68: 
jpayne@68:         Return the first matching element, or None if no element was found.
jpayne@68: 
jpayne@68:         """
jpayne@68:         # assert self._root is not None
jpayne@68:         if path[:1] == "/":
jpayne@68:             path = "." + path
jpayne@68:             warnings.warn(
jpayne@68:                 "This search is broken in 1.3 and earlier, and will be "
jpayne@68:                 "fixed in a future version.  If you rely on the current "
jpayne@68:                 "behaviour, change it to %r" % path,
jpayne@68:                 FutureWarning, stacklevel=2
jpayne@68:                 )
jpayne@68:         return self._root.findtext(path, default, namespaces)
jpayne@68: 
jpayne@68:     def findall(self, path, namespaces=None):
jpayne@68:         """Find all matching subelements by tag name or path.
jpayne@68: 
jpayne@68:         Same as getroot().findall(path), which is Element.findall().
jpayne@68: 
jpayne@68:         *path* is a string having either an element tag or an XPath,
jpayne@68:         *namespaces* is an optional mapping from namespace prefix to full name.
jpayne@68: 
jpayne@68:         Return list containing all matching elements in document order.
jpayne@68: 
jpayne@68:         """
jpayne@68:         # assert self._root is not None
jpayne@68:         if path[:1] == "/":
jpayne@68:             path = "." + path
jpayne@68:             warnings.warn(
jpayne@68:                 "This search is broken in 1.3 and earlier, and will be "
jpayne@68:                 "fixed in a future version.  If you rely on the current "
jpayne@68:                 "behaviour, change it to %r" % path,
jpayne@68:                 FutureWarning, stacklevel=2
jpayne@68:                 )
jpayne@68:         return self._root.findall(path, namespaces)
jpayne@68: 
jpayne@68:     def iterfind(self, path, namespaces=None):
jpayne@68:         """Find all matching subelements by tag name or path.
jpayne@68: 
jpayne@68:         Same as getroot().iterfind(path), which is element.iterfind()
jpayne@68: 
jpayne@68:         *path* is a string having either an element tag or an XPath,
jpayne@68:         *namespaces* is an optional mapping from namespace prefix to full name.
jpayne@68: 
jpayne@68:         Return an iterable yielding all matching elements in document order.
jpayne@68: 
jpayne@68:         """
jpayne@68:         # assert self._root is not None
jpayne@68:         if path[:1] == "/":
jpayne@68:             path = "." + path
jpayne@68:             warnings.warn(
jpayne@68:                 "This search is broken in 1.3 and earlier, and will be "
jpayne@68:                 "fixed in a future version.  If you rely on the current "
jpayne@68:                 "behaviour, change it to %r" % path,
jpayne@68:                 FutureWarning, stacklevel=2
jpayne@68:                 )
jpayne@68:         return self._root.iterfind(path, namespaces)
jpayne@68: 
jpayne@68:     def write(self, file_or_filename,
jpayne@68:               encoding=None,
jpayne@68:               xml_declaration=None,
jpayne@68:               default_namespace=None,
jpayne@68:               method=None, *,
jpayne@68:               short_empty_elements=True):
jpayne@68:         """Write element tree to a file as XML.
jpayne@68: 
jpayne@68:         Arguments:
jpayne@68:           *file_or_filename* -- file name or a file object opened for writing
jpayne@68: 
jpayne@68:           *encoding* -- the output encoding (default: US-ASCII)
jpayne@68: 
jpayne@68:           *xml_declaration* -- bool indicating if an XML declaration should be
jpayne@68:                                added to the output. If None, an XML declaration
jpayne@68:                                is added if encoding IS NOT either of:
jpayne@68:                                US-ASCII, UTF-8, or Unicode
jpayne@68: 
jpayne@68:           *default_namespace* -- sets the default XML namespace (for "xmlns")
jpayne@68: 
jpayne@68:           *method* -- either "xml" (default), "html, "text", or "c14n"
jpayne@68: 
jpayne@68:           *short_empty_elements* -- controls the formatting of elements
jpayne@68:                                     that contain no content. If True (default)
jpayne@68:                                     they are emitted as a single self-closed
jpayne@68:                                     tag, otherwise they are emitted as a pair
jpayne@68:                                     of start/end tags
jpayne@68: 
jpayne@68:         """
jpayne@68:         if not method:
jpayne@68:             method = "xml"
jpayne@68:         elif method not in _serialize:
jpayne@68:             raise ValueError("unknown method %r" % method)
jpayne@68:         if not encoding:
jpayne@68:             if method == "c14n":
jpayne@68:                 encoding = "utf-8"
jpayne@68:             else:
jpayne@68:                 encoding = "us-ascii"
jpayne@68:         enc_lower = encoding.lower()
jpayne@68:         with _get_writer(file_or_filename, enc_lower) as write:
jpayne@68:             if method == "xml" and (xml_declaration or
jpayne@68:                     (xml_declaration is None and
jpayne@68:                      enc_lower not in ("utf-8", "us-ascii", "unicode"))):
jpayne@68:                 declared_encoding = encoding
jpayne@68:                 if enc_lower == "unicode":
jpayne@68:                     # Retrieve the default encoding for the xml declaration
jpayne@68:                     import locale
jpayne@68:                     declared_encoding = locale.getpreferredencoding()
jpayne@68:                 write("<?xml version='1.0' encoding='%s'?>\n" % (
jpayne@68:                     declared_encoding,))
jpayne@68:             if method == "text":
jpayne@68:                 _serialize_text(write, self._root)
jpayne@68:             else:
jpayne@68:                 qnames, namespaces = _namespaces(self._root, default_namespace)
jpayne@68:                 serialize = _serialize[method]
jpayne@68:                 serialize(write, self._root, qnames, namespaces,
jpayne@68:                           short_empty_elements=short_empty_elements)
jpayne@68: 
jpayne@68:     def write_c14n(self, file):
jpayne@68:         # lxml.etree compatibility.  use output method instead
jpayne@68:         return self.write(file, method="c14n")
jpayne@68: 
jpayne@68: # --------------------------------------------------------------------
jpayne@68: # serialization support
jpayne@68: 
jpayne@68: @contextlib.contextmanager
jpayne@68: def _get_writer(file_or_filename, encoding):
jpayne@68:     # returns text write method and release all resources after using
jpayne@68:     try:
jpayne@68:         write = file_or_filename.write
jpayne@68:     except AttributeError:
jpayne@68:         # file_or_filename is a file name
jpayne@68:         if encoding == "unicode":
jpayne@68:             file = open(file_or_filename, "w")
jpayne@68:         else:
jpayne@68:             file = open(file_or_filename, "w", encoding=encoding,
jpayne@68:                         errors="xmlcharrefreplace")
jpayne@68:         with file:
jpayne@68:             yield file.write
jpayne@68:     else:
jpayne@68:         # file_or_filename is a file-like object
jpayne@68:         # encoding determines if it is a text or binary writer
jpayne@68:         if encoding == "unicode":
jpayne@68:             # use a text writer as is
jpayne@68:             yield write
jpayne@68:         else:
jpayne@68:             # wrap a binary writer with TextIOWrapper
jpayne@68:             with contextlib.ExitStack() as stack:
jpayne@68:                 if isinstance(file_or_filename, io.BufferedIOBase):
jpayne@68:                     file = file_or_filename
jpayne@68:                 elif isinstance(file_or_filename, io.RawIOBase):
jpayne@68:                     file = io.BufferedWriter(file_or_filename)
jpayne@68:                     # Keep the original file open when the BufferedWriter is
jpayne@68:                     # destroyed
jpayne@68:                     stack.callback(file.detach)
jpayne@68:                 else:
jpayne@68:                     # This is to handle passed objects that aren't in the
jpayne@68:                     # IOBase hierarchy, but just have a write method
jpayne@68:                     file = io.BufferedIOBase()
jpayne@68:                     file.writable = lambda: True
jpayne@68:                     file.write = write
jpayne@68:                     try:
jpayne@68:                         # TextIOWrapper uses this methods to determine
jpayne@68:                         # if BOM (for UTF-16, etc) should be added
jpayne@68:                         file.seekable = file_or_filename.seekable
jpayne@68:                         file.tell = file_or_filename.tell
jpayne@68:                     except AttributeError:
jpayne@68:                         pass
jpayne@68:                 file = io.TextIOWrapper(file,
jpayne@68:                                         encoding=encoding,
jpayne@68:                                         errors="xmlcharrefreplace",
jpayne@68:                                         newline="\n")
jpayne@68:                 # Keep the original file open when the TextIOWrapper is
jpayne@68:                 # destroyed
jpayne@68:                 stack.callback(file.detach)
jpayne@68:                 yield file.write
jpayne@68: 
jpayne@68: def _namespaces(elem, default_namespace=None):
jpayne@68:     # identify namespaces used in this tree
jpayne@68: 
jpayne@68:     # maps qnames to *encoded* prefix:local names
jpayne@68:     qnames = {None: None}
jpayne@68: 
jpayne@68:     # maps uri:s to prefixes
jpayne@68:     namespaces = {}
jpayne@68:     if default_namespace:
jpayne@68:         namespaces[default_namespace] = ""
jpayne@68: 
jpayne@68:     def add_qname(qname):
jpayne@68:         # calculate serialized qname representation
jpayne@68:         try:
jpayne@68:             if qname[:1] == "{":
jpayne@68:                 uri, tag = qname[1:].rsplit("}", 1)
jpayne@68:                 prefix = namespaces.get(uri)
jpayne@68:                 if prefix is None:
jpayne@68:                     prefix = _namespace_map.get(uri)
jpayne@68:                     if prefix is None:
jpayne@68:                         prefix = "ns%d" % len(namespaces)
jpayne@68:                     if prefix != "xml":
jpayne@68:                         namespaces[uri] = prefix
jpayne@68:                 if prefix:
jpayne@68:                     qnames[qname] = "%s:%s" % (prefix, tag)
jpayne@68:                 else:
jpayne@68:                     qnames[qname] = tag # default element
jpayne@68:             else:
jpayne@68:                 if default_namespace:
jpayne@68:                     # FIXME: can this be handled in XML 1.0?
jpayne@68:                     raise ValueError(
jpayne@68:                         "cannot use non-qualified names with "
jpayne@68:                         "default_namespace option"
jpayne@68:                         )
jpayne@68:                 qnames[qname] = qname
jpayne@68:         except TypeError:
jpayne@68:             _raise_serialization_error(qname)
jpayne@68: 
jpayne@68:     # populate qname and namespaces table
jpayne@68:     for elem in elem.iter():
jpayne@68:         tag = elem.tag
jpayne@68:         if isinstance(tag, QName):
jpayne@68:             if tag.text not in qnames:
jpayne@68:                 add_qname(tag.text)
jpayne@68:         elif isinstance(tag, str):
jpayne@68:             if tag not in qnames:
jpayne@68:                 add_qname(tag)
jpayne@68:         elif tag is not None and tag is not Comment and tag is not PI:
jpayne@68:             _raise_serialization_error(tag)
jpayne@68:         for key, value in elem.items():
jpayne@68:             if isinstance(key, QName):
jpayne@68:                 key = key.text
jpayne@68:             if key not in qnames:
jpayne@68:                 add_qname(key)
jpayne@68:             if isinstance(value, QName) and value.text not in qnames:
jpayne@68:                 add_qname(value.text)
jpayne@68:         text = elem.text
jpayne@68:         if isinstance(text, QName) and text.text not in qnames:
jpayne@68:             add_qname(text.text)
jpayne@68:     return qnames, namespaces
jpayne@68: 
jpayne@68: def _serialize_xml(write, elem, qnames, namespaces,
jpayne@68:                    short_empty_elements, **kwargs):
jpayne@68:     tag = elem.tag
jpayne@68:     text = elem.text
jpayne@68:     if tag is Comment:
jpayne@68:         write("<!--%s-->" % text)
jpayne@68:     elif tag is ProcessingInstruction:
jpayne@68:         write("<?%s?>" % text)
jpayne@68:     else:
jpayne@68:         tag = qnames[tag]
jpayne@68:         if tag is None:
jpayne@68:             if text:
jpayne@68:                 write(_escape_cdata(text))
jpayne@68:             for e in elem:
jpayne@68:                 _serialize_xml(write, e, qnames, None,
jpayne@68:                                short_empty_elements=short_empty_elements)
jpayne@68:         else:
jpayne@68:             write("<" + tag)
jpayne@68:             items = list(elem.items())
jpayne@68:             if items or namespaces:
jpayne@68:                 if namespaces:
jpayne@68:                     for v, k in sorted(namespaces.items(),
jpayne@68:                                        key=lambda x: x[1]):  # sort on prefix
jpayne@68:                         if k:
jpayne@68:                             k = ":" + k
jpayne@68:                         write(" xmlns%s=\"%s\"" % (
jpayne@68:                             k,
jpayne@68:                             _escape_attrib(v)
jpayne@68:                             ))
jpayne@68:                 for k, v in items:
jpayne@68:                     if isinstance(k, QName):
jpayne@68:                         k = k.text
jpayne@68:                     if isinstance(v, QName):
jpayne@68:                         v = qnames[v.text]
jpayne@68:                     else:
jpayne@68:                         v = _escape_attrib(v)
jpayne@68:                     write(" %s=\"%s\"" % (qnames[k], v))
jpayne@68:             if text or len(elem) or not short_empty_elements:
jpayne@68:                 write(">")
jpayne@68:                 if text:
jpayne@68:                     write(_escape_cdata(text))
jpayne@68:                 for e in elem:
jpayne@68:                     _serialize_xml(write, e, qnames, None,
jpayne@68:                                    short_empty_elements=short_empty_elements)
jpayne@68:                 write("</" + tag + ">")
jpayne@68:             else:
jpayne@68:                 write(" />")
jpayne@68:     if elem.tail:
jpayne@68:         write(_escape_cdata(elem.tail))
jpayne@68: 
jpayne@68: HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
jpayne@68:               "img", "input", "isindex", "link", "meta", "param")
jpayne@68: 
jpayne@68: try:
jpayne@68:     HTML_EMPTY = set(HTML_EMPTY)
jpayne@68: except NameError:
jpayne@68:     pass
jpayne@68: 
jpayne@68: def _serialize_html(write, elem, qnames, namespaces, **kwargs):
jpayne@68:     tag = elem.tag
jpayne@68:     text = elem.text
jpayne@68:     if tag is Comment:
jpayne@68:         write("<!--%s-->" % _escape_cdata(text))
jpayne@68:     elif tag is ProcessingInstruction:
jpayne@68:         write("<?%s?>" % _escape_cdata(text))
jpayne@68:     else:
jpayne@68:         tag = qnames[tag]
jpayne@68:         if tag is None:
jpayne@68:             if text:
jpayne@68:                 write(_escape_cdata(text))
jpayne@68:             for e in elem:
jpayne@68:                 _serialize_html(write, e, qnames, None)
jpayne@68:         else:
jpayne@68:             write("<" + tag)
jpayne@68:             items = list(elem.items())
jpayne@68:             if items or namespaces:
jpayne@68:                 if namespaces:
jpayne@68:                     for v, k in sorted(namespaces.items(),
jpayne@68:                                        key=lambda x: x[1]):  # sort on prefix
jpayne@68:                         if k:
jpayne@68:                             k = ":" + k
jpayne@68:                         write(" xmlns%s=\"%s\"" % (
jpayne@68:                             k,
jpayne@68:                             _escape_attrib(v)
jpayne@68:                             ))
jpayne@68:                 for k, v in items:
jpayne@68:                     if isinstance(k, QName):
jpayne@68:                         k = k.text
jpayne@68:                     if isinstance(v, QName):
jpayne@68:                         v = qnames[v.text]
jpayne@68:                     else:
jpayne@68:                         v = _escape_attrib_html(v)
jpayne@68:                     # FIXME: handle boolean attributes
jpayne@68:                     write(" %s=\"%s\"" % (qnames[k], v))
jpayne@68:             write(">")
jpayne@68:             ltag = tag.lower()
jpayne@68:             if text:
jpayne@68:                 if ltag == "script" or ltag == "style":
jpayne@68:                     write(text)
jpayne@68:                 else:
jpayne@68:                     write(_escape_cdata(text))
jpayne@68:             for e in elem:
jpayne@68:                 _serialize_html(write, e, qnames, None)
jpayne@68:             if ltag not in HTML_EMPTY:
jpayne@68:                 write("</" + tag + ">")
jpayne@68:     if elem.tail:
jpayne@68:         write(_escape_cdata(elem.tail))
jpayne@68: 
jpayne@68: def _serialize_text(write, elem):
jpayne@68:     for part in elem.itertext():
jpayne@68:         write(part)
jpayne@68:     if elem.tail:
jpayne@68:         write(elem.tail)
jpayne@68: 
jpayne@68: _serialize = {
jpayne@68:     "xml": _serialize_xml,
jpayne@68:     "html": _serialize_html,
jpayne@68:     "text": _serialize_text,
jpayne@68: # this optional method is imported at the end of the module
jpayne@68: #   "c14n": _serialize_c14n,
jpayne@68: }
jpayne@68: 
jpayne@68: 
jpayne@68: def register_namespace(prefix, uri):
jpayne@68:     """Register a namespace prefix.
jpayne@68: 
jpayne@68:     The registry is global, and any existing mapping for either the
jpayne@68:     given prefix or the namespace URI will be removed.
jpayne@68: 
jpayne@68:     *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and
jpayne@68:     attributes in this namespace will be serialized with prefix if possible.
jpayne@68: 
jpayne@68:     ValueError is raised if prefix is reserved or is invalid.
jpayne@68: 
jpayne@68:     """
jpayne@68:     if re.match(r"ns\d+$", prefix):
jpayne@68:         raise ValueError("Prefix format reserved for internal use")
jpayne@68:     for k, v in list(_namespace_map.items()):
jpayne@68:         if k == uri or v == prefix:
jpayne@68:             del _namespace_map[k]
jpayne@68:     _namespace_map[uri] = prefix
jpayne@68: 
jpayne@68: _namespace_map = {
jpayne@68:     # "well-known" namespace prefixes
jpayne@68:     "http://www.w3.org/XML/1998/namespace": "xml",
jpayne@68:     "http://www.w3.org/1999/xhtml": "html",
jpayne@68:     "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
jpayne@68:     "http://schemas.xmlsoap.org/wsdl/": "wsdl",
jpayne@68:     # xml schema
jpayne@68:     "http://www.w3.org/2001/XMLSchema": "xs",
jpayne@68:     "http://www.w3.org/2001/XMLSchema-instance": "xsi",
jpayne@68:     # dublin core
jpayne@68:     "http://purl.org/dc/elements/1.1/": "dc",
jpayne@68: }
jpayne@68: # For tests and troubleshooting
jpayne@68: register_namespace._namespace_map = _namespace_map
jpayne@68: 
jpayne@68: def _raise_serialization_error(text):
jpayne@68:     raise TypeError(
jpayne@68:         "cannot serialize %r (type %s)" % (text, type(text).__name__)
jpayne@68:         )
jpayne@68: 
jpayne@68: def _escape_cdata(text):
jpayne@68:     # escape character data
jpayne@68:     try:
jpayne@68:         # it's worth avoiding do-nothing calls for strings that are
jpayne@68:         # shorter than 500 characters, or so.  assume that's, by far,
jpayne@68:         # the most common case in most applications.
jpayne@68:         if "&" in text:
jpayne@68:             text = text.replace("&", "&amp;")
jpayne@68:         if "<" in text:
jpayne@68:             text = text.replace("<", "&lt;")
jpayne@68:         if ">" in text:
jpayne@68:             text = text.replace(">", "&gt;")
jpayne@68:         return text
jpayne@68:     except (TypeError, AttributeError):
jpayne@68:         _raise_serialization_error(text)
jpayne@68: 
jpayne@68: def _escape_attrib(text):
jpayne@68:     # escape attribute value
jpayne@68:     try:
jpayne@68:         if "&" in text:
jpayne@68:             text = text.replace("&", "&amp;")
jpayne@68:         if "<" in text:
jpayne@68:             text = text.replace("<", "&lt;")
jpayne@68:         if ">" in text:
jpayne@68:             text = text.replace(">", "&gt;")
jpayne@68:         if "\"" in text:
jpayne@68:             text = text.replace("\"", "&quot;")
jpayne@68:         # The following business with carriage returns is to satisfy
jpayne@68:         # Section 2.11 of the XML specification, stating that
jpayne@68:         # CR or CR LN should be replaced with just LN
jpayne@68:         # http://www.w3.org/TR/REC-xml/#sec-line-ends
jpayne@68:         if "\r\n" in text:
jpayne@68:             text = text.replace("\r\n", "\n")
jpayne@68:         if "\r" in text:
jpayne@68:             text = text.replace("\r", "\n")
jpayne@68:         #The following four lines are issue 17582
jpayne@68:         if "\n" in text:
jpayne@68:             text = text.replace("\n", "&#10;")
jpayne@68:         if "\t" in text:
jpayne@68:             text = text.replace("\t", "&#09;")
jpayne@68:         return text
jpayne@68:     except (TypeError, AttributeError):
jpayne@68:         _raise_serialization_error(text)
jpayne@68: 
jpayne@68: def _escape_attrib_html(text):
jpayne@68:     # escape attribute value
jpayne@68:     try:
jpayne@68:         if "&" in text:
jpayne@68:             text = text.replace("&", "&amp;")
jpayne@68:         if ">" in text:
jpayne@68:             text = text.replace(">", "&gt;")
jpayne@68:         if "\"" in text:
jpayne@68:             text = text.replace("\"", "&quot;")
jpayne@68:         return text
jpayne@68:     except (TypeError, AttributeError):
jpayne@68:         _raise_serialization_error(text)
jpayne@68: 
jpayne@68: # --------------------------------------------------------------------
jpayne@68: 
jpayne@68: def tostring(element, encoding=None, method=None, *,
jpayne@68:              xml_declaration=None, default_namespace=None,
jpayne@68:              short_empty_elements=True):
jpayne@68:     """Generate string representation of XML element.
jpayne@68: 
jpayne@68:     All subelements are included.  If encoding is "unicode", a string
jpayne@68:     is returned. Otherwise a bytestring is returned.
jpayne@68: 
jpayne@68:     *element* is an Element instance, *encoding* is an optional output
jpayne@68:     encoding defaulting to US-ASCII, *method* is an optional output which can
jpayne@68:     be one of "xml" (default), "html", "text" or "c14n", *default_namespace*
jpayne@68:     sets the default XML namespace (for "xmlns").
jpayne@68: 
jpayne@68:     Returns an (optionally) encoded string containing the XML data.
jpayne@68: 
jpayne@68:     """
jpayne@68:     stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()
jpayne@68:     ElementTree(element).write(stream, encoding,
jpayne@68:                                xml_declaration=xml_declaration,
jpayne@68:                                default_namespace=default_namespace,
jpayne@68:                                method=method,
jpayne@68:                                short_empty_elements=short_empty_elements)
jpayne@68:     return stream.getvalue()
jpayne@68: 
jpayne@68: class _ListDataStream(io.BufferedIOBase):
jpayne@68:     """An auxiliary stream accumulating into a list reference."""
jpayne@68:     def __init__(self, lst):
jpayne@68:         self.lst = lst
jpayne@68: 
jpayne@68:     def writable(self):
jpayne@68:         return True
jpayne@68: 
jpayne@68:     def seekable(self):
jpayne@68:         return True
jpayne@68: 
jpayne@68:     def write(self, b):
jpayne@68:         self.lst.append(b)
jpayne@68: 
jpayne@68:     def tell(self):
jpayne@68:         return len(self.lst)
jpayne@68: 
jpayne@68: def tostringlist(element, encoding=None, method=None, *,
jpayne@68:                  xml_declaration=None, default_namespace=None,
jpayne@68:                  short_empty_elements=True):
jpayne@68:     lst = []
jpayne@68:     stream = _ListDataStream(lst)
jpayne@68:     ElementTree(element).write(stream, encoding,
jpayne@68:                                xml_declaration=xml_declaration,
jpayne@68:                                default_namespace=default_namespace,
jpayne@68:                                method=method,
jpayne@68:                                short_empty_elements=short_empty_elements)
jpayne@68:     return lst
jpayne@68: 
jpayne@68: 
jpayne@68: def dump(elem):
jpayne@68:     """Write element tree or element structure to sys.stdout.
jpayne@68: 
jpayne@68:     This function should be used for debugging only.
jpayne@68: 
jpayne@68:     *elem* is either an ElementTree, or a single Element.  The exact output
jpayne@68:     format is implementation dependent.  In this version, it's written as an
jpayne@68:     ordinary XML file.
jpayne@68: 
jpayne@68:     """
jpayne@68:     # debugging
jpayne@68:     if not isinstance(elem, ElementTree):
jpayne@68:         elem = ElementTree(elem)
jpayne@68:     elem.write(sys.stdout, encoding="unicode")
jpayne@68:     tail = elem.getroot().tail
jpayne@68:     if not tail or tail[-1] != "\n":
jpayne@68:         sys.stdout.write("\n")
jpayne@68: 
jpayne@68: # --------------------------------------------------------------------
jpayne@68: # parsing
jpayne@68: 
jpayne@68: 
jpayne@68: def parse(source, parser=None):
jpayne@68:     """Parse XML document into element tree.
jpayne@68: 
jpayne@68:     *source* is a filename or file object containing XML data,
jpayne@68:     *parser* is an optional parser instance defaulting to XMLParser.
jpayne@68: 
jpayne@68:     Return an ElementTree instance.
jpayne@68: 
jpayne@68:     """
jpayne@68:     tree = ElementTree()
jpayne@68:     tree.parse(source, parser)
jpayne@68:     return tree
jpayne@68: 
jpayne@68: 
jpayne@68: def iterparse(source, events=None, parser=None):
jpayne@68:     """Incrementally parse XML document into ElementTree.
jpayne@68: 
jpayne@68:     This class also reports what's going on to the user based on the
jpayne@68:     *events* it is initialized with.  The supported events are the strings
jpayne@68:     "start", "end", "start-ns" and "end-ns" (the "ns" events are used to get
jpayne@68:     detailed namespace information).  If *events* is omitted, only
jpayne@68:     "end" events are reported.
jpayne@68: 
jpayne@68:     *source* is a filename or file object containing XML data, *events* is
jpayne@68:     a list of events to report back, *parser* is an optional parser instance.
jpayne@68: 
jpayne@68:     Returns an iterator providing (event, elem) pairs.
jpayne@68: 
jpayne@68:     """
jpayne@68:     # Use the internal, undocumented _parser argument for now; When the
jpayne@68:     # parser argument of iterparse is removed, this can be killed.
jpayne@68:     pullparser = XMLPullParser(events=events, _parser=parser)
jpayne@68:     def iterator():
jpayne@68:         try:
jpayne@68:             while True:
jpayne@68:                 yield from pullparser.read_events()
jpayne@68:                 # load event buffer
jpayne@68:                 data = source.read(16 * 1024)
jpayne@68:                 if not data:
jpayne@68:                     break
jpayne@68:                 pullparser.feed(data)
jpayne@68:             root = pullparser._close_and_return_root()
jpayne@68:             yield from pullparser.read_events()
jpayne@68:             it.root = root
jpayne@68:         finally:
jpayne@68:             if close_source:
jpayne@68:                 source.close()
jpayne@68: 
jpayne@68:     class IterParseIterator(collections.abc.Iterator):
jpayne@68:         __next__ = iterator().__next__
jpayne@68:     it = IterParseIterator()
jpayne@68:     it.root = None
jpayne@68:     del iterator, IterParseIterator
jpayne@68: 
jpayne@68:     close_source = False
jpayne@68:     if not hasattr(source, "read"):
jpayne@68:         source = open(source, "rb")
jpayne@68:         close_source = True
jpayne@68: 
jpayne@68:     return it
jpayne@68: 
jpayne@68: 
jpayne@68: class XMLPullParser:
jpayne@68: 
jpayne@68:     def __init__(self, events=None, *, _parser=None):
jpayne@68:         # The _parser argument is for internal use only and must not be relied
jpayne@68:         # upon in user code. It will be removed in a future release.
jpayne@68:         # See http://bugs.python.org/issue17741 for more details.
jpayne@68: 
jpayne@68:         self._events_queue = collections.deque()
jpayne@68:         self._parser = _parser or XMLParser(target=TreeBuilder())
jpayne@68:         # wire up the parser for event reporting
jpayne@68:         if events is None:
jpayne@68:             events = ("end",)
jpayne@68:         self._parser._setevents(self._events_queue, events)
jpayne@68: 
jpayne@68:     def feed(self, data):
jpayne@68:         """Feed encoded data to parser."""
jpayne@68:         if self._parser is None:
jpayne@68:             raise ValueError("feed() called after end of stream")
jpayne@68:         if data:
jpayne@68:             try:
jpayne@68:                 self._parser.feed(data)
jpayne@68:             except SyntaxError as exc:
jpayne@68:                 self._events_queue.append(exc)
jpayne@68: 
jpayne@68:     def _close_and_return_root(self):
jpayne@68:         # iterparse needs this to set its root attribute properly :(
jpayne@68:         root = self._parser.close()
jpayne@68:         self._parser = None
jpayne@68:         return root
jpayne@68: 
jpayne@68:     def close(self):
jpayne@68:         """Finish feeding data to parser.
jpayne@68: 
jpayne@68:         Unlike XMLParser, does not return the root element. Use
jpayne@68:         read_events() to consume elements from XMLPullParser.
jpayne@68:         """
jpayne@68:         self._close_and_return_root()
jpayne@68: 
jpayne@68:     def read_events(self):
jpayne@68:         """Return an iterator over currently available (event, elem) pairs.
jpayne@68: 
jpayne@68:         Events are consumed from the internal event queue as they are
jpayne@68:         retrieved from the iterator.
jpayne@68:         """
jpayne@68:         events = self._events_queue
jpayne@68:         while events:
jpayne@68:             event = events.popleft()
jpayne@68:             if isinstance(event, Exception):
jpayne@68:                 raise event
jpayne@68:             else:
jpayne@68:                 yield event
jpayne@68: 
jpayne@68: 
jpayne@68: def XML(text, parser=None):
jpayne@68:     """Parse XML document from string constant.
jpayne@68: 
jpayne@68:     This function can be used to embed "XML Literals" in Python code.
jpayne@68: 
jpayne@68:     *text* is a string containing XML data, *parser* is an
jpayne@68:     optional parser instance, defaulting to the standard XMLParser.
jpayne@68: 
jpayne@68:     Returns an Element instance.
jpayne@68: 
jpayne@68:     """
jpayne@68:     if not parser:
jpayne@68:         parser = XMLParser(target=TreeBuilder())
jpayne@68:     parser.feed(text)
jpayne@68:     return parser.close()
jpayne@68: 
jpayne@68: 
jpayne@68: def XMLID(text, parser=None):
jpayne@68:     """Parse XML document from string constant for its IDs.
jpayne@68: 
jpayne@68:     *text* is a string containing XML data, *parser* is an
jpayne@68:     optional parser instance, defaulting to the standard XMLParser.
jpayne@68: 
jpayne@68:     Returns an (Element, dict) tuple, in which the
jpayne@68:     dict maps element id:s to elements.
jpayne@68: 
jpayne@68:     """
jpayne@68:     if not parser:
jpayne@68:         parser = XMLParser(target=TreeBuilder())
jpayne@68:     parser.feed(text)
jpayne@68:     tree = parser.close()
jpayne@68:     ids = {}
jpayne@68:     for elem in tree.iter():
jpayne@68:         id = elem.get("id")
jpayne@68:         if id:
jpayne@68:             ids[id] = elem
jpayne@68:     return tree, ids
jpayne@68: 
jpayne@68: # Parse XML document from string constant.  Alias for XML().
jpayne@68: fromstring = XML
jpayne@68: 
jpayne@68: def fromstringlist(sequence, parser=None):
jpayne@68:     """Parse XML document from sequence of string fragments.
jpayne@68: 
jpayne@68:     *sequence* is a list of other sequence, *parser* is an optional parser
jpayne@68:     instance, defaulting to the standard XMLParser.
jpayne@68: 
jpayne@68:     Returns an Element instance.
jpayne@68: 
jpayne@68:     """
jpayne@68:     if not parser:
jpayne@68:         parser = XMLParser(target=TreeBuilder())
jpayne@68:     for text in sequence:
jpayne@68:         parser.feed(text)
jpayne@68:     return parser.close()
jpayne@68: 
jpayne@68: # --------------------------------------------------------------------
jpayne@68: 
jpayne@68: 
jpayne@68: class TreeBuilder:
jpayne@68:     """Generic element structure builder.
jpayne@68: 
jpayne@68:     This builder converts a sequence of start, data, and end method
jpayne@68:     calls to a well-formed element structure.
jpayne@68: 
jpayne@68:     You can use this class to build an element structure using a custom XML
jpayne@68:     parser, or a parser for some other XML-like format.
jpayne@68: 
jpayne@68:     *element_factory* is an optional element factory which is called
jpayne@68:     to create new Element instances, as necessary.
jpayne@68: 
jpayne@68:     *comment_factory* is a factory to create comments to be used instead of
jpayne@68:     the standard factory.  If *insert_comments* is false (the default),
jpayne@68:     comments will not be inserted into the tree.
jpayne@68: 
jpayne@68:     *pi_factory* is a factory to create processing instructions to be used
jpayne@68:     instead of the standard factory.  If *insert_pis* is false (the default),
jpayne@68:     processing instructions will not be inserted into the tree.
jpayne@68:     """
jpayne@68:     def __init__(self, element_factory=None, *,
jpayne@68:                  comment_factory=None, pi_factory=None,
jpayne@68:                  insert_comments=False, insert_pis=False):
jpayne@68:         self._data = [] # data collector
jpayne@68:         self._elem = [] # element stack
jpayne@68:         self._last = None # last element
jpayne@68:         self._root = None # root element
jpayne@68:         self._tail = None # true if we're after an end tag
jpayne@68:         if comment_factory is None:
jpayne@68:             comment_factory = Comment
jpayne@68:         self._comment_factory = comment_factory
jpayne@68:         self.insert_comments = insert_comments
jpayne@68:         if pi_factory is None:
jpayne@68:             pi_factory = ProcessingInstruction
jpayne@68:         self._pi_factory = pi_factory
jpayne@68:         self.insert_pis = insert_pis
jpayne@68:         if element_factory is None:
jpayne@68:             element_factory = Element
jpayne@68:         self._factory = element_factory
jpayne@68: 
jpayne@68:     def close(self):
jpayne@68:         """Flush builder buffers and return toplevel document Element."""
jpayne@68:         assert len(self._elem) == 0, "missing end tags"
jpayne@68:         assert self._root is not None, "missing toplevel element"
jpayne@68:         return self._root
jpayne@68: 
jpayne@68:     def _flush(self):
jpayne@68:         if self._data:
jpayne@68:             if self._last is not None:
jpayne@68:                 text = "".join(self._data)
jpayne@68:                 if self._tail:
jpayne@68:                     assert self._last.tail is None, "internal error (tail)"
jpayne@68:                     self._last.tail = text
jpayne@68:                 else:
jpayne@68:                     assert self._last.text is None, "internal error (text)"
jpayne@68:                     self._last.text = text
jpayne@68:             self._data = []
jpayne@68: 
jpayne@68:     def data(self, data):
jpayne@68:         """Add text to current element."""
jpayne@68:         self._data.append(data)
jpayne@68: 
jpayne@68:     def start(self, tag, attrs):
jpayne@68:         """Open new element and return it.
jpayne@68: 
jpayne@68:         *tag* is the element name, *attrs* is a dict containing element
jpayne@68:         attributes.
jpayne@68: 
jpayne@68:         """
jpayne@68:         self._flush()
jpayne@68:         self._last = elem = self._factory(tag, attrs)
jpayne@68:         if self._elem:
jpayne@68:             self._elem[-1].append(elem)
jpayne@68:         elif self._root is None:
jpayne@68:             self._root = elem
jpayne@68:         self._elem.append(elem)
jpayne@68:         self._tail = 0
jpayne@68:         return elem
jpayne@68: 
jpayne@68:     def end(self, tag):
jpayne@68:         """Close and return current Element.
jpayne@68: 
jpayne@68:         *tag* is the element name.
jpayne@68: 
jpayne@68:         """
jpayne@68:         self._flush()
jpayne@68:         self._last = self._elem.pop()
jpayne@68:         assert self._last.tag == tag,\
jpayne@68:                "end tag mismatch (expected %s, got %s)" % (
jpayne@68:                    self._last.tag, tag)
jpayne@68:         self._tail = 1
jpayne@68:         return self._last
jpayne@68: 
jpayne@68:     def comment(self, text):
jpayne@68:         """Create a comment using the comment_factory.
jpayne@68: 
jpayne@68:         *text* is the text of the comment.
jpayne@68:         """
jpayne@68:         return self._handle_single(
jpayne@68:             self._comment_factory, self.insert_comments, text)
jpayne@68: 
jpayne@68:     def pi(self, target, text=None):
jpayne@68:         """Create a processing instruction using the pi_factory.
jpayne@68: 
jpayne@68:         *target* is the target name of the processing instruction.
jpayne@68:         *text* is the data of the processing instruction, or ''.
jpayne@68:         """
jpayne@68:         return self._handle_single(
jpayne@68:             self._pi_factory, self.insert_pis, target, text)
jpayne@68: 
jpayne@68:     def _handle_single(self, factory, insert, *args):
jpayne@68:         elem = factory(*args)
jpayne@68:         if insert:
jpayne@68:             self._flush()
jpayne@68:             self._last = elem
jpayne@68:             if self._elem:
jpayne@68:                 self._elem[-1].append(elem)
jpayne@68:             self._tail = 1
jpayne@68:         return elem
jpayne@68: 
jpayne@68: 
jpayne@68: # also see ElementTree and TreeBuilder
jpayne@68: class XMLParser:
jpayne@68:     """Element structure builder for XML source data based on the expat parser.
jpayne@68: 
jpayne@68:     *target* is an optional target object which defaults to an instance of the
jpayne@68:     standard TreeBuilder class, *encoding* is an optional encoding string
jpayne@68:     which if given, overrides the encoding specified in the XML file:
jpayne@68:     http://www.iana.org/assignments/character-sets
jpayne@68: 
jpayne@68:     """
jpayne@68: 
jpayne@68:     def __init__(self, *, target=None, encoding=None):
jpayne@68:         try:
jpayne@68:             from xml.parsers import expat
jpayne@68:         except ImportError:
jpayne@68:             try:
jpayne@68:                 import pyexpat as expat
jpayne@68:             except ImportError:
jpayne@68:                 raise ImportError(
jpayne@68:                     "No module named expat; use SimpleXMLTreeBuilder instead"
jpayne@68:                     )
jpayne@68:         parser = expat.ParserCreate(encoding, "}")
jpayne@68:         if target is None:
jpayne@68:             target = TreeBuilder()
jpayne@68:         # underscored names are provided for compatibility only
jpayne@68:         self.parser = self._parser = parser
jpayne@68:         self.target = self._target = target
jpayne@68:         self._error = expat.error
jpayne@68:         self._names = {} # name memo cache
jpayne@68:         # main callbacks
jpayne@68:         parser.DefaultHandlerExpand = self._default
jpayne@68:         if hasattr(target, 'start'):
jpayne@68:             parser.StartElementHandler = self._start
jpayne@68:         if hasattr(target, 'end'):
jpayne@68:             parser.EndElementHandler = self._end
jpayne@68:         if hasattr(target, 'start_ns'):
jpayne@68:             parser.StartNamespaceDeclHandler = self._start_ns
jpayne@68:         if hasattr(target, 'end_ns'):
jpayne@68:             parser.EndNamespaceDeclHandler = self._end_ns
jpayne@68:         if hasattr(target, 'data'):
jpayne@68:             parser.CharacterDataHandler = target.data
jpayne@68:         # miscellaneous callbacks
jpayne@68:         if hasattr(target, 'comment'):
jpayne@68:             parser.CommentHandler = target.comment
jpayne@68:         if hasattr(target, 'pi'):
jpayne@68:             parser.ProcessingInstructionHandler = target.pi
jpayne@68:         # Configure pyexpat: buffering, new-style attribute handling.
jpayne@68:         parser.buffer_text = 1
jpayne@68:         parser.ordered_attributes = 1
jpayne@68:         parser.specified_attributes = 1
jpayne@68:         self._doctype = None
jpayne@68:         self.entity = {}
jpayne@68:         try:
jpayne@68:             self.version = "Expat %d.%d.%d" % expat.version_info
jpayne@68:         except AttributeError:
jpayne@68:             pass # unknown
jpayne@68: 
jpayne@68:     def _setevents(self, events_queue, events_to_report):
jpayne@68:         # Internal API for XMLPullParser
jpayne@68:         # events_to_report: a list of events to report during parsing (same as
jpayne@68:         # the *events* of XMLPullParser's constructor.
jpayne@68:         # events_queue: a list of actual parsing events that will be populated
jpayne@68:         # by the underlying parser.
jpayne@68:         #
jpayne@68:         parser = self._parser
jpayne@68:         append = events_queue.append
jpayne@68:         for event_name in events_to_report:
jpayne@68:             if event_name == "start":
jpayne@68:                 parser.ordered_attributes = 1
jpayne@68:                 parser.specified_attributes = 1
jpayne@68:                 def handler(tag, attrib_in, event=event_name, append=append,
jpayne@68:                             start=self._start):
jpayne@68:                     append((event, start(tag, attrib_in)))
jpayne@68:                 parser.StartElementHandler = handler
jpayne@68:             elif event_name == "end":
jpayne@68:                 def handler(tag, event=event_name, append=append,
jpayne@68:                             end=self._end):
jpayne@68:                     append((event, end(tag)))
jpayne@68:                 parser.EndElementHandler = handler
jpayne@68:             elif event_name == "start-ns":
jpayne@68:                 # TreeBuilder does not implement .start_ns()
jpayne@68:                 if hasattr(self.target, "start_ns"):
jpayne@68:                     def handler(prefix, uri, event=event_name, append=append,
jpayne@68:                                 start_ns=self._start_ns):
jpayne@68:                         append((event, start_ns(prefix, uri)))
jpayne@68:                 else:
jpayne@68:                     def handler(prefix, uri, event=event_name, append=append):
jpayne@68:                         append((event, (prefix or '', uri or '')))
jpayne@68:                 parser.StartNamespaceDeclHandler = handler
jpayne@68:             elif event_name == "end-ns":
jpayne@68:                 # TreeBuilder does not implement .end_ns()
jpayne@68:                 if hasattr(self.target, "end_ns"):
jpayne@68:                     def handler(prefix, event=event_name, append=append,
jpayne@68:                                 end_ns=self._end_ns):
jpayne@68:                         append((event, end_ns(prefix)))
jpayne@68:                 else:
jpayne@68:                     def handler(prefix, event=event_name, append=append):
jpayne@68:                         append((event, None))
jpayne@68:                 parser.EndNamespaceDeclHandler = handler
jpayne@68:             elif event_name == 'comment':
jpayne@68:                 def handler(text, event=event_name, append=append, self=self):
jpayne@68:                     append((event, self.target.comment(text)))
jpayne@68:                 parser.CommentHandler = handler
jpayne@68:             elif event_name == 'pi':
jpayne@68:                 def handler(pi_target, data, event=event_name, append=append,
jpayne@68:                             self=self):
jpayne@68:                     append((event, self.target.pi(pi_target, data)))
jpayne@68:                 parser.ProcessingInstructionHandler = handler
jpayne@68:             else:
jpayne@68:                 raise ValueError("unknown event %r" % event_name)
jpayne@68: 
jpayne@68:     def _raiseerror(self, value):
jpayne@68:         err = ParseError(value)
jpayne@68:         err.code = value.code
jpayne@68:         err.position = value.lineno, value.offset
jpayne@68:         raise err
jpayne@68: 
jpayne@68:     def _fixname(self, key):
jpayne@68:         # expand qname, and convert name string to ascii, if possible
jpayne@68:         try:
jpayne@68:             name = self._names[key]
jpayne@68:         except KeyError:
jpayne@68:             name = key
jpayne@68:             if "}" in name:
jpayne@68:                 name = "{" + name
jpayne@68:             self._names[key] = name
jpayne@68:         return name
jpayne@68: 
jpayne@68:     def _start_ns(self, prefix, uri):
jpayne@68:         return self.target.start_ns(prefix or '', uri or '')
jpayne@68: 
jpayne@68:     def _end_ns(self, prefix):
jpayne@68:         return self.target.end_ns(prefix or '')
jpayne@68: 
jpayne@68:     def _start(self, tag, attr_list):
jpayne@68:         # Handler for expat's StartElementHandler. Since ordered_attributes
jpayne@68:         # is set, the attributes are reported as a list of alternating
jpayne@68:         # attribute name,value.
jpayne@68:         fixname = self._fixname
jpayne@68:         tag = fixname(tag)
jpayne@68:         attrib = {}
jpayne@68:         if attr_list:
jpayne@68:             for i in range(0, len(attr_list), 2):
jpayne@68:                 attrib[fixname(attr_list[i])] = attr_list[i+1]
jpayne@68:         return self.target.start(tag, attrib)
jpayne@68: 
jpayne@68:     def _end(self, tag):
jpayne@68:         return self.target.end(self._fixname(tag))
jpayne@68: 
jpayne@68:     def _default(self, text):
jpayne@68:         prefix = text[:1]
jpayne@68:         if prefix == "&":
jpayne@68:             # deal with undefined entities
jpayne@68:             try:
jpayne@68:                 data_handler = self.target.data
jpayne@68:             except AttributeError:
jpayne@68:                 return
jpayne@68:             try:
jpayne@68:                 data_handler(self.entity[text[1:-1]])
jpayne@68:             except KeyError:
jpayne@68:                 from xml.parsers import expat
jpayne@68:                 err = expat.error(
jpayne@68:                     "undefined entity %s: line %d, column %d" %
jpayne@68:                     (text, self.parser.ErrorLineNumber,
jpayne@68:                     self.parser.ErrorColumnNumber)
jpayne@68:                     )
jpayne@68:                 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY
jpayne@68:                 err.lineno = self.parser.ErrorLineNumber
jpayne@68:                 err.offset = self.parser.ErrorColumnNumber
jpayne@68:                 raise err
jpayne@68:         elif prefix == "<" and text[:9] == "<!DOCTYPE":
jpayne@68:             self._doctype = [] # inside a doctype declaration
jpayne@68:         elif self._doctype is not None:
jpayne@68:             # parse doctype contents
jpayne@68:             if prefix == ">":
jpayne@68:                 self._doctype = None
jpayne@68:                 return
jpayne@68:             text = text.strip()
jpayne@68:             if not text:
jpayne@68:                 return
jpayne@68:             self._doctype.append(text)
jpayne@68:             n = len(self._doctype)
jpayne@68:             if n > 2:
jpayne@68:                 type = self._doctype[1]
jpayne@68:                 if type == "PUBLIC" and n == 4:
jpayne@68:                     name, type, pubid, system = self._doctype
jpayne@68:                     if pubid:
jpayne@68:                         pubid = pubid[1:-1]
jpayne@68:                 elif type == "SYSTEM" and n == 3:
jpayne@68:                     name, type, system = self._doctype
jpayne@68:                     pubid = None
jpayne@68:                 else:
jpayne@68:                     return
jpayne@68:                 if hasattr(self.target, "doctype"):
jpayne@68:                     self.target.doctype(name, pubid, system[1:-1])
jpayne@68:                 elif hasattr(self, "doctype"):
jpayne@68:                     warnings.warn(
jpayne@68:                         "The doctype() method of XMLParser is ignored.  "
jpayne@68:                         "Define doctype() method on the TreeBuilder target.",
jpayne@68:                         RuntimeWarning)
jpayne@68: 
jpayne@68:                 self._doctype = None
jpayne@68: 
jpayne@68:     def feed(self, data):
jpayne@68:         """Feed encoded data to parser."""
jpayne@68:         try:
jpayne@68:             self.parser.Parse(data, 0)
jpayne@68:         except self._error as v:
jpayne@68:             self._raiseerror(v)
jpayne@68: 
jpayne@68:     def close(self):
jpayne@68:         """Finish feeding data to parser and return element structure."""
jpayne@68:         try:
jpayne@68:             self.parser.Parse("", 1) # end of data
jpayne@68:         except self._error as v:
jpayne@68:             self._raiseerror(v)
jpayne@68:         try:
jpayne@68:             close_handler = self.target.close
jpayne@68:         except AttributeError:
jpayne@68:             pass
jpayne@68:         else:
jpayne@68:             return close_handler()
jpayne@68:         finally:
jpayne@68:             # get rid of circular references
jpayne@68:             del self.parser, self._parser
jpayne@68:             del self.target, self._target
jpayne@68: 
jpayne@68: 
jpayne@68: # --------------------------------------------------------------------
jpayne@68: # C14N 2.0
jpayne@68: 
jpayne@68: def canonicalize(xml_data=None, *, out=None, from_file=None, **options):
jpayne@68:     """Convert XML to its C14N 2.0 serialised form.
jpayne@68: 
jpayne@68:     If *out* is provided, it must be a file or file-like object that receives
jpayne@68:     the serialised canonical XML output (text, not bytes) through its ``.write()``
jpayne@68:     method.  To write to a file, open it in text mode with encoding "utf-8".
jpayne@68:     If *out* is not provided, this function returns the output as text string.
jpayne@68: 
jpayne@68:     Either *xml_data* (an XML string) or *from_file* (a file path or
jpayne@68:     file-like object) must be provided as input.
jpayne@68: 
jpayne@68:     The configuration options are the same as for the ``C14NWriterTarget``.
jpayne@68:     """
jpayne@68:     if xml_data is None and from_file is None:
jpayne@68:         raise ValueError("Either 'xml_data' or 'from_file' must be provided as input")
jpayne@68:     sio = None
jpayne@68:     if out is None:
jpayne@68:         sio = out = io.StringIO()
jpayne@68: 
jpayne@68:     parser = XMLParser(target=C14NWriterTarget(out.write, **options))
jpayne@68: 
jpayne@68:     if xml_data is not None:
jpayne@68:         parser.feed(xml_data)
jpayne@68:         parser.close()
jpayne@68:     elif from_file is not None:
jpayne@68:         parse(from_file, parser=parser)
jpayne@68: 
jpayne@68:     return sio.getvalue() if sio is not None else None
jpayne@68: 
jpayne@68: 
jpayne@68: _looks_like_prefix_name = re.compile(r'^\w+:\w+$', re.UNICODE).match
jpayne@68: 
jpayne@68: 
jpayne@68: class C14NWriterTarget:
jpayne@68:     """
jpayne@68:     Canonicalization writer target for the XMLParser.
jpayne@68: 
jpayne@68:     Serialises parse events to XML C14N 2.0.
jpayne@68: 
jpayne@68:     The *write* function is used for writing out the resulting data stream
jpayne@68:     as text (not bytes).  To write to a file, open it in text mode with encoding
jpayne@68:     "utf-8" and pass its ``.write`` method.
jpayne@68: 
jpayne@68:     Configuration options:
jpayne@68: 
jpayne@68:     - *with_comments*: set to true to include comments
jpayne@68:     - *strip_text*: set to true to strip whitespace before and after text content
jpayne@68:     - *rewrite_prefixes*: set to true to replace namespace prefixes by "n{number}"
jpayne@68:     - *qname_aware_tags*: a set of qname aware tag names in which prefixes
jpayne@68:                           should be replaced in text content
jpayne@68:     - *qname_aware_attrs*: a set of qname aware attribute names in which prefixes
jpayne@68:                            should be replaced in text content
jpayne@68:     - *exclude_attrs*: a set of attribute names that should not be serialised
jpayne@68:     - *exclude_tags*: a set of tag names that should not be serialised
jpayne@68:     """
jpayne@68:     def __init__(self, write, *,
jpayne@68:                  with_comments=False, strip_text=False, rewrite_prefixes=False,
jpayne@68:                  qname_aware_tags=None, qname_aware_attrs=None,
jpayne@68:                  exclude_attrs=None, exclude_tags=None):
jpayne@68:         self._write = write
jpayne@68:         self._data = []
jpayne@68:         self._with_comments = with_comments
jpayne@68:         self._strip_text = strip_text
jpayne@68:         self._exclude_attrs = set(exclude_attrs) if exclude_attrs else None
jpayne@68:         self._exclude_tags = set(exclude_tags) if exclude_tags else None
jpayne@68: 
jpayne@68:         self._rewrite_prefixes = rewrite_prefixes
jpayne@68:         if qname_aware_tags:
jpayne@68:             self._qname_aware_tags = set(qname_aware_tags)
jpayne@68:         else:
jpayne@68:             self._qname_aware_tags = None
jpayne@68:         if qname_aware_attrs:
jpayne@68:             self._find_qname_aware_attrs = set(qname_aware_attrs).intersection
jpayne@68:         else:
jpayne@68:             self._find_qname_aware_attrs = None
jpayne@68: 
jpayne@68:         # Stack with globally and newly declared namespaces as (uri, prefix) pairs.
jpayne@68:         self._declared_ns_stack = [[
jpayne@68:             ("http://www.w3.org/XML/1998/namespace", "xml"),
jpayne@68:         ]]
jpayne@68:         # Stack with user declared namespace prefixes as (uri, prefix) pairs.
jpayne@68:         self._ns_stack = []
jpayne@68:         if not rewrite_prefixes:
jpayne@68:             self._ns_stack.append(list(_namespace_map.items()))
jpayne@68:         self._ns_stack.append([])
jpayne@68:         self._prefix_map = {}
jpayne@68:         self._preserve_space = [False]
jpayne@68:         self._pending_start = None
jpayne@68:         self._root_seen = False
jpayne@68:         self._root_done = False
jpayne@68:         self._ignored_depth = 0
jpayne@68: 
jpayne@68:     def _iter_namespaces(self, ns_stack, _reversed=reversed):
jpayne@68:         for namespaces in _reversed(ns_stack):
jpayne@68:             if namespaces:  # almost no element declares new namespaces
jpayne@68:                 yield from namespaces
jpayne@68: 
jpayne@68:     def _resolve_prefix_name(self, prefixed_name):
jpayne@68:         prefix, name = prefixed_name.split(':', 1)
jpayne@68:         for uri, p in self._iter_namespaces(self._ns_stack):
jpayne@68:             if p == prefix:
jpayne@68:                 return f'{{{uri}}}{name}'
jpayne@68:         raise ValueError(f'Prefix {prefix} of QName "{prefixed_name}" is not declared in scope')
jpayne@68: 
jpayne@68:     def _qname(self, qname, uri=None):
jpayne@68:         if uri is None:
jpayne@68:             uri, tag = qname[1:].rsplit('}', 1) if qname[:1] == '{' else ('', qname)
jpayne@68:         else:
jpayne@68:             tag = qname
jpayne@68: 
jpayne@68:         prefixes_seen = set()
jpayne@68:         for u, prefix in self._iter_namespaces(self._declared_ns_stack):
jpayne@68:             if u == uri and prefix not in prefixes_seen:
jpayne@68:                 return f'{prefix}:{tag}' if prefix else tag, tag, uri
jpayne@68:             prefixes_seen.add(prefix)
jpayne@68: 
jpayne@68:         # Not declared yet => add new declaration.
jpayne@68:         if self._rewrite_prefixes:
jpayne@68:             if uri in self._prefix_map:
jpayne@68:                 prefix = self._prefix_map[uri]
jpayne@68:             else:
jpayne@68:                 prefix = self._prefix_map[uri] = f'n{len(self._prefix_map)}'
jpayne@68:             self._declared_ns_stack[-1].append((uri, prefix))
jpayne@68:             return f'{prefix}:{tag}', tag, uri
jpayne@68: 
jpayne@68:         if not uri and '' not in prefixes_seen:
jpayne@68:             # No default namespace declared => no prefix needed.
jpayne@68:             return tag, tag, uri
jpayne@68: 
jpayne@68:         for u, prefix in self._iter_namespaces(self._ns_stack):
jpayne@68:             if u == uri:
jpayne@68:                 self._declared_ns_stack[-1].append((uri, prefix))
jpayne@68:                 return f'{prefix}:{tag}' if prefix else tag, tag, uri
jpayne@68: 
jpayne@68:         raise ValueError(f'Namespace "{uri}" is not declared in scope')
jpayne@68: 
jpayne@68:     def data(self, data):
jpayne@68:         if not self._ignored_depth:
jpayne@68:             self._data.append(data)
jpayne@68: 
jpayne@68:     def _flush(self, _join_text=''.join):
jpayne@68:         data = _join_text(self._data)
jpayne@68:         del self._data[:]
jpayne@68:         if self._strip_text and not self._preserve_space[-1]:
jpayne@68:             data = data.strip()
jpayne@68:         if self._pending_start is not None:
jpayne@68:             args, self._pending_start = self._pending_start, None
jpayne@68:             qname_text = data if data and _looks_like_prefix_name(data) else None
jpayne@68:             self._start(*args, qname_text)
jpayne@68:             if qname_text is not None:
jpayne@68:                 return
jpayne@68:         if data and self._root_seen:
jpayne@68:             self._write(_escape_cdata_c14n(data))
jpayne@68: 
jpayne@68:     def start_ns(self, prefix, uri):
jpayne@68:         if self._ignored_depth:
jpayne@68:             return
jpayne@68:         # we may have to resolve qnames in text content
jpayne@68:         if self._data:
jpayne@68:             self._flush()
jpayne@68:         self._ns_stack[-1].append((uri, prefix))
jpayne@68: 
jpayne@68:     def start(self, tag, attrs):
jpayne@68:         if self._exclude_tags is not None and (
jpayne@68:                 self._ignored_depth or tag in self._exclude_tags):
jpayne@68:             self._ignored_depth += 1
jpayne@68:             return
jpayne@68:         if self._data:
jpayne@68:             self._flush()
jpayne@68: 
jpayne@68:         new_namespaces = []
jpayne@68:         self._declared_ns_stack.append(new_namespaces)
jpayne@68: 
jpayne@68:         if self._qname_aware_tags is not None and tag in self._qname_aware_tags:
jpayne@68:             # Need to parse text first to see if it requires a prefix declaration.
jpayne@68:             self._pending_start = (tag, attrs, new_namespaces)
jpayne@68:             return
jpayne@68:         self._start(tag, attrs, new_namespaces)
jpayne@68: 
jpayne@68:     def _start(self, tag, attrs, new_namespaces, qname_text=None):
jpayne@68:         if self._exclude_attrs is not None and attrs:
jpayne@68:             attrs = {k: v for k, v in attrs.items() if k not in self._exclude_attrs}
jpayne@68: 
jpayne@68:         qnames = {tag, *attrs}
jpayne@68:         resolved_names = {}
jpayne@68: 
jpayne@68:         # Resolve prefixes in attribute and tag text.
jpayne@68:         if qname_text is not None:
jpayne@68:             qname = resolved_names[qname_text] = self._resolve_prefix_name(qname_text)
jpayne@68:             qnames.add(qname)
jpayne@68:         if self._find_qname_aware_attrs is not None and attrs:
jpayne@68:             qattrs = self._find_qname_aware_attrs(attrs)
jpayne@68:             if qattrs:
jpayne@68:                 for attr_name in qattrs:
jpayne@68:                     value = attrs[attr_name]
jpayne@68:                     if _looks_like_prefix_name(value):
jpayne@68:                         qname = resolved_names[value] = self._resolve_prefix_name(value)
jpayne@68:                         qnames.add(qname)
jpayne@68:             else:
jpayne@68:                 qattrs = None
jpayne@68:         else:
jpayne@68:             qattrs = None
jpayne@68: 
jpayne@68:         # Assign prefixes in lexicographical order of used URIs.
jpayne@68:         parse_qname = self._qname
jpayne@68:         parsed_qnames = {n: parse_qname(n) for n in sorted(
jpayne@68:             qnames, key=lambda n: n.split('}', 1))}
jpayne@68: 
jpayne@68:         # Write namespace declarations in prefix order ...
jpayne@68:         if new_namespaces:
jpayne@68:             attr_list = [
jpayne@68:                 ('xmlns:' + prefix if prefix else 'xmlns', uri)
jpayne@68:                 for uri, prefix in new_namespaces
jpayne@68:             ]
jpayne@68:             attr_list.sort()
jpayne@68:         else:
jpayne@68:             # almost always empty
jpayne@68:             attr_list = []
jpayne@68: 
jpayne@68:         # ... followed by attributes in URI+name order
jpayne@68:         if attrs:
jpayne@68:             for k, v in sorted(attrs.items()):
jpayne@68:                 if qattrs is not None and k in qattrs and v in resolved_names:
jpayne@68:                     v = parsed_qnames[resolved_names[v]][0]
jpayne@68:                 attr_qname, attr_name, uri = parsed_qnames[k]
jpayne@68:                 # No prefix for attributes in default ('') namespace.
jpayne@68:                 attr_list.append((attr_qname if uri else attr_name, v))
jpayne@68: 
jpayne@68:         # Honour xml:space attributes.
jpayne@68:         space_behaviour = attrs.get('{http://www.w3.org/XML/1998/namespace}space')
jpayne@68:         self._preserve_space.append(
jpayne@68:             space_behaviour == 'preserve' if space_behaviour
jpayne@68:             else self._preserve_space[-1])
jpayne@68: 
jpayne@68:         # Write the tag.
jpayne@68:         write = self._write
jpayne@68:         write('<' + parsed_qnames[tag][0])
jpayne@68:         if attr_list:
jpayne@68:             write(''.join([f' {k}="{_escape_attrib_c14n(v)}"' for k, v in attr_list]))
jpayne@68:         write('>')
jpayne@68: 
jpayne@68:         # Write the resolved qname text content.
jpayne@68:         if qname_text is not None:
jpayne@68:             write(_escape_cdata_c14n(parsed_qnames[resolved_names[qname_text]][0]))
jpayne@68: 
jpayne@68:         self._root_seen = True
jpayne@68:         self._ns_stack.append([])
jpayne@68: 
jpayne@68:     def end(self, tag):
jpayne@68:         if self._ignored_depth:
jpayne@68:             self._ignored_depth -= 1
jpayne@68:             return
jpayne@68:         if self._data:
jpayne@68:             self._flush()
jpayne@68:         self._write(f'</{self._qname(tag)[0]}>')
jpayne@68:         self._preserve_space.pop()
jpayne@68:         self._root_done = len(self._preserve_space) == 1
jpayne@68:         self._declared_ns_stack.pop()
jpayne@68:         self._ns_stack.pop()
jpayne@68: 
jpayne@68:     def comment(self, text):
jpayne@68:         if not self._with_comments:
jpayne@68:             return
jpayne@68:         if self._ignored_depth:
jpayne@68:             return
jpayne@68:         if self._root_done:
jpayne@68:             self._write('\n')
jpayne@68:         elif self._root_seen and self._data:
jpayne@68:             self._flush()
jpayne@68:         self._write(f'<!--{_escape_cdata_c14n(text)}-->')
jpayne@68:         if not self._root_seen:
jpayne@68:             self._write('\n')
jpayne@68: 
jpayne@68:     def pi(self, target, data):
jpayne@68:         if self._ignored_depth:
jpayne@68:             return
jpayne@68:         if self._root_done:
jpayne@68:             self._write('\n')
jpayne@68:         elif self._root_seen and self._data:
jpayne@68:             self._flush()
jpayne@68:         self._write(
jpayne@68:             f'<?{target} {_escape_cdata_c14n(data)}?>' if data else f'<?{target}?>')
jpayne@68:         if not self._root_seen:
jpayne@68:             self._write('\n')
jpayne@68: 
jpayne@68: 
jpayne@68: def _escape_cdata_c14n(text):
jpayne@68:     # escape character data
jpayne@68:     try:
jpayne@68:         # it's worth avoiding do-nothing calls for strings that are
jpayne@68:         # shorter than 500 character, or so.  assume that's, by far,
jpayne@68:         # the most common case in most applications.
jpayne@68:         if '&' in text:
jpayne@68:             text = text.replace('&', '&amp;')
jpayne@68:         if '<' in text:
jpayne@68:             text = text.replace('<', '&lt;')
jpayne@68:         if '>' in text:
jpayne@68:             text = text.replace('>', '&gt;')
jpayne@68:         if '\r' in text:
jpayne@68:             text = text.replace('\r', '&#xD;')
jpayne@68:         return text
jpayne@68:     except (TypeError, AttributeError):
jpayne@68:         _raise_serialization_error(text)
jpayne@68: 
jpayne@68: 
jpayne@68: def _escape_attrib_c14n(text):
jpayne@68:     # escape attribute value
jpayne@68:     try:
jpayne@68:         if '&' in text:
jpayne@68:             text = text.replace('&', '&amp;')
jpayne@68:         if '<' in text:
jpayne@68:             text = text.replace('<', '&lt;')
jpayne@68:         if '"' in text:
jpayne@68:             text = text.replace('"', '&quot;')
jpayne@68:         if '\t' in text:
jpayne@68:             text = text.replace('\t', '&#x9;')
jpayne@68:         if '\n' in text:
jpayne@68:             text = text.replace('\n', '&#xA;')
jpayne@68:         if '\r' in text:
jpayne@68:             text = text.replace('\r', '&#xD;')
jpayne@68:         return text
jpayne@68:     except (TypeError, AttributeError):
jpayne@68:         _raise_serialization_error(text)
jpayne@68: 
jpayne@68: 
jpayne@68: # --------------------------------------------------------------------
jpayne@68: 
jpayne@68: # Import the C accelerators
jpayne@68: try:
jpayne@68:     # Element is going to be shadowed by the C implementation. We need to keep
jpayne@68:     # the Python version of it accessible for some "creative" by external code
jpayne@68:     # (see tests)
jpayne@68:     _Element_Py = Element
jpayne@68: 
jpayne@68:     # Element, SubElement, ParseError, TreeBuilder, XMLParser, _set_factories
jpayne@68:     from _elementtree import *
jpayne@68:     from _elementtree import _set_factories
jpayne@68: except ImportError:
jpayne@68:     pass
jpayne@68: else:
jpayne@68:     _set_factories(Comment, ProcessingInstruction)