comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/xml/etree/ElementTree.py @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 """Lightweight XML support for Python.
2
3 XML is an inherently hierarchical data format, and the most natural way to
4 represent it is with a tree. This module has two classes for this purpose:
5
6 1. ElementTree represents the whole XML document as a tree and
7
8 2. Element represents a single node in this tree.
9
10 Interactions with the whole document (reading and writing to/from files) are
11 usually done on the ElementTree level. Interactions with a single XML element
12 and its sub-elements are done on the Element level.
13
14 Element is a flexible container object designed to store hierarchical data
15 structures in memory. It can be described as a cross between a list and a
16 dictionary. Each Element has a number of properties associated with it:
17
18 'tag' - a string containing the element's name.
19
20 'attributes' - a Python dictionary storing the element's attributes.
21
22 'text' - a string containing the element's text content.
23
24 'tail' - an optional string containing text after the element's end tag.
25
26 And a number of child elements stored in a Python sequence.
27
28 To create an element instance, use the Element constructor,
29 or the SubElement factory function.
30
31 You can also use the ElementTree class to wrap an element structure
32 and convert it to and from XML.
33
34 """
35
36 #---------------------------------------------------------------------
37 # Licensed to PSF under a Contributor Agreement.
38 # See http://www.python.org/psf/license for licensing details.
39 #
40 # ElementTree
41 # Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved.
42 #
43 # fredrik@pythonware.com
44 # http://www.pythonware.com
45 # --------------------------------------------------------------------
46 # The ElementTree toolkit is
47 #
48 # Copyright (c) 1999-2008 by Fredrik Lundh
49 #
50 # By obtaining, using, and/or copying this software and/or its
51 # associated documentation, you agree that you have read, understood,
52 # and will comply with the following terms and conditions:
53 #
54 # Permission to use, copy, modify, and distribute this software and
55 # its associated documentation for any purpose and without fee is
56 # hereby granted, provided that the above copyright notice appears in
57 # all copies, and that both that copyright notice and this permission
58 # notice appear in supporting documentation, and that the name of
59 # Secret Labs AB or the author not be used in advertising or publicity
60 # pertaining to distribution of the software without specific, written
61 # prior permission.
62 #
63 # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
64 # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
65 # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
66 # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
67 # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
68 # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
69 # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
70 # OF THIS SOFTWARE.
71 # --------------------------------------------------------------------
72
73 __all__ = [
74 # public symbols
75 "Comment",
76 "dump",
77 "Element", "ElementTree",
78 "fromstring", "fromstringlist",
79 "iselement", "iterparse",
80 "parse", "ParseError",
81 "PI", "ProcessingInstruction",
82 "QName",
83 "SubElement",
84 "tostring", "tostringlist",
85 "TreeBuilder",
86 "VERSION",
87 "XML", "XMLID",
88 "XMLParser", "XMLPullParser",
89 "register_namespace",
90 "canonicalize", "C14NWriterTarget",
91 ]
92
93 VERSION = "1.3.0"
94
95 import sys
96 import re
97 import warnings
98 import io
99 import collections
100 import collections.abc
101 import contextlib
102
103 from . import ElementPath
104
105
106 class ParseError(SyntaxError):
107 """An error when parsing an XML document.
108
109 In addition to its exception value, a ParseError contains
110 two extra attributes:
111 'code' - the specific exception code
112 'position' - the line and column of the error
113
114 """
115 pass
116
117 # --------------------------------------------------------------------
118
119
120 def iselement(element):
121 """Return True if *element* appears to be an Element."""
122 return hasattr(element, 'tag')
123
124
125 class Element:
126 """An XML element.
127
128 This class is the reference implementation of the Element interface.
129
130 An element's length is its number of subelements. That means if you
131 want to check if an element is truly empty, you should check BOTH
132 its length AND its text attribute.
133
134 The element tag, attribute names, and attribute values can be either
135 bytes or strings.
136
137 *tag* is the element name. *attrib* is an optional dictionary containing
138 element attributes. *extra* are additional element attributes given as
139 keyword arguments.
140
141 Example form:
142 <tag attrib>text<child/>...</tag>tail
143
144 """
145
146 tag = None
147 """The element's name."""
148
149 attrib = None
150 """Dictionary of the element's attributes."""
151
152 text = None
153 """
154 Text before first subelement. This is either a string or the value None.
155 Note that if there is no text, this attribute may be either
156 None or the empty string, depending on the parser.
157
158 """
159
160 tail = None
161 """
162 Text after this element's end tag, but before the next sibling element's
163 start tag. This is either a string or the value None. Note that if there
164 was no text, this attribute may be either None or an empty string,
165 depending on the parser.
166
167 """
168
169 def __init__(self, tag, attrib={}, **extra):
170 if not isinstance(attrib, dict):
171 raise TypeError("attrib must be dict, not %s" % (
172 attrib.__class__.__name__,))
173 self.tag = tag
174 self.attrib = {**attrib, **extra}
175 self._children = []
176
177 def __repr__(self):
178 return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self))
179
180 def makeelement(self, tag, attrib):
181 """Create a new element with the same type.
182
183 *tag* is a string containing the element name.
184 *attrib* is a dictionary containing the element attributes.
185
186 Do not call this method, use the SubElement factory function instead.
187
188 """
189 return self.__class__(tag, attrib)
190
191 def copy(self):
192 """Return copy of current element.
193
194 This creates a shallow copy. Subelements will be shared with the
195 original tree.
196
197 """
198 elem = self.makeelement(self.tag, self.attrib)
199 elem.text = self.text
200 elem.tail = self.tail
201 elem[:] = self
202 return elem
203
204 def __len__(self):
205 return len(self._children)
206
207 def __bool__(self):
208 warnings.warn(
209 "The behavior of this method will change in future versions. "
210 "Use specific 'len(elem)' or 'elem is not None' test instead.",
211 FutureWarning, stacklevel=2
212 )
213 return len(self._children) != 0 # emulate old behaviour, for now
214
215 def __getitem__(self, index):
216 return self._children[index]
217
218 def __setitem__(self, index, element):
219 if isinstance(index, slice):
220 for elt in element:
221 self._assert_is_element(elt)
222 else:
223 self._assert_is_element(element)
224 self._children[index] = element
225
226 def __delitem__(self, index):
227 del self._children[index]
228
229 def append(self, subelement):
230 """Add *subelement* to the end of this element.
231
232 The new element will appear in document order after the last existing
233 subelement (or directly after the text, if it's the first subelement),
234 but before the end tag for this element.
235
236 """
237 self._assert_is_element(subelement)
238 self._children.append(subelement)
239
240 def extend(self, elements):
241 """Append subelements from a sequence.
242
243 *elements* is a sequence with zero or more elements.
244
245 """
246 for element in elements:
247 self._assert_is_element(element)
248 self._children.extend(elements)
249
250 def insert(self, index, subelement):
251 """Insert *subelement* at position *index*."""
252 self._assert_is_element(subelement)
253 self._children.insert(index, subelement)
254
255 def _assert_is_element(self, e):
256 # Need to refer to the actual Python implementation, not the
257 # shadowing C implementation.
258 if not isinstance(e, _Element_Py):
259 raise TypeError('expected an Element, not %s' % type(e).__name__)
260
261 def remove(self, subelement):
262 """Remove matching subelement.
263
264 Unlike the find methods, this method compares elements based on
265 identity, NOT ON tag value or contents. To remove subelements by
266 other means, the easiest way is to use a list comprehension to
267 select what elements to keep, and then use slice assignment to update
268 the parent element.
269
270 ValueError is raised if a matching element could not be found.
271
272 """
273 # assert iselement(element)
274 self._children.remove(subelement)
275
276 def getchildren(self):
277 """(Deprecated) Return all subelements.
278
279 Elements are returned in document order.
280
281 """
282 warnings.warn(
283 "This method will be removed in future versions. "
284 "Use 'list(elem)' or iteration over elem instead.",
285 DeprecationWarning, stacklevel=2
286 )
287 return self._children
288
289 def find(self, path, namespaces=None):
290 """Find first matching element by tag name or path.
291
292 *path* is a string having either an element tag or an XPath,
293 *namespaces* is an optional mapping from namespace prefix to full name.
294
295 Return the first matching element, or None if no element was found.
296
297 """
298 return ElementPath.find(self, path, namespaces)
299
300 def findtext(self, path, default=None, namespaces=None):
301 """Find text for first matching element by tag name or path.
302
303 *path* is a string having either an element tag or an XPath,
304 *default* is the value to return if the element was not found,
305 *namespaces* is an optional mapping from namespace prefix to full name.
306
307 Return text content of first matching element, or default value if
308 none was found. Note that if an element is found having no text
309 content, the empty string is returned.
310
311 """
312 return ElementPath.findtext(self, path, default, namespaces)
313
314 def findall(self, path, namespaces=None):
315 """Find all matching subelements by tag name or path.
316
317 *path* is a string having either an element tag or an XPath,
318 *namespaces* is an optional mapping from namespace prefix to full name.
319
320 Returns list containing all matching elements in document order.
321
322 """
323 return ElementPath.findall(self, path, namespaces)
324
325 def iterfind(self, path, namespaces=None):
326 """Find all matching subelements by tag name or path.
327
328 *path* is a string having either an element tag or an XPath,
329 *namespaces* is an optional mapping from namespace prefix to full name.
330
331 Return an iterable yielding all matching elements in document order.
332
333 """
334 return ElementPath.iterfind(self, path, namespaces)
335
336 def clear(self):
337 """Reset element.
338
339 This function removes all subelements, clears all attributes, and sets
340 the text and tail attributes to None.
341
342 """
343 self.attrib.clear()
344 self._children = []
345 self.text = self.tail = None
346
347 def get(self, key, default=None):
348 """Get element attribute.
349
350 Equivalent to attrib.get, but some implementations may handle this a
351 bit more efficiently. *key* is what attribute to look for, and
352 *default* is what to return if the attribute was not found.
353
354 Returns a string containing the attribute value, or the default if
355 attribute was not found.
356
357 """
358 return self.attrib.get(key, default)
359
360 def set(self, key, value):
361 """Set element attribute.
362
363 Equivalent to attrib[key] = value, but some implementations may handle
364 this a bit more efficiently. *key* is what attribute to set, and
365 *value* is the attribute value to set it to.
366
367 """
368 self.attrib[key] = value
369
370 def keys(self):
371 """Get list of attribute names.
372
373 Names are returned in an arbitrary order, just like an ordinary
374 Python dict. Equivalent to attrib.keys()
375
376 """
377 return self.attrib.keys()
378
379 def items(self):
380 """Get element attributes as a sequence.
381
382 The attributes are returned in arbitrary order. Equivalent to
383 attrib.items().
384
385 Return a list of (name, value) tuples.
386
387 """
388 return self.attrib.items()
389
390 def iter(self, tag=None):
391 """Create tree iterator.
392
393 The iterator loops over the element and all subelements in document
394 order, returning all elements with a matching tag.
395
396 If the tree structure is modified during iteration, new or removed
397 elements may or may not be included. To get a stable set, use the
398 list() function on the iterator, and loop over the resulting list.
399
400 *tag* is what tags to look for (default is to return all elements)
401
402 Return an iterator containing all the matching elements.
403
404 """
405 if tag == "*":
406 tag = None
407 if tag is None or self.tag == tag:
408 yield self
409 for e in self._children:
410 yield from e.iter(tag)
411
412 # compatibility
413 def getiterator(self, tag=None):
414 warnings.warn(
415 "This method will be removed in future versions. "
416 "Use 'elem.iter()' or 'list(elem.iter())' instead.",
417 DeprecationWarning, stacklevel=2
418 )
419 return list(self.iter(tag))
420
421 def itertext(self):
422 """Create text iterator.
423
424 The iterator loops over the element and all subelements in document
425 order, returning all inner text.
426
427 """
428 tag = self.tag
429 if not isinstance(tag, str) and tag is not None:
430 return
431 t = self.text
432 if t:
433 yield t
434 for e in self:
435 yield from e.itertext()
436 t = e.tail
437 if t:
438 yield t
439
440
441 def SubElement(parent, tag, attrib={}, **extra):
442 """Subelement factory which creates an element instance, and appends it
443 to an existing parent.
444
445 The element tag, attribute names, and attribute values can be either
446 bytes or Unicode strings.
447
448 *parent* is the parent element, *tag* is the subelements name, *attrib* is
449 an optional directory containing element attributes, *extra* are
450 additional attributes given as keyword arguments.
451
452 """
453 attrib = {**attrib, **extra}
454 element = parent.makeelement(tag, attrib)
455 parent.append(element)
456 return element
457
458
459 def Comment(text=None):
460 """Comment element factory.
461
462 This function creates a special element which the standard serializer
463 serializes as an XML comment.
464
465 *text* is a string containing the comment string.
466
467 """
468 element = Element(Comment)
469 element.text = text
470 return element
471
472
473 def ProcessingInstruction(target, text=None):
474 """Processing Instruction element factory.
475
476 This function creates a special element which the standard serializer
477 serializes as an XML comment.
478
479 *target* is a string containing the processing instruction, *text* is a
480 string containing the processing instruction contents, if any.
481
482 """
483 element = Element(ProcessingInstruction)
484 element.text = target
485 if text:
486 element.text = element.text + " " + text
487 return element
488
489 PI = ProcessingInstruction
490
491
492 class QName:
493 """Qualified name wrapper.
494
495 This class can be used to wrap a QName attribute value in order to get
496 proper namespace handing on output.
497
498 *text_or_uri* is a string containing the QName value either in the form
499 {uri}local, or if the tag argument is given, the URI part of a QName.
500
501 *tag* is an optional argument which if given, will make the first
502 argument (text_or_uri) be interpreted as a URI, and this argument (tag)
503 be interpreted as a local name.
504
505 """
506 def __init__(self, text_or_uri, tag=None):
507 if tag:
508 text_or_uri = "{%s}%s" % (text_or_uri, tag)
509 self.text = text_or_uri
510 def __str__(self):
511 return self.text
512 def __repr__(self):
513 return '<%s %r>' % (self.__class__.__name__, self.text)
514 def __hash__(self):
515 return hash(self.text)
516 def __le__(self, other):
517 if isinstance(other, QName):
518 return self.text <= other.text
519 return self.text <= other
520 def __lt__(self, other):
521 if isinstance(other, QName):
522 return self.text < other.text
523 return self.text < other
524 def __ge__(self, other):
525 if isinstance(other, QName):
526 return self.text >= other.text
527 return self.text >= other
528 def __gt__(self, other):
529 if isinstance(other, QName):
530 return self.text > other.text
531 return self.text > other
532 def __eq__(self, other):
533 if isinstance(other, QName):
534 return self.text == other.text
535 return self.text == other
536
537 # --------------------------------------------------------------------
538
539
540 class ElementTree:
541 """An XML element hierarchy.
542
543 This class also provides support for serialization to and from
544 standard XML.
545
546 *element* is an optional root element node,
547 *file* is an optional file handle or file name of an XML file whose
548 contents will be used to initialize the tree with.
549
550 """
551 def __init__(self, element=None, file=None):
552 # assert element is None or iselement(element)
553 self._root = element # first node
554 if file:
555 self.parse(file)
556
557 def getroot(self):
558 """Return root element of this tree."""
559 return self._root
560
561 def _setroot(self, element):
562 """Replace root element of this tree.
563
564 This will discard the current contents of the tree and replace it
565 with the given element. Use with care!
566
567 """
568 # assert iselement(element)
569 self._root = element
570
571 def parse(self, source, parser=None):
572 """Load external XML document into element tree.
573
574 *source* is a file name or file object, *parser* is an optional parser
575 instance that defaults to XMLParser.
576
577 ParseError is raised if the parser fails to parse the document.
578
579 Returns the root element of the given source document.
580
581 """
582 close_source = False
583 if not hasattr(source, "read"):
584 source = open(source, "rb")
585 close_source = True
586 try:
587 if parser is None:
588 # If no parser was specified, create a default XMLParser
589 parser = XMLParser()
590 if hasattr(parser, '_parse_whole'):
591 # The default XMLParser, when it comes from an accelerator,
592 # can define an internal _parse_whole API for efficiency.
593 # It can be used to parse the whole source without feeding
594 # it with chunks.
595 self._root = parser._parse_whole(source)
596 return self._root
597 while True:
598 data = source.read(65536)
599 if not data:
600 break
601 parser.feed(data)
602 self._root = parser.close()
603 return self._root
604 finally:
605 if close_source:
606 source.close()
607
608 def iter(self, tag=None):
609 """Create and return tree iterator for the root element.
610
611 The iterator loops over all elements in this tree, in document order.
612
613 *tag* is a string with the tag name to iterate over
614 (default is to return all elements).
615
616 """
617 # assert self._root is not None
618 return self._root.iter(tag)
619
620 # compatibility
621 def getiterator(self, tag=None):
622 warnings.warn(
623 "This method will be removed in future versions. "
624 "Use 'tree.iter()' or 'list(tree.iter())' instead.",
625 DeprecationWarning, stacklevel=2
626 )
627 return list(self.iter(tag))
628
629 def find(self, path, namespaces=None):
630 """Find first matching element by tag name or path.
631
632 Same as getroot().find(path), which is Element.find()
633
634 *path* is a string having either an element tag or an XPath,
635 *namespaces* is an optional mapping from namespace prefix to full name.
636
637 Return the first matching element, or None if no element was found.
638
639 """
640 # assert self._root is not None
641 if path[:1] == "/":
642 path = "." + path
643 warnings.warn(
644 "This search is broken in 1.3 and earlier, and will be "
645 "fixed in a future version. If you rely on the current "
646 "behaviour, change it to %r" % path,
647 FutureWarning, stacklevel=2
648 )
649 return self._root.find(path, namespaces)
650
651 def findtext(self, path, default=None, namespaces=None):
652 """Find first matching element by tag name or path.
653
654 Same as getroot().findtext(path), which is Element.findtext()
655
656 *path* is a string having either an element tag or an XPath,
657 *namespaces* is an optional mapping from namespace prefix to full name.
658
659 Return the first matching element, or None if no element was found.
660
661 """
662 # assert self._root is not None
663 if path[:1] == "/":
664 path = "." + path
665 warnings.warn(
666 "This search is broken in 1.3 and earlier, and will be "
667 "fixed in a future version. If you rely on the current "
668 "behaviour, change it to %r" % path,
669 FutureWarning, stacklevel=2
670 )
671 return self._root.findtext(path, default, namespaces)
672
673 def findall(self, path, namespaces=None):
674 """Find all matching subelements by tag name or path.
675
676 Same as getroot().findall(path), which is Element.findall().
677
678 *path* is a string having either an element tag or an XPath,
679 *namespaces* is an optional mapping from namespace prefix to full name.
680
681 Return list containing all matching elements in document order.
682
683 """
684 # assert self._root is not None
685 if path[:1] == "/":
686 path = "." + path
687 warnings.warn(
688 "This search is broken in 1.3 and earlier, and will be "
689 "fixed in a future version. If you rely on the current "
690 "behaviour, change it to %r" % path,
691 FutureWarning, stacklevel=2
692 )
693 return self._root.findall(path, namespaces)
694
695 def iterfind(self, path, namespaces=None):
696 """Find all matching subelements by tag name or path.
697
698 Same as getroot().iterfind(path), which is element.iterfind()
699
700 *path* is a string having either an element tag or an XPath,
701 *namespaces* is an optional mapping from namespace prefix to full name.
702
703 Return an iterable yielding all matching elements in document order.
704
705 """
706 # assert self._root is not None
707 if path[:1] == "/":
708 path = "." + path
709 warnings.warn(
710 "This search is broken in 1.3 and earlier, and will be "
711 "fixed in a future version. If you rely on the current "
712 "behaviour, change it to %r" % path,
713 FutureWarning, stacklevel=2
714 )
715 return self._root.iterfind(path, namespaces)
716
717 def write(self, file_or_filename,
718 encoding=None,
719 xml_declaration=None,
720 default_namespace=None,
721 method=None, *,
722 short_empty_elements=True):
723 """Write element tree to a file as XML.
724
725 Arguments:
726 *file_or_filename* -- file name or a file object opened for writing
727
728 *encoding* -- the output encoding (default: US-ASCII)
729
730 *xml_declaration* -- bool indicating if an XML declaration should be
731 added to the output. If None, an XML declaration
732 is added if encoding IS NOT either of:
733 US-ASCII, UTF-8, or Unicode
734
735 *default_namespace* -- sets the default XML namespace (for "xmlns")
736
737 *method* -- either "xml" (default), "html, "text", or "c14n"
738
739 *short_empty_elements* -- controls the formatting of elements
740 that contain no content. If True (default)
741 they are emitted as a single self-closed
742 tag, otherwise they are emitted as a pair
743 of start/end tags
744
745 """
746 if not method:
747 method = "xml"
748 elif method not in _serialize:
749 raise ValueError("unknown method %r" % method)
750 if not encoding:
751 if method == "c14n":
752 encoding = "utf-8"
753 else:
754 encoding = "us-ascii"
755 enc_lower = encoding.lower()
756 with _get_writer(file_or_filename, enc_lower) as write:
757 if method == "xml" and (xml_declaration or
758 (xml_declaration is None and
759 enc_lower not in ("utf-8", "us-ascii", "unicode"))):
760 declared_encoding = encoding
761 if enc_lower == "unicode":
762 # Retrieve the default encoding for the xml declaration
763 import locale
764 declared_encoding = locale.getpreferredencoding()
765 write("<?xml version='1.0' encoding='%s'?>\n" % (
766 declared_encoding,))
767 if method == "text":
768 _serialize_text(write, self._root)
769 else:
770 qnames, namespaces = _namespaces(self._root, default_namespace)
771 serialize = _serialize[method]
772 serialize(write, self._root, qnames, namespaces,
773 short_empty_elements=short_empty_elements)
774
775 def write_c14n(self, file):
776 # lxml.etree compatibility. use output method instead
777 return self.write(file, method="c14n")
778
779 # --------------------------------------------------------------------
780 # serialization support
781
782 @contextlib.contextmanager
783 def _get_writer(file_or_filename, encoding):
784 # returns text write method and release all resources after using
785 try:
786 write = file_or_filename.write
787 except AttributeError:
788 # file_or_filename is a file name
789 if encoding == "unicode":
790 file = open(file_or_filename, "w")
791 else:
792 file = open(file_or_filename, "w", encoding=encoding,
793 errors="xmlcharrefreplace")
794 with file:
795 yield file.write
796 else:
797 # file_or_filename is a file-like object
798 # encoding determines if it is a text or binary writer
799 if encoding == "unicode":
800 # use a text writer as is
801 yield write
802 else:
803 # wrap a binary writer with TextIOWrapper
804 with contextlib.ExitStack() as stack:
805 if isinstance(file_or_filename, io.BufferedIOBase):
806 file = file_or_filename
807 elif isinstance(file_or_filename, io.RawIOBase):
808 file = io.BufferedWriter(file_or_filename)
809 # Keep the original file open when the BufferedWriter is
810 # destroyed
811 stack.callback(file.detach)
812 else:
813 # This is to handle passed objects that aren't in the
814 # IOBase hierarchy, but just have a write method
815 file = io.BufferedIOBase()
816 file.writable = lambda: True
817 file.write = write
818 try:
819 # TextIOWrapper uses this methods to determine
820 # if BOM (for UTF-16, etc) should be added
821 file.seekable = file_or_filename.seekable
822 file.tell = file_or_filename.tell
823 except AttributeError:
824 pass
825 file = io.TextIOWrapper(file,
826 encoding=encoding,
827 errors="xmlcharrefreplace",
828 newline="\n")
829 # Keep the original file open when the TextIOWrapper is
830 # destroyed
831 stack.callback(file.detach)
832 yield file.write
833
834 def _namespaces(elem, default_namespace=None):
835 # identify namespaces used in this tree
836
837 # maps qnames to *encoded* prefix:local names
838 qnames = {None: None}
839
840 # maps uri:s to prefixes
841 namespaces = {}
842 if default_namespace:
843 namespaces[default_namespace] = ""
844
845 def add_qname(qname):
846 # calculate serialized qname representation
847 try:
848 if qname[:1] == "{":
849 uri, tag = qname[1:].rsplit("}", 1)
850 prefix = namespaces.get(uri)
851 if prefix is None:
852 prefix = _namespace_map.get(uri)
853 if prefix is None:
854 prefix = "ns%d" % len(namespaces)
855 if prefix != "xml":
856 namespaces[uri] = prefix
857 if prefix:
858 qnames[qname] = "%s:%s" % (prefix, tag)
859 else:
860 qnames[qname] = tag # default element
861 else:
862 if default_namespace:
863 # FIXME: can this be handled in XML 1.0?
864 raise ValueError(
865 "cannot use non-qualified names with "
866 "default_namespace option"
867 )
868 qnames[qname] = qname
869 except TypeError:
870 _raise_serialization_error(qname)
871
872 # populate qname and namespaces table
873 for elem in elem.iter():
874 tag = elem.tag
875 if isinstance(tag, QName):
876 if tag.text not in qnames:
877 add_qname(tag.text)
878 elif isinstance(tag, str):
879 if tag not in qnames:
880 add_qname(tag)
881 elif tag is not None and tag is not Comment and tag is not PI:
882 _raise_serialization_error(tag)
883 for key, value in elem.items():
884 if isinstance(key, QName):
885 key = key.text
886 if key not in qnames:
887 add_qname(key)
888 if isinstance(value, QName) and value.text not in qnames:
889 add_qname(value.text)
890 text = elem.text
891 if isinstance(text, QName) and text.text not in qnames:
892 add_qname(text.text)
893 return qnames, namespaces
894
895 def _serialize_xml(write, elem, qnames, namespaces,
896 short_empty_elements, **kwargs):
897 tag = elem.tag
898 text = elem.text
899 if tag is Comment:
900 write("<!--%s-->" % text)
901 elif tag is ProcessingInstruction:
902 write("<?%s?>" % text)
903 else:
904 tag = qnames[tag]
905 if tag is None:
906 if text:
907 write(_escape_cdata(text))
908 for e in elem:
909 _serialize_xml(write, e, qnames, None,
910 short_empty_elements=short_empty_elements)
911 else:
912 write("<" + tag)
913 items = list(elem.items())
914 if items or namespaces:
915 if namespaces:
916 for v, k in sorted(namespaces.items(),
917 key=lambda x: x[1]): # sort on prefix
918 if k:
919 k = ":" + k
920 write(" xmlns%s=\"%s\"" % (
921 k,
922 _escape_attrib(v)
923 ))
924 for k, v in items:
925 if isinstance(k, QName):
926 k = k.text
927 if isinstance(v, QName):
928 v = qnames[v.text]
929 else:
930 v = _escape_attrib(v)
931 write(" %s=\"%s\"" % (qnames[k], v))
932 if text or len(elem) or not short_empty_elements:
933 write(">")
934 if text:
935 write(_escape_cdata(text))
936 for e in elem:
937 _serialize_xml(write, e, qnames, None,
938 short_empty_elements=short_empty_elements)
939 write("</" + tag + ">")
940 else:
941 write(" />")
942 if elem.tail:
943 write(_escape_cdata(elem.tail))
944
945 HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
946 "img", "input", "isindex", "link", "meta", "param")
947
948 try:
949 HTML_EMPTY = set(HTML_EMPTY)
950 except NameError:
951 pass
952
953 def _serialize_html(write, elem, qnames, namespaces, **kwargs):
954 tag = elem.tag
955 text = elem.text
956 if tag is Comment:
957 write("<!--%s-->" % _escape_cdata(text))
958 elif tag is ProcessingInstruction:
959 write("<?%s?>" % _escape_cdata(text))
960 else:
961 tag = qnames[tag]
962 if tag is None:
963 if text:
964 write(_escape_cdata(text))
965 for e in elem:
966 _serialize_html(write, e, qnames, None)
967 else:
968 write("<" + tag)
969 items = list(elem.items())
970 if items or namespaces:
971 if namespaces:
972 for v, k in sorted(namespaces.items(),
973 key=lambda x: x[1]): # sort on prefix
974 if k:
975 k = ":" + k
976 write(" xmlns%s=\"%s\"" % (
977 k,
978 _escape_attrib(v)
979 ))
980 for k, v in items:
981 if isinstance(k, QName):
982 k = k.text
983 if isinstance(v, QName):
984 v = qnames[v.text]
985 else:
986 v = _escape_attrib_html(v)
987 # FIXME: handle boolean attributes
988 write(" %s=\"%s\"" % (qnames[k], v))
989 write(">")
990 ltag = tag.lower()
991 if text:
992 if ltag == "script" or ltag == "style":
993 write(text)
994 else:
995 write(_escape_cdata(text))
996 for e in elem:
997 _serialize_html(write, e, qnames, None)
998 if ltag not in HTML_EMPTY:
999 write("</" + tag + ">")
1000 if elem.tail:
1001 write(_escape_cdata(elem.tail))
1002
1003 def _serialize_text(write, elem):
1004 for part in elem.itertext():
1005 write(part)
1006 if elem.tail:
1007 write(elem.tail)
1008
1009 _serialize = {
1010 "xml": _serialize_xml,
1011 "html": _serialize_html,
1012 "text": _serialize_text,
1013 # this optional method is imported at the end of the module
1014 # "c14n": _serialize_c14n,
1015 }
1016
1017
1018 def register_namespace(prefix, uri):
1019 """Register a namespace prefix.
1020
1021 The registry is global, and any existing mapping for either the
1022 given prefix or the namespace URI will be removed.
1023
1024 *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and
1025 attributes in this namespace will be serialized with prefix if possible.
1026
1027 ValueError is raised if prefix is reserved or is invalid.
1028
1029 """
1030 if re.match(r"ns\d+$", prefix):
1031 raise ValueError("Prefix format reserved for internal use")
1032 for k, v in list(_namespace_map.items()):
1033 if k == uri or v == prefix:
1034 del _namespace_map[k]
1035 _namespace_map[uri] = prefix
1036
1037 _namespace_map = {
1038 # "well-known" namespace prefixes
1039 "http://www.w3.org/XML/1998/namespace": "xml",
1040 "http://www.w3.org/1999/xhtml": "html",
1041 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
1042 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
1043 # xml schema
1044 "http://www.w3.org/2001/XMLSchema": "xs",
1045 "http://www.w3.org/2001/XMLSchema-instance": "xsi",
1046 # dublin core
1047 "http://purl.org/dc/elements/1.1/": "dc",
1048 }
1049 # For tests and troubleshooting
1050 register_namespace._namespace_map = _namespace_map
1051
1052 def _raise_serialization_error(text):
1053 raise TypeError(
1054 "cannot serialize %r (type %s)" % (text, type(text).__name__)
1055 )
1056
1057 def _escape_cdata(text):
1058 # escape character data
1059 try:
1060 # it's worth avoiding do-nothing calls for strings that are
1061 # shorter than 500 characters, or so. assume that's, by far,
1062 # the most common case in most applications.
1063 if "&" in text:
1064 text = text.replace("&", "&amp;")
1065 if "<" in text:
1066 text = text.replace("<", "&lt;")
1067 if ">" in text:
1068 text = text.replace(">", "&gt;")
1069 return text
1070 except (TypeError, AttributeError):
1071 _raise_serialization_error(text)
1072
1073 def _escape_attrib(text):
1074 # escape attribute value
1075 try:
1076 if "&" in text:
1077 text = text.replace("&", "&amp;")
1078 if "<" in text:
1079 text = text.replace("<", "&lt;")
1080 if ">" in text:
1081 text = text.replace(">", "&gt;")
1082 if "\"" in text:
1083 text = text.replace("\"", "&quot;")
1084 # The following business with carriage returns is to satisfy
1085 # Section 2.11 of the XML specification, stating that
1086 # CR or CR LN should be replaced with just LN
1087 # http://www.w3.org/TR/REC-xml/#sec-line-ends
1088 if "\r\n" in text:
1089 text = text.replace("\r\n", "\n")
1090 if "\r" in text:
1091 text = text.replace("\r", "\n")
1092 #The following four lines are issue 17582
1093 if "\n" in text:
1094 text = text.replace("\n", "&#10;")
1095 if "\t" in text:
1096 text = text.replace("\t", "&#09;")
1097 return text
1098 except (TypeError, AttributeError):
1099 _raise_serialization_error(text)
1100
1101 def _escape_attrib_html(text):
1102 # escape attribute value
1103 try:
1104 if "&" in text:
1105 text = text.replace("&", "&amp;")
1106 if ">" in text:
1107 text = text.replace(">", "&gt;")
1108 if "\"" in text:
1109 text = text.replace("\"", "&quot;")
1110 return text
1111 except (TypeError, AttributeError):
1112 _raise_serialization_error(text)
1113
1114 # --------------------------------------------------------------------
1115
1116 def tostring(element, encoding=None, method=None, *,
1117 xml_declaration=None, default_namespace=None,
1118 short_empty_elements=True):
1119 """Generate string representation of XML element.
1120
1121 All subelements are included. If encoding is "unicode", a string
1122 is returned. Otherwise a bytestring is returned.
1123
1124 *element* is an Element instance, *encoding* is an optional output
1125 encoding defaulting to US-ASCII, *method* is an optional output which can
1126 be one of "xml" (default), "html", "text" or "c14n", *default_namespace*
1127 sets the default XML namespace (for "xmlns").
1128
1129 Returns an (optionally) encoded string containing the XML data.
1130
1131 """
1132 stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()
1133 ElementTree(element).write(stream, encoding,
1134 xml_declaration=xml_declaration,
1135 default_namespace=default_namespace,
1136 method=method,
1137 short_empty_elements=short_empty_elements)
1138 return stream.getvalue()
1139
1140 class _ListDataStream(io.BufferedIOBase):
1141 """An auxiliary stream accumulating into a list reference."""
1142 def __init__(self, lst):
1143 self.lst = lst
1144
1145 def writable(self):
1146 return True
1147
1148 def seekable(self):
1149 return True
1150
1151 def write(self, b):
1152 self.lst.append(b)
1153
1154 def tell(self):
1155 return len(self.lst)
1156
1157 def tostringlist(element, encoding=None, method=None, *,
1158 xml_declaration=None, default_namespace=None,
1159 short_empty_elements=True):
1160 lst = []
1161 stream = _ListDataStream(lst)
1162 ElementTree(element).write(stream, encoding,
1163 xml_declaration=xml_declaration,
1164 default_namespace=default_namespace,
1165 method=method,
1166 short_empty_elements=short_empty_elements)
1167 return lst
1168
1169
1170 def dump(elem):
1171 """Write element tree or element structure to sys.stdout.
1172
1173 This function should be used for debugging only.
1174
1175 *elem* is either an ElementTree, or a single Element. The exact output
1176 format is implementation dependent. In this version, it's written as an
1177 ordinary XML file.
1178
1179 """
1180 # debugging
1181 if not isinstance(elem, ElementTree):
1182 elem = ElementTree(elem)
1183 elem.write(sys.stdout, encoding="unicode")
1184 tail = elem.getroot().tail
1185 if not tail or tail[-1] != "\n":
1186 sys.stdout.write("\n")
1187
1188 # --------------------------------------------------------------------
1189 # parsing
1190
1191
1192 def parse(source, parser=None):
1193 """Parse XML document into element tree.
1194
1195 *source* is a filename or file object containing XML data,
1196 *parser* is an optional parser instance defaulting to XMLParser.
1197
1198 Return an ElementTree instance.
1199
1200 """
1201 tree = ElementTree()
1202 tree.parse(source, parser)
1203 return tree
1204
1205
1206 def iterparse(source, events=None, parser=None):
1207 """Incrementally parse XML document into ElementTree.
1208
1209 This class also reports what's going on to the user based on the
1210 *events* it is initialized with. The supported events are the strings
1211 "start", "end", "start-ns" and "end-ns" (the "ns" events are used to get
1212 detailed namespace information). If *events* is omitted, only
1213 "end" events are reported.
1214
1215 *source* is a filename or file object containing XML data, *events* is
1216 a list of events to report back, *parser* is an optional parser instance.
1217
1218 Returns an iterator providing (event, elem) pairs.
1219
1220 """
1221 # Use the internal, undocumented _parser argument for now; When the
1222 # parser argument of iterparse is removed, this can be killed.
1223 pullparser = XMLPullParser(events=events, _parser=parser)
1224 def iterator():
1225 try:
1226 while True:
1227 yield from pullparser.read_events()
1228 # load event buffer
1229 data = source.read(16 * 1024)
1230 if not data:
1231 break
1232 pullparser.feed(data)
1233 root = pullparser._close_and_return_root()
1234 yield from pullparser.read_events()
1235 it.root = root
1236 finally:
1237 if close_source:
1238 source.close()
1239
1240 class IterParseIterator(collections.abc.Iterator):
1241 __next__ = iterator().__next__
1242 it = IterParseIterator()
1243 it.root = None
1244 del iterator, IterParseIterator
1245
1246 close_source = False
1247 if not hasattr(source, "read"):
1248 source = open(source, "rb")
1249 close_source = True
1250
1251 return it
1252
1253
1254 class XMLPullParser:
1255
1256 def __init__(self, events=None, *, _parser=None):
1257 # The _parser argument is for internal use only and must not be relied
1258 # upon in user code. It will be removed in a future release.
1259 # See http://bugs.python.org/issue17741 for more details.
1260
1261 self._events_queue = collections.deque()
1262 self._parser = _parser or XMLParser(target=TreeBuilder())
1263 # wire up the parser for event reporting
1264 if events is None:
1265 events = ("end",)
1266 self._parser._setevents(self._events_queue, events)
1267
1268 def feed(self, data):
1269 """Feed encoded data to parser."""
1270 if self._parser is None:
1271 raise ValueError("feed() called after end of stream")
1272 if data:
1273 try:
1274 self._parser.feed(data)
1275 except SyntaxError as exc:
1276 self._events_queue.append(exc)
1277
1278 def _close_and_return_root(self):
1279 # iterparse needs this to set its root attribute properly :(
1280 root = self._parser.close()
1281 self._parser = None
1282 return root
1283
1284 def close(self):
1285 """Finish feeding data to parser.
1286
1287 Unlike XMLParser, does not return the root element. Use
1288 read_events() to consume elements from XMLPullParser.
1289 """
1290 self._close_and_return_root()
1291
1292 def read_events(self):
1293 """Return an iterator over currently available (event, elem) pairs.
1294
1295 Events are consumed from the internal event queue as they are
1296 retrieved from the iterator.
1297 """
1298 events = self._events_queue
1299 while events:
1300 event = events.popleft()
1301 if isinstance(event, Exception):
1302 raise event
1303 else:
1304 yield event
1305
1306
1307 def XML(text, parser=None):
1308 """Parse XML document from string constant.
1309
1310 This function can be used to embed "XML Literals" in Python code.
1311
1312 *text* is a string containing XML data, *parser* is an
1313 optional parser instance, defaulting to the standard XMLParser.
1314
1315 Returns an Element instance.
1316
1317 """
1318 if not parser:
1319 parser = XMLParser(target=TreeBuilder())
1320 parser.feed(text)
1321 return parser.close()
1322
1323
1324 def XMLID(text, parser=None):
1325 """Parse XML document from string constant for its IDs.
1326
1327 *text* is a string containing XML data, *parser* is an
1328 optional parser instance, defaulting to the standard XMLParser.
1329
1330 Returns an (Element, dict) tuple, in which the
1331 dict maps element id:s to elements.
1332
1333 """
1334 if not parser:
1335 parser = XMLParser(target=TreeBuilder())
1336 parser.feed(text)
1337 tree = parser.close()
1338 ids = {}
1339 for elem in tree.iter():
1340 id = elem.get("id")
1341 if id:
1342 ids[id] = elem
1343 return tree, ids
1344
1345 # Parse XML document from string constant. Alias for XML().
1346 fromstring = XML
1347
1348 def fromstringlist(sequence, parser=None):
1349 """Parse XML document from sequence of string fragments.
1350
1351 *sequence* is a list of other sequence, *parser* is an optional parser
1352 instance, defaulting to the standard XMLParser.
1353
1354 Returns an Element instance.
1355
1356 """
1357 if not parser:
1358 parser = XMLParser(target=TreeBuilder())
1359 for text in sequence:
1360 parser.feed(text)
1361 return parser.close()
1362
1363 # --------------------------------------------------------------------
1364
1365
1366 class TreeBuilder:
1367 """Generic element structure builder.
1368
1369 This builder converts a sequence of start, data, and end method
1370 calls to a well-formed element structure.
1371
1372 You can use this class to build an element structure using a custom XML
1373 parser, or a parser for some other XML-like format.
1374
1375 *element_factory* is an optional element factory which is called
1376 to create new Element instances, as necessary.
1377
1378 *comment_factory* is a factory to create comments to be used instead of
1379 the standard factory. If *insert_comments* is false (the default),
1380 comments will not be inserted into the tree.
1381
1382 *pi_factory* is a factory to create processing instructions to be used
1383 instead of the standard factory. If *insert_pis* is false (the default),
1384 processing instructions will not be inserted into the tree.
1385 """
1386 def __init__(self, element_factory=None, *,
1387 comment_factory=None, pi_factory=None,
1388 insert_comments=False, insert_pis=False):
1389 self._data = [] # data collector
1390 self._elem = [] # element stack
1391 self._last = None # last element
1392 self._root = None # root element
1393 self._tail = None # true if we're after an end tag
1394 if comment_factory is None:
1395 comment_factory = Comment
1396 self._comment_factory = comment_factory
1397 self.insert_comments = insert_comments
1398 if pi_factory is None:
1399 pi_factory = ProcessingInstruction
1400 self._pi_factory = pi_factory
1401 self.insert_pis = insert_pis
1402 if element_factory is None:
1403 element_factory = Element
1404 self._factory = element_factory
1405
1406 def close(self):
1407 """Flush builder buffers and return toplevel document Element."""
1408 assert len(self._elem) == 0, "missing end tags"
1409 assert self._root is not None, "missing toplevel element"
1410 return self._root
1411
1412 def _flush(self):
1413 if self._data:
1414 if self._last is not None:
1415 text = "".join(self._data)
1416 if self._tail:
1417 assert self._last.tail is None, "internal error (tail)"
1418 self._last.tail = text
1419 else:
1420 assert self._last.text is None, "internal error (text)"
1421 self._last.text = text
1422 self._data = []
1423
1424 def data(self, data):
1425 """Add text to current element."""
1426 self._data.append(data)
1427
1428 def start(self, tag, attrs):
1429 """Open new element and return it.
1430
1431 *tag* is the element name, *attrs* is a dict containing element
1432 attributes.
1433
1434 """
1435 self._flush()
1436 self._last = elem = self._factory(tag, attrs)
1437 if self._elem:
1438 self._elem[-1].append(elem)
1439 elif self._root is None:
1440 self._root = elem
1441 self._elem.append(elem)
1442 self._tail = 0
1443 return elem
1444
1445 def end(self, tag):
1446 """Close and return current Element.
1447
1448 *tag* is the element name.
1449
1450 """
1451 self._flush()
1452 self._last = self._elem.pop()
1453 assert self._last.tag == tag,\
1454 "end tag mismatch (expected %s, got %s)" % (
1455 self._last.tag, tag)
1456 self._tail = 1
1457 return self._last
1458
1459 def comment(self, text):
1460 """Create a comment using the comment_factory.
1461
1462 *text* is the text of the comment.
1463 """
1464 return self._handle_single(
1465 self._comment_factory, self.insert_comments, text)
1466
1467 def pi(self, target, text=None):
1468 """Create a processing instruction using the pi_factory.
1469
1470 *target* is the target name of the processing instruction.
1471 *text* is the data of the processing instruction, or ''.
1472 """
1473 return self._handle_single(
1474 self._pi_factory, self.insert_pis, target, text)
1475
1476 def _handle_single(self, factory, insert, *args):
1477 elem = factory(*args)
1478 if insert:
1479 self._flush()
1480 self._last = elem
1481 if self._elem:
1482 self._elem[-1].append(elem)
1483 self._tail = 1
1484 return elem
1485
1486
1487 # also see ElementTree and TreeBuilder
1488 class XMLParser:
1489 """Element structure builder for XML source data based on the expat parser.
1490
1491 *target* is an optional target object which defaults to an instance of the
1492 standard TreeBuilder class, *encoding* is an optional encoding string
1493 which if given, overrides the encoding specified in the XML file:
1494 http://www.iana.org/assignments/character-sets
1495
1496 """
1497
1498 def __init__(self, *, target=None, encoding=None):
1499 try:
1500 from xml.parsers import expat
1501 except ImportError:
1502 try:
1503 import pyexpat as expat
1504 except ImportError:
1505 raise ImportError(
1506 "No module named expat; use SimpleXMLTreeBuilder instead"
1507 )
1508 parser = expat.ParserCreate(encoding, "}")
1509 if target is None:
1510 target = TreeBuilder()
1511 # underscored names are provided for compatibility only
1512 self.parser = self._parser = parser
1513 self.target = self._target = target
1514 self._error = expat.error
1515 self._names = {} # name memo cache
1516 # main callbacks
1517 parser.DefaultHandlerExpand = self._default
1518 if hasattr(target, 'start'):
1519 parser.StartElementHandler = self._start
1520 if hasattr(target, 'end'):
1521 parser.EndElementHandler = self._end
1522 if hasattr(target, 'start_ns'):
1523 parser.StartNamespaceDeclHandler = self._start_ns
1524 if hasattr(target, 'end_ns'):
1525 parser.EndNamespaceDeclHandler = self._end_ns
1526 if hasattr(target, 'data'):
1527 parser.CharacterDataHandler = target.data
1528 # miscellaneous callbacks
1529 if hasattr(target, 'comment'):
1530 parser.CommentHandler = target.comment
1531 if hasattr(target, 'pi'):
1532 parser.ProcessingInstructionHandler = target.pi
1533 # Configure pyexpat: buffering, new-style attribute handling.
1534 parser.buffer_text = 1
1535 parser.ordered_attributes = 1
1536 parser.specified_attributes = 1
1537 self._doctype = None
1538 self.entity = {}
1539 try:
1540 self.version = "Expat %d.%d.%d" % expat.version_info
1541 except AttributeError:
1542 pass # unknown
1543
1544 def _setevents(self, events_queue, events_to_report):
1545 # Internal API for XMLPullParser
1546 # events_to_report: a list of events to report during parsing (same as
1547 # the *events* of XMLPullParser's constructor.
1548 # events_queue: a list of actual parsing events that will be populated
1549 # by the underlying parser.
1550 #
1551 parser = self._parser
1552 append = events_queue.append
1553 for event_name in events_to_report:
1554 if event_name == "start":
1555 parser.ordered_attributes = 1
1556 parser.specified_attributes = 1
1557 def handler(tag, attrib_in, event=event_name, append=append,
1558 start=self._start):
1559 append((event, start(tag, attrib_in)))
1560 parser.StartElementHandler = handler
1561 elif event_name == "end":
1562 def handler(tag, event=event_name, append=append,
1563 end=self._end):
1564 append((event, end(tag)))
1565 parser.EndElementHandler = handler
1566 elif event_name == "start-ns":
1567 # TreeBuilder does not implement .start_ns()
1568 if hasattr(self.target, "start_ns"):
1569 def handler(prefix, uri, event=event_name, append=append,
1570 start_ns=self._start_ns):
1571 append((event, start_ns(prefix, uri)))
1572 else:
1573 def handler(prefix, uri, event=event_name, append=append):
1574 append((event, (prefix or '', uri or '')))
1575 parser.StartNamespaceDeclHandler = handler
1576 elif event_name == "end-ns":
1577 # TreeBuilder does not implement .end_ns()
1578 if hasattr(self.target, "end_ns"):
1579 def handler(prefix, event=event_name, append=append,
1580 end_ns=self._end_ns):
1581 append((event, end_ns(prefix)))
1582 else:
1583 def handler(prefix, event=event_name, append=append):
1584 append((event, None))
1585 parser.EndNamespaceDeclHandler = handler
1586 elif event_name == 'comment':
1587 def handler(text, event=event_name, append=append, self=self):
1588 append((event, self.target.comment(text)))
1589 parser.CommentHandler = handler
1590 elif event_name == 'pi':
1591 def handler(pi_target, data, event=event_name, append=append,
1592 self=self):
1593 append((event, self.target.pi(pi_target, data)))
1594 parser.ProcessingInstructionHandler = handler
1595 else:
1596 raise ValueError("unknown event %r" % event_name)
1597
1598 def _raiseerror(self, value):
1599 err = ParseError(value)
1600 err.code = value.code
1601 err.position = value.lineno, value.offset
1602 raise err
1603
1604 def _fixname(self, key):
1605 # expand qname, and convert name string to ascii, if possible
1606 try:
1607 name = self._names[key]
1608 except KeyError:
1609 name = key
1610 if "}" in name:
1611 name = "{" + name
1612 self._names[key] = name
1613 return name
1614
1615 def _start_ns(self, prefix, uri):
1616 return self.target.start_ns(prefix or '', uri or '')
1617
1618 def _end_ns(self, prefix):
1619 return self.target.end_ns(prefix or '')
1620
1621 def _start(self, tag, attr_list):
1622 # Handler for expat's StartElementHandler. Since ordered_attributes
1623 # is set, the attributes are reported as a list of alternating
1624 # attribute name,value.
1625 fixname = self._fixname
1626 tag = fixname(tag)
1627 attrib = {}
1628 if attr_list:
1629 for i in range(0, len(attr_list), 2):
1630 attrib[fixname(attr_list[i])] = attr_list[i+1]
1631 return self.target.start(tag, attrib)
1632
1633 def _end(self, tag):
1634 return self.target.end(self._fixname(tag))
1635
1636 def _default(self, text):
1637 prefix = text[:1]
1638 if prefix == "&":
1639 # deal with undefined entities
1640 try:
1641 data_handler = self.target.data
1642 except AttributeError:
1643 return
1644 try:
1645 data_handler(self.entity[text[1:-1]])
1646 except KeyError:
1647 from xml.parsers import expat
1648 err = expat.error(
1649 "undefined entity %s: line %d, column %d" %
1650 (text, self.parser.ErrorLineNumber,
1651 self.parser.ErrorColumnNumber)
1652 )
1653 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY
1654 err.lineno = self.parser.ErrorLineNumber
1655 err.offset = self.parser.ErrorColumnNumber
1656 raise err
1657 elif prefix == "<" and text[:9] == "<!DOCTYPE":
1658 self._doctype = [] # inside a doctype declaration
1659 elif self._doctype is not None:
1660 # parse doctype contents
1661 if prefix == ">":
1662 self._doctype = None
1663 return
1664 text = text.strip()
1665 if not text:
1666 return
1667 self._doctype.append(text)
1668 n = len(self._doctype)
1669 if n > 2:
1670 type = self._doctype[1]
1671 if type == "PUBLIC" and n == 4:
1672 name, type, pubid, system = self._doctype
1673 if pubid:
1674 pubid = pubid[1:-1]
1675 elif type == "SYSTEM" and n == 3:
1676 name, type, system = self._doctype
1677 pubid = None
1678 else:
1679 return
1680 if hasattr(self.target, "doctype"):
1681 self.target.doctype(name, pubid, system[1:-1])
1682 elif hasattr(self, "doctype"):
1683 warnings.warn(
1684 "The doctype() method of XMLParser is ignored. "
1685 "Define doctype() method on the TreeBuilder target.",
1686 RuntimeWarning)
1687
1688 self._doctype = None
1689
1690 def feed(self, data):
1691 """Feed encoded data to parser."""
1692 try:
1693 self.parser.Parse(data, 0)
1694 except self._error as v:
1695 self._raiseerror(v)
1696
1697 def close(self):
1698 """Finish feeding data to parser and return element structure."""
1699 try:
1700 self.parser.Parse("", 1) # end of data
1701 except self._error as v:
1702 self._raiseerror(v)
1703 try:
1704 close_handler = self.target.close
1705 except AttributeError:
1706 pass
1707 else:
1708 return close_handler()
1709 finally:
1710 # get rid of circular references
1711 del self.parser, self._parser
1712 del self.target, self._target
1713
1714
1715 # --------------------------------------------------------------------
1716 # C14N 2.0
1717
1718 def canonicalize(xml_data=None, *, out=None, from_file=None, **options):
1719 """Convert XML to its C14N 2.0 serialised form.
1720
1721 If *out* is provided, it must be a file or file-like object that receives
1722 the serialised canonical XML output (text, not bytes) through its ``.write()``
1723 method. To write to a file, open it in text mode with encoding "utf-8".
1724 If *out* is not provided, this function returns the output as text string.
1725
1726 Either *xml_data* (an XML string) or *from_file* (a file path or
1727 file-like object) must be provided as input.
1728
1729 The configuration options are the same as for the ``C14NWriterTarget``.
1730 """
1731 if xml_data is None and from_file is None:
1732 raise ValueError("Either 'xml_data' or 'from_file' must be provided as input")
1733 sio = None
1734 if out is None:
1735 sio = out = io.StringIO()
1736
1737 parser = XMLParser(target=C14NWriterTarget(out.write, **options))
1738
1739 if xml_data is not None:
1740 parser.feed(xml_data)
1741 parser.close()
1742 elif from_file is not None:
1743 parse(from_file, parser=parser)
1744
1745 return sio.getvalue() if sio is not None else None
1746
1747
1748 _looks_like_prefix_name = re.compile(r'^\w+:\w+$', re.UNICODE).match
1749
1750
1751 class C14NWriterTarget:
1752 """
1753 Canonicalization writer target for the XMLParser.
1754
1755 Serialises parse events to XML C14N 2.0.
1756
1757 The *write* function is used for writing out the resulting data stream
1758 as text (not bytes). To write to a file, open it in text mode with encoding
1759 "utf-8" and pass its ``.write`` method.
1760
1761 Configuration options:
1762
1763 - *with_comments*: set to true to include comments
1764 - *strip_text*: set to true to strip whitespace before and after text content
1765 - *rewrite_prefixes*: set to true to replace namespace prefixes by "n{number}"
1766 - *qname_aware_tags*: a set of qname aware tag names in which prefixes
1767 should be replaced in text content
1768 - *qname_aware_attrs*: a set of qname aware attribute names in which prefixes
1769 should be replaced in text content
1770 - *exclude_attrs*: a set of attribute names that should not be serialised
1771 - *exclude_tags*: a set of tag names that should not be serialised
1772 """
1773 def __init__(self, write, *,
1774 with_comments=False, strip_text=False, rewrite_prefixes=False,
1775 qname_aware_tags=None, qname_aware_attrs=None,
1776 exclude_attrs=None, exclude_tags=None):
1777 self._write = write
1778 self._data = []
1779 self._with_comments = with_comments
1780 self._strip_text = strip_text
1781 self._exclude_attrs = set(exclude_attrs) if exclude_attrs else None
1782 self._exclude_tags = set(exclude_tags) if exclude_tags else None
1783
1784 self._rewrite_prefixes = rewrite_prefixes
1785 if qname_aware_tags:
1786 self._qname_aware_tags = set(qname_aware_tags)
1787 else:
1788 self._qname_aware_tags = None
1789 if qname_aware_attrs:
1790 self._find_qname_aware_attrs = set(qname_aware_attrs).intersection
1791 else:
1792 self._find_qname_aware_attrs = None
1793
1794 # Stack with globally and newly declared namespaces as (uri, prefix) pairs.
1795 self._declared_ns_stack = [[
1796 ("http://www.w3.org/XML/1998/namespace", "xml"),
1797 ]]
1798 # Stack with user declared namespace prefixes as (uri, prefix) pairs.
1799 self._ns_stack = []
1800 if not rewrite_prefixes:
1801 self._ns_stack.append(list(_namespace_map.items()))
1802 self._ns_stack.append([])
1803 self._prefix_map = {}
1804 self._preserve_space = [False]
1805 self._pending_start = None
1806 self._root_seen = False
1807 self._root_done = False
1808 self._ignored_depth = 0
1809
1810 def _iter_namespaces(self, ns_stack, _reversed=reversed):
1811 for namespaces in _reversed(ns_stack):
1812 if namespaces: # almost no element declares new namespaces
1813 yield from namespaces
1814
1815 def _resolve_prefix_name(self, prefixed_name):
1816 prefix, name = prefixed_name.split(':', 1)
1817 for uri, p in self._iter_namespaces(self._ns_stack):
1818 if p == prefix:
1819 return f'{{{uri}}}{name}'
1820 raise ValueError(f'Prefix {prefix} of QName "{prefixed_name}" is not declared in scope')
1821
1822 def _qname(self, qname, uri=None):
1823 if uri is None:
1824 uri, tag = qname[1:].rsplit('}', 1) if qname[:1] == '{' else ('', qname)
1825 else:
1826 tag = qname
1827
1828 prefixes_seen = set()
1829 for u, prefix in self._iter_namespaces(self._declared_ns_stack):
1830 if u == uri and prefix not in prefixes_seen:
1831 return f'{prefix}:{tag}' if prefix else tag, tag, uri
1832 prefixes_seen.add(prefix)
1833
1834 # Not declared yet => add new declaration.
1835 if self._rewrite_prefixes:
1836 if uri in self._prefix_map:
1837 prefix = self._prefix_map[uri]
1838 else:
1839 prefix = self._prefix_map[uri] = f'n{len(self._prefix_map)}'
1840 self._declared_ns_stack[-1].append((uri, prefix))
1841 return f'{prefix}:{tag}', tag, uri
1842
1843 if not uri and '' not in prefixes_seen:
1844 # No default namespace declared => no prefix needed.
1845 return tag, tag, uri
1846
1847 for u, prefix in self._iter_namespaces(self._ns_stack):
1848 if u == uri:
1849 self._declared_ns_stack[-1].append((uri, prefix))
1850 return f'{prefix}:{tag}' if prefix else tag, tag, uri
1851
1852 raise ValueError(f'Namespace "{uri}" is not declared in scope')
1853
1854 def data(self, data):
1855 if not self._ignored_depth:
1856 self._data.append(data)
1857
1858 def _flush(self, _join_text=''.join):
1859 data = _join_text(self._data)
1860 del self._data[:]
1861 if self._strip_text and not self._preserve_space[-1]:
1862 data = data.strip()
1863 if self._pending_start is not None:
1864 args, self._pending_start = self._pending_start, None
1865 qname_text = data if data and _looks_like_prefix_name(data) else None
1866 self._start(*args, qname_text)
1867 if qname_text is not None:
1868 return
1869 if data and self._root_seen:
1870 self._write(_escape_cdata_c14n(data))
1871
1872 def start_ns(self, prefix, uri):
1873 if self._ignored_depth:
1874 return
1875 # we may have to resolve qnames in text content
1876 if self._data:
1877 self._flush()
1878 self._ns_stack[-1].append((uri, prefix))
1879
1880 def start(self, tag, attrs):
1881 if self._exclude_tags is not None and (
1882 self._ignored_depth or tag in self._exclude_tags):
1883 self._ignored_depth += 1
1884 return
1885 if self._data:
1886 self._flush()
1887
1888 new_namespaces = []
1889 self._declared_ns_stack.append(new_namespaces)
1890
1891 if self._qname_aware_tags is not None and tag in self._qname_aware_tags:
1892 # Need to parse text first to see if it requires a prefix declaration.
1893 self._pending_start = (tag, attrs, new_namespaces)
1894 return
1895 self._start(tag, attrs, new_namespaces)
1896
1897 def _start(self, tag, attrs, new_namespaces, qname_text=None):
1898 if self._exclude_attrs is not None and attrs:
1899 attrs = {k: v for k, v in attrs.items() if k not in self._exclude_attrs}
1900
1901 qnames = {tag, *attrs}
1902 resolved_names = {}
1903
1904 # Resolve prefixes in attribute and tag text.
1905 if qname_text is not None:
1906 qname = resolved_names[qname_text] = self._resolve_prefix_name(qname_text)
1907 qnames.add(qname)
1908 if self._find_qname_aware_attrs is not None and attrs:
1909 qattrs = self._find_qname_aware_attrs(attrs)
1910 if qattrs:
1911 for attr_name in qattrs:
1912 value = attrs[attr_name]
1913 if _looks_like_prefix_name(value):
1914 qname = resolved_names[value] = self._resolve_prefix_name(value)
1915 qnames.add(qname)
1916 else:
1917 qattrs = None
1918 else:
1919 qattrs = None
1920
1921 # Assign prefixes in lexicographical order of used URIs.
1922 parse_qname = self._qname
1923 parsed_qnames = {n: parse_qname(n) for n in sorted(
1924 qnames, key=lambda n: n.split('}', 1))}
1925
1926 # Write namespace declarations in prefix order ...
1927 if new_namespaces:
1928 attr_list = [
1929 ('xmlns:' + prefix if prefix else 'xmlns', uri)
1930 for uri, prefix in new_namespaces
1931 ]
1932 attr_list.sort()
1933 else:
1934 # almost always empty
1935 attr_list = []
1936
1937 # ... followed by attributes in URI+name order
1938 if attrs:
1939 for k, v in sorted(attrs.items()):
1940 if qattrs is not None and k in qattrs and v in resolved_names:
1941 v = parsed_qnames[resolved_names[v]][0]
1942 attr_qname, attr_name, uri = parsed_qnames[k]
1943 # No prefix for attributes in default ('') namespace.
1944 attr_list.append((attr_qname if uri else attr_name, v))
1945
1946 # Honour xml:space attributes.
1947 space_behaviour = attrs.get('{http://www.w3.org/XML/1998/namespace}space')
1948 self._preserve_space.append(
1949 space_behaviour == 'preserve' if space_behaviour
1950 else self._preserve_space[-1])
1951
1952 # Write the tag.
1953 write = self._write
1954 write('<' + parsed_qnames[tag][0])
1955 if attr_list:
1956 write(''.join([f' {k}="{_escape_attrib_c14n(v)}"' for k, v in attr_list]))
1957 write('>')
1958
1959 # Write the resolved qname text content.
1960 if qname_text is not None:
1961 write(_escape_cdata_c14n(parsed_qnames[resolved_names[qname_text]][0]))
1962
1963 self._root_seen = True
1964 self._ns_stack.append([])
1965
1966 def end(self, tag):
1967 if self._ignored_depth:
1968 self._ignored_depth -= 1
1969 return
1970 if self._data:
1971 self._flush()
1972 self._write(f'</{self._qname(tag)[0]}>')
1973 self._preserve_space.pop()
1974 self._root_done = len(self._preserve_space) == 1
1975 self._declared_ns_stack.pop()
1976 self._ns_stack.pop()
1977
1978 def comment(self, text):
1979 if not self._with_comments:
1980 return
1981 if self._ignored_depth:
1982 return
1983 if self._root_done:
1984 self._write('\n')
1985 elif self._root_seen and self._data:
1986 self._flush()
1987 self._write(f'<!--{_escape_cdata_c14n(text)}-->')
1988 if not self._root_seen:
1989 self._write('\n')
1990
1991 def pi(self, target, data):
1992 if self._ignored_depth:
1993 return
1994 if self._root_done:
1995 self._write('\n')
1996 elif self._root_seen and self._data:
1997 self._flush()
1998 self._write(
1999 f'<?{target} {_escape_cdata_c14n(data)}?>' if data else f'<?{target}?>')
2000 if not self._root_seen:
2001 self._write('\n')
2002
2003
2004 def _escape_cdata_c14n(text):
2005 # escape character data
2006 try:
2007 # it's worth avoiding do-nothing calls for strings that are
2008 # shorter than 500 character, or so. assume that's, by far,
2009 # the most common case in most applications.
2010 if '&' in text:
2011 text = text.replace('&', '&amp;')
2012 if '<' in text:
2013 text = text.replace('<', '&lt;')
2014 if '>' in text:
2015 text = text.replace('>', '&gt;')
2016 if '\r' in text:
2017 text = text.replace('\r', '&#xD;')
2018 return text
2019 except (TypeError, AttributeError):
2020 _raise_serialization_error(text)
2021
2022
2023 def _escape_attrib_c14n(text):
2024 # escape attribute value
2025 try:
2026 if '&' in text:
2027 text = text.replace('&', '&amp;')
2028 if '<' in text:
2029 text = text.replace('<', '&lt;')
2030 if '"' in text:
2031 text = text.replace('"', '&quot;')
2032 if '\t' in text:
2033 text = text.replace('\t', '&#x9;')
2034 if '\n' in text:
2035 text = text.replace('\n', '&#xA;')
2036 if '\r' in text:
2037 text = text.replace('\r', '&#xD;')
2038 return text
2039 except (TypeError, AttributeError):
2040 _raise_serialization_error(text)
2041
2042
2043 # --------------------------------------------------------------------
2044
2045 # Import the C accelerators
2046 try:
2047 # Element is going to be shadowed by the C implementation. We need to keep
2048 # the Python version of it accessible for some "creative" by external code
2049 # (see tests)
2050 _Element_Py = Element
2051
2052 # Element, SubElement, ParseError, TreeBuilder, XMLParser, _set_factories
2053 from _elementtree import *
2054 from _elementtree import _set_factories
2055 except ImportError:
2056 pass
2057 else:
2058 _set_factories(Comment, ProcessingInstruction)