jpayne@68
|
1 """Lightweight XML support for Python.
|
jpayne@68
|
2
|
jpayne@68
|
3 XML is an inherently hierarchical data format, and the most natural way to
|
jpayne@68
|
4 represent it is with a tree. This module has two classes for this purpose:
|
jpayne@68
|
5
|
jpayne@68
|
6 1. ElementTree represents the whole XML document as a tree and
|
jpayne@68
|
7
|
jpayne@68
|
8 2. Element represents a single node in this tree.
|
jpayne@68
|
9
|
jpayne@68
|
10 Interactions with the whole document (reading and writing to/from files) are
|
jpayne@68
|
11 usually done on the ElementTree level. Interactions with a single XML element
|
jpayne@68
|
12 and its sub-elements are done on the Element level.
|
jpayne@68
|
13
|
jpayne@68
|
14 Element is a flexible container object designed to store hierarchical data
|
jpayne@68
|
15 structures in memory. It can be described as a cross between a list and a
|
jpayne@68
|
16 dictionary. Each Element has a number of properties associated with it:
|
jpayne@68
|
17
|
jpayne@68
|
18 'tag' - a string containing the element's name.
|
jpayne@68
|
19
|
jpayne@68
|
20 'attributes' - a Python dictionary storing the element's attributes.
|
jpayne@68
|
21
|
jpayne@68
|
22 'text' - a string containing the element's text content.
|
jpayne@68
|
23
|
jpayne@68
|
24 'tail' - an optional string containing text after the element's end tag.
|
jpayne@68
|
25
|
jpayne@68
|
26 And a number of child elements stored in a Python sequence.
|
jpayne@68
|
27
|
jpayne@68
|
28 To create an element instance, use the Element constructor,
|
jpayne@68
|
29 or the SubElement factory function.
|
jpayne@68
|
30
|
jpayne@68
|
31 You can also use the ElementTree class to wrap an element structure
|
jpayne@68
|
32 and convert it to and from XML.
|
jpayne@68
|
33
|
jpayne@68
|
34 """
|
jpayne@68
|
35
|
jpayne@68
|
36 #---------------------------------------------------------------------
|
jpayne@68
|
37 # Licensed to PSF under a Contributor Agreement.
|
jpayne@68
|
38 # See http://www.python.org/psf/license for licensing details.
|
jpayne@68
|
39 #
|
jpayne@68
|
40 # ElementTree
|
jpayne@68
|
41 # Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved.
|
jpayne@68
|
42 #
|
jpayne@68
|
43 # fredrik@pythonware.com
|
jpayne@68
|
44 # http://www.pythonware.com
|
jpayne@68
|
45 # --------------------------------------------------------------------
|
jpayne@68
|
46 # The ElementTree toolkit is
|
jpayne@68
|
47 #
|
jpayne@68
|
48 # Copyright (c) 1999-2008 by Fredrik Lundh
|
jpayne@68
|
49 #
|
jpayne@68
|
50 # By obtaining, using, and/or copying this software and/or its
|
jpayne@68
|
51 # associated documentation, you agree that you have read, understood,
|
jpayne@68
|
52 # and will comply with the following terms and conditions:
|
jpayne@68
|
53 #
|
jpayne@68
|
54 # Permission to use, copy, modify, and distribute this software and
|
jpayne@68
|
55 # its associated documentation for any purpose and without fee is
|
jpayne@68
|
56 # hereby granted, provided that the above copyright notice appears in
|
jpayne@68
|
57 # all copies, and that both that copyright notice and this permission
|
jpayne@68
|
58 # notice appear in supporting documentation, and that the name of
|
jpayne@68
|
59 # Secret Labs AB or the author not be used in advertising or publicity
|
jpayne@68
|
60 # pertaining to distribution of the software without specific, written
|
jpayne@68
|
61 # prior permission.
|
jpayne@68
|
62 #
|
jpayne@68
|
63 # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
|
jpayne@68
|
64 # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
|
jpayne@68
|
65 # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
|
jpayne@68
|
66 # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
|
jpayne@68
|
67 # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
jpayne@68
|
68 # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
jpayne@68
|
69 # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
jpayne@68
|
70 # OF THIS SOFTWARE.
|
jpayne@68
|
71 # --------------------------------------------------------------------
|
jpayne@68
|
72
|
jpayne@68
|
73 __all__ = [
|
jpayne@68
|
74 # public symbols
|
jpayne@68
|
75 "Comment",
|
jpayne@68
|
76 "dump",
|
jpayne@68
|
77 "Element", "ElementTree",
|
jpayne@68
|
78 "fromstring", "fromstringlist",
|
jpayne@68
|
79 "iselement", "iterparse",
|
jpayne@68
|
80 "parse", "ParseError",
|
jpayne@68
|
81 "PI", "ProcessingInstruction",
|
jpayne@68
|
82 "QName",
|
jpayne@68
|
83 "SubElement",
|
jpayne@68
|
84 "tostring", "tostringlist",
|
jpayne@68
|
85 "TreeBuilder",
|
jpayne@68
|
86 "VERSION",
|
jpayne@68
|
87 "XML", "XMLID",
|
jpayne@68
|
88 "XMLParser", "XMLPullParser",
|
jpayne@68
|
89 "register_namespace",
|
jpayne@68
|
90 "canonicalize", "C14NWriterTarget",
|
jpayne@68
|
91 ]
|
jpayne@68
|
92
|
jpayne@68
|
93 VERSION = "1.3.0"
|
jpayne@68
|
94
|
jpayne@68
|
95 import sys
|
jpayne@68
|
96 import re
|
jpayne@68
|
97 import warnings
|
jpayne@68
|
98 import io
|
jpayne@68
|
99 import collections
|
jpayne@68
|
100 import collections.abc
|
jpayne@68
|
101 import contextlib
|
jpayne@68
|
102
|
jpayne@68
|
103 from . import ElementPath
|
jpayne@68
|
104
|
jpayne@68
|
105
|
jpayne@68
|
106 class ParseError(SyntaxError):
|
jpayne@68
|
107 """An error when parsing an XML document.
|
jpayne@68
|
108
|
jpayne@68
|
109 In addition to its exception value, a ParseError contains
|
jpayne@68
|
110 two extra attributes:
|
jpayne@68
|
111 'code' - the specific exception code
|
jpayne@68
|
112 'position' - the line and column of the error
|
jpayne@68
|
113
|
jpayne@68
|
114 """
|
jpayne@68
|
115 pass
|
jpayne@68
|
116
|
jpayne@68
|
117 # --------------------------------------------------------------------
|
jpayne@68
|
118
|
jpayne@68
|
119
|
jpayne@68
|
120 def iselement(element):
|
jpayne@68
|
121 """Return True if *element* appears to be an Element."""
|
jpayne@68
|
122 return hasattr(element, 'tag')
|
jpayne@68
|
123
|
jpayne@68
|
124
|
jpayne@68
|
125 class Element:
|
jpayne@68
|
126 """An XML element.
|
jpayne@68
|
127
|
jpayne@68
|
128 This class is the reference implementation of the Element interface.
|
jpayne@68
|
129
|
jpayne@68
|
130 An element's length is its number of subelements. That means if you
|
jpayne@68
|
131 want to check if an element is truly empty, you should check BOTH
|
jpayne@68
|
132 its length AND its text attribute.
|
jpayne@68
|
133
|
jpayne@68
|
134 The element tag, attribute names, and attribute values can be either
|
jpayne@68
|
135 bytes or strings.
|
jpayne@68
|
136
|
jpayne@68
|
137 *tag* is the element name. *attrib* is an optional dictionary containing
|
jpayne@68
|
138 element attributes. *extra* are additional element attributes given as
|
jpayne@68
|
139 keyword arguments.
|
jpayne@68
|
140
|
jpayne@68
|
141 Example form:
|
jpayne@68
|
142 <tag attrib>text<child/>...</tag>tail
|
jpayne@68
|
143
|
jpayne@68
|
144 """
|
jpayne@68
|
145
|
jpayne@68
|
146 tag = None
|
jpayne@68
|
147 """The element's name."""
|
jpayne@68
|
148
|
jpayne@68
|
149 attrib = None
|
jpayne@68
|
150 """Dictionary of the element's attributes."""
|
jpayne@68
|
151
|
jpayne@68
|
152 text = None
|
jpayne@68
|
153 """
|
jpayne@68
|
154 Text before first subelement. This is either a string or the value None.
|
jpayne@68
|
155 Note that if there is no text, this attribute may be either
|
jpayne@68
|
156 None or the empty string, depending on the parser.
|
jpayne@68
|
157
|
jpayne@68
|
158 """
|
jpayne@68
|
159
|
jpayne@68
|
160 tail = None
|
jpayne@68
|
161 """
|
jpayne@68
|
162 Text after this element's end tag, but before the next sibling element's
|
jpayne@68
|
163 start tag. This is either a string or the value None. Note that if there
|
jpayne@68
|
164 was no text, this attribute may be either None or an empty string,
|
jpayne@68
|
165 depending on the parser.
|
jpayne@68
|
166
|
jpayne@68
|
167 """
|
jpayne@68
|
168
|
jpayne@68
|
169 def __init__(self, tag, attrib={}, **extra):
|
jpayne@68
|
170 if not isinstance(attrib, dict):
|
jpayne@68
|
171 raise TypeError("attrib must be dict, not %s" % (
|
jpayne@68
|
172 attrib.__class__.__name__,))
|
jpayne@68
|
173 self.tag = tag
|
jpayne@68
|
174 self.attrib = {**attrib, **extra}
|
jpayne@68
|
175 self._children = []
|
jpayne@68
|
176
|
jpayne@68
|
177 def __repr__(self):
|
jpayne@68
|
178 return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self))
|
jpayne@68
|
179
|
jpayne@68
|
180 def makeelement(self, tag, attrib):
|
jpayne@68
|
181 """Create a new element with the same type.
|
jpayne@68
|
182
|
jpayne@68
|
183 *tag* is a string containing the element name.
|
jpayne@68
|
184 *attrib* is a dictionary containing the element attributes.
|
jpayne@68
|
185
|
jpayne@68
|
186 Do not call this method, use the SubElement factory function instead.
|
jpayne@68
|
187
|
jpayne@68
|
188 """
|
jpayne@68
|
189 return self.__class__(tag, attrib)
|
jpayne@68
|
190
|
jpayne@68
|
191 def copy(self):
|
jpayne@68
|
192 """Return copy of current element.
|
jpayne@68
|
193
|
jpayne@68
|
194 This creates a shallow copy. Subelements will be shared with the
|
jpayne@68
|
195 original tree.
|
jpayne@68
|
196
|
jpayne@68
|
197 """
|
jpayne@68
|
198 elem = self.makeelement(self.tag, self.attrib)
|
jpayne@68
|
199 elem.text = self.text
|
jpayne@68
|
200 elem.tail = self.tail
|
jpayne@68
|
201 elem[:] = self
|
jpayne@68
|
202 return elem
|
jpayne@68
|
203
|
jpayne@68
|
204 def __len__(self):
|
jpayne@68
|
205 return len(self._children)
|
jpayne@68
|
206
|
jpayne@68
|
207 def __bool__(self):
|
jpayne@68
|
208 warnings.warn(
|
jpayne@68
|
209 "The behavior of this method will change in future versions. "
|
jpayne@68
|
210 "Use specific 'len(elem)' or 'elem is not None' test instead.",
|
jpayne@68
|
211 FutureWarning, stacklevel=2
|
jpayne@68
|
212 )
|
jpayne@68
|
213 return len(self._children) != 0 # emulate old behaviour, for now
|
jpayne@68
|
214
|
jpayne@68
|
215 def __getitem__(self, index):
|
jpayne@68
|
216 return self._children[index]
|
jpayne@68
|
217
|
jpayne@68
|
218 def __setitem__(self, index, element):
|
jpayne@68
|
219 if isinstance(index, slice):
|
jpayne@68
|
220 for elt in element:
|
jpayne@68
|
221 self._assert_is_element(elt)
|
jpayne@68
|
222 else:
|
jpayne@68
|
223 self._assert_is_element(element)
|
jpayne@68
|
224 self._children[index] = element
|
jpayne@68
|
225
|
jpayne@68
|
226 def __delitem__(self, index):
|
jpayne@68
|
227 del self._children[index]
|
jpayne@68
|
228
|
jpayne@68
|
229 def append(self, subelement):
|
jpayne@68
|
230 """Add *subelement* to the end of this element.
|
jpayne@68
|
231
|
jpayne@68
|
232 The new element will appear in document order after the last existing
|
jpayne@68
|
233 subelement (or directly after the text, if it's the first subelement),
|
jpayne@68
|
234 but before the end tag for this element.
|
jpayne@68
|
235
|
jpayne@68
|
236 """
|
jpayne@68
|
237 self._assert_is_element(subelement)
|
jpayne@68
|
238 self._children.append(subelement)
|
jpayne@68
|
239
|
jpayne@68
|
240 def extend(self, elements):
|
jpayne@68
|
241 """Append subelements from a sequence.
|
jpayne@68
|
242
|
jpayne@68
|
243 *elements* is a sequence with zero or more elements.
|
jpayne@68
|
244
|
jpayne@68
|
245 """
|
jpayne@68
|
246 for element in elements:
|
jpayne@68
|
247 self._assert_is_element(element)
|
jpayne@68
|
248 self._children.extend(elements)
|
jpayne@68
|
249
|
jpayne@68
|
250 def insert(self, index, subelement):
|
jpayne@68
|
251 """Insert *subelement* at position *index*."""
|
jpayne@68
|
252 self._assert_is_element(subelement)
|
jpayne@68
|
253 self._children.insert(index, subelement)
|
jpayne@68
|
254
|
jpayne@68
|
255 def _assert_is_element(self, e):
|
jpayne@68
|
256 # Need to refer to the actual Python implementation, not the
|
jpayne@68
|
257 # shadowing C implementation.
|
jpayne@68
|
258 if not isinstance(e, _Element_Py):
|
jpayne@68
|
259 raise TypeError('expected an Element, not %s' % type(e).__name__)
|
jpayne@68
|
260
|
jpayne@68
|
261 def remove(self, subelement):
|
jpayne@68
|
262 """Remove matching subelement.
|
jpayne@68
|
263
|
jpayne@68
|
264 Unlike the find methods, this method compares elements based on
|
jpayne@68
|
265 identity, NOT ON tag value or contents. To remove subelements by
|
jpayne@68
|
266 other means, the easiest way is to use a list comprehension to
|
jpayne@68
|
267 select what elements to keep, and then use slice assignment to update
|
jpayne@68
|
268 the parent element.
|
jpayne@68
|
269
|
jpayne@68
|
270 ValueError is raised if a matching element could not be found.
|
jpayne@68
|
271
|
jpayne@68
|
272 """
|
jpayne@68
|
273 # assert iselement(element)
|
jpayne@68
|
274 self._children.remove(subelement)
|
jpayne@68
|
275
|
jpayne@68
|
276 def getchildren(self):
|
jpayne@68
|
277 """(Deprecated) Return all subelements.
|
jpayne@68
|
278
|
jpayne@68
|
279 Elements are returned in document order.
|
jpayne@68
|
280
|
jpayne@68
|
281 """
|
jpayne@68
|
282 warnings.warn(
|
jpayne@68
|
283 "This method will be removed in future versions. "
|
jpayne@68
|
284 "Use 'list(elem)' or iteration over elem instead.",
|
jpayne@68
|
285 DeprecationWarning, stacklevel=2
|
jpayne@68
|
286 )
|
jpayne@68
|
287 return self._children
|
jpayne@68
|
288
|
jpayne@68
|
289 def find(self, path, namespaces=None):
|
jpayne@68
|
290 """Find first matching element by tag name or path.
|
jpayne@68
|
291
|
jpayne@68
|
292 *path* is a string having either an element tag or an XPath,
|
jpayne@68
|
293 *namespaces* is an optional mapping from namespace prefix to full name.
|
jpayne@68
|
294
|
jpayne@68
|
295 Return the first matching element, or None if no element was found.
|
jpayne@68
|
296
|
jpayne@68
|
297 """
|
jpayne@68
|
298 return ElementPath.find(self, path, namespaces)
|
jpayne@68
|
299
|
jpayne@68
|
300 def findtext(self, path, default=None, namespaces=None):
|
jpayne@68
|
301 """Find text for first matching element by tag name or path.
|
jpayne@68
|
302
|
jpayne@68
|
303 *path* is a string having either an element tag or an XPath,
|
jpayne@68
|
304 *default* is the value to return if the element was not found,
|
jpayne@68
|
305 *namespaces* is an optional mapping from namespace prefix to full name.
|
jpayne@68
|
306
|
jpayne@68
|
307 Return text content of first matching element, or default value if
|
jpayne@68
|
308 none was found. Note that if an element is found having no text
|
jpayne@68
|
309 content, the empty string is returned.
|
jpayne@68
|
310
|
jpayne@68
|
311 """
|
jpayne@68
|
312 return ElementPath.findtext(self, path, default, namespaces)
|
jpayne@68
|
313
|
jpayne@68
|
314 def findall(self, path, namespaces=None):
|
jpayne@68
|
315 """Find all matching subelements by tag name or path.
|
jpayne@68
|
316
|
jpayne@68
|
317 *path* is a string having either an element tag or an XPath,
|
jpayne@68
|
318 *namespaces* is an optional mapping from namespace prefix to full name.
|
jpayne@68
|
319
|
jpayne@68
|
320 Returns list containing all matching elements in document order.
|
jpayne@68
|
321
|
jpayne@68
|
322 """
|
jpayne@68
|
323 return ElementPath.findall(self, path, namespaces)
|
jpayne@68
|
324
|
jpayne@68
|
325 def iterfind(self, path, namespaces=None):
|
jpayne@68
|
326 """Find all matching subelements by tag name or path.
|
jpayne@68
|
327
|
jpayne@68
|
328 *path* is a string having either an element tag or an XPath,
|
jpayne@68
|
329 *namespaces* is an optional mapping from namespace prefix to full name.
|
jpayne@68
|
330
|
jpayne@68
|
331 Return an iterable yielding all matching elements in document order.
|
jpayne@68
|
332
|
jpayne@68
|
333 """
|
jpayne@68
|
334 return ElementPath.iterfind(self, path, namespaces)
|
jpayne@68
|
335
|
jpayne@68
|
336 def clear(self):
|
jpayne@68
|
337 """Reset element.
|
jpayne@68
|
338
|
jpayne@68
|
339 This function removes all subelements, clears all attributes, and sets
|
jpayne@68
|
340 the text and tail attributes to None.
|
jpayne@68
|
341
|
jpayne@68
|
342 """
|
jpayne@68
|
343 self.attrib.clear()
|
jpayne@68
|
344 self._children = []
|
jpayne@68
|
345 self.text = self.tail = None
|
jpayne@68
|
346
|
jpayne@68
|
347 def get(self, key, default=None):
|
jpayne@68
|
348 """Get element attribute.
|
jpayne@68
|
349
|
jpayne@68
|
350 Equivalent to attrib.get, but some implementations may handle this a
|
jpayne@68
|
351 bit more efficiently. *key* is what attribute to look for, and
|
jpayne@68
|
352 *default* is what to return if the attribute was not found.
|
jpayne@68
|
353
|
jpayne@68
|
354 Returns a string containing the attribute value, or the default if
|
jpayne@68
|
355 attribute was not found.
|
jpayne@68
|
356
|
jpayne@68
|
357 """
|
jpayne@68
|
358 return self.attrib.get(key, default)
|
jpayne@68
|
359
|
jpayne@68
|
360 def set(self, key, value):
|
jpayne@68
|
361 """Set element attribute.
|
jpayne@68
|
362
|
jpayne@68
|
363 Equivalent to attrib[key] = value, but some implementations may handle
|
jpayne@68
|
364 this a bit more efficiently. *key* is what attribute to set, and
|
jpayne@68
|
365 *value* is the attribute value to set it to.
|
jpayne@68
|
366
|
jpayne@68
|
367 """
|
jpayne@68
|
368 self.attrib[key] = value
|
jpayne@68
|
369
|
jpayne@68
|
370 def keys(self):
|
jpayne@68
|
371 """Get list of attribute names.
|
jpayne@68
|
372
|
jpayne@68
|
373 Names are returned in an arbitrary order, just like an ordinary
|
jpayne@68
|
374 Python dict. Equivalent to attrib.keys()
|
jpayne@68
|
375
|
jpayne@68
|
376 """
|
jpayne@68
|
377 return self.attrib.keys()
|
jpayne@68
|
378
|
jpayne@68
|
379 def items(self):
|
jpayne@68
|
380 """Get element attributes as a sequence.
|
jpayne@68
|
381
|
jpayne@68
|
382 The attributes are returned in arbitrary order. Equivalent to
|
jpayne@68
|
383 attrib.items().
|
jpayne@68
|
384
|
jpayne@68
|
385 Return a list of (name, value) tuples.
|
jpayne@68
|
386
|
jpayne@68
|
387 """
|
jpayne@68
|
388 return self.attrib.items()
|
jpayne@68
|
389
|
jpayne@68
|
390 def iter(self, tag=None):
|
jpayne@68
|
391 """Create tree iterator.
|
jpayne@68
|
392
|
jpayne@68
|
393 The iterator loops over the element and all subelements in document
|
jpayne@68
|
394 order, returning all elements with a matching tag.
|
jpayne@68
|
395
|
jpayne@68
|
396 If the tree structure is modified during iteration, new or removed
|
jpayne@68
|
397 elements may or may not be included. To get a stable set, use the
|
jpayne@68
|
398 list() function on the iterator, and loop over the resulting list.
|
jpayne@68
|
399
|
jpayne@68
|
400 *tag* is what tags to look for (default is to return all elements)
|
jpayne@68
|
401
|
jpayne@68
|
402 Return an iterator containing all the matching elements.
|
jpayne@68
|
403
|
jpayne@68
|
404 """
|
jpayne@68
|
405 if tag == "*":
|
jpayne@68
|
406 tag = None
|
jpayne@68
|
407 if tag is None or self.tag == tag:
|
jpayne@68
|
408 yield self
|
jpayne@68
|
409 for e in self._children:
|
jpayne@68
|
410 yield from e.iter(tag)
|
jpayne@68
|
411
|
jpayne@68
|
412 # compatibility
|
jpayne@68
|
413 def getiterator(self, tag=None):
|
jpayne@68
|
414 warnings.warn(
|
jpayne@68
|
415 "This method will be removed in future versions. "
|
jpayne@68
|
416 "Use 'elem.iter()' or 'list(elem.iter())' instead.",
|
jpayne@68
|
417 DeprecationWarning, stacklevel=2
|
jpayne@68
|
418 )
|
jpayne@68
|
419 return list(self.iter(tag))
|
jpayne@68
|
420
|
jpayne@68
|
421 def itertext(self):
|
jpayne@68
|
422 """Create text iterator.
|
jpayne@68
|
423
|
jpayne@68
|
424 The iterator loops over the element and all subelements in document
|
jpayne@68
|
425 order, returning all inner text.
|
jpayne@68
|
426
|
jpayne@68
|
427 """
|
jpayne@68
|
428 tag = self.tag
|
jpayne@68
|
429 if not isinstance(tag, str) and tag is not None:
|
jpayne@68
|
430 return
|
jpayne@68
|
431 t = self.text
|
jpayne@68
|
432 if t:
|
jpayne@68
|
433 yield t
|
jpayne@68
|
434 for e in self:
|
jpayne@68
|
435 yield from e.itertext()
|
jpayne@68
|
436 t = e.tail
|
jpayne@68
|
437 if t:
|
jpayne@68
|
438 yield t
|
jpayne@68
|
439
|
jpayne@68
|
440
|
jpayne@68
|
441 def SubElement(parent, tag, attrib={}, **extra):
|
jpayne@68
|
442 """Subelement factory which creates an element instance, and appends it
|
jpayne@68
|
443 to an existing parent.
|
jpayne@68
|
444
|
jpayne@68
|
445 The element tag, attribute names, and attribute values can be either
|
jpayne@68
|
446 bytes or Unicode strings.
|
jpayne@68
|
447
|
jpayne@68
|
448 *parent* is the parent element, *tag* is the subelements name, *attrib* is
|
jpayne@68
|
449 an optional directory containing element attributes, *extra* are
|
jpayne@68
|
450 additional attributes given as keyword arguments.
|
jpayne@68
|
451
|
jpayne@68
|
452 """
|
jpayne@68
|
453 attrib = {**attrib, **extra}
|
jpayne@68
|
454 element = parent.makeelement(tag, attrib)
|
jpayne@68
|
455 parent.append(element)
|
jpayne@68
|
456 return element
|
jpayne@68
|
457
|
jpayne@68
|
458
|
jpayne@68
|
459 def Comment(text=None):
|
jpayne@68
|
460 """Comment element factory.
|
jpayne@68
|
461
|
jpayne@68
|
462 This function creates a special element which the standard serializer
|
jpayne@68
|
463 serializes as an XML comment.
|
jpayne@68
|
464
|
jpayne@68
|
465 *text* is a string containing the comment string.
|
jpayne@68
|
466
|
jpayne@68
|
467 """
|
jpayne@68
|
468 element = Element(Comment)
|
jpayne@68
|
469 element.text = text
|
jpayne@68
|
470 return element
|
jpayne@68
|
471
|
jpayne@68
|
472
|
jpayne@68
|
473 def ProcessingInstruction(target, text=None):
|
jpayne@68
|
474 """Processing Instruction element factory.
|
jpayne@68
|
475
|
jpayne@68
|
476 This function creates a special element which the standard serializer
|
jpayne@68
|
477 serializes as an XML comment.
|
jpayne@68
|
478
|
jpayne@68
|
479 *target* is a string containing the processing instruction, *text* is a
|
jpayne@68
|
480 string containing the processing instruction contents, if any.
|
jpayne@68
|
481
|
jpayne@68
|
482 """
|
jpayne@68
|
483 element = Element(ProcessingInstruction)
|
jpayne@68
|
484 element.text = target
|
jpayne@68
|
485 if text:
|
jpayne@68
|
486 element.text = element.text + " " + text
|
jpayne@68
|
487 return element
|
jpayne@68
|
488
|
jpayne@68
|
489 PI = ProcessingInstruction
|
jpayne@68
|
490
|
jpayne@68
|
491
|
jpayne@68
|
492 class QName:
|
jpayne@68
|
493 """Qualified name wrapper.
|
jpayne@68
|
494
|
jpayne@68
|
495 This class can be used to wrap a QName attribute value in order to get
|
jpayne@68
|
496 proper namespace handing on output.
|
jpayne@68
|
497
|
jpayne@68
|
498 *text_or_uri* is a string containing the QName value either in the form
|
jpayne@68
|
499 {uri}local, or if the tag argument is given, the URI part of a QName.
|
jpayne@68
|
500
|
jpayne@68
|
501 *tag* is an optional argument which if given, will make the first
|
jpayne@68
|
502 argument (text_or_uri) be interpreted as a URI, and this argument (tag)
|
jpayne@68
|
503 be interpreted as a local name.
|
jpayne@68
|
504
|
jpayne@68
|
505 """
|
jpayne@68
|
506 def __init__(self, text_or_uri, tag=None):
|
jpayne@68
|
507 if tag:
|
jpayne@68
|
508 text_or_uri = "{%s}%s" % (text_or_uri, tag)
|
jpayne@68
|
509 self.text = text_or_uri
|
jpayne@68
|
510 def __str__(self):
|
jpayne@68
|
511 return self.text
|
jpayne@68
|
512 def __repr__(self):
|
jpayne@68
|
513 return '<%s %r>' % (self.__class__.__name__, self.text)
|
jpayne@68
|
514 def __hash__(self):
|
jpayne@68
|
515 return hash(self.text)
|
jpayne@68
|
516 def __le__(self, other):
|
jpayne@68
|
517 if isinstance(other, QName):
|
jpayne@68
|
518 return self.text <= other.text
|
jpayne@68
|
519 return self.text <= other
|
jpayne@68
|
520 def __lt__(self, other):
|
jpayne@68
|
521 if isinstance(other, QName):
|
jpayne@68
|
522 return self.text < other.text
|
jpayne@68
|
523 return self.text < other
|
jpayne@68
|
524 def __ge__(self, other):
|
jpayne@68
|
525 if isinstance(other, QName):
|
jpayne@68
|
526 return self.text >= other.text
|
jpayne@68
|
527 return self.text >= other
|
jpayne@68
|
528 def __gt__(self, other):
|
jpayne@68
|
529 if isinstance(other, QName):
|
jpayne@68
|
530 return self.text > other.text
|
jpayne@68
|
531 return self.text > other
|
jpayne@68
|
532 def __eq__(self, other):
|
jpayne@68
|
533 if isinstance(other, QName):
|
jpayne@68
|
534 return self.text == other.text
|
jpayne@68
|
535 return self.text == other
|
jpayne@68
|
536
|
jpayne@68
|
537 # --------------------------------------------------------------------
|
jpayne@68
|
538
|
jpayne@68
|
539
|
jpayne@68
|
540 class ElementTree:
|
jpayne@68
|
541 """An XML element hierarchy.
|
jpayne@68
|
542
|
jpayne@68
|
543 This class also provides support for serialization to and from
|
jpayne@68
|
544 standard XML.
|
jpayne@68
|
545
|
jpayne@68
|
546 *element* is an optional root element node,
|
jpayne@68
|
547 *file* is an optional file handle or file name of an XML file whose
|
jpayne@68
|
548 contents will be used to initialize the tree with.
|
jpayne@68
|
549
|
jpayne@68
|
550 """
|
jpayne@68
|
551 def __init__(self, element=None, file=None):
|
jpayne@68
|
552 # assert element is None or iselement(element)
|
jpayne@68
|
553 self._root = element # first node
|
jpayne@68
|
554 if file:
|
jpayne@68
|
555 self.parse(file)
|
jpayne@68
|
556
|
jpayne@68
|
557 def getroot(self):
|
jpayne@68
|
558 """Return root element of this tree."""
|
jpayne@68
|
559 return self._root
|
jpayne@68
|
560
|
jpayne@68
|
561 def _setroot(self, element):
|
jpayne@68
|
562 """Replace root element of this tree.
|
jpayne@68
|
563
|
jpayne@68
|
564 This will discard the current contents of the tree and replace it
|
jpayne@68
|
565 with the given element. Use with care!
|
jpayne@68
|
566
|
jpayne@68
|
567 """
|
jpayne@68
|
568 # assert iselement(element)
|
jpayne@68
|
569 self._root = element
|
jpayne@68
|
570
|
jpayne@68
|
571 def parse(self, source, parser=None):
|
jpayne@68
|
572 """Load external XML document into element tree.
|
jpayne@68
|
573
|
jpayne@68
|
574 *source* is a file name or file object, *parser* is an optional parser
|
jpayne@68
|
575 instance that defaults to XMLParser.
|
jpayne@68
|
576
|
jpayne@68
|
577 ParseError is raised if the parser fails to parse the document.
|
jpayne@68
|
578
|
jpayne@68
|
579 Returns the root element of the given source document.
|
jpayne@68
|
580
|
jpayne@68
|
581 """
|
jpayne@68
|
582 close_source = False
|
jpayne@68
|
583 if not hasattr(source, "read"):
|
jpayne@68
|
584 source = open(source, "rb")
|
jpayne@68
|
585 close_source = True
|
jpayne@68
|
586 try:
|
jpayne@68
|
587 if parser is None:
|
jpayne@68
|
588 # If no parser was specified, create a default XMLParser
|
jpayne@68
|
589 parser = XMLParser()
|
jpayne@68
|
590 if hasattr(parser, '_parse_whole'):
|
jpayne@68
|
591 # The default XMLParser, when it comes from an accelerator,
|
jpayne@68
|
592 # can define an internal _parse_whole API for efficiency.
|
jpayne@68
|
593 # It can be used to parse the whole source without feeding
|
jpayne@68
|
594 # it with chunks.
|
jpayne@68
|
595 self._root = parser._parse_whole(source)
|
jpayne@68
|
596 return self._root
|
jpayne@68
|
597 while True:
|
jpayne@68
|
598 data = source.read(65536)
|
jpayne@68
|
599 if not data:
|
jpayne@68
|
600 break
|
jpayne@68
|
601 parser.feed(data)
|
jpayne@68
|
602 self._root = parser.close()
|
jpayne@68
|
603 return self._root
|
jpayne@68
|
604 finally:
|
jpayne@68
|
605 if close_source:
|
jpayne@68
|
606 source.close()
|
jpayne@68
|
607
|
jpayne@68
|
608 def iter(self, tag=None):
|
jpayne@68
|
609 """Create and return tree iterator for the root element.
|
jpayne@68
|
610
|
jpayne@68
|
611 The iterator loops over all elements in this tree, in document order.
|
jpayne@68
|
612
|
jpayne@68
|
613 *tag* is a string with the tag name to iterate over
|
jpayne@68
|
614 (default is to return all elements).
|
jpayne@68
|
615
|
jpayne@68
|
616 """
|
jpayne@68
|
617 # assert self._root is not None
|
jpayne@68
|
618 return self._root.iter(tag)
|
jpayne@68
|
619
|
jpayne@68
|
620 # compatibility
|
jpayne@68
|
621 def getiterator(self, tag=None):
|
jpayne@68
|
622 warnings.warn(
|
jpayne@68
|
623 "This method will be removed in future versions. "
|
jpayne@68
|
624 "Use 'tree.iter()' or 'list(tree.iter())' instead.",
|
jpayne@68
|
625 DeprecationWarning, stacklevel=2
|
jpayne@68
|
626 )
|
jpayne@68
|
627 return list(self.iter(tag))
|
jpayne@68
|
628
|
jpayne@68
|
629 def find(self, path, namespaces=None):
|
jpayne@68
|
630 """Find first matching element by tag name or path.
|
jpayne@68
|
631
|
jpayne@68
|
632 Same as getroot().find(path), which is Element.find()
|
jpayne@68
|
633
|
jpayne@68
|
634 *path* is a string having either an element tag or an XPath,
|
jpayne@68
|
635 *namespaces* is an optional mapping from namespace prefix to full name.
|
jpayne@68
|
636
|
jpayne@68
|
637 Return the first matching element, or None if no element was found.
|
jpayne@68
|
638
|
jpayne@68
|
639 """
|
jpayne@68
|
640 # assert self._root is not None
|
jpayne@68
|
641 if path[:1] == "/":
|
jpayne@68
|
642 path = "." + path
|
jpayne@68
|
643 warnings.warn(
|
jpayne@68
|
644 "This search is broken in 1.3 and earlier, and will be "
|
jpayne@68
|
645 "fixed in a future version. If you rely on the current "
|
jpayne@68
|
646 "behaviour, change it to %r" % path,
|
jpayne@68
|
647 FutureWarning, stacklevel=2
|
jpayne@68
|
648 )
|
jpayne@68
|
649 return self._root.find(path, namespaces)
|
jpayne@68
|
650
|
jpayne@68
|
651 def findtext(self, path, default=None, namespaces=None):
|
jpayne@68
|
652 """Find first matching element by tag name or path.
|
jpayne@68
|
653
|
jpayne@68
|
654 Same as getroot().findtext(path), which is Element.findtext()
|
jpayne@68
|
655
|
jpayne@68
|
656 *path* is a string having either an element tag or an XPath,
|
jpayne@68
|
657 *namespaces* is an optional mapping from namespace prefix to full name.
|
jpayne@68
|
658
|
jpayne@68
|
659 Return the first matching element, or None if no element was found.
|
jpayne@68
|
660
|
jpayne@68
|
661 """
|
jpayne@68
|
662 # assert self._root is not None
|
jpayne@68
|
663 if path[:1] == "/":
|
jpayne@68
|
664 path = "." + path
|
jpayne@68
|
665 warnings.warn(
|
jpayne@68
|
666 "This search is broken in 1.3 and earlier, and will be "
|
jpayne@68
|
667 "fixed in a future version. If you rely on the current "
|
jpayne@68
|
668 "behaviour, change it to %r" % path,
|
jpayne@68
|
669 FutureWarning, stacklevel=2
|
jpayne@68
|
670 )
|
jpayne@68
|
671 return self._root.findtext(path, default, namespaces)
|
jpayne@68
|
672
|
jpayne@68
|
673 def findall(self, path, namespaces=None):
|
jpayne@68
|
674 """Find all matching subelements by tag name or path.
|
jpayne@68
|
675
|
jpayne@68
|
676 Same as getroot().findall(path), which is Element.findall().
|
jpayne@68
|
677
|
jpayne@68
|
678 *path* is a string having either an element tag or an XPath,
|
jpayne@68
|
679 *namespaces* is an optional mapping from namespace prefix to full name.
|
jpayne@68
|
680
|
jpayne@68
|
681 Return list containing all matching elements in document order.
|
jpayne@68
|
682
|
jpayne@68
|
683 """
|
jpayne@68
|
684 # assert self._root is not None
|
jpayne@68
|
685 if path[:1] == "/":
|
jpayne@68
|
686 path = "." + path
|
jpayne@68
|
687 warnings.warn(
|
jpayne@68
|
688 "This search is broken in 1.3 and earlier, and will be "
|
jpayne@68
|
689 "fixed in a future version. If you rely on the current "
|
jpayne@68
|
690 "behaviour, change it to %r" % path,
|
jpayne@68
|
691 FutureWarning, stacklevel=2
|
jpayne@68
|
692 )
|
jpayne@68
|
693 return self._root.findall(path, namespaces)
|
jpayne@68
|
694
|
jpayne@68
|
695 def iterfind(self, path, namespaces=None):
|
jpayne@68
|
696 """Find all matching subelements by tag name or path.
|
jpayne@68
|
697
|
jpayne@68
|
698 Same as getroot().iterfind(path), which is element.iterfind()
|
jpayne@68
|
699
|
jpayne@68
|
700 *path* is a string having either an element tag or an XPath,
|
jpayne@68
|
701 *namespaces* is an optional mapping from namespace prefix to full name.
|
jpayne@68
|
702
|
jpayne@68
|
703 Return an iterable yielding all matching elements in document order.
|
jpayne@68
|
704
|
jpayne@68
|
705 """
|
jpayne@68
|
706 # assert self._root is not None
|
jpayne@68
|
707 if path[:1] == "/":
|
jpayne@68
|
708 path = "." + path
|
jpayne@68
|
709 warnings.warn(
|
jpayne@68
|
710 "This search is broken in 1.3 and earlier, and will be "
|
jpayne@68
|
711 "fixed in a future version. If you rely on the current "
|
jpayne@68
|
712 "behaviour, change it to %r" % path,
|
jpayne@68
|
713 FutureWarning, stacklevel=2
|
jpayne@68
|
714 )
|
jpayne@68
|
715 return self._root.iterfind(path, namespaces)
|
jpayne@68
|
716
|
jpayne@68
|
717 def write(self, file_or_filename,
|
jpayne@68
|
718 encoding=None,
|
jpayne@68
|
719 xml_declaration=None,
|
jpayne@68
|
720 default_namespace=None,
|
jpayne@68
|
721 method=None, *,
|
jpayne@68
|
722 short_empty_elements=True):
|
jpayne@68
|
723 """Write element tree to a file as XML.
|
jpayne@68
|
724
|
jpayne@68
|
725 Arguments:
|
jpayne@68
|
726 *file_or_filename* -- file name or a file object opened for writing
|
jpayne@68
|
727
|
jpayne@68
|
728 *encoding* -- the output encoding (default: US-ASCII)
|
jpayne@68
|
729
|
jpayne@68
|
730 *xml_declaration* -- bool indicating if an XML declaration should be
|
jpayne@68
|
731 added to the output. If None, an XML declaration
|
jpayne@68
|
732 is added if encoding IS NOT either of:
|
jpayne@68
|
733 US-ASCII, UTF-8, or Unicode
|
jpayne@68
|
734
|
jpayne@68
|
735 *default_namespace* -- sets the default XML namespace (for "xmlns")
|
jpayne@68
|
736
|
jpayne@68
|
737 *method* -- either "xml" (default), "html, "text", or "c14n"
|
jpayne@68
|
738
|
jpayne@68
|
739 *short_empty_elements* -- controls the formatting of elements
|
jpayne@68
|
740 that contain no content. If True (default)
|
jpayne@68
|
741 they are emitted as a single self-closed
|
jpayne@68
|
742 tag, otherwise they are emitted as a pair
|
jpayne@68
|
743 of start/end tags
|
jpayne@68
|
744
|
jpayne@68
|
745 """
|
jpayne@68
|
746 if not method:
|
jpayne@68
|
747 method = "xml"
|
jpayne@68
|
748 elif method not in _serialize:
|
jpayne@68
|
749 raise ValueError("unknown method %r" % method)
|
jpayne@68
|
750 if not encoding:
|
jpayne@68
|
751 if method == "c14n":
|
jpayne@68
|
752 encoding = "utf-8"
|
jpayne@68
|
753 else:
|
jpayne@68
|
754 encoding = "us-ascii"
|
jpayne@68
|
755 enc_lower = encoding.lower()
|
jpayne@68
|
756 with _get_writer(file_or_filename, enc_lower) as write:
|
jpayne@68
|
757 if method == "xml" and (xml_declaration or
|
jpayne@68
|
758 (xml_declaration is None and
|
jpayne@68
|
759 enc_lower not in ("utf-8", "us-ascii", "unicode"))):
|
jpayne@68
|
760 declared_encoding = encoding
|
jpayne@68
|
761 if enc_lower == "unicode":
|
jpayne@68
|
762 # Retrieve the default encoding for the xml declaration
|
jpayne@68
|
763 import locale
|
jpayne@68
|
764 declared_encoding = locale.getpreferredencoding()
|
jpayne@68
|
765 write("<?xml version='1.0' encoding='%s'?>\n" % (
|
jpayne@68
|
766 declared_encoding,))
|
jpayne@68
|
767 if method == "text":
|
jpayne@68
|
768 _serialize_text(write, self._root)
|
jpayne@68
|
769 else:
|
jpayne@68
|
770 qnames, namespaces = _namespaces(self._root, default_namespace)
|
jpayne@68
|
771 serialize = _serialize[method]
|
jpayne@68
|
772 serialize(write, self._root, qnames, namespaces,
|
jpayne@68
|
773 short_empty_elements=short_empty_elements)
|
jpayne@68
|
774
|
jpayne@68
|
775 def write_c14n(self, file):
|
jpayne@68
|
776 # lxml.etree compatibility. use output method instead
|
jpayne@68
|
777 return self.write(file, method="c14n")
|
jpayne@68
|
778
|
jpayne@68
|
779 # --------------------------------------------------------------------
|
jpayne@68
|
780 # serialization support
|
jpayne@68
|
781
|
jpayne@68
|
782 @contextlib.contextmanager
|
jpayne@68
|
783 def _get_writer(file_or_filename, encoding):
|
jpayne@68
|
784 # returns text write method and release all resources after using
|
jpayne@68
|
785 try:
|
jpayne@68
|
786 write = file_or_filename.write
|
jpayne@68
|
787 except AttributeError:
|
jpayne@68
|
788 # file_or_filename is a file name
|
jpayne@68
|
789 if encoding == "unicode":
|
jpayne@68
|
790 file = open(file_or_filename, "w")
|
jpayne@68
|
791 else:
|
jpayne@68
|
792 file = open(file_or_filename, "w", encoding=encoding,
|
jpayne@68
|
793 errors="xmlcharrefreplace")
|
jpayne@68
|
794 with file:
|
jpayne@68
|
795 yield file.write
|
jpayne@68
|
796 else:
|
jpayne@68
|
797 # file_or_filename is a file-like object
|
jpayne@68
|
798 # encoding determines if it is a text or binary writer
|
jpayne@68
|
799 if encoding == "unicode":
|
jpayne@68
|
800 # use a text writer as is
|
jpayne@68
|
801 yield write
|
jpayne@68
|
802 else:
|
jpayne@68
|
803 # wrap a binary writer with TextIOWrapper
|
jpayne@68
|
804 with contextlib.ExitStack() as stack:
|
jpayne@68
|
805 if isinstance(file_or_filename, io.BufferedIOBase):
|
jpayne@68
|
806 file = file_or_filename
|
jpayne@68
|
807 elif isinstance(file_or_filename, io.RawIOBase):
|
jpayne@68
|
808 file = io.BufferedWriter(file_or_filename)
|
jpayne@68
|
809 # Keep the original file open when the BufferedWriter is
|
jpayne@68
|
810 # destroyed
|
jpayne@68
|
811 stack.callback(file.detach)
|
jpayne@68
|
812 else:
|
jpayne@68
|
813 # This is to handle passed objects that aren't in the
|
jpayne@68
|
814 # IOBase hierarchy, but just have a write method
|
jpayne@68
|
815 file = io.BufferedIOBase()
|
jpayne@68
|
816 file.writable = lambda: True
|
jpayne@68
|
817 file.write = write
|
jpayne@68
|
818 try:
|
jpayne@68
|
819 # TextIOWrapper uses this methods to determine
|
jpayne@68
|
820 # if BOM (for UTF-16, etc) should be added
|
jpayne@68
|
821 file.seekable = file_or_filename.seekable
|
jpayne@68
|
822 file.tell = file_or_filename.tell
|
jpayne@68
|
823 except AttributeError:
|
jpayne@68
|
824 pass
|
jpayne@68
|
825 file = io.TextIOWrapper(file,
|
jpayne@68
|
826 encoding=encoding,
|
jpayne@68
|
827 errors="xmlcharrefreplace",
|
jpayne@68
|
828 newline="\n")
|
jpayne@68
|
829 # Keep the original file open when the TextIOWrapper is
|
jpayne@68
|
830 # destroyed
|
jpayne@68
|
831 stack.callback(file.detach)
|
jpayne@68
|
832 yield file.write
|
jpayne@68
|
833
|
jpayne@68
|
834 def _namespaces(elem, default_namespace=None):
|
jpayne@68
|
835 # identify namespaces used in this tree
|
jpayne@68
|
836
|
jpayne@68
|
837 # maps qnames to *encoded* prefix:local names
|
jpayne@68
|
838 qnames = {None: None}
|
jpayne@68
|
839
|
jpayne@68
|
840 # maps uri:s to prefixes
|
jpayne@68
|
841 namespaces = {}
|
jpayne@68
|
842 if default_namespace:
|
jpayne@68
|
843 namespaces[default_namespace] = ""
|
jpayne@68
|
844
|
jpayne@68
|
845 def add_qname(qname):
|
jpayne@68
|
846 # calculate serialized qname representation
|
jpayne@68
|
847 try:
|
jpayne@68
|
848 if qname[:1] == "{":
|
jpayne@68
|
849 uri, tag = qname[1:].rsplit("}", 1)
|
jpayne@68
|
850 prefix = namespaces.get(uri)
|
jpayne@68
|
851 if prefix is None:
|
jpayne@68
|
852 prefix = _namespace_map.get(uri)
|
jpayne@68
|
853 if prefix is None:
|
jpayne@68
|
854 prefix = "ns%d" % len(namespaces)
|
jpayne@68
|
855 if prefix != "xml":
|
jpayne@68
|
856 namespaces[uri] = prefix
|
jpayne@68
|
857 if prefix:
|
jpayne@68
|
858 qnames[qname] = "%s:%s" % (prefix, tag)
|
jpayne@68
|
859 else:
|
jpayne@68
|
860 qnames[qname] = tag # default element
|
jpayne@68
|
861 else:
|
jpayne@68
|
862 if default_namespace:
|
jpayne@68
|
863 # FIXME: can this be handled in XML 1.0?
|
jpayne@68
|
864 raise ValueError(
|
jpayne@68
|
865 "cannot use non-qualified names with "
|
jpayne@68
|
866 "default_namespace option"
|
jpayne@68
|
867 )
|
jpayne@68
|
868 qnames[qname] = qname
|
jpayne@68
|
869 except TypeError:
|
jpayne@68
|
870 _raise_serialization_error(qname)
|
jpayne@68
|
871
|
jpayne@68
|
872 # populate qname and namespaces table
|
jpayne@68
|
873 for elem in elem.iter():
|
jpayne@68
|
874 tag = elem.tag
|
jpayne@68
|
875 if isinstance(tag, QName):
|
jpayne@68
|
876 if tag.text not in qnames:
|
jpayne@68
|
877 add_qname(tag.text)
|
jpayne@68
|
878 elif isinstance(tag, str):
|
jpayne@68
|
879 if tag not in qnames:
|
jpayne@68
|
880 add_qname(tag)
|
jpayne@68
|
881 elif tag is not None and tag is not Comment and tag is not PI:
|
jpayne@68
|
882 _raise_serialization_error(tag)
|
jpayne@68
|
883 for key, value in elem.items():
|
jpayne@68
|
884 if isinstance(key, QName):
|
jpayne@68
|
885 key = key.text
|
jpayne@68
|
886 if key not in qnames:
|
jpayne@68
|
887 add_qname(key)
|
jpayne@68
|
888 if isinstance(value, QName) and value.text not in qnames:
|
jpayne@68
|
889 add_qname(value.text)
|
jpayne@68
|
890 text = elem.text
|
jpayne@68
|
891 if isinstance(text, QName) and text.text not in qnames:
|
jpayne@68
|
892 add_qname(text.text)
|
jpayne@68
|
893 return qnames, namespaces
|
jpayne@68
|
894
|
jpayne@68
|
895 def _serialize_xml(write, elem, qnames, namespaces,
|
jpayne@68
|
896 short_empty_elements, **kwargs):
|
jpayne@68
|
897 tag = elem.tag
|
jpayne@68
|
898 text = elem.text
|
jpayne@68
|
899 if tag is Comment:
|
jpayne@68
|
900 write("<!--%s-->" % text)
|
jpayne@68
|
901 elif tag is ProcessingInstruction:
|
jpayne@68
|
902 write("<?%s?>" % text)
|
jpayne@68
|
903 else:
|
jpayne@68
|
904 tag = qnames[tag]
|
jpayne@68
|
905 if tag is None:
|
jpayne@68
|
906 if text:
|
jpayne@68
|
907 write(_escape_cdata(text))
|
jpayne@68
|
908 for e in elem:
|
jpayne@68
|
909 _serialize_xml(write, e, qnames, None,
|
jpayne@68
|
910 short_empty_elements=short_empty_elements)
|
jpayne@68
|
911 else:
|
jpayne@68
|
912 write("<" + tag)
|
jpayne@68
|
913 items = list(elem.items())
|
jpayne@68
|
914 if items or namespaces:
|
jpayne@68
|
915 if namespaces:
|
jpayne@68
|
916 for v, k in sorted(namespaces.items(),
|
jpayne@68
|
917 key=lambda x: x[1]): # sort on prefix
|
jpayne@68
|
918 if k:
|
jpayne@68
|
919 k = ":" + k
|
jpayne@68
|
920 write(" xmlns%s=\"%s\"" % (
|
jpayne@68
|
921 k,
|
jpayne@68
|
922 _escape_attrib(v)
|
jpayne@68
|
923 ))
|
jpayne@68
|
924 for k, v in items:
|
jpayne@68
|
925 if isinstance(k, QName):
|
jpayne@68
|
926 k = k.text
|
jpayne@68
|
927 if isinstance(v, QName):
|
jpayne@68
|
928 v = qnames[v.text]
|
jpayne@68
|
929 else:
|
jpayne@68
|
930 v = _escape_attrib(v)
|
jpayne@68
|
931 write(" %s=\"%s\"" % (qnames[k], v))
|
jpayne@68
|
932 if text or len(elem) or not short_empty_elements:
|
jpayne@68
|
933 write(">")
|
jpayne@68
|
934 if text:
|
jpayne@68
|
935 write(_escape_cdata(text))
|
jpayne@68
|
936 for e in elem:
|
jpayne@68
|
937 _serialize_xml(write, e, qnames, None,
|
jpayne@68
|
938 short_empty_elements=short_empty_elements)
|
jpayne@68
|
939 write("</" + tag + ">")
|
jpayne@68
|
940 else:
|
jpayne@68
|
941 write(" />")
|
jpayne@68
|
942 if elem.tail:
|
jpayne@68
|
943 write(_escape_cdata(elem.tail))
|
jpayne@68
|
944
|
jpayne@68
|
945 HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
|
jpayne@68
|
946 "img", "input", "isindex", "link", "meta", "param")
|
jpayne@68
|
947
|
jpayne@68
|
948 try:
|
jpayne@68
|
949 HTML_EMPTY = set(HTML_EMPTY)
|
jpayne@68
|
950 except NameError:
|
jpayne@68
|
951 pass
|
jpayne@68
|
952
|
jpayne@68
|
953 def _serialize_html(write, elem, qnames, namespaces, **kwargs):
|
jpayne@68
|
954 tag = elem.tag
|
jpayne@68
|
955 text = elem.text
|
jpayne@68
|
956 if tag is Comment:
|
jpayne@68
|
957 write("<!--%s-->" % _escape_cdata(text))
|
jpayne@68
|
958 elif tag is ProcessingInstruction:
|
jpayne@68
|
959 write("<?%s?>" % _escape_cdata(text))
|
jpayne@68
|
960 else:
|
jpayne@68
|
961 tag = qnames[tag]
|
jpayne@68
|
962 if tag is None:
|
jpayne@68
|
963 if text:
|
jpayne@68
|
964 write(_escape_cdata(text))
|
jpayne@68
|
965 for e in elem:
|
jpayne@68
|
966 _serialize_html(write, e, qnames, None)
|
jpayne@68
|
967 else:
|
jpayne@68
|
968 write("<" + tag)
|
jpayne@68
|
969 items = list(elem.items())
|
jpayne@68
|
970 if items or namespaces:
|
jpayne@68
|
971 if namespaces:
|
jpayne@68
|
972 for v, k in sorted(namespaces.items(),
|
jpayne@68
|
973 key=lambda x: x[1]): # sort on prefix
|
jpayne@68
|
974 if k:
|
jpayne@68
|
975 k = ":" + k
|
jpayne@68
|
976 write(" xmlns%s=\"%s\"" % (
|
jpayne@68
|
977 k,
|
jpayne@68
|
978 _escape_attrib(v)
|
jpayne@68
|
979 ))
|
jpayne@68
|
980 for k, v in items:
|
jpayne@68
|
981 if isinstance(k, QName):
|
jpayne@68
|
982 k = k.text
|
jpayne@68
|
983 if isinstance(v, QName):
|
jpayne@68
|
984 v = qnames[v.text]
|
jpayne@68
|
985 else:
|
jpayne@68
|
986 v = _escape_attrib_html(v)
|
jpayne@68
|
987 # FIXME: handle boolean attributes
|
jpayne@68
|
988 write(" %s=\"%s\"" % (qnames[k], v))
|
jpayne@68
|
989 write(">")
|
jpayne@68
|
990 ltag = tag.lower()
|
jpayne@68
|
991 if text:
|
jpayne@68
|
992 if ltag == "script" or ltag == "style":
|
jpayne@68
|
993 write(text)
|
jpayne@68
|
994 else:
|
jpayne@68
|
995 write(_escape_cdata(text))
|
jpayne@68
|
996 for e in elem:
|
jpayne@68
|
997 _serialize_html(write, e, qnames, None)
|
jpayne@68
|
998 if ltag not in HTML_EMPTY:
|
jpayne@68
|
999 write("</" + tag + ">")
|
jpayne@68
|
1000 if elem.tail:
|
jpayne@68
|
1001 write(_escape_cdata(elem.tail))
|
jpayne@68
|
1002
|
jpayne@68
|
1003 def _serialize_text(write, elem):
|
jpayne@68
|
1004 for part in elem.itertext():
|
jpayne@68
|
1005 write(part)
|
jpayne@68
|
1006 if elem.tail:
|
jpayne@68
|
1007 write(elem.tail)
|
jpayne@68
|
1008
|
jpayne@68
|
1009 _serialize = {
|
jpayne@68
|
1010 "xml": _serialize_xml,
|
jpayne@68
|
1011 "html": _serialize_html,
|
jpayne@68
|
1012 "text": _serialize_text,
|
jpayne@68
|
1013 # this optional method is imported at the end of the module
|
jpayne@68
|
1014 # "c14n": _serialize_c14n,
|
jpayne@68
|
1015 }
|
jpayne@68
|
1016
|
jpayne@68
|
1017
|
jpayne@68
|
1018 def register_namespace(prefix, uri):
|
jpayne@68
|
1019 """Register a namespace prefix.
|
jpayne@68
|
1020
|
jpayne@68
|
1021 The registry is global, and any existing mapping for either the
|
jpayne@68
|
1022 given prefix or the namespace URI will be removed.
|
jpayne@68
|
1023
|
jpayne@68
|
1024 *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and
|
jpayne@68
|
1025 attributes in this namespace will be serialized with prefix if possible.
|
jpayne@68
|
1026
|
jpayne@68
|
1027 ValueError is raised if prefix is reserved or is invalid.
|
jpayne@68
|
1028
|
jpayne@68
|
1029 """
|
jpayne@68
|
1030 if re.match(r"ns\d+$", prefix):
|
jpayne@68
|
1031 raise ValueError("Prefix format reserved for internal use")
|
jpayne@68
|
1032 for k, v in list(_namespace_map.items()):
|
jpayne@68
|
1033 if k == uri or v == prefix:
|
jpayne@68
|
1034 del _namespace_map[k]
|
jpayne@68
|
1035 _namespace_map[uri] = prefix
|
jpayne@68
|
1036
|
jpayne@68
|
1037 _namespace_map = {
|
jpayne@68
|
1038 # "well-known" namespace prefixes
|
jpayne@68
|
1039 "http://www.w3.org/XML/1998/namespace": "xml",
|
jpayne@68
|
1040 "http://www.w3.org/1999/xhtml": "html",
|
jpayne@68
|
1041 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
|
jpayne@68
|
1042 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
|
jpayne@68
|
1043 # xml schema
|
jpayne@68
|
1044 "http://www.w3.org/2001/XMLSchema": "xs",
|
jpayne@68
|
1045 "http://www.w3.org/2001/XMLSchema-instance": "xsi",
|
jpayne@68
|
1046 # dublin core
|
jpayne@68
|
1047 "http://purl.org/dc/elements/1.1/": "dc",
|
jpayne@68
|
1048 }
|
jpayne@68
|
1049 # For tests and troubleshooting
|
jpayne@68
|
1050 register_namespace._namespace_map = _namespace_map
|
jpayne@68
|
1051
|
jpayne@68
|
1052 def _raise_serialization_error(text):
|
jpayne@68
|
1053 raise TypeError(
|
jpayne@68
|
1054 "cannot serialize %r (type %s)" % (text, type(text).__name__)
|
jpayne@68
|
1055 )
|
jpayne@68
|
1056
|
jpayne@68
|
1057 def _escape_cdata(text):
|
jpayne@68
|
1058 # escape character data
|
jpayne@68
|
1059 try:
|
jpayne@68
|
1060 # it's worth avoiding do-nothing calls for strings that are
|
jpayne@68
|
1061 # shorter than 500 characters, or so. assume that's, by far,
|
jpayne@68
|
1062 # the most common case in most applications.
|
jpayne@68
|
1063 if "&" in text:
|
jpayne@68
|
1064 text = text.replace("&", "&")
|
jpayne@68
|
1065 if "<" in text:
|
jpayne@68
|
1066 text = text.replace("<", "<")
|
jpayne@68
|
1067 if ">" in text:
|
jpayne@68
|
1068 text = text.replace(">", ">")
|
jpayne@68
|
1069 return text
|
jpayne@68
|
1070 except (TypeError, AttributeError):
|
jpayne@68
|
1071 _raise_serialization_error(text)
|
jpayne@68
|
1072
|
jpayne@68
|
1073 def _escape_attrib(text):
|
jpayne@68
|
1074 # escape attribute value
|
jpayne@68
|
1075 try:
|
jpayne@68
|
1076 if "&" in text:
|
jpayne@68
|
1077 text = text.replace("&", "&")
|
jpayne@68
|
1078 if "<" in text:
|
jpayne@68
|
1079 text = text.replace("<", "<")
|
jpayne@68
|
1080 if ">" in text:
|
jpayne@68
|
1081 text = text.replace(">", ">")
|
jpayne@68
|
1082 if "\"" in text:
|
jpayne@68
|
1083 text = text.replace("\"", """)
|
jpayne@68
|
1084 # The following business with carriage returns is to satisfy
|
jpayne@68
|
1085 # Section 2.11 of the XML specification, stating that
|
jpayne@68
|
1086 # CR or CR LN should be replaced with just LN
|
jpayne@68
|
1087 # http://www.w3.org/TR/REC-xml/#sec-line-ends
|
jpayne@68
|
1088 if "\r\n" in text:
|
jpayne@68
|
1089 text = text.replace("\r\n", "\n")
|
jpayne@68
|
1090 if "\r" in text:
|
jpayne@68
|
1091 text = text.replace("\r", "\n")
|
jpayne@68
|
1092 #The following four lines are issue 17582
|
jpayne@68
|
1093 if "\n" in text:
|
jpayne@68
|
1094 text = text.replace("\n", " ")
|
jpayne@68
|
1095 if "\t" in text:
|
jpayne@68
|
1096 text = text.replace("\t", "	")
|
jpayne@68
|
1097 return text
|
jpayne@68
|
1098 except (TypeError, AttributeError):
|
jpayne@68
|
1099 _raise_serialization_error(text)
|
jpayne@68
|
1100
|
jpayne@68
|
1101 def _escape_attrib_html(text):
|
jpayne@68
|
1102 # escape attribute value
|
jpayne@68
|
1103 try:
|
jpayne@68
|
1104 if "&" in text:
|
jpayne@68
|
1105 text = text.replace("&", "&")
|
jpayne@68
|
1106 if ">" in text:
|
jpayne@68
|
1107 text = text.replace(">", ">")
|
jpayne@68
|
1108 if "\"" in text:
|
jpayne@68
|
1109 text = text.replace("\"", """)
|
jpayne@68
|
1110 return text
|
jpayne@68
|
1111 except (TypeError, AttributeError):
|
jpayne@68
|
1112 _raise_serialization_error(text)
|
jpayne@68
|
1113
|
jpayne@68
|
1114 # --------------------------------------------------------------------
|
jpayne@68
|
1115
|
jpayne@68
|
1116 def tostring(element, encoding=None, method=None, *,
|
jpayne@68
|
1117 xml_declaration=None, default_namespace=None,
|
jpayne@68
|
1118 short_empty_elements=True):
|
jpayne@68
|
1119 """Generate string representation of XML element.
|
jpayne@68
|
1120
|
jpayne@68
|
1121 All subelements are included. If encoding is "unicode", a string
|
jpayne@68
|
1122 is returned. Otherwise a bytestring is returned.
|
jpayne@68
|
1123
|
jpayne@68
|
1124 *element* is an Element instance, *encoding* is an optional output
|
jpayne@68
|
1125 encoding defaulting to US-ASCII, *method* is an optional output which can
|
jpayne@68
|
1126 be one of "xml" (default), "html", "text" or "c14n", *default_namespace*
|
jpayne@68
|
1127 sets the default XML namespace (for "xmlns").
|
jpayne@68
|
1128
|
jpayne@68
|
1129 Returns an (optionally) encoded string containing the XML data.
|
jpayne@68
|
1130
|
jpayne@68
|
1131 """
|
jpayne@68
|
1132 stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()
|
jpayne@68
|
1133 ElementTree(element).write(stream, encoding,
|
jpayne@68
|
1134 xml_declaration=xml_declaration,
|
jpayne@68
|
1135 default_namespace=default_namespace,
|
jpayne@68
|
1136 method=method,
|
jpayne@68
|
1137 short_empty_elements=short_empty_elements)
|
jpayne@68
|
1138 return stream.getvalue()
|
jpayne@68
|
1139
|
jpayne@68
|
1140 class _ListDataStream(io.BufferedIOBase):
|
jpayne@68
|
1141 """An auxiliary stream accumulating into a list reference."""
|
jpayne@68
|
1142 def __init__(self, lst):
|
jpayne@68
|
1143 self.lst = lst
|
jpayne@68
|
1144
|
jpayne@68
|
1145 def writable(self):
|
jpayne@68
|
1146 return True
|
jpayne@68
|
1147
|
jpayne@68
|
1148 def seekable(self):
|
jpayne@68
|
1149 return True
|
jpayne@68
|
1150
|
jpayne@68
|
1151 def write(self, b):
|
jpayne@68
|
1152 self.lst.append(b)
|
jpayne@68
|
1153
|
jpayne@68
|
1154 def tell(self):
|
jpayne@68
|
1155 return len(self.lst)
|
jpayne@68
|
1156
|
jpayne@68
|
1157 def tostringlist(element, encoding=None, method=None, *,
|
jpayne@68
|
1158 xml_declaration=None, default_namespace=None,
|
jpayne@68
|
1159 short_empty_elements=True):
|
jpayne@68
|
1160 lst = []
|
jpayne@68
|
1161 stream = _ListDataStream(lst)
|
jpayne@68
|
1162 ElementTree(element).write(stream, encoding,
|
jpayne@68
|
1163 xml_declaration=xml_declaration,
|
jpayne@68
|
1164 default_namespace=default_namespace,
|
jpayne@68
|
1165 method=method,
|
jpayne@68
|
1166 short_empty_elements=short_empty_elements)
|
jpayne@68
|
1167 return lst
|
jpayne@68
|
1168
|
jpayne@68
|
1169
|
jpayne@68
|
1170 def dump(elem):
|
jpayne@68
|
1171 """Write element tree or element structure to sys.stdout.
|
jpayne@68
|
1172
|
jpayne@68
|
1173 This function should be used for debugging only.
|
jpayne@68
|
1174
|
jpayne@68
|
1175 *elem* is either an ElementTree, or a single Element. The exact output
|
jpayne@68
|
1176 format is implementation dependent. In this version, it's written as an
|
jpayne@68
|
1177 ordinary XML file.
|
jpayne@68
|
1178
|
jpayne@68
|
1179 """
|
jpayne@68
|
1180 # debugging
|
jpayne@68
|
1181 if not isinstance(elem, ElementTree):
|
jpayne@68
|
1182 elem = ElementTree(elem)
|
jpayne@68
|
1183 elem.write(sys.stdout, encoding="unicode")
|
jpayne@68
|
1184 tail = elem.getroot().tail
|
jpayne@68
|
1185 if not tail or tail[-1] != "\n":
|
jpayne@68
|
1186 sys.stdout.write("\n")
|
jpayne@68
|
1187
|
jpayne@68
|
1188 # --------------------------------------------------------------------
|
jpayne@68
|
1189 # parsing
|
jpayne@68
|
1190
|
jpayne@68
|
1191
|
jpayne@68
|
1192 def parse(source, parser=None):
|
jpayne@68
|
1193 """Parse XML document into element tree.
|
jpayne@68
|
1194
|
jpayne@68
|
1195 *source* is a filename or file object containing XML data,
|
jpayne@68
|
1196 *parser* is an optional parser instance defaulting to XMLParser.
|
jpayne@68
|
1197
|
jpayne@68
|
1198 Return an ElementTree instance.
|
jpayne@68
|
1199
|
jpayne@68
|
1200 """
|
jpayne@68
|
1201 tree = ElementTree()
|
jpayne@68
|
1202 tree.parse(source, parser)
|
jpayne@68
|
1203 return tree
|
jpayne@68
|
1204
|
jpayne@68
|
1205
|
jpayne@68
|
1206 def iterparse(source, events=None, parser=None):
|
jpayne@68
|
1207 """Incrementally parse XML document into ElementTree.
|
jpayne@68
|
1208
|
jpayne@68
|
1209 This class also reports what's going on to the user based on the
|
jpayne@68
|
1210 *events* it is initialized with. The supported events are the strings
|
jpayne@68
|
1211 "start", "end", "start-ns" and "end-ns" (the "ns" events are used to get
|
jpayne@68
|
1212 detailed namespace information). If *events* is omitted, only
|
jpayne@68
|
1213 "end" events are reported.
|
jpayne@68
|
1214
|
jpayne@68
|
1215 *source* is a filename or file object containing XML data, *events* is
|
jpayne@68
|
1216 a list of events to report back, *parser* is an optional parser instance.
|
jpayne@68
|
1217
|
jpayne@68
|
1218 Returns an iterator providing (event, elem) pairs.
|
jpayne@68
|
1219
|
jpayne@68
|
1220 """
|
jpayne@68
|
1221 # Use the internal, undocumented _parser argument for now; When the
|
jpayne@68
|
1222 # parser argument of iterparse is removed, this can be killed.
|
jpayne@68
|
1223 pullparser = XMLPullParser(events=events, _parser=parser)
|
jpayne@68
|
1224 def iterator():
|
jpayne@68
|
1225 try:
|
jpayne@68
|
1226 while True:
|
jpayne@68
|
1227 yield from pullparser.read_events()
|
jpayne@68
|
1228 # load event buffer
|
jpayne@68
|
1229 data = source.read(16 * 1024)
|
jpayne@68
|
1230 if not data:
|
jpayne@68
|
1231 break
|
jpayne@68
|
1232 pullparser.feed(data)
|
jpayne@68
|
1233 root = pullparser._close_and_return_root()
|
jpayne@68
|
1234 yield from pullparser.read_events()
|
jpayne@68
|
1235 it.root = root
|
jpayne@68
|
1236 finally:
|
jpayne@68
|
1237 if close_source:
|
jpayne@68
|
1238 source.close()
|
jpayne@68
|
1239
|
jpayne@68
|
1240 class IterParseIterator(collections.abc.Iterator):
|
jpayne@68
|
1241 __next__ = iterator().__next__
|
jpayne@68
|
1242 it = IterParseIterator()
|
jpayne@68
|
1243 it.root = None
|
jpayne@68
|
1244 del iterator, IterParseIterator
|
jpayne@68
|
1245
|
jpayne@68
|
1246 close_source = False
|
jpayne@68
|
1247 if not hasattr(source, "read"):
|
jpayne@68
|
1248 source = open(source, "rb")
|
jpayne@68
|
1249 close_source = True
|
jpayne@68
|
1250
|
jpayne@68
|
1251 return it
|
jpayne@68
|
1252
|
jpayne@68
|
1253
|
jpayne@68
|
1254 class XMLPullParser:
|
jpayne@68
|
1255
|
jpayne@68
|
1256 def __init__(self, events=None, *, _parser=None):
|
jpayne@68
|
1257 # The _parser argument is for internal use only and must not be relied
|
jpayne@68
|
1258 # upon in user code. It will be removed in a future release.
|
jpayne@68
|
1259 # See http://bugs.python.org/issue17741 for more details.
|
jpayne@68
|
1260
|
jpayne@68
|
1261 self._events_queue = collections.deque()
|
jpayne@68
|
1262 self._parser = _parser or XMLParser(target=TreeBuilder())
|
jpayne@68
|
1263 # wire up the parser for event reporting
|
jpayne@68
|
1264 if events is None:
|
jpayne@68
|
1265 events = ("end",)
|
jpayne@68
|
1266 self._parser._setevents(self._events_queue, events)
|
jpayne@68
|
1267
|
jpayne@68
|
1268 def feed(self, data):
|
jpayne@68
|
1269 """Feed encoded data to parser."""
|
jpayne@68
|
1270 if self._parser is None:
|
jpayne@68
|
1271 raise ValueError("feed() called after end of stream")
|
jpayne@68
|
1272 if data:
|
jpayne@68
|
1273 try:
|
jpayne@68
|
1274 self._parser.feed(data)
|
jpayne@68
|
1275 except SyntaxError as exc:
|
jpayne@68
|
1276 self._events_queue.append(exc)
|
jpayne@68
|
1277
|
jpayne@68
|
1278 def _close_and_return_root(self):
|
jpayne@68
|
1279 # iterparse needs this to set its root attribute properly :(
|
jpayne@68
|
1280 root = self._parser.close()
|
jpayne@68
|
1281 self._parser = None
|
jpayne@68
|
1282 return root
|
jpayne@68
|
1283
|
jpayne@68
|
1284 def close(self):
|
jpayne@68
|
1285 """Finish feeding data to parser.
|
jpayne@68
|
1286
|
jpayne@68
|
1287 Unlike XMLParser, does not return the root element. Use
|
jpayne@68
|
1288 read_events() to consume elements from XMLPullParser.
|
jpayne@68
|
1289 """
|
jpayne@68
|
1290 self._close_and_return_root()
|
jpayne@68
|
1291
|
jpayne@68
|
1292 def read_events(self):
|
jpayne@68
|
1293 """Return an iterator over currently available (event, elem) pairs.
|
jpayne@68
|
1294
|
jpayne@68
|
1295 Events are consumed from the internal event queue as they are
|
jpayne@68
|
1296 retrieved from the iterator.
|
jpayne@68
|
1297 """
|
jpayne@68
|
1298 events = self._events_queue
|
jpayne@68
|
1299 while events:
|
jpayne@68
|
1300 event = events.popleft()
|
jpayne@68
|
1301 if isinstance(event, Exception):
|
jpayne@68
|
1302 raise event
|
jpayne@68
|
1303 else:
|
jpayne@68
|
1304 yield event
|
jpayne@68
|
1305
|
jpayne@68
|
1306
|
jpayne@68
|
1307 def XML(text, parser=None):
|
jpayne@68
|
1308 """Parse XML document from string constant.
|
jpayne@68
|
1309
|
jpayne@68
|
1310 This function can be used to embed "XML Literals" in Python code.
|
jpayne@68
|
1311
|
jpayne@68
|
1312 *text* is a string containing XML data, *parser* is an
|
jpayne@68
|
1313 optional parser instance, defaulting to the standard XMLParser.
|
jpayne@68
|
1314
|
jpayne@68
|
1315 Returns an Element instance.
|
jpayne@68
|
1316
|
jpayne@68
|
1317 """
|
jpayne@68
|
1318 if not parser:
|
jpayne@68
|
1319 parser = XMLParser(target=TreeBuilder())
|
jpayne@68
|
1320 parser.feed(text)
|
jpayne@68
|
1321 return parser.close()
|
jpayne@68
|
1322
|
jpayne@68
|
1323
|
jpayne@68
|
1324 def XMLID(text, parser=None):
|
jpayne@68
|
1325 """Parse XML document from string constant for its IDs.
|
jpayne@68
|
1326
|
jpayne@68
|
1327 *text* is a string containing XML data, *parser* is an
|
jpayne@68
|
1328 optional parser instance, defaulting to the standard XMLParser.
|
jpayne@68
|
1329
|
jpayne@68
|
1330 Returns an (Element, dict) tuple, in which the
|
jpayne@68
|
1331 dict maps element id:s to elements.
|
jpayne@68
|
1332
|
jpayne@68
|
1333 """
|
jpayne@68
|
1334 if not parser:
|
jpayne@68
|
1335 parser = XMLParser(target=TreeBuilder())
|
jpayne@68
|
1336 parser.feed(text)
|
jpayne@68
|
1337 tree = parser.close()
|
jpayne@68
|
1338 ids = {}
|
jpayne@68
|
1339 for elem in tree.iter():
|
jpayne@68
|
1340 id = elem.get("id")
|
jpayne@68
|
1341 if id:
|
jpayne@68
|
1342 ids[id] = elem
|
jpayne@68
|
1343 return tree, ids
|
jpayne@68
|
1344
|
jpayne@68
|
1345 # Parse XML document from string constant. Alias for XML().
|
jpayne@68
|
1346 fromstring = XML
|
jpayne@68
|
1347
|
jpayne@68
|
1348 def fromstringlist(sequence, parser=None):
|
jpayne@68
|
1349 """Parse XML document from sequence of string fragments.
|
jpayne@68
|
1350
|
jpayne@68
|
1351 *sequence* is a list of other sequence, *parser* is an optional parser
|
jpayne@68
|
1352 instance, defaulting to the standard XMLParser.
|
jpayne@68
|
1353
|
jpayne@68
|
1354 Returns an Element instance.
|
jpayne@68
|
1355
|
jpayne@68
|
1356 """
|
jpayne@68
|
1357 if not parser:
|
jpayne@68
|
1358 parser = XMLParser(target=TreeBuilder())
|
jpayne@68
|
1359 for text in sequence:
|
jpayne@68
|
1360 parser.feed(text)
|
jpayne@68
|
1361 return parser.close()
|
jpayne@68
|
1362
|
jpayne@68
|
1363 # --------------------------------------------------------------------
|
jpayne@68
|
1364
|
jpayne@68
|
1365
|
jpayne@68
|
1366 class TreeBuilder:
|
jpayne@68
|
1367 """Generic element structure builder.
|
jpayne@68
|
1368
|
jpayne@68
|
1369 This builder converts a sequence of start, data, and end method
|
jpayne@68
|
1370 calls to a well-formed element structure.
|
jpayne@68
|
1371
|
jpayne@68
|
1372 You can use this class to build an element structure using a custom XML
|
jpayne@68
|
1373 parser, or a parser for some other XML-like format.
|
jpayne@68
|
1374
|
jpayne@68
|
1375 *element_factory* is an optional element factory which is called
|
jpayne@68
|
1376 to create new Element instances, as necessary.
|
jpayne@68
|
1377
|
jpayne@68
|
1378 *comment_factory* is a factory to create comments to be used instead of
|
jpayne@68
|
1379 the standard factory. If *insert_comments* is false (the default),
|
jpayne@68
|
1380 comments will not be inserted into the tree.
|
jpayne@68
|
1381
|
jpayne@68
|
1382 *pi_factory* is a factory to create processing instructions to be used
|
jpayne@68
|
1383 instead of the standard factory. If *insert_pis* is false (the default),
|
jpayne@68
|
1384 processing instructions will not be inserted into the tree.
|
jpayne@68
|
1385 """
|
jpayne@68
|
1386 def __init__(self, element_factory=None, *,
|
jpayne@68
|
1387 comment_factory=None, pi_factory=None,
|
jpayne@68
|
1388 insert_comments=False, insert_pis=False):
|
jpayne@68
|
1389 self._data = [] # data collector
|
jpayne@68
|
1390 self._elem = [] # element stack
|
jpayne@68
|
1391 self._last = None # last element
|
jpayne@68
|
1392 self._root = None # root element
|
jpayne@68
|
1393 self._tail = None # true if we're after an end tag
|
jpayne@68
|
1394 if comment_factory is None:
|
jpayne@68
|
1395 comment_factory = Comment
|
jpayne@68
|
1396 self._comment_factory = comment_factory
|
jpayne@68
|
1397 self.insert_comments = insert_comments
|
jpayne@68
|
1398 if pi_factory is None:
|
jpayne@68
|
1399 pi_factory = ProcessingInstruction
|
jpayne@68
|
1400 self._pi_factory = pi_factory
|
jpayne@68
|
1401 self.insert_pis = insert_pis
|
jpayne@68
|
1402 if element_factory is None:
|
jpayne@68
|
1403 element_factory = Element
|
jpayne@68
|
1404 self._factory = element_factory
|
jpayne@68
|
1405
|
jpayne@68
|
1406 def close(self):
|
jpayne@68
|
1407 """Flush builder buffers and return toplevel document Element."""
|
jpayne@68
|
1408 assert len(self._elem) == 0, "missing end tags"
|
jpayne@68
|
1409 assert self._root is not None, "missing toplevel element"
|
jpayne@68
|
1410 return self._root
|
jpayne@68
|
1411
|
jpayne@68
|
1412 def _flush(self):
|
jpayne@68
|
1413 if self._data:
|
jpayne@68
|
1414 if self._last is not None:
|
jpayne@68
|
1415 text = "".join(self._data)
|
jpayne@68
|
1416 if self._tail:
|
jpayne@68
|
1417 assert self._last.tail is None, "internal error (tail)"
|
jpayne@68
|
1418 self._last.tail = text
|
jpayne@68
|
1419 else:
|
jpayne@68
|
1420 assert self._last.text is None, "internal error (text)"
|
jpayne@68
|
1421 self._last.text = text
|
jpayne@68
|
1422 self._data = []
|
jpayne@68
|
1423
|
jpayne@68
|
1424 def data(self, data):
|
jpayne@68
|
1425 """Add text to current element."""
|
jpayne@68
|
1426 self._data.append(data)
|
jpayne@68
|
1427
|
jpayne@68
|
1428 def start(self, tag, attrs):
|
jpayne@68
|
1429 """Open new element and return it.
|
jpayne@68
|
1430
|
jpayne@68
|
1431 *tag* is the element name, *attrs* is a dict containing element
|
jpayne@68
|
1432 attributes.
|
jpayne@68
|
1433
|
jpayne@68
|
1434 """
|
jpayne@68
|
1435 self._flush()
|
jpayne@68
|
1436 self._last = elem = self._factory(tag, attrs)
|
jpayne@68
|
1437 if self._elem:
|
jpayne@68
|
1438 self._elem[-1].append(elem)
|
jpayne@68
|
1439 elif self._root is None:
|
jpayne@68
|
1440 self._root = elem
|
jpayne@68
|
1441 self._elem.append(elem)
|
jpayne@68
|
1442 self._tail = 0
|
jpayne@68
|
1443 return elem
|
jpayne@68
|
1444
|
jpayne@68
|
1445 def end(self, tag):
|
jpayne@68
|
1446 """Close and return current Element.
|
jpayne@68
|
1447
|
jpayne@68
|
1448 *tag* is the element name.
|
jpayne@68
|
1449
|
jpayne@68
|
1450 """
|
jpayne@68
|
1451 self._flush()
|
jpayne@68
|
1452 self._last = self._elem.pop()
|
jpayne@68
|
1453 assert self._last.tag == tag,\
|
jpayne@68
|
1454 "end tag mismatch (expected %s, got %s)" % (
|
jpayne@68
|
1455 self._last.tag, tag)
|
jpayne@68
|
1456 self._tail = 1
|
jpayne@68
|
1457 return self._last
|
jpayne@68
|
1458
|
jpayne@68
|
1459 def comment(self, text):
|
jpayne@68
|
1460 """Create a comment using the comment_factory.
|
jpayne@68
|
1461
|
jpayne@68
|
1462 *text* is the text of the comment.
|
jpayne@68
|
1463 """
|
jpayne@68
|
1464 return self._handle_single(
|
jpayne@68
|
1465 self._comment_factory, self.insert_comments, text)
|
jpayne@68
|
1466
|
jpayne@68
|
1467 def pi(self, target, text=None):
|
jpayne@68
|
1468 """Create a processing instruction using the pi_factory.
|
jpayne@68
|
1469
|
jpayne@68
|
1470 *target* is the target name of the processing instruction.
|
jpayne@68
|
1471 *text* is the data of the processing instruction, or ''.
|
jpayne@68
|
1472 """
|
jpayne@68
|
1473 return self._handle_single(
|
jpayne@68
|
1474 self._pi_factory, self.insert_pis, target, text)
|
jpayne@68
|
1475
|
jpayne@68
|
1476 def _handle_single(self, factory, insert, *args):
|
jpayne@68
|
1477 elem = factory(*args)
|
jpayne@68
|
1478 if insert:
|
jpayne@68
|
1479 self._flush()
|
jpayne@68
|
1480 self._last = elem
|
jpayne@68
|
1481 if self._elem:
|
jpayne@68
|
1482 self._elem[-1].append(elem)
|
jpayne@68
|
1483 self._tail = 1
|
jpayne@68
|
1484 return elem
|
jpayne@68
|
1485
|
jpayne@68
|
1486
|
jpayne@68
|
1487 # also see ElementTree and TreeBuilder
|
jpayne@68
|
1488 class XMLParser:
|
jpayne@68
|
1489 """Element structure builder for XML source data based on the expat parser.
|
jpayne@68
|
1490
|
jpayne@68
|
1491 *target* is an optional target object which defaults to an instance of the
|
jpayne@68
|
1492 standard TreeBuilder class, *encoding* is an optional encoding string
|
jpayne@68
|
1493 which if given, overrides the encoding specified in the XML file:
|
jpayne@68
|
1494 http://www.iana.org/assignments/character-sets
|
jpayne@68
|
1495
|
jpayne@68
|
1496 """
|
jpayne@68
|
1497
|
jpayne@68
|
1498 def __init__(self, *, target=None, encoding=None):
|
jpayne@68
|
1499 try:
|
jpayne@68
|
1500 from xml.parsers import expat
|
jpayne@68
|
1501 except ImportError:
|
jpayne@68
|
1502 try:
|
jpayne@68
|
1503 import pyexpat as expat
|
jpayne@68
|
1504 except ImportError:
|
jpayne@68
|
1505 raise ImportError(
|
jpayne@68
|
1506 "No module named expat; use SimpleXMLTreeBuilder instead"
|
jpayne@68
|
1507 )
|
jpayne@68
|
1508 parser = expat.ParserCreate(encoding, "}")
|
jpayne@68
|
1509 if target is None:
|
jpayne@68
|
1510 target = TreeBuilder()
|
jpayne@68
|
1511 # underscored names are provided for compatibility only
|
jpayne@68
|
1512 self.parser = self._parser = parser
|
jpayne@68
|
1513 self.target = self._target = target
|
jpayne@68
|
1514 self._error = expat.error
|
jpayne@68
|
1515 self._names = {} # name memo cache
|
jpayne@68
|
1516 # main callbacks
|
jpayne@68
|
1517 parser.DefaultHandlerExpand = self._default
|
jpayne@68
|
1518 if hasattr(target, 'start'):
|
jpayne@68
|
1519 parser.StartElementHandler = self._start
|
jpayne@68
|
1520 if hasattr(target, 'end'):
|
jpayne@68
|
1521 parser.EndElementHandler = self._end
|
jpayne@68
|
1522 if hasattr(target, 'start_ns'):
|
jpayne@68
|
1523 parser.StartNamespaceDeclHandler = self._start_ns
|
jpayne@68
|
1524 if hasattr(target, 'end_ns'):
|
jpayne@68
|
1525 parser.EndNamespaceDeclHandler = self._end_ns
|
jpayne@68
|
1526 if hasattr(target, 'data'):
|
jpayne@68
|
1527 parser.CharacterDataHandler = target.data
|
jpayne@68
|
1528 # miscellaneous callbacks
|
jpayne@68
|
1529 if hasattr(target, 'comment'):
|
jpayne@68
|
1530 parser.CommentHandler = target.comment
|
jpayne@68
|
1531 if hasattr(target, 'pi'):
|
jpayne@68
|
1532 parser.ProcessingInstructionHandler = target.pi
|
jpayne@68
|
1533 # Configure pyexpat: buffering, new-style attribute handling.
|
jpayne@68
|
1534 parser.buffer_text = 1
|
jpayne@68
|
1535 parser.ordered_attributes = 1
|
jpayne@68
|
1536 parser.specified_attributes = 1
|
jpayne@68
|
1537 self._doctype = None
|
jpayne@68
|
1538 self.entity = {}
|
jpayne@68
|
1539 try:
|
jpayne@68
|
1540 self.version = "Expat %d.%d.%d" % expat.version_info
|
jpayne@68
|
1541 except AttributeError:
|
jpayne@68
|
1542 pass # unknown
|
jpayne@68
|
1543
|
jpayne@68
|
1544 def _setevents(self, events_queue, events_to_report):
|
jpayne@68
|
1545 # Internal API for XMLPullParser
|
jpayne@68
|
1546 # events_to_report: a list of events to report during parsing (same as
|
jpayne@68
|
1547 # the *events* of XMLPullParser's constructor.
|
jpayne@68
|
1548 # events_queue: a list of actual parsing events that will be populated
|
jpayne@68
|
1549 # by the underlying parser.
|
jpayne@68
|
1550 #
|
jpayne@68
|
1551 parser = self._parser
|
jpayne@68
|
1552 append = events_queue.append
|
jpayne@68
|
1553 for event_name in events_to_report:
|
jpayne@68
|
1554 if event_name == "start":
|
jpayne@68
|
1555 parser.ordered_attributes = 1
|
jpayne@68
|
1556 parser.specified_attributes = 1
|
jpayne@68
|
1557 def handler(tag, attrib_in, event=event_name, append=append,
|
jpayne@68
|
1558 start=self._start):
|
jpayne@68
|
1559 append((event, start(tag, attrib_in)))
|
jpayne@68
|
1560 parser.StartElementHandler = handler
|
jpayne@68
|
1561 elif event_name == "end":
|
jpayne@68
|
1562 def handler(tag, event=event_name, append=append,
|
jpayne@68
|
1563 end=self._end):
|
jpayne@68
|
1564 append((event, end(tag)))
|
jpayne@68
|
1565 parser.EndElementHandler = handler
|
jpayne@68
|
1566 elif event_name == "start-ns":
|
jpayne@68
|
1567 # TreeBuilder does not implement .start_ns()
|
jpayne@68
|
1568 if hasattr(self.target, "start_ns"):
|
jpayne@68
|
1569 def handler(prefix, uri, event=event_name, append=append,
|
jpayne@68
|
1570 start_ns=self._start_ns):
|
jpayne@68
|
1571 append((event, start_ns(prefix, uri)))
|
jpayne@68
|
1572 else:
|
jpayne@68
|
1573 def handler(prefix, uri, event=event_name, append=append):
|
jpayne@68
|
1574 append((event, (prefix or '', uri or '')))
|
jpayne@68
|
1575 parser.StartNamespaceDeclHandler = handler
|
jpayne@68
|
1576 elif event_name == "end-ns":
|
jpayne@68
|
1577 # TreeBuilder does not implement .end_ns()
|
jpayne@68
|
1578 if hasattr(self.target, "end_ns"):
|
jpayne@68
|
1579 def handler(prefix, event=event_name, append=append,
|
jpayne@68
|
1580 end_ns=self._end_ns):
|
jpayne@68
|
1581 append((event, end_ns(prefix)))
|
jpayne@68
|
1582 else:
|
jpayne@68
|
1583 def handler(prefix, event=event_name, append=append):
|
jpayne@68
|
1584 append((event, None))
|
jpayne@68
|
1585 parser.EndNamespaceDeclHandler = handler
|
jpayne@68
|
1586 elif event_name == 'comment':
|
jpayne@68
|
1587 def handler(text, event=event_name, append=append, self=self):
|
jpayne@68
|
1588 append((event, self.target.comment(text)))
|
jpayne@68
|
1589 parser.CommentHandler = handler
|
jpayne@68
|
1590 elif event_name == 'pi':
|
jpayne@68
|
1591 def handler(pi_target, data, event=event_name, append=append,
|
jpayne@68
|
1592 self=self):
|
jpayne@68
|
1593 append((event, self.target.pi(pi_target, data)))
|
jpayne@68
|
1594 parser.ProcessingInstructionHandler = handler
|
jpayne@68
|
1595 else:
|
jpayne@68
|
1596 raise ValueError("unknown event %r" % event_name)
|
jpayne@68
|
1597
|
jpayne@68
|
1598 def _raiseerror(self, value):
|
jpayne@68
|
1599 err = ParseError(value)
|
jpayne@68
|
1600 err.code = value.code
|
jpayne@68
|
1601 err.position = value.lineno, value.offset
|
jpayne@68
|
1602 raise err
|
jpayne@68
|
1603
|
jpayne@68
|
1604 def _fixname(self, key):
|
jpayne@68
|
1605 # expand qname, and convert name string to ascii, if possible
|
jpayne@68
|
1606 try:
|
jpayne@68
|
1607 name = self._names[key]
|
jpayne@68
|
1608 except KeyError:
|
jpayne@68
|
1609 name = key
|
jpayne@68
|
1610 if "}" in name:
|
jpayne@68
|
1611 name = "{" + name
|
jpayne@68
|
1612 self._names[key] = name
|
jpayne@68
|
1613 return name
|
jpayne@68
|
1614
|
jpayne@68
|
1615 def _start_ns(self, prefix, uri):
|
jpayne@68
|
1616 return self.target.start_ns(prefix or '', uri or '')
|
jpayne@68
|
1617
|
jpayne@68
|
1618 def _end_ns(self, prefix):
|
jpayne@68
|
1619 return self.target.end_ns(prefix or '')
|
jpayne@68
|
1620
|
jpayne@68
|
1621 def _start(self, tag, attr_list):
|
jpayne@68
|
1622 # Handler for expat's StartElementHandler. Since ordered_attributes
|
jpayne@68
|
1623 # is set, the attributes are reported as a list of alternating
|
jpayne@68
|
1624 # attribute name,value.
|
jpayne@68
|
1625 fixname = self._fixname
|
jpayne@68
|
1626 tag = fixname(tag)
|
jpayne@68
|
1627 attrib = {}
|
jpayne@68
|
1628 if attr_list:
|
jpayne@68
|
1629 for i in range(0, len(attr_list), 2):
|
jpayne@68
|
1630 attrib[fixname(attr_list[i])] = attr_list[i+1]
|
jpayne@68
|
1631 return self.target.start(tag, attrib)
|
jpayne@68
|
1632
|
jpayne@68
|
1633 def _end(self, tag):
|
jpayne@68
|
1634 return self.target.end(self._fixname(tag))
|
jpayne@68
|
1635
|
jpayne@68
|
1636 def _default(self, text):
|
jpayne@68
|
1637 prefix = text[:1]
|
jpayne@68
|
1638 if prefix == "&":
|
jpayne@68
|
1639 # deal with undefined entities
|
jpayne@68
|
1640 try:
|
jpayne@68
|
1641 data_handler = self.target.data
|
jpayne@68
|
1642 except AttributeError:
|
jpayne@68
|
1643 return
|
jpayne@68
|
1644 try:
|
jpayne@68
|
1645 data_handler(self.entity[text[1:-1]])
|
jpayne@68
|
1646 except KeyError:
|
jpayne@68
|
1647 from xml.parsers import expat
|
jpayne@68
|
1648 err = expat.error(
|
jpayne@68
|
1649 "undefined entity %s: line %d, column %d" %
|
jpayne@68
|
1650 (text, self.parser.ErrorLineNumber,
|
jpayne@68
|
1651 self.parser.ErrorColumnNumber)
|
jpayne@68
|
1652 )
|
jpayne@68
|
1653 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY
|
jpayne@68
|
1654 err.lineno = self.parser.ErrorLineNumber
|
jpayne@68
|
1655 err.offset = self.parser.ErrorColumnNumber
|
jpayne@68
|
1656 raise err
|
jpayne@68
|
1657 elif prefix == "<" and text[:9] == "<!DOCTYPE":
|
jpayne@68
|
1658 self._doctype = [] # inside a doctype declaration
|
jpayne@68
|
1659 elif self._doctype is not None:
|
jpayne@68
|
1660 # parse doctype contents
|
jpayne@68
|
1661 if prefix == ">":
|
jpayne@68
|
1662 self._doctype = None
|
jpayne@68
|
1663 return
|
jpayne@68
|
1664 text = text.strip()
|
jpayne@68
|
1665 if not text:
|
jpayne@68
|
1666 return
|
jpayne@68
|
1667 self._doctype.append(text)
|
jpayne@68
|
1668 n = len(self._doctype)
|
jpayne@68
|
1669 if n > 2:
|
jpayne@68
|
1670 type = self._doctype[1]
|
jpayne@68
|
1671 if type == "PUBLIC" and n == 4:
|
jpayne@68
|
1672 name, type, pubid, system = self._doctype
|
jpayne@68
|
1673 if pubid:
|
jpayne@68
|
1674 pubid = pubid[1:-1]
|
jpayne@68
|
1675 elif type == "SYSTEM" and n == 3:
|
jpayne@68
|
1676 name, type, system = self._doctype
|
jpayne@68
|
1677 pubid = None
|
jpayne@68
|
1678 else:
|
jpayne@68
|
1679 return
|
jpayne@68
|
1680 if hasattr(self.target, "doctype"):
|
jpayne@68
|
1681 self.target.doctype(name, pubid, system[1:-1])
|
jpayne@68
|
1682 elif hasattr(self, "doctype"):
|
jpayne@68
|
1683 warnings.warn(
|
jpayne@68
|
1684 "The doctype() method of XMLParser is ignored. "
|
jpayne@68
|
1685 "Define doctype() method on the TreeBuilder target.",
|
jpayne@68
|
1686 RuntimeWarning)
|
jpayne@68
|
1687
|
jpayne@68
|
1688 self._doctype = None
|
jpayne@68
|
1689
|
jpayne@68
|
1690 def feed(self, data):
|
jpayne@68
|
1691 """Feed encoded data to parser."""
|
jpayne@68
|
1692 try:
|
jpayne@68
|
1693 self.parser.Parse(data, 0)
|
jpayne@68
|
1694 except self._error as v:
|
jpayne@68
|
1695 self._raiseerror(v)
|
jpayne@68
|
1696
|
jpayne@68
|
1697 def close(self):
|
jpayne@68
|
1698 """Finish feeding data to parser and return element structure."""
|
jpayne@68
|
1699 try:
|
jpayne@68
|
1700 self.parser.Parse("", 1) # end of data
|
jpayne@68
|
1701 except self._error as v:
|
jpayne@68
|
1702 self._raiseerror(v)
|
jpayne@68
|
1703 try:
|
jpayne@68
|
1704 close_handler = self.target.close
|
jpayne@68
|
1705 except AttributeError:
|
jpayne@68
|
1706 pass
|
jpayne@68
|
1707 else:
|
jpayne@68
|
1708 return close_handler()
|
jpayne@68
|
1709 finally:
|
jpayne@68
|
1710 # get rid of circular references
|
jpayne@68
|
1711 del self.parser, self._parser
|
jpayne@68
|
1712 del self.target, self._target
|
jpayne@68
|
1713
|
jpayne@68
|
1714
|
jpayne@68
|
1715 # --------------------------------------------------------------------
|
jpayne@68
|
1716 # C14N 2.0
|
jpayne@68
|
1717
|
jpayne@68
|
1718 def canonicalize(xml_data=None, *, out=None, from_file=None, **options):
|
jpayne@68
|
1719 """Convert XML to its C14N 2.0 serialised form.
|
jpayne@68
|
1720
|
jpayne@68
|
1721 If *out* is provided, it must be a file or file-like object that receives
|
jpayne@68
|
1722 the serialised canonical XML output (text, not bytes) through its ``.write()``
|
jpayne@68
|
1723 method. To write to a file, open it in text mode with encoding "utf-8".
|
jpayne@68
|
1724 If *out* is not provided, this function returns the output as text string.
|
jpayne@68
|
1725
|
jpayne@68
|
1726 Either *xml_data* (an XML string) or *from_file* (a file path or
|
jpayne@68
|
1727 file-like object) must be provided as input.
|
jpayne@68
|
1728
|
jpayne@68
|
1729 The configuration options are the same as for the ``C14NWriterTarget``.
|
jpayne@68
|
1730 """
|
jpayne@68
|
1731 if xml_data is None and from_file is None:
|
jpayne@68
|
1732 raise ValueError("Either 'xml_data' or 'from_file' must be provided as input")
|
jpayne@68
|
1733 sio = None
|
jpayne@68
|
1734 if out is None:
|
jpayne@68
|
1735 sio = out = io.StringIO()
|
jpayne@68
|
1736
|
jpayne@68
|
1737 parser = XMLParser(target=C14NWriterTarget(out.write, **options))
|
jpayne@68
|
1738
|
jpayne@68
|
1739 if xml_data is not None:
|
jpayne@68
|
1740 parser.feed(xml_data)
|
jpayne@68
|
1741 parser.close()
|
jpayne@68
|
1742 elif from_file is not None:
|
jpayne@68
|
1743 parse(from_file, parser=parser)
|
jpayne@68
|
1744
|
jpayne@68
|
1745 return sio.getvalue() if sio is not None else None
|
jpayne@68
|
1746
|
jpayne@68
|
1747
|
jpayne@68
|
1748 _looks_like_prefix_name = re.compile(r'^\w+:\w+$', re.UNICODE).match
|
jpayne@68
|
1749
|
jpayne@68
|
1750
|
jpayne@68
|
1751 class C14NWriterTarget:
|
jpayne@68
|
1752 """
|
jpayne@68
|
1753 Canonicalization writer target for the XMLParser.
|
jpayne@68
|
1754
|
jpayne@68
|
1755 Serialises parse events to XML C14N 2.0.
|
jpayne@68
|
1756
|
jpayne@68
|
1757 The *write* function is used for writing out the resulting data stream
|
jpayne@68
|
1758 as text (not bytes). To write to a file, open it in text mode with encoding
|
jpayne@68
|
1759 "utf-8" and pass its ``.write`` method.
|
jpayne@68
|
1760
|
jpayne@68
|
1761 Configuration options:
|
jpayne@68
|
1762
|
jpayne@68
|
1763 - *with_comments*: set to true to include comments
|
jpayne@68
|
1764 - *strip_text*: set to true to strip whitespace before and after text content
|
jpayne@68
|
1765 - *rewrite_prefixes*: set to true to replace namespace prefixes by "n{number}"
|
jpayne@68
|
1766 - *qname_aware_tags*: a set of qname aware tag names in which prefixes
|
jpayne@68
|
1767 should be replaced in text content
|
jpayne@68
|
1768 - *qname_aware_attrs*: a set of qname aware attribute names in which prefixes
|
jpayne@68
|
1769 should be replaced in text content
|
jpayne@68
|
1770 - *exclude_attrs*: a set of attribute names that should not be serialised
|
jpayne@68
|
1771 - *exclude_tags*: a set of tag names that should not be serialised
|
jpayne@68
|
1772 """
|
jpayne@68
|
1773 def __init__(self, write, *,
|
jpayne@68
|
1774 with_comments=False, strip_text=False, rewrite_prefixes=False,
|
jpayne@68
|
1775 qname_aware_tags=None, qname_aware_attrs=None,
|
jpayne@68
|
1776 exclude_attrs=None, exclude_tags=None):
|
jpayne@68
|
1777 self._write = write
|
jpayne@68
|
1778 self._data = []
|
jpayne@68
|
1779 self._with_comments = with_comments
|
jpayne@68
|
1780 self._strip_text = strip_text
|
jpayne@68
|
1781 self._exclude_attrs = set(exclude_attrs) if exclude_attrs else None
|
jpayne@68
|
1782 self._exclude_tags = set(exclude_tags) if exclude_tags else None
|
jpayne@68
|
1783
|
jpayne@68
|
1784 self._rewrite_prefixes = rewrite_prefixes
|
jpayne@68
|
1785 if qname_aware_tags:
|
jpayne@68
|
1786 self._qname_aware_tags = set(qname_aware_tags)
|
jpayne@68
|
1787 else:
|
jpayne@68
|
1788 self._qname_aware_tags = None
|
jpayne@68
|
1789 if qname_aware_attrs:
|
jpayne@68
|
1790 self._find_qname_aware_attrs = set(qname_aware_attrs).intersection
|
jpayne@68
|
1791 else:
|
jpayne@68
|
1792 self._find_qname_aware_attrs = None
|
jpayne@68
|
1793
|
jpayne@68
|
1794 # Stack with globally and newly declared namespaces as (uri, prefix) pairs.
|
jpayne@68
|
1795 self._declared_ns_stack = [[
|
jpayne@68
|
1796 ("http://www.w3.org/XML/1998/namespace", "xml"),
|
jpayne@68
|
1797 ]]
|
jpayne@68
|
1798 # Stack with user declared namespace prefixes as (uri, prefix) pairs.
|
jpayne@68
|
1799 self._ns_stack = []
|
jpayne@68
|
1800 if not rewrite_prefixes:
|
jpayne@68
|
1801 self._ns_stack.append(list(_namespace_map.items()))
|
jpayne@68
|
1802 self._ns_stack.append([])
|
jpayne@68
|
1803 self._prefix_map = {}
|
jpayne@68
|
1804 self._preserve_space = [False]
|
jpayne@68
|
1805 self._pending_start = None
|
jpayne@68
|
1806 self._root_seen = False
|
jpayne@68
|
1807 self._root_done = False
|
jpayne@68
|
1808 self._ignored_depth = 0
|
jpayne@68
|
1809
|
jpayne@68
|
1810 def _iter_namespaces(self, ns_stack, _reversed=reversed):
|
jpayne@68
|
1811 for namespaces in _reversed(ns_stack):
|
jpayne@68
|
1812 if namespaces: # almost no element declares new namespaces
|
jpayne@68
|
1813 yield from namespaces
|
jpayne@68
|
1814
|
jpayne@68
|
1815 def _resolve_prefix_name(self, prefixed_name):
|
jpayne@68
|
1816 prefix, name = prefixed_name.split(':', 1)
|
jpayne@68
|
1817 for uri, p in self._iter_namespaces(self._ns_stack):
|
jpayne@68
|
1818 if p == prefix:
|
jpayne@68
|
1819 return f'{{{uri}}}{name}'
|
jpayne@68
|
1820 raise ValueError(f'Prefix {prefix} of QName "{prefixed_name}" is not declared in scope')
|
jpayne@68
|
1821
|
jpayne@68
|
1822 def _qname(self, qname, uri=None):
|
jpayne@68
|
1823 if uri is None:
|
jpayne@68
|
1824 uri, tag = qname[1:].rsplit('}', 1) if qname[:1] == '{' else ('', qname)
|
jpayne@68
|
1825 else:
|
jpayne@68
|
1826 tag = qname
|
jpayne@68
|
1827
|
jpayne@68
|
1828 prefixes_seen = set()
|
jpayne@68
|
1829 for u, prefix in self._iter_namespaces(self._declared_ns_stack):
|
jpayne@68
|
1830 if u == uri and prefix not in prefixes_seen:
|
jpayne@68
|
1831 return f'{prefix}:{tag}' if prefix else tag, tag, uri
|
jpayne@68
|
1832 prefixes_seen.add(prefix)
|
jpayne@68
|
1833
|
jpayne@68
|
1834 # Not declared yet => add new declaration.
|
jpayne@68
|
1835 if self._rewrite_prefixes:
|
jpayne@68
|
1836 if uri in self._prefix_map:
|
jpayne@68
|
1837 prefix = self._prefix_map[uri]
|
jpayne@68
|
1838 else:
|
jpayne@68
|
1839 prefix = self._prefix_map[uri] = f'n{len(self._prefix_map)}'
|
jpayne@68
|
1840 self._declared_ns_stack[-1].append((uri, prefix))
|
jpayne@68
|
1841 return f'{prefix}:{tag}', tag, uri
|
jpayne@68
|
1842
|
jpayne@68
|
1843 if not uri and '' not in prefixes_seen:
|
jpayne@68
|
1844 # No default namespace declared => no prefix needed.
|
jpayne@68
|
1845 return tag, tag, uri
|
jpayne@68
|
1846
|
jpayne@68
|
1847 for u, prefix in self._iter_namespaces(self._ns_stack):
|
jpayne@68
|
1848 if u == uri:
|
jpayne@68
|
1849 self._declared_ns_stack[-1].append((uri, prefix))
|
jpayne@68
|
1850 return f'{prefix}:{tag}' if prefix else tag, tag, uri
|
jpayne@68
|
1851
|
jpayne@68
|
1852 raise ValueError(f'Namespace "{uri}" is not declared in scope')
|
jpayne@68
|
1853
|
jpayne@68
|
1854 def data(self, data):
|
jpayne@68
|
1855 if not self._ignored_depth:
|
jpayne@68
|
1856 self._data.append(data)
|
jpayne@68
|
1857
|
jpayne@68
|
1858 def _flush(self, _join_text=''.join):
|
jpayne@68
|
1859 data = _join_text(self._data)
|
jpayne@68
|
1860 del self._data[:]
|
jpayne@68
|
1861 if self._strip_text and not self._preserve_space[-1]:
|
jpayne@68
|
1862 data = data.strip()
|
jpayne@68
|
1863 if self._pending_start is not None:
|
jpayne@68
|
1864 args, self._pending_start = self._pending_start, None
|
jpayne@68
|
1865 qname_text = data if data and _looks_like_prefix_name(data) else None
|
jpayne@68
|
1866 self._start(*args, qname_text)
|
jpayne@68
|
1867 if qname_text is not None:
|
jpayne@68
|
1868 return
|
jpayne@68
|
1869 if data and self._root_seen:
|
jpayne@68
|
1870 self._write(_escape_cdata_c14n(data))
|
jpayne@68
|
1871
|
jpayne@68
|
1872 def start_ns(self, prefix, uri):
|
jpayne@68
|
1873 if self._ignored_depth:
|
jpayne@68
|
1874 return
|
jpayne@68
|
1875 # we may have to resolve qnames in text content
|
jpayne@68
|
1876 if self._data:
|
jpayne@68
|
1877 self._flush()
|
jpayne@68
|
1878 self._ns_stack[-1].append((uri, prefix))
|
jpayne@68
|
1879
|
jpayne@68
|
1880 def start(self, tag, attrs):
|
jpayne@68
|
1881 if self._exclude_tags is not None and (
|
jpayne@68
|
1882 self._ignored_depth or tag in self._exclude_tags):
|
jpayne@68
|
1883 self._ignored_depth += 1
|
jpayne@68
|
1884 return
|
jpayne@68
|
1885 if self._data:
|
jpayne@68
|
1886 self._flush()
|
jpayne@68
|
1887
|
jpayne@68
|
1888 new_namespaces = []
|
jpayne@68
|
1889 self._declared_ns_stack.append(new_namespaces)
|
jpayne@68
|
1890
|
jpayne@68
|
1891 if self._qname_aware_tags is not None and tag in self._qname_aware_tags:
|
jpayne@68
|
1892 # Need to parse text first to see if it requires a prefix declaration.
|
jpayne@68
|
1893 self._pending_start = (tag, attrs, new_namespaces)
|
jpayne@68
|
1894 return
|
jpayne@68
|
1895 self._start(tag, attrs, new_namespaces)
|
jpayne@68
|
1896
|
jpayne@68
|
1897 def _start(self, tag, attrs, new_namespaces, qname_text=None):
|
jpayne@68
|
1898 if self._exclude_attrs is not None and attrs:
|
jpayne@68
|
1899 attrs = {k: v for k, v in attrs.items() if k not in self._exclude_attrs}
|
jpayne@68
|
1900
|
jpayne@68
|
1901 qnames = {tag, *attrs}
|
jpayne@68
|
1902 resolved_names = {}
|
jpayne@68
|
1903
|
jpayne@68
|
1904 # Resolve prefixes in attribute and tag text.
|
jpayne@68
|
1905 if qname_text is not None:
|
jpayne@68
|
1906 qname = resolved_names[qname_text] = self._resolve_prefix_name(qname_text)
|
jpayne@68
|
1907 qnames.add(qname)
|
jpayne@68
|
1908 if self._find_qname_aware_attrs is not None and attrs:
|
jpayne@68
|
1909 qattrs = self._find_qname_aware_attrs(attrs)
|
jpayne@68
|
1910 if qattrs:
|
jpayne@68
|
1911 for attr_name in qattrs:
|
jpayne@68
|
1912 value = attrs[attr_name]
|
jpayne@68
|
1913 if _looks_like_prefix_name(value):
|
jpayne@68
|
1914 qname = resolved_names[value] = self._resolve_prefix_name(value)
|
jpayne@68
|
1915 qnames.add(qname)
|
jpayne@68
|
1916 else:
|
jpayne@68
|
1917 qattrs = None
|
jpayne@68
|
1918 else:
|
jpayne@68
|
1919 qattrs = None
|
jpayne@68
|
1920
|
jpayne@68
|
1921 # Assign prefixes in lexicographical order of used URIs.
|
jpayne@68
|
1922 parse_qname = self._qname
|
jpayne@68
|
1923 parsed_qnames = {n: parse_qname(n) for n in sorted(
|
jpayne@68
|
1924 qnames, key=lambda n: n.split('}', 1))}
|
jpayne@68
|
1925
|
jpayne@68
|
1926 # Write namespace declarations in prefix order ...
|
jpayne@68
|
1927 if new_namespaces:
|
jpayne@68
|
1928 attr_list = [
|
jpayne@68
|
1929 ('xmlns:' + prefix if prefix else 'xmlns', uri)
|
jpayne@68
|
1930 for uri, prefix in new_namespaces
|
jpayne@68
|
1931 ]
|
jpayne@68
|
1932 attr_list.sort()
|
jpayne@68
|
1933 else:
|
jpayne@68
|
1934 # almost always empty
|
jpayne@68
|
1935 attr_list = []
|
jpayne@68
|
1936
|
jpayne@68
|
1937 # ... followed by attributes in URI+name order
|
jpayne@68
|
1938 if attrs:
|
jpayne@68
|
1939 for k, v in sorted(attrs.items()):
|
jpayne@68
|
1940 if qattrs is not None and k in qattrs and v in resolved_names:
|
jpayne@68
|
1941 v = parsed_qnames[resolved_names[v]][0]
|
jpayne@68
|
1942 attr_qname, attr_name, uri = parsed_qnames[k]
|
jpayne@68
|
1943 # No prefix for attributes in default ('') namespace.
|
jpayne@68
|
1944 attr_list.append((attr_qname if uri else attr_name, v))
|
jpayne@68
|
1945
|
jpayne@68
|
1946 # Honour xml:space attributes.
|
jpayne@68
|
1947 space_behaviour = attrs.get('{http://www.w3.org/XML/1998/namespace}space')
|
jpayne@68
|
1948 self._preserve_space.append(
|
jpayne@68
|
1949 space_behaviour == 'preserve' if space_behaviour
|
jpayne@68
|
1950 else self._preserve_space[-1])
|
jpayne@68
|
1951
|
jpayne@68
|
1952 # Write the tag.
|
jpayne@68
|
1953 write = self._write
|
jpayne@68
|
1954 write('<' + parsed_qnames[tag][0])
|
jpayne@68
|
1955 if attr_list:
|
jpayne@68
|
1956 write(''.join([f' {k}="{_escape_attrib_c14n(v)}"' for k, v in attr_list]))
|
jpayne@68
|
1957 write('>')
|
jpayne@68
|
1958
|
jpayne@68
|
1959 # Write the resolved qname text content.
|
jpayne@68
|
1960 if qname_text is not None:
|
jpayne@68
|
1961 write(_escape_cdata_c14n(parsed_qnames[resolved_names[qname_text]][0]))
|
jpayne@68
|
1962
|
jpayne@68
|
1963 self._root_seen = True
|
jpayne@68
|
1964 self._ns_stack.append([])
|
jpayne@68
|
1965
|
jpayne@68
|
1966 def end(self, tag):
|
jpayne@68
|
1967 if self._ignored_depth:
|
jpayne@68
|
1968 self._ignored_depth -= 1
|
jpayne@68
|
1969 return
|
jpayne@68
|
1970 if self._data:
|
jpayne@68
|
1971 self._flush()
|
jpayne@68
|
1972 self._write(f'</{self._qname(tag)[0]}>')
|
jpayne@68
|
1973 self._preserve_space.pop()
|
jpayne@68
|
1974 self._root_done = len(self._preserve_space) == 1
|
jpayne@68
|
1975 self._declared_ns_stack.pop()
|
jpayne@68
|
1976 self._ns_stack.pop()
|
jpayne@68
|
1977
|
jpayne@68
|
1978 def comment(self, text):
|
jpayne@68
|
1979 if not self._with_comments:
|
jpayne@68
|
1980 return
|
jpayne@68
|
1981 if self._ignored_depth:
|
jpayne@68
|
1982 return
|
jpayne@68
|
1983 if self._root_done:
|
jpayne@68
|
1984 self._write('\n')
|
jpayne@68
|
1985 elif self._root_seen and self._data:
|
jpayne@68
|
1986 self._flush()
|
jpayne@68
|
1987 self._write(f'<!--{_escape_cdata_c14n(text)}-->')
|
jpayne@68
|
1988 if not self._root_seen:
|
jpayne@68
|
1989 self._write('\n')
|
jpayne@68
|
1990
|
jpayne@68
|
1991 def pi(self, target, data):
|
jpayne@68
|
1992 if self._ignored_depth:
|
jpayne@68
|
1993 return
|
jpayne@68
|
1994 if self._root_done:
|
jpayne@68
|
1995 self._write('\n')
|
jpayne@68
|
1996 elif self._root_seen and self._data:
|
jpayne@68
|
1997 self._flush()
|
jpayne@68
|
1998 self._write(
|
jpayne@68
|
1999 f'<?{target} {_escape_cdata_c14n(data)}?>' if data else f'<?{target}?>')
|
jpayne@68
|
2000 if not self._root_seen:
|
jpayne@68
|
2001 self._write('\n')
|
jpayne@68
|
2002
|
jpayne@68
|
2003
|
jpayne@68
|
2004 def _escape_cdata_c14n(text):
|
jpayne@68
|
2005 # escape character data
|
jpayne@68
|
2006 try:
|
jpayne@68
|
2007 # it's worth avoiding do-nothing calls for strings that are
|
jpayne@68
|
2008 # shorter than 500 character, or so. assume that's, by far,
|
jpayne@68
|
2009 # the most common case in most applications.
|
jpayne@68
|
2010 if '&' in text:
|
jpayne@68
|
2011 text = text.replace('&', '&')
|
jpayne@68
|
2012 if '<' in text:
|
jpayne@68
|
2013 text = text.replace('<', '<')
|
jpayne@68
|
2014 if '>' in text:
|
jpayne@68
|
2015 text = text.replace('>', '>')
|
jpayne@68
|
2016 if '\r' in text:
|
jpayne@68
|
2017 text = text.replace('\r', '
')
|
jpayne@68
|
2018 return text
|
jpayne@68
|
2019 except (TypeError, AttributeError):
|
jpayne@68
|
2020 _raise_serialization_error(text)
|
jpayne@68
|
2021
|
jpayne@68
|
2022
|
jpayne@68
|
2023 def _escape_attrib_c14n(text):
|
jpayne@68
|
2024 # escape attribute value
|
jpayne@68
|
2025 try:
|
jpayne@68
|
2026 if '&' in text:
|
jpayne@68
|
2027 text = text.replace('&', '&')
|
jpayne@68
|
2028 if '<' in text:
|
jpayne@68
|
2029 text = text.replace('<', '<')
|
jpayne@68
|
2030 if '"' in text:
|
jpayne@68
|
2031 text = text.replace('"', '"')
|
jpayne@68
|
2032 if '\t' in text:
|
jpayne@68
|
2033 text = text.replace('\t', '	')
|
jpayne@68
|
2034 if '\n' in text:
|
jpayne@68
|
2035 text = text.replace('\n', '
')
|
jpayne@68
|
2036 if '\r' in text:
|
jpayne@68
|
2037 text = text.replace('\r', '
')
|
jpayne@68
|
2038 return text
|
jpayne@68
|
2039 except (TypeError, AttributeError):
|
jpayne@68
|
2040 _raise_serialization_error(text)
|
jpayne@68
|
2041
|
jpayne@68
|
2042
|
jpayne@68
|
2043 # --------------------------------------------------------------------
|
jpayne@68
|
2044
|
jpayne@68
|
2045 # Import the C accelerators
|
jpayne@68
|
2046 try:
|
jpayne@68
|
2047 # Element is going to be shadowed by the C implementation. We need to keep
|
jpayne@68
|
2048 # the Python version of it accessible for some "creative" by external code
|
jpayne@68
|
2049 # (see tests)
|
jpayne@68
|
2050 _Element_Py = Element
|
jpayne@68
|
2051
|
jpayne@68
|
2052 # Element, SubElement, ParseError, TreeBuilder, XMLParser, _set_factories
|
jpayne@68
|
2053 from _elementtree import *
|
jpayne@68
|
2054 from _elementtree import _set_factories
|
jpayne@68
|
2055 except ImportError:
|
jpayne@68
|
2056 pass
|
jpayne@68
|
2057 else:
|
jpayne@68
|
2058 _set_factories(Comment, ProcessingInstruction)
|