comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/xml/dom/expatbuilder.py @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 """Facility to use the Expat parser to load a minidom instance
2 from a string or file.
3
4 This avoids all the overhead of SAX and pulldom to gain performance.
5 """
6
7 # Warning!
8 #
9 # This module is tightly bound to the implementation details of the
10 # minidom DOM and can't be used with other DOM implementations. This
11 # is due, in part, to a lack of appropriate methods in the DOM (there is
12 # no way to create Entity and Notation nodes via the DOM Level 2
13 # interface), and for performance. The latter is the cause of some fairly
14 # cryptic code.
15 #
16 # Performance hacks:
17 #
18 # - .character_data_handler() has an extra case in which continuing
19 # data is appended to an existing Text node; this can be a
20 # speedup since pyexpat can break up character data into multiple
21 # callbacks even though we set the buffer_text attribute on the
22 # parser. This also gives us the advantage that we don't need a
23 # separate normalization pass.
24 #
25 # - Determining that a node exists is done using an identity comparison
26 # with None rather than a truth test; this avoids searching for and
27 # calling any methods on the node object if it exists. (A rather
28 # nice speedup is achieved this way as well!)
29
30 from xml.dom import xmlbuilder, minidom, Node
31 from xml.dom import EMPTY_NAMESPACE, EMPTY_PREFIX, XMLNS_NAMESPACE
32 from xml.parsers import expat
33 from xml.dom.minidom import _append_child, _set_attribute_node
34 from xml.dom.NodeFilter import NodeFilter
35
36 TEXT_NODE = Node.TEXT_NODE
37 CDATA_SECTION_NODE = Node.CDATA_SECTION_NODE
38 DOCUMENT_NODE = Node.DOCUMENT_NODE
39
40 FILTER_ACCEPT = xmlbuilder.DOMBuilderFilter.FILTER_ACCEPT
41 FILTER_REJECT = xmlbuilder.DOMBuilderFilter.FILTER_REJECT
42 FILTER_SKIP = xmlbuilder.DOMBuilderFilter.FILTER_SKIP
43 FILTER_INTERRUPT = xmlbuilder.DOMBuilderFilter.FILTER_INTERRUPT
44
45 theDOMImplementation = minidom.getDOMImplementation()
46
47 # Expat typename -> TypeInfo
48 _typeinfo_map = {
49 "CDATA": minidom.TypeInfo(None, "cdata"),
50 "ENUM": minidom.TypeInfo(None, "enumeration"),
51 "ENTITY": minidom.TypeInfo(None, "entity"),
52 "ENTITIES": minidom.TypeInfo(None, "entities"),
53 "ID": minidom.TypeInfo(None, "id"),
54 "IDREF": minidom.TypeInfo(None, "idref"),
55 "IDREFS": minidom.TypeInfo(None, "idrefs"),
56 "NMTOKEN": minidom.TypeInfo(None, "nmtoken"),
57 "NMTOKENS": minidom.TypeInfo(None, "nmtokens"),
58 }
59
60 class ElementInfo(object):
61 __slots__ = '_attr_info', '_model', 'tagName'
62
63 def __init__(self, tagName, model=None):
64 self.tagName = tagName
65 self._attr_info = []
66 self._model = model
67
68 def __getstate__(self):
69 return self._attr_info, self._model, self.tagName
70
71 def __setstate__(self, state):
72 self._attr_info, self._model, self.tagName = state
73
74 def getAttributeType(self, aname):
75 for info in self._attr_info:
76 if info[1] == aname:
77 t = info[-2]
78 if t[0] == "(":
79 return _typeinfo_map["ENUM"]
80 else:
81 return _typeinfo_map[info[-2]]
82 return minidom._no_type
83
84 def getAttributeTypeNS(self, namespaceURI, localName):
85 return minidom._no_type
86
87 def isElementContent(self):
88 if self._model:
89 type = self._model[0]
90 return type not in (expat.model.XML_CTYPE_ANY,
91 expat.model.XML_CTYPE_MIXED)
92 else:
93 return False
94
95 def isEmpty(self):
96 if self._model:
97 return self._model[0] == expat.model.XML_CTYPE_EMPTY
98 else:
99 return False
100
101 def isId(self, aname):
102 for info in self._attr_info:
103 if info[1] == aname:
104 return info[-2] == "ID"
105 return False
106
107 def isIdNS(self, euri, ename, auri, aname):
108 # not sure this is meaningful
109 return self.isId((auri, aname))
110
111 def _intern(builder, s):
112 return builder._intern_setdefault(s, s)
113
114 def _parse_ns_name(builder, name):
115 assert ' ' in name
116 parts = name.split(' ')
117 intern = builder._intern_setdefault
118 if len(parts) == 3:
119 uri, localname, prefix = parts
120 prefix = intern(prefix, prefix)
121 qname = "%s:%s" % (prefix, localname)
122 qname = intern(qname, qname)
123 localname = intern(localname, localname)
124 elif len(parts) == 2:
125 uri, localname = parts
126 prefix = EMPTY_PREFIX
127 qname = localname = intern(localname, localname)
128 else:
129 raise ValueError("Unsupported syntax: spaces in URIs not supported: %r" % name)
130 return intern(uri, uri), localname, prefix, qname
131
132
133 class ExpatBuilder:
134 """Document builder that uses Expat to build a ParsedXML.DOM document
135 instance."""
136
137 def __init__(self, options=None):
138 if options is None:
139 options = xmlbuilder.Options()
140 self._options = options
141 if self._options.filter is not None:
142 self._filter = FilterVisibilityController(self._options.filter)
143 else:
144 self._filter = None
145 # This *really* doesn't do anything in this case, so
146 # override it with something fast & minimal.
147 self._finish_start_element = id
148 self._parser = None
149 self.reset()
150
151 def createParser(self):
152 """Create a new parser object."""
153 return expat.ParserCreate()
154
155 def getParser(self):
156 """Return the parser object, creating a new one if needed."""
157 if not self._parser:
158 self._parser = self.createParser()
159 self._intern_setdefault = self._parser.intern.setdefault
160 self._parser.buffer_text = True
161 self._parser.ordered_attributes = True
162 self._parser.specified_attributes = True
163 self.install(self._parser)
164 return self._parser
165
166 def reset(self):
167 """Free all data structures used during DOM construction."""
168 self.document = theDOMImplementation.createDocument(
169 EMPTY_NAMESPACE, None, None)
170 self.curNode = self.document
171 self._elem_info = self.document._elem_info
172 self._cdata = False
173
174 def install(self, parser):
175 """Install the callbacks needed to build the DOM into the parser."""
176 # This creates circular references!
177 parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
178 parser.StartElementHandler = self.first_element_handler
179 parser.EndElementHandler = self.end_element_handler
180 parser.ProcessingInstructionHandler = self.pi_handler
181 if self._options.entities:
182 parser.EntityDeclHandler = self.entity_decl_handler
183 parser.NotationDeclHandler = self.notation_decl_handler
184 if self._options.comments:
185 parser.CommentHandler = self.comment_handler
186 if self._options.cdata_sections:
187 parser.StartCdataSectionHandler = self.start_cdata_section_handler
188 parser.EndCdataSectionHandler = self.end_cdata_section_handler
189 parser.CharacterDataHandler = self.character_data_handler_cdata
190 else:
191 parser.CharacterDataHandler = self.character_data_handler
192 parser.ExternalEntityRefHandler = self.external_entity_ref_handler
193 parser.XmlDeclHandler = self.xml_decl_handler
194 parser.ElementDeclHandler = self.element_decl_handler
195 parser.AttlistDeclHandler = self.attlist_decl_handler
196
197 def parseFile(self, file):
198 """Parse a document from a file object, returning the document
199 node."""
200 parser = self.getParser()
201 first_buffer = True
202 try:
203 while 1:
204 buffer = file.read(16*1024)
205 if not buffer:
206 break
207 parser.Parse(buffer, 0)
208 if first_buffer and self.document.documentElement:
209 self._setup_subset(buffer)
210 first_buffer = False
211 parser.Parse("", True)
212 except ParseEscape:
213 pass
214 doc = self.document
215 self.reset()
216 self._parser = None
217 return doc
218
219 def parseString(self, string):
220 """Parse a document from a string, returning the document node."""
221 parser = self.getParser()
222 try:
223 parser.Parse(string, True)
224 self._setup_subset(string)
225 except ParseEscape:
226 pass
227 doc = self.document
228 self.reset()
229 self._parser = None
230 return doc
231
232 def _setup_subset(self, buffer):
233 """Load the internal subset if there might be one."""
234 if self.document.doctype:
235 extractor = InternalSubsetExtractor()
236 extractor.parseString(buffer)
237 subset = extractor.getSubset()
238 self.document.doctype.internalSubset = subset
239
240 def start_doctype_decl_handler(self, doctypeName, systemId, publicId,
241 has_internal_subset):
242 doctype = self.document.implementation.createDocumentType(
243 doctypeName, publicId, systemId)
244 doctype.ownerDocument = self.document
245 _append_child(self.document, doctype)
246 self.document.doctype = doctype
247 if self._filter and self._filter.acceptNode(doctype) == FILTER_REJECT:
248 self.document.doctype = None
249 del self.document.childNodes[-1]
250 doctype = None
251 self._parser.EntityDeclHandler = None
252 self._parser.NotationDeclHandler = None
253 if has_internal_subset:
254 if doctype is not None:
255 doctype.entities._seq = []
256 doctype.notations._seq = []
257 self._parser.CommentHandler = None
258 self._parser.ProcessingInstructionHandler = None
259 self._parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
260
261 def end_doctype_decl_handler(self):
262 if self._options.comments:
263 self._parser.CommentHandler = self.comment_handler
264 self._parser.ProcessingInstructionHandler = self.pi_handler
265 if not (self._elem_info or self._filter):
266 self._finish_end_element = id
267
268 def pi_handler(self, target, data):
269 node = self.document.createProcessingInstruction(target, data)
270 _append_child(self.curNode, node)
271 if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
272 self.curNode.removeChild(node)
273
274 def character_data_handler_cdata(self, data):
275 childNodes = self.curNode.childNodes
276 if self._cdata:
277 if ( self._cdata_continue
278 and childNodes[-1].nodeType == CDATA_SECTION_NODE):
279 childNodes[-1].appendData(data)
280 return
281 node = self.document.createCDATASection(data)
282 self._cdata_continue = True
283 elif childNodes and childNodes[-1].nodeType == TEXT_NODE:
284 node = childNodes[-1]
285 value = node.data + data
286 node.data = value
287 return
288 else:
289 node = minidom.Text()
290 node.data = data
291 node.ownerDocument = self.document
292 _append_child(self.curNode, node)
293
294 def character_data_handler(self, data):
295 childNodes = self.curNode.childNodes
296 if childNodes and childNodes[-1].nodeType == TEXT_NODE:
297 node = childNodes[-1]
298 node.data = node.data + data
299 return
300 node = minidom.Text()
301 node.data = node.data + data
302 node.ownerDocument = self.document
303 _append_child(self.curNode, node)
304
305 def entity_decl_handler(self, entityName, is_parameter_entity, value,
306 base, systemId, publicId, notationName):
307 if is_parameter_entity:
308 # we don't care about parameter entities for the DOM
309 return
310 if not self._options.entities:
311 return
312 node = self.document._create_entity(entityName, publicId,
313 systemId, notationName)
314 if value is not None:
315 # internal entity
316 # node *should* be readonly, but we'll cheat
317 child = self.document.createTextNode(value)
318 node.childNodes.append(child)
319 self.document.doctype.entities._seq.append(node)
320 if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
321 del self.document.doctype.entities._seq[-1]
322
323 def notation_decl_handler(self, notationName, base, systemId, publicId):
324 node = self.document._create_notation(notationName, publicId, systemId)
325 self.document.doctype.notations._seq.append(node)
326 if self._filter and self._filter.acceptNode(node) == FILTER_ACCEPT:
327 del self.document.doctype.notations._seq[-1]
328
329 def comment_handler(self, data):
330 node = self.document.createComment(data)
331 _append_child(self.curNode, node)
332 if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
333 self.curNode.removeChild(node)
334
335 def start_cdata_section_handler(self):
336 self._cdata = True
337 self._cdata_continue = False
338
339 def end_cdata_section_handler(self):
340 self._cdata = False
341 self._cdata_continue = False
342
343 def external_entity_ref_handler(self, context, base, systemId, publicId):
344 return 1
345
346 def first_element_handler(self, name, attributes):
347 if self._filter is None and not self._elem_info:
348 self._finish_end_element = id
349 self.getParser().StartElementHandler = self.start_element_handler
350 self.start_element_handler(name, attributes)
351
352 def start_element_handler(self, name, attributes):
353 node = self.document.createElement(name)
354 _append_child(self.curNode, node)
355 self.curNode = node
356
357 if attributes:
358 for i in range(0, len(attributes), 2):
359 a = minidom.Attr(attributes[i], EMPTY_NAMESPACE,
360 None, EMPTY_PREFIX)
361 value = attributes[i+1]
362 a.value = value
363 a.ownerDocument = self.document
364 _set_attribute_node(node, a)
365
366 if node is not self.document.documentElement:
367 self._finish_start_element(node)
368
369 def _finish_start_element(self, node):
370 if self._filter:
371 # To be general, we'd have to call isSameNode(), but this
372 # is sufficient for minidom:
373 if node is self.document.documentElement:
374 return
375 filt = self._filter.startContainer(node)
376 if filt == FILTER_REJECT:
377 # ignore this node & all descendents
378 Rejecter(self)
379 elif filt == FILTER_SKIP:
380 # ignore this node, but make it's children become
381 # children of the parent node
382 Skipper(self)
383 else:
384 return
385 self.curNode = node.parentNode
386 node.parentNode.removeChild(node)
387 node.unlink()
388
389 # If this ever changes, Namespaces.end_element_handler() needs to
390 # be changed to match.
391 #
392 def end_element_handler(self, name):
393 curNode = self.curNode
394 self.curNode = curNode.parentNode
395 self._finish_end_element(curNode)
396
397 def _finish_end_element(self, curNode):
398 info = self._elem_info.get(curNode.tagName)
399 if info:
400 self._handle_white_text_nodes(curNode, info)
401 if self._filter:
402 if curNode is self.document.documentElement:
403 return
404 if self._filter.acceptNode(curNode) == FILTER_REJECT:
405 self.curNode.removeChild(curNode)
406 curNode.unlink()
407
408 def _handle_white_text_nodes(self, node, info):
409 if (self._options.whitespace_in_element_content
410 or not info.isElementContent()):
411 return
412
413 # We have element type information and should remove ignorable
414 # whitespace; identify for text nodes which contain only
415 # whitespace.
416 L = []
417 for child in node.childNodes:
418 if child.nodeType == TEXT_NODE and not child.data.strip():
419 L.append(child)
420
421 # Remove ignorable whitespace from the tree.
422 for child in L:
423 node.removeChild(child)
424
425 def element_decl_handler(self, name, model):
426 info = self._elem_info.get(name)
427 if info is None:
428 self._elem_info[name] = ElementInfo(name, model)
429 else:
430 assert info._model is None
431 info._model = model
432
433 def attlist_decl_handler(self, elem, name, type, default, required):
434 info = self._elem_info.get(elem)
435 if info is None:
436 info = ElementInfo(elem)
437 self._elem_info[elem] = info
438 info._attr_info.append(
439 [None, name, None, None, default, 0, type, required])
440
441 def xml_decl_handler(self, version, encoding, standalone):
442 self.document.version = version
443 self.document.encoding = encoding
444 # This is still a little ugly, thanks to the pyexpat API. ;-(
445 if standalone >= 0:
446 if standalone:
447 self.document.standalone = True
448 else:
449 self.document.standalone = False
450
451
452 # Don't include FILTER_INTERRUPT, since that's checked separately
453 # where allowed.
454 _ALLOWED_FILTER_RETURNS = (FILTER_ACCEPT, FILTER_REJECT, FILTER_SKIP)
455
456 class FilterVisibilityController(object):
457 """Wrapper around a DOMBuilderFilter which implements the checks
458 to make the whatToShow filter attribute work."""
459
460 __slots__ = 'filter',
461
462 def __init__(self, filter):
463 self.filter = filter
464
465 def startContainer(self, node):
466 mask = self._nodetype_mask[node.nodeType]
467 if self.filter.whatToShow & mask:
468 val = self.filter.startContainer(node)
469 if val == FILTER_INTERRUPT:
470 raise ParseEscape
471 if val not in _ALLOWED_FILTER_RETURNS:
472 raise ValueError(
473 "startContainer() returned illegal value: " + repr(val))
474 return val
475 else:
476 return FILTER_ACCEPT
477
478 def acceptNode(self, node):
479 mask = self._nodetype_mask[node.nodeType]
480 if self.filter.whatToShow & mask:
481 val = self.filter.acceptNode(node)
482 if val == FILTER_INTERRUPT:
483 raise ParseEscape
484 if val == FILTER_SKIP:
485 # move all child nodes to the parent, and remove this node
486 parent = node.parentNode
487 for child in node.childNodes[:]:
488 parent.appendChild(child)
489 # node is handled by the caller
490 return FILTER_REJECT
491 if val not in _ALLOWED_FILTER_RETURNS:
492 raise ValueError(
493 "acceptNode() returned illegal value: " + repr(val))
494 return val
495 else:
496 return FILTER_ACCEPT
497
498 _nodetype_mask = {
499 Node.ELEMENT_NODE: NodeFilter.SHOW_ELEMENT,
500 Node.ATTRIBUTE_NODE: NodeFilter.SHOW_ATTRIBUTE,
501 Node.TEXT_NODE: NodeFilter.SHOW_TEXT,
502 Node.CDATA_SECTION_NODE: NodeFilter.SHOW_CDATA_SECTION,
503 Node.ENTITY_REFERENCE_NODE: NodeFilter.SHOW_ENTITY_REFERENCE,
504 Node.ENTITY_NODE: NodeFilter.SHOW_ENTITY,
505 Node.PROCESSING_INSTRUCTION_NODE: NodeFilter.SHOW_PROCESSING_INSTRUCTION,
506 Node.COMMENT_NODE: NodeFilter.SHOW_COMMENT,
507 Node.DOCUMENT_NODE: NodeFilter.SHOW_DOCUMENT,
508 Node.DOCUMENT_TYPE_NODE: NodeFilter.SHOW_DOCUMENT_TYPE,
509 Node.DOCUMENT_FRAGMENT_NODE: NodeFilter.SHOW_DOCUMENT_FRAGMENT,
510 Node.NOTATION_NODE: NodeFilter.SHOW_NOTATION,
511 }
512
513
514 class FilterCrutch(object):
515 __slots__ = '_builder', '_level', '_old_start', '_old_end'
516
517 def __init__(self, builder):
518 self._level = 0
519 self._builder = builder
520 parser = builder._parser
521 self._old_start = parser.StartElementHandler
522 self._old_end = parser.EndElementHandler
523 parser.StartElementHandler = self.start_element_handler
524 parser.EndElementHandler = self.end_element_handler
525
526 class Rejecter(FilterCrutch):
527 __slots__ = ()
528
529 def __init__(self, builder):
530 FilterCrutch.__init__(self, builder)
531 parser = builder._parser
532 for name in ("ProcessingInstructionHandler",
533 "CommentHandler",
534 "CharacterDataHandler",
535 "StartCdataSectionHandler",
536 "EndCdataSectionHandler",
537 "ExternalEntityRefHandler",
538 ):
539 setattr(parser, name, None)
540
541 def start_element_handler(self, *args):
542 self._level = self._level + 1
543
544 def end_element_handler(self, *args):
545 if self._level == 0:
546 # restore the old handlers
547 parser = self._builder._parser
548 self._builder.install(parser)
549 parser.StartElementHandler = self._old_start
550 parser.EndElementHandler = self._old_end
551 else:
552 self._level = self._level - 1
553
554 class Skipper(FilterCrutch):
555 __slots__ = ()
556
557 def start_element_handler(self, *args):
558 node = self._builder.curNode
559 self._old_start(*args)
560 if self._builder.curNode is not node:
561 self._level = self._level + 1
562
563 def end_element_handler(self, *args):
564 if self._level == 0:
565 # We're popping back out of the node we're skipping, so we
566 # shouldn't need to do anything but reset the handlers.
567 self._builder._parser.StartElementHandler = self._old_start
568 self._builder._parser.EndElementHandler = self._old_end
569 self._builder = None
570 else:
571 self._level = self._level - 1
572 self._old_end(*args)
573
574
575 # framework document used by the fragment builder.
576 # Takes a string for the doctype, subset string, and namespace attrs string.
577
578 _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID = \
579 "http://xml.python.org/entities/fragment-builder/internal"
580
581 _FRAGMENT_BUILDER_TEMPLATE = (
582 '''\
583 <!DOCTYPE wrapper
584 %%s [
585 <!ENTITY fragment-builder-internal
586 SYSTEM "%s">
587 %%s
588 ]>
589 <wrapper %%s
590 >&fragment-builder-internal;</wrapper>'''
591 % _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID)
592
593
594 class FragmentBuilder(ExpatBuilder):
595 """Builder which constructs document fragments given XML source
596 text and a context node.
597
598 The context node is expected to provide information about the
599 namespace declarations which are in scope at the start of the
600 fragment.
601 """
602
603 def __init__(self, context, options=None):
604 if context.nodeType == DOCUMENT_NODE:
605 self.originalDocument = context
606 self.context = context
607 else:
608 self.originalDocument = context.ownerDocument
609 self.context = context
610 ExpatBuilder.__init__(self, options)
611
612 def reset(self):
613 ExpatBuilder.reset(self)
614 self.fragment = None
615
616 def parseFile(self, file):
617 """Parse a document fragment from a file object, returning the
618 fragment node."""
619 return self.parseString(file.read())
620
621 def parseString(self, string):
622 """Parse a document fragment from a string, returning the
623 fragment node."""
624 self._source = string
625 parser = self.getParser()
626 doctype = self.originalDocument.doctype
627 ident = ""
628 if doctype:
629 subset = doctype.internalSubset or self._getDeclarations()
630 if doctype.publicId:
631 ident = ('PUBLIC "%s" "%s"'
632 % (doctype.publicId, doctype.systemId))
633 elif doctype.systemId:
634 ident = 'SYSTEM "%s"' % doctype.systemId
635 else:
636 subset = ""
637 nsattrs = self._getNSattrs() # get ns decls from node's ancestors
638 document = _FRAGMENT_BUILDER_TEMPLATE % (ident, subset, nsattrs)
639 try:
640 parser.Parse(document, 1)
641 except:
642 self.reset()
643 raise
644 fragment = self.fragment
645 self.reset()
646 ## self._parser = None
647 return fragment
648
649 def _getDeclarations(self):
650 """Re-create the internal subset from the DocumentType node.
651
652 This is only needed if we don't already have the
653 internalSubset as a string.
654 """
655 doctype = self.context.ownerDocument.doctype
656 s = ""
657 if doctype:
658 for i in range(doctype.notations.length):
659 notation = doctype.notations.item(i)
660 if s:
661 s = s + "\n "
662 s = "%s<!NOTATION %s" % (s, notation.nodeName)
663 if notation.publicId:
664 s = '%s PUBLIC "%s"\n "%s">' \
665 % (s, notation.publicId, notation.systemId)
666 else:
667 s = '%s SYSTEM "%s">' % (s, notation.systemId)
668 for i in range(doctype.entities.length):
669 entity = doctype.entities.item(i)
670 if s:
671 s = s + "\n "
672 s = "%s<!ENTITY %s" % (s, entity.nodeName)
673 if entity.publicId:
674 s = '%s PUBLIC "%s"\n "%s"' \
675 % (s, entity.publicId, entity.systemId)
676 elif entity.systemId:
677 s = '%s SYSTEM "%s"' % (s, entity.systemId)
678 else:
679 s = '%s "%s"' % (s, entity.firstChild.data)
680 if entity.notationName:
681 s = "%s NOTATION %s" % (s, entity.notationName)
682 s = s + ">"
683 return s
684
685 def _getNSattrs(self):
686 return ""
687
688 def external_entity_ref_handler(self, context, base, systemId, publicId):
689 if systemId == _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID:
690 # this entref is the one that we made to put the subtree
691 # in; all of our given input is parsed in here.
692 old_document = self.document
693 old_cur_node = self.curNode
694 parser = self._parser.ExternalEntityParserCreate(context)
695 # put the real document back, parse into the fragment to return
696 self.document = self.originalDocument
697 self.fragment = self.document.createDocumentFragment()
698 self.curNode = self.fragment
699 try:
700 parser.Parse(self._source, 1)
701 finally:
702 self.curNode = old_cur_node
703 self.document = old_document
704 self._source = None
705 return -1
706 else:
707 return ExpatBuilder.external_entity_ref_handler(
708 self, context, base, systemId, publicId)
709
710
711 class Namespaces:
712 """Mix-in class for builders; adds support for namespaces."""
713
714 def _initNamespaces(self):
715 # list of (prefix, uri) ns declarations. Namespace attrs are
716 # constructed from this and added to the element's attrs.
717 self._ns_ordered_prefixes = []
718
719 def createParser(self):
720 """Create a new namespace-handling parser."""
721 parser = expat.ParserCreate(namespace_separator=" ")
722 parser.namespace_prefixes = True
723 return parser
724
725 def install(self, parser):
726 """Insert the namespace-handlers onto the parser."""
727 ExpatBuilder.install(self, parser)
728 if self._options.namespace_declarations:
729 parser.StartNamespaceDeclHandler = (
730 self.start_namespace_decl_handler)
731
732 def start_namespace_decl_handler(self, prefix, uri):
733 """Push this namespace declaration on our storage."""
734 self._ns_ordered_prefixes.append((prefix, uri))
735
736 def start_element_handler(self, name, attributes):
737 if ' ' in name:
738 uri, localname, prefix, qname = _parse_ns_name(self, name)
739 else:
740 uri = EMPTY_NAMESPACE
741 qname = name
742 localname = None
743 prefix = EMPTY_PREFIX
744 node = minidom.Element(qname, uri, prefix, localname)
745 node.ownerDocument = self.document
746 _append_child(self.curNode, node)
747 self.curNode = node
748
749 if self._ns_ordered_prefixes:
750 for prefix, uri in self._ns_ordered_prefixes:
751 if prefix:
752 a = minidom.Attr(_intern(self, 'xmlns:' + prefix),
753 XMLNS_NAMESPACE, prefix, "xmlns")
754 else:
755 a = minidom.Attr("xmlns", XMLNS_NAMESPACE,
756 "xmlns", EMPTY_PREFIX)
757 a.value = uri
758 a.ownerDocument = self.document
759 _set_attribute_node(node, a)
760 del self._ns_ordered_prefixes[:]
761
762 if attributes:
763 node._ensure_attributes()
764 _attrs = node._attrs
765 _attrsNS = node._attrsNS
766 for i in range(0, len(attributes), 2):
767 aname = attributes[i]
768 value = attributes[i+1]
769 if ' ' in aname:
770 uri, localname, prefix, qname = _parse_ns_name(self, aname)
771 a = minidom.Attr(qname, uri, localname, prefix)
772 _attrs[qname] = a
773 _attrsNS[(uri, localname)] = a
774 else:
775 a = minidom.Attr(aname, EMPTY_NAMESPACE,
776 aname, EMPTY_PREFIX)
777 _attrs[aname] = a
778 _attrsNS[(EMPTY_NAMESPACE, aname)] = a
779 a.ownerDocument = self.document
780 a.value = value
781 a.ownerElement = node
782
783 if __debug__:
784 # This only adds some asserts to the original
785 # end_element_handler(), so we only define this when -O is not
786 # used. If changing one, be sure to check the other to see if
787 # it needs to be changed as well.
788 #
789 def end_element_handler(self, name):
790 curNode = self.curNode
791 if ' ' in name:
792 uri, localname, prefix, qname = _parse_ns_name(self, name)
793 assert (curNode.namespaceURI == uri
794 and curNode.localName == localname
795 and curNode.prefix == prefix), \
796 "element stack messed up! (namespace)"
797 else:
798 assert curNode.nodeName == name, \
799 "element stack messed up - bad nodeName"
800 assert curNode.namespaceURI == EMPTY_NAMESPACE, \
801 "element stack messed up - bad namespaceURI"
802 self.curNode = curNode.parentNode
803 self._finish_end_element(curNode)
804
805
806 class ExpatBuilderNS(Namespaces, ExpatBuilder):
807 """Document builder that supports namespaces."""
808
809 def reset(self):
810 ExpatBuilder.reset(self)
811 self._initNamespaces()
812
813
814 class FragmentBuilderNS(Namespaces, FragmentBuilder):
815 """Fragment builder that supports namespaces."""
816
817 def reset(self):
818 FragmentBuilder.reset(self)
819 self._initNamespaces()
820
821 def _getNSattrs(self):
822 """Return string of namespace attributes from this element and
823 ancestors."""
824 # XXX This needs to be re-written to walk the ancestors of the
825 # context to build up the namespace information from
826 # declarations, elements, and attributes found in context.
827 # Otherwise we have to store a bunch more data on the DOM
828 # (though that *might* be more reliable -- not clear).
829 attrs = ""
830 context = self.context
831 L = []
832 while context:
833 if hasattr(context, '_ns_prefix_uri'):
834 for prefix, uri in context._ns_prefix_uri.items():
835 # add every new NS decl from context to L and attrs string
836 if prefix in L:
837 continue
838 L.append(prefix)
839 if prefix:
840 declname = "xmlns:" + prefix
841 else:
842 declname = "xmlns"
843 if attrs:
844 attrs = "%s\n %s='%s'" % (attrs, declname, uri)
845 else:
846 attrs = " %s='%s'" % (declname, uri)
847 context = context.parentNode
848 return attrs
849
850
851 class ParseEscape(Exception):
852 """Exception raised to short-circuit parsing in InternalSubsetExtractor."""
853 pass
854
855 class InternalSubsetExtractor(ExpatBuilder):
856 """XML processor which can rip out the internal document type subset."""
857
858 subset = None
859
860 def getSubset(self):
861 """Return the internal subset as a string."""
862 return self.subset
863
864 def parseFile(self, file):
865 try:
866 ExpatBuilder.parseFile(self, file)
867 except ParseEscape:
868 pass
869
870 def parseString(self, string):
871 try:
872 ExpatBuilder.parseString(self, string)
873 except ParseEscape:
874 pass
875
876 def install(self, parser):
877 parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
878 parser.StartElementHandler = self.start_element_handler
879
880 def start_doctype_decl_handler(self, name, publicId, systemId,
881 has_internal_subset):
882 if has_internal_subset:
883 parser = self.getParser()
884 self.subset = []
885 parser.DefaultHandler = self.subset.append
886 parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
887 else:
888 raise ParseEscape()
889
890 def end_doctype_decl_handler(self):
891 s = ''.join(self.subset).replace('\r\n', '\n').replace('\r', '\n')
892 self.subset = s
893 raise ParseEscape()
894
895 def start_element_handler(self, name, attrs):
896 raise ParseEscape()
897
898
899 def parse(file, namespaces=True):
900 """Parse a document, returning the resulting Document node.
901
902 'file' may be either a file name or an open file object.
903 """
904 if namespaces:
905 builder = ExpatBuilderNS()
906 else:
907 builder = ExpatBuilder()
908
909 if isinstance(file, str):
910 with open(file, 'rb') as fp:
911 result = builder.parseFile(fp)
912 else:
913 result = builder.parseFile(file)
914 return result
915
916
917 def parseString(string, namespaces=True):
918 """Parse a document from a string, returning the resulting
919 Document node.
920 """
921 if namespaces:
922 builder = ExpatBuilderNS()
923 else:
924 builder = ExpatBuilder()
925 return builder.parseString(string)
926
927
928 def parseFragment(file, context, namespaces=True):
929 """Parse a fragment of a document, given the context from which it
930 was originally extracted. context should be the parent of the
931 node(s) which are in the fragment.
932
933 'file' may be either a file name or an open file object.
934 """
935 if namespaces:
936 builder = FragmentBuilderNS(context)
937 else:
938 builder = FragmentBuilder(context)
939
940 if isinstance(file, str):
941 with open(file, 'rb') as fp:
942 result = builder.parseFile(fp)
943 else:
944 result = builder.parseFile(file)
945 return result
946
947
948 def parseFragmentString(string, context, namespaces=True):
949 """Parse a fragment of a document from a string, given the context
950 from which it was originally extracted. context should be the
951 parent of the node(s) which are in the fragment.
952 """
953 if namespaces:
954 builder = FragmentBuilderNS(context)
955 else:
956 builder = FragmentBuilder(context)
957 return builder.parseString(string)
958
959
960 def makeBuilder(options):
961 """Create a builder based on an Options object."""
962 if options.namespaces:
963 return ExpatBuilderNS(options)
964 else:
965 return ExpatBuilder(options)