jpayne@68: """Facility to use the Expat parser to load a minidom instance jpayne@68: from a string or file. jpayne@68: jpayne@68: This avoids all the overhead of SAX and pulldom to gain performance. jpayne@68: """ jpayne@68: jpayne@68: # Warning! jpayne@68: # jpayne@68: # This module is tightly bound to the implementation details of the jpayne@68: # minidom DOM and can't be used with other DOM implementations. This jpayne@68: # is due, in part, to a lack of appropriate methods in the DOM (there is jpayne@68: # no way to create Entity and Notation nodes via the DOM Level 2 jpayne@68: # interface), and for performance. The latter is the cause of some fairly jpayne@68: # cryptic code. jpayne@68: # jpayne@68: # Performance hacks: jpayne@68: # jpayne@68: # - .character_data_handler() has an extra case in which continuing jpayne@68: # data is appended to an existing Text node; this can be a jpayne@68: # speedup since pyexpat can break up character data into multiple jpayne@68: # callbacks even though we set the buffer_text attribute on the jpayne@68: # parser. This also gives us the advantage that we don't need a jpayne@68: # separate normalization pass. jpayne@68: # jpayne@68: # - Determining that a node exists is done using an identity comparison jpayne@68: # with None rather than a truth test; this avoids searching for and jpayne@68: # calling any methods on the node object if it exists. (A rather jpayne@68: # nice speedup is achieved this way as well!) jpayne@68: jpayne@68: from xml.dom import xmlbuilder, minidom, Node jpayne@68: from xml.dom import EMPTY_NAMESPACE, EMPTY_PREFIX, XMLNS_NAMESPACE jpayne@68: from xml.parsers import expat jpayne@68: from xml.dom.minidom import _append_child, _set_attribute_node jpayne@68: from xml.dom.NodeFilter import NodeFilter jpayne@68: jpayne@68: TEXT_NODE = Node.TEXT_NODE jpayne@68: CDATA_SECTION_NODE = Node.CDATA_SECTION_NODE jpayne@68: DOCUMENT_NODE = Node.DOCUMENT_NODE jpayne@68: jpayne@68: FILTER_ACCEPT = xmlbuilder.DOMBuilderFilter.FILTER_ACCEPT jpayne@68: FILTER_REJECT = xmlbuilder.DOMBuilderFilter.FILTER_REJECT jpayne@68: FILTER_SKIP = xmlbuilder.DOMBuilderFilter.FILTER_SKIP jpayne@68: FILTER_INTERRUPT = xmlbuilder.DOMBuilderFilter.FILTER_INTERRUPT jpayne@68: jpayne@68: theDOMImplementation = minidom.getDOMImplementation() jpayne@68: jpayne@68: # Expat typename -> TypeInfo jpayne@68: _typeinfo_map = { jpayne@68: "CDATA": minidom.TypeInfo(None, "cdata"), jpayne@68: "ENUM": minidom.TypeInfo(None, "enumeration"), jpayne@68: "ENTITY": minidom.TypeInfo(None, "entity"), jpayne@68: "ENTITIES": minidom.TypeInfo(None, "entities"), jpayne@68: "ID": minidom.TypeInfo(None, "id"), jpayne@68: "IDREF": minidom.TypeInfo(None, "idref"), jpayne@68: "IDREFS": minidom.TypeInfo(None, "idrefs"), jpayne@68: "NMTOKEN": minidom.TypeInfo(None, "nmtoken"), jpayne@68: "NMTOKENS": minidom.TypeInfo(None, "nmtokens"), jpayne@68: } jpayne@68: jpayne@68: class ElementInfo(object): jpayne@68: __slots__ = '_attr_info', '_model', 'tagName' jpayne@68: jpayne@68: def __init__(self, tagName, model=None): jpayne@68: self.tagName = tagName jpayne@68: self._attr_info = [] jpayne@68: self._model = model jpayne@68: jpayne@68: def __getstate__(self): jpayne@68: return self._attr_info, self._model, self.tagName jpayne@68: jpayne@68: def __setstate__(self, state): jpayne@68: self._attr_info, self._model, self.tagName = state jpayne@68: jpayne@68: def getAttributeType(self, aname): jpayne@68: for info in self._attr_info: jpayne@68: if info[1] == aname: jpayne@68: t = info[-2] jpayne@68: if t[0] == "(": jpayne@68: return _typeinfo_map["ENUM"] jpayne@68: else: jpayne@68: return _typeinfo_map[info[-2]] jpayne@68: return minidom._no_type jpayne@68: jpayne@68: def getAttributeTypeNS(self, namespaceURI, localName): jpayne@68: return minidom._no_type jpayne@68: jpayne@68: def isElementContent(self): jpayne@68: if self._model: jpayne@68: type = self._model[0] jpayne@68: return type not in (expat.model.XML_CTYPE_ANY, jpayne@68: expat.model.XML_CTYPE_MIXED) jpayne@68: else: jpayne@68: return False jpayne@68: jpayne@68: def isEmpty(self): jpayne@68: if self._model: jpayne@68: return self._model[0] == expat.model.XML_CTYPE_EMPTY jpayne@68: else: jpayne@68: return False jpayne@68: jpayne@68: def isId(self, aname): jpayne@68: for info in self._attr_info: jpayne@68: if info[1] == aname: jpayne@68: return info[-2] == "ID" jpayne@68: return False jpayne@68: jpayne@68: def isIdNS(self, euri, ename, auri, aname): jpayne@68: # not sure this is meaningful jpayne@68: return self.isId((auri, aname)) jpayne@68: jpayne@68: def _intern(builder, s): jpayne@68: return builder._intern_setdefault(s, s) jpayne@68: jpayne@68: def _parse_ns_name(builder, name): jpayne@68: assert ' ' in name jpayne@68: parts = name.split(' ') jpayne@68: intern = builder._intern_setdefault jpayne@68: if len(parts) == 3: jpayne@68: uri, localname, prefix = parts jpayne@68: prefix = intern(prefix, prefix) jpayne@68: qname = "%s:%s" % (prefix, localname) jpayne@68: qname = intern(qname, qname) jpayne@68: localname = intern(localname, localname) jpayne@68: elif len(parts) == 2: jpayne@68: uri, localname = parts jpayne@68: prefix = EMPTY_PREFIX jpayne@68: qname = localname = intern(localname, localname) jpayne@68: else: jpayne@68: raise ValueError("Unsupported syntax: spaces in URIs not supported: %r" % name) jpayne@68: return intern(uri, uri), localname, prefix, qname jpayne@68: jpayne@68: jpayne@68: class ExpatBuilder: jpayne@68: """Document builder that uses Expat to build a ParsedXML.DOM document jpayne@68: instance.""" jpayne@68: jpayne@68: def __init__(self, options=None): jpayne@68: if options is None: jpayne@68: options = xmlbuilder.Options() jpayne@68: self._options = options jpayne@68: if self._options.filter is not None: jpayne@68: self._filter = FilterVisibilityController(self._options.filter) jpayne@68: else: jpayne@68: self._filter = None jpayne@68: # This *really* doesn't do anything in this case, so jpayne@68: # override it with something fast & minimal. jpayne@68: self._finish_start_element = id jpayne@68: self._parser = None jpayne@68: self.reset() jpayne@68: jpayne@68: def createParser(self): jpayne@68: """Create a new parser object.""" jpayne@68: return expat.ParserCreate() jpayne@68: jpayne@68: def getParser(self): jpayne@68: """Return the parser object, creating a new one if needed.""" jpayne@68: if not self._parser: jpayne@68: self._parser = self.createParser() jpayne@68: self._intern_setdefault = self._parser.intern.setdefault jpayne@68: self._parser.buffer_text = True jpayne@68: self._parser.ordered_attributes = True jpayne@68: self._parser.specified_attributes = True jpayne@68: self.install(self._parser) jpayne@68: return self._parser jpayne@68: jpayne@68: def reset(self): jpayne@68: """Free all data structures used during DOM construction.""" jpayne@68: self.document = theDOMImplementation.createDocument( jpayne@68: EMPTY_NAMESPACE, None, None) jpayne@68: self.curNode = self.document jpayne@68: self._elem_info = self.document._elem_info jpayne@68: self._cdata = False jpayne@68: jpayne@68: def install(self, parser): jpayne@68: """Install the callbacks needed to build the DOM into the parser.""" jpayne@68: # This creates circular references! jpayne@68: parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler jpayne@68: parser.StartElementHandler = self.first_element_handler jpayne@68: parser.EndElementHandler = self.end_element_handler jpayne@68: parser.ProcessingInstructionHandler = self.pi_handler jpayne@68: if self._options.entities: jpayne@68: parser.EntityDeclHandler = self.entity_decl_handler jpayne@68: parser.NotationDeclHandler = self.notation_decl_handler jpayne@68: if self._options.comments: jpayne@68: parser.CommentHandler = self.comment_handler jpayne@68: if self._options.cdata_sections: jpayne@68: parser.StartCdataSectionHandler = self.start_cdata_section_handler jpayne@68: parser.EndCdataSectionHandler = self.end_cdata_section_handler jpayne@68: parser.CharacterDataHandler = self.character_data_handler_cdata jpayne@68: else: jpayne@68: parser.CharacterDataHandler = self.character_data_handler jpayne@68: parser.ExternalEntityRefHandler = self.external_entity_ref_handler jpayne@68: parser.XmlDeclHandler = self.xml_decl_handler jpayne@68: parser.ElementDeclHandler = self.element_decl_handler jpayne@68: parser.AttlistDeclHandler = self.attlist_decl_handler jpayne@68: jpayne@68: def parseFile(self, file): jpayne@68: """Parse a document from a file object, returning the document jpayne@68: node.""" jpayne@68: parser = self.getParser() jpayne@68: first_buffer = True jpayne@68: try: jpayne@68: while 1: jpayne@68: buffer = file.read(16*1024) jpayne@68: if not buffer: jpayne@68: break jpayne@68: parser.Parse(buffer, 0) jpayne@68: if first_buffer and self.document.documentElement: jpayne@68: self._setup_subset(buffer) jpayne@68: first_buffer = False jpayne@68: parser.Parse("", True) jpayne@68: except ParseEscape: jpayne@68: pass jpayne@68: doc = self.document jpayne@68: self.reset() jpayne@68: self._parser = None jpayne@68: return doc jpayne@68: jpayne@68: def parseString(self, string): jpayne@68: """Parse a document from a string, returning the document node.""" jpayne@68: parser = self.getParser() jpayne@68: try: jpayne@68: parser.Parse(string, True) jpayne@68: self._setup_subset(string) jpayne@68: except ParseEscape: jpayne@68: pass jpayne@68: doc = self.document jpayne@68: self.reset() jpayne@68: self._parser = None jpayne@68: return doc jpayne@68: jpayne@68: def _setup_subset(self, buffer): jpayne@68: """Load the internal subset if there might be one.""" jpayne@68: if self.document.doctype: jpayne@68: extractor = InternalSubsetExtractor() jpayne@68: extractor.parseString(buffer) jpayne@68: subset = extractor.getSubset() jpayne@68: self.document.doctype.internalSubset = subset jpayne@68: jpayne@68: def start_doctype_decl_handler(self, doctypeName, systemId, publicId, jpayne@68: has_internal_subset): jpayne@68: doctype = self.document.implementation.createDocumentType( jpayne@68: doctypeName, publicId, systemId) jpayne@68: doctype.ownerDocument = self.document jpayne@68: _append_child(self.document, doctype) jpayne@68: self.document.doctype = doctype jpayne@68: if self._filter and self._filter.acceptNode(doctype) == FILTER_REJECT: jpayne@68: self.document.doctype = None jpayne@68: del self.document.childNodes[-1] jpayne@68: doctype = None jpayne@68: self._parser.EntityDeclHandler = None jpayne@68: self._parser.NotationDeclHandler = None jpayne@68: if has_internal_subset: jpayne@68: if doctype is not None: jpayne@68: doctype.entities._seq = [] jpayne@68: doctype.notations._seq = [] jpayne@68: self._parser.CommentHandler = None jpayne@68: self._parser.ProcessingInstructionHandler = None jpayne@68: self._parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler jpayne@68: jpayne@68: def end_doctype_decl_handler(self): jpayne@68: if self._options.comments: jpayne@68: self._parser.CommentHandler = self.comment_handler jpayne@68: self._parser.ProcessingInstructionHandler = self.pi_handler jpayne@68: if not (self._elem_info or self._filter): jpayne@68: self._finish_end_element = id jpayne@68: jpayne@68: def pi_handler(self, target, data): jpayne@68: node = self.document.createProcessingInstruction(target, data) jpayne@68: _append_child(self.curNode, node) jpayne@68: if self._filter and self._filter.acceptNode(node) == FILTER_REJECT: jpayne@68: self.curNode.removeChild(node) jpayne@68: jpayne@68: def character_data_handler_cdata(self, data): jpayne@68: childNodes = self.curNode.childNodes jpayne@68: if self._cdata: jpayne@68: if ( self._cdata_continue jpayne@68: and childNodes[-1].nodeType == CDATA_SECTION_NODE): jpayne@68: childNodes[-1].appendData(data) jpayne@68: return jpayne@68: node = self.document.createCDATASection(data) jpayne@68: self._cdata_continue = True jpayne@68: elif childNodes and childNodes[-1].nodeType == TEXT_NODE: jpayne@68: node = childNodes[-1] jpayne@68: value = node.data + data jpayne@68: node.data = value jpayne@68: return jpayne@68: else: jpayne@68: node = minidom.Text() jpayne@68: node.data = data jpayne@68: node.ownerDocument = self.document jpayne@68: _append_child(self.curNode, node) jpayne@68: jpayne@68: def character_data_handler(self, data): jpayne@68: childNodes = self.curNode.childNodes jpayne@68: if childNodes and childNodes[-1].nodeType == TEXT_NODE: jpayne@68: node = childNodes[-1] jpayne@68: node.data = node.data + data jpayne@68: return jpayne@68: node = minidom.Text() jpayne@68: node.data = node.data + data jpayne@68: node.ownerDocument = self.document jpayne@68: _append_child(self.curNode, node) jpayne@68: jpayne@68: def entity_decl_handler(self, entityName, is_parameter_entity, value, jpayne@68: base, systemId, publicId, notationName): jpayne@68: if is_parameter_entity: jpayne@68: # we don't care about parameter entities for the DOM jpayne@68: return jpayne@68: if not self._options.entities: jpayne@68: return jpayne@68: node = self.document._create_entity(entityName, publicId, jpayne@68: systemId, notationName) jpayne@68: if value is not None: jpayne@68: # internal entity jpayne@68: # node *should* be readonly, but we'll cheat jpayne@68: child = self.document.createTextNode(value) jpayne@68: node.childNodes.append(child) jpayne@68: self.document.doctype.entities._seq.append(node) jpayne@68: if self._filter and self._filter.acceptNode(node) == FILTER_REJECT: jpayne@68: del self.document.doctype.entities._seq[-1] jpayne@68: jpayne@68: def notation_decl_handler(self, notationName, base, systemId, publicId): jpayne@68: node = self.document._create_notation(notationName, publicId, systemId) jpayne@68: self.document.doctype.notations._seq.append(node) jpayne@68: if self._filter and self._filter.acceptNode(node) == FILTER_ACCEPT: jpayne@68: del self.document.doctype.notations._seq[-1] jpayne@68: jpayne@68: def comment_handler(self, data): jpayne@68: node = self.document.createComment(data) jpayne@68: _append_child(self.curNode, node) jpayne@68: if self._filter and self._filter.acceptNode(node) == FILTER_REJECT: jpayne@68: self.curNode.removeChild(node) jpayne@68: jpayne@68: def start_cdata_section_handler(self): jpayne@68: self._cdata = True jpayne@68: self._cdata_continue = False jpayne@68: jpayne@68: def end_cdata_section_handler(self): jpayne@68: self._cdata = False jpayne@68: self._cdata_continue = False jpayne@68: jpayne@68: def external_entity_ref_handler(self, context, base, systemId, publicId): jpayne@68: return 1 jpayne@68: jpayne@68: def first_element_handler(self, name, attributes): jpayne@68: if self._filter is None and not self._elem_info: jpayne@68: self._finish_end_element = id jpayne@68: self.getParser().StartElementHandler = self.start_element_handler jpayne@68: self.start_element_handler(name, attributes) jpayne@68: jpayne@68: def start_element_handler(self, name, attributes): jpayne@68: node = self.document.createElement(name) jpayne@68: _append_child(self.curNode, node) jpayne@68: self.curNode = node jpayne@68: jpayne@68: if attributes: jpayne@68: for i in range(0, len(attributes), 2): jpayne@68: a = minidom.Attr(attributes[i], EMPTY_NAMESPACE, jpayne@68: None, EMPTY_PREFIX) jpayne@68: value = attributes[i+1] jpayne@68: a.value = value jpayne@68: a.ownerDocument = self.document jpayne@68: _set_attribute_node(node, a) jpayne@68: jpayne@68: if node is not self.document.documentElement: jpayne@68: self._finish_start_element(node) jpayne@68: jpayne@68: def _finish_start_element(self, node): jpayne@68: if self._filter: jpayne@68: # To be general, we'd have to call isSameNode(), but this jpayne@68: # is sufficient for minidom: jpayne@68: if node is self.document.documentElement: jpayne@68: return jpayne@68: filt = self._filter.startContainer(node) jpayne@68: if filt == FILTER_REJECT: jpayne@68: # ignore this node & all descendents jpayne@68: Rejecter(self) jpayne@68: elif filt == FILTER_SKIP: jpayne@68: # ignore this node, but make it's children become jpayne@68: # children of the parent node jpayne@68: Skipper(self) jpayne@68: else: jpayne@68: return jpayne@68: self.curNode = node.parentNode jpayne@68: node.parentNode.removeChild(node) jpayne@68: node.unlink() jpayne@68: jpayne@68: # If this ever changes, Namespaces.end_element_handler() needs to jpayne@68: # be changed to match. jpayne@68: # jpayne@68: def end_element_handler(self, name): jpayne@68: curNode = self.curNode jpayne@68: self.curNode = curNode.parentNode jpayne@68: self._finish_end_element(curNode) jpayne@68: jpayne@68: def _finish_end_element(self, curNode): jpayne@68: info = self._elem_info.get(curNode.tagName) jpayne@68: if info: jpayne@68: self._handle_white_text_nodes(curNode, info) jpayne@68: if self._filter: jpayne@68: if curNode is self.document.documentElement: jpayne@68: return jpayne@68: if self._filter.acceptNode(curNode) == FILTER_REJECT: jpayne@68: self.curNode.removeChild(curNode) jpayne@68: curNode.unlink() jpayne@68: jpayne@68: def _handle_white_text_nodes(self, node, info): jpayne@68: if (self._options.whitespace_in_element_content jpayne@68: or not info.isElementContent()): jpayne@68: return jpayne@68: jpayne@68: # We have element type information and should remove ignorable jpayne@68: # whitespace; identify for text nodes which contain only jpayne@68: # whitespace. jpayne@68: L = [] jpayne@68: for child in node.childNodes: jpayne@68: if child.nodeType == TEXT_NODE and not child.data.strip(): jpayne@68: L.append(child) jpayne@68: jpayne@68: # Remove ignorable whitespace from the tree. jpayne@68: for child in L: jpayne@68: node.removeChild(child) jpayne@68: jpayne@68: def element_decl_handler(self, name, model): jpayne@68: info = self._elem_info.get(name) jpayne@68: if info is None: jpayne@68: self._elem_info[name] = ElementInfo(name, model) jpayne@68: else: jpayne@68: assert info._model is None jpayne@68: info._model = model jpayne@68: jpayne@68: def attlist_decl_handler(self, elem, name, type, default, required): jpayne@68: info = self._elem_info.get(elem) jpayne@68: if info is None: jpayne@68: info = ElementInfo(elem) jpayne@68: self._elem_info[elem] = info jpayne@68: info._attr_info.append( jpayne@68: [None, name, None, None, default, 0, type, required]) jpayne@68: jpayne@68: def xml_decl_handler(self, version, encoding, standalone): jpayne@68: self.document.version = version jpayne@68: self.document.encoding = encoding jpayne@68: # This is still a little ugly, thanks to the pyexpat API. ;-( jpayne@68: if standalone >= 0: jpayne@68: if standalone: jpayne@68: self.document.standalone = True jpayne@68: else: jpayne@68: self.document.standalone = False jpayne@68: jpayne@68: jpayne@68: # Don't include FILTER_INTERRUPT, since that's checked separately jpayne@68: # where allowed. jpayne@68: _ALLOWED_FILTER_RETURNS = (FILTER_ACCEPT, FILTER_REJECT, FILTER_SKIP) jpayne@68: jpayne@68: class FilterVisibilityController(object): jpayne@68: """Wrapper around a DOMBuilderFilter which implements the checks jpayne@68: to make the whatToShow filter attribute work.""" jpayne@68: jpayne@68: __slots__ = 'filter', jpayne@68: jpayne@68: def __init__(self, filter): jpayne@68: self.filter = filter jpayne@68: jpayne@68: def startContainer(self, node): jpayne@68: mask = self._nodetype_mask[node.nodeType] jpayne@68: if self.filter.whatToShow & mask: jpayne@68: val = self.filter.startContainer(node) jpayne@68: if val == FILTER_INTERRUPT: jpayne@68: raise ParseEscape jpayne@68: if val not in _ALLOWED_FILTER_RETURNS: jpayne@68: raise ValueError( jpayne@68: "startContainer() returned illegal value: " + repr(val)) jpayne@68: return val jpayne@68: else: jpayne@68: return FILTER_ACCEPT jpayne@68: jpayne@68: def acceptNode(self, node): jpayne@68: mask = self._nodetype_mask[node.nodeType] jpayne@68: if self.filter.whatToShow & mask: jpayne@68: val = self.filter.acceptNode(node) jpayne@68: if val == FILTER_INTERRUPT: jpayne@68: raise ParseEscape jpayne@68: if val == FILTER_SKIP: jpayne@68: # move all child nodes to the parent, and remove this node jpayne@68: parent = node.parentNode jpayne@68: for child in node.childNodes[:]: jpayne@68: parent.appendChild(child) jpayne@68: # node is handled by the caller jpayne@68: return FILTER_REJECT jpayne@68: if val not in _ALLOWED_FILTER_RETURNS: jpayne@68: raise ValueError( jpayne@68: "acceptNode() returned illegal value: " + repr(val)) jpayne@68: return val jpayne@68: else: jpayne@68: return FILTER_ACCEPT jpayne@68: jpayne@68: _nodetype_mask = { jpayne@68: Node.ELEMENT_NODE: NodeFilter.SHOW_ELEMENT, jpayne@68: Node.ATTRIBUTE_NODE: NodeFilter.SHOW_ATTRIBUTE, jpayne@68: Node.TEXT_NODE: NodeFilter.SHOW_TEXT, jpayne@68: Node.CDATA_SECTION_NODE: NodeFilter.SHOW_CDATA_SECTION, jpayne@68: Node.ENTITY_REFERENCE_NODE: NodeFilter.SHOW_ENTITY_REFERENCE, jpayne@68: Node.ENTITY_NODE: NodeFilter.SHOW_ENTITY, jpayne@68: Node.PROCESSING_INSTRUCTION_NODE: NodeFilter.SHOW_PROCESSING_INSTRUCTION, jpayne@68: Node.COMMENT_NODE: NodeFilter.SHOW_COMMENT, jpayne@68: Node.DOCUMENT_NODE: NodeFilter.SHOW_DOCUMENT, jpayne@68: Node.DOCUMENT_TYPE_NODE: NodeFilter.SHOW_DOCUMENT_TYPE, jpayne@68: Node.DOCUMENT_FRAGMENT_NODE: NodeFilter.SHOW_DOCUMENT_FRAGMENT, jpayne@68: Node.NOTATION_NODE: NodeFilter.SHOW_NOTATION, jpayne@68: } jpayne@68: jpayne@68: jpayne@68: class FilterCrutch(object): jpayne@68: __slots__ = '_builder', '_level', '_old_start', '_old_end' jpayne@68: jpayne@68: def __init__(self, builder): jpayne@68: self._level = 0 jpayne@68: self._builder = builder jpayne@68: parser = builder._parser jpayne@68: self._old_start = parser.StartElementHandler jpayne@68: self._old_end = parser.EndElementHandler jpayne@68: parser.StartElementHandler = self.start_element_handler jpayne@68: parser.EndElementHandler = self.end_element_handler jpayne@68: jpayne@68: class Rejecter(FilterCrutch): jpayne@68: __slots__ = () jpayne@68: jpayne@68: def __init__(self, builder): jpayne@68: FilterCrutch.__init__(self, builder) jpayne@68: parser = builder._parser jpayne@68: for name in ("ProcessingInstructionHandler", jpayne@68: "CommentHandler", jpayne@68: "CharacterDataHandler", jpayne@68: "StartCdataSectionHandler", jpayne@68: "EndCdataSectionHandler", jpayne@68: "ExternalEntityRefHandler", jpayne@68: ): jpayne@68: setattr(parser, name, None) jpayne@68: jpayne@68: def start_element_handler(self, *args): jpayne@68: self._level = self._level + 1 jpayne@68: jpayne@68: def end_element_handler(self, *args): jpayne@68: if self._level == 0: jpayne@68: # restore the old handlers jpayne@68: parser = self._builder._parser jpayne@68: self._builder.install(parser) jpayne@68: parser.StartElementHandler = self._old_start jpayne@68: parser.EndElementHandler = self._old_end jpayne@68: else: jpayne@68: self._level = self._level - 1 jpayne@68: jpayne@68: class Skipper(FilterCrutch): jpayne@68: __slots__ = () jpayne@68: jpayne@68: def start_element_handler(self, *args): jpayne@68: node = self._builder.curNode jpayne@68: self._old_start(*args) jpayne@68: if self._builder.curNode is not node: jpayne@68: self._level = self._level + 1 jpayne@68: jpayne@68: def end_element_handler(self, *args): jpayne@68: if self._level == 0: jpayne@68: # We're popping back out of the node we're skipping, so we jpayne@68: # shouldn't need to do anything but reset the handlers. jpayne@68: self._builder._parser.StartElementHandler = self._old_start jpayne@68: self._builder._parser.EndElementHandler = self._old_end jpayne@68: self._builder = None jpayne@68: else: jpayne@68: self._level = self._level - 1 jpayne@68: self._old_end(*args) jpayne@68: jpayne@68: jpayne@68: # framework document used by the fragment builder. jpayne@68: # Takes a string for the doctype, subset string, and namespace attrs string. jpayne@68: jpayne@68: _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID = \ jpayne@68: "http://xml.python.org/entities/fragment-builder/internal" jpayne@68: jpayne@68: _FRAGMENT_BUILDER_TEMPLATE = ( jpayne@68: '''\ jpayne@68: jpayne@68: %%s jpayne@68: ]> jpayne@68: &fragment-builder-internal;''' jpayne@68: % _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID) jpayne@68: jpayne@68: jpayne@68: class FragmentBuilder(ExpatBuilder): jpayne@68: """Builder which constructs document fragments given XML source jpayne@68: text and a context node. jpayne@68: jpayne@68: The context node is expected to provide information about the jpayne@68: namespace declarations which are in scope at the start of the jpayne@68: fragment. jpayne@68: """ jpayne@68: jpayne@68: def __init__(self, context, options=None): jpayne@68: if context.nodeType == DOCUMENT_NODE: jpayne@68: self.originalDocument = context jpayne@68: self.context = context jpayne@68: else: jpayne@68: self.originalDocument = context.ownerDocument jpayne@68: self.context = context jpayne@68: ExpatBuilder.__init__(self, options) jpayne@68: jpayne@68: def reset(self): jpayne@68: ExpatBuilder.reset(self) jpayne@68: self.fragment = None jpayne@68: jpayne@68: def parseFile(self, file): jpayne@68: """Parse a document fragment from a file object, returning the jpayne@68: fragment node.""" jpayne@68: return self.parseString(file.read()) jpayne@68: jpayne@68: def parseString(self, string): jpayne@68: """Parse a document fragment from a string, returning the jpayne@68: fragment node.""" jpayne@68: self._source = string jpayne@68: parser = self.getParser() jpayne@68: doctype = self.originalDocument.doctype jpayne@68: ident = "" jpayne@68: if doctype: jpayne@68: subset = doctype.internalSubset or self._getDeclarations() jpayne@68: if doctype.publicId: jpayne@68: ident = ('PUBLIC "%s" "%s"' jpayne@68: % (doctype.publicId, doctype.systemId)) jpayne@68: elif doctype.systemId: jpayne@68: ident = 'SYSTEM "%s"' % doctype.systemId jpayne@68: else: jpayne@68: subset = "" jpayne@68: nsattrs = self._getNSattrs() # get ns decls from node's ancestors jpayne@68: document = _FRAGMENT_BUILDER_TEMPLATE % (ident, subset, nsattrs) jpayne@68: try: jpayne@68: parser.Parse(document, 1) jpayne@68: except: jpayne@68: self.reset() jpayne@68: raise jpayne@68: fragment = self.fragment jpayne@68: self.reset() jpayne@68: ## self._parser = None jpayne@68: return fragment jpayne@68: jpayne@68: def _getDeclarations(self): jpayne@68: """Re-create the internal subset from the DocumentType node. jpayne@68: jpayne@68: This is only needed if we don't already have the jpayne@68: internalSubset as a string. jpayne@68: """ jpayne@68: doctype = self.context.ownerDocument.doctype jpayne@68: s = "" jpayne@68: if doctype: jpayne@68: for i in range(doctype.notations.length): jpayne@68: notation = doctype.notations.item(i) jpayne@68: if s: jpayne@68: s = s + "\n " jpayne@68: s = "%s' \ jpayne@68: % (s, notation.publicId, notation.systemId) jpayne@68: else: jpayne@68: s = '%s SYSTEM "%s">' % (s, notation.systemId) jpayne@68: for i in range(doctype.entities.length): jpayne@68: entity = doctype.entities.item(i) jpayne@68: if s: jpayne@68: s = s + "\n " jpayne@68: s = "%s" jpayne@68: return s jpayne@68: jpayne@68: def _getNSattrs(self): jpayne@68: return "" jpayne@68: jpayne@68: def external_entity_ref_handler(self, context, base, systemId, publicId): jpayne@68: if systemId == _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID: jpayne@68: # this entref is the one that we made to put the subtree jpayne@68: # in; all of our given input is parsed in here. jpayne@68: old_document = self.document jpayne@68: old_cur_node = self.curNode jpayne@68: parser = self._parser.ExternalEntityParserCreate(context) jpayne@68: # put the real document back, parse into the fragment to return jpayne@68: self.document = self.originalDocument jpayne@68: self.fragment = self.document.createDocumentFragment() jpayne@68: self.curNode = self.fragment jpayne@68: try: jpayne@68: parser.Parse(self._source, 1) jpayne@68: finally: jpayne@68: self.curNode = old_cur_node jpayne@68: self.document = old_document jpayne@68: self._source = None jpayne@68: return -1 jpayne@68: else: jpayne@68: return ExpatBuilder.external_entity_ref_handler( jpayne@68: self, context, base, systemId, publicId) jpayne@68: jpayne@68: jpayne@68: class Namespaces: jpayne@68: """Mix-in class for builders; adds support for namespaces.""" jpayne@68: jpayne@68: def _initNamespaces(self): jpayne@68: # list of (prefix, uri) ns declarations. Namespace attrs are jpayne@68: # constructed from this and added to the element's attrs. jpayne@68: self._ns_ordered_prefixes = [] jpayne@68: jpayne@68: def createParser(self): jpayne@68: """Create a new namespace-handling parser.""" jpayne@68: parser = expat.ParserCreate(namespace_separator=" ") jpayne@68: parser.namespace_prefixes = True jpayne@68: return parser jpayne@68: jpayne@68: def install(self, parser): jpayne@68: """Insert the namespace-handlers onto the parser.""" jpayne@68: ExpatBuilder.install(self, parser) jpayne@68: if self._options.namespace_declarations: jpayne@68: parser.StartNamespaceDeclHandler = ( jpayne@68: self.start_namespace_decl_handler) jpayne@68: jpayne@68: def start_namespace_decl_handler(self, prefix, uri): jpayne@68: """Push this namespace declaration on our storage.""" jpayne@68: self._ns_ordered_prefixes.append((prefix, uri)) jpayne@68: jpayne@68: def start_element_handler(self, name, attributes): jpayne@68: if ' ' in name: jpayne@68: uri, localname, prefix, qname = _parse_ns_name(self, name) jpayne@68: else: jpayne@68: uri = EMPTY_NAMESPACE jpayne@68: qname = name jpayne@68: localname = None jpayne@68: prefix = EMPTY_PREFIX jpayne@68: node = minidom.Element(qname, uri, prefix, localname) jpayne@68: node.ownerDocument = self.document jpayne@68: _append_child(self.curNode, node) jpayne@68: self.curNode = node jpayne@68: jpayne@68: if self._ns_ordered_prefixes: jpayne@68: for prefix, uri in self._ns_ordered_prefixes: jpayne@68: if prefix: jpayne@68: a = minidom.Attr(_intern(self, 'xmlns:' + prefix), jpayne@68: XMLNS_NAMESPACE, prefix, "xmlns") jpayne@68: else: jpayne@68: a = minidom.Attr("xmlns", XMLNS_NAMESPACE, jpayne@68: "xmlns", EMPTY_PREFIX) jpayne@68: a.value = uri jpayne@68: a.ownerDocument = self.document jpayne@68: _set_attribute_node(node, a) jpayne@68: del self._ns_ordered_prefixes[:] jpayne@68: jpayne@68: if attributes: jpayne@68: node._ensure_attributes() jpayne@68: _attrs = node._attrs jpayne@68: _attrsNS = node._attrsNS jpayne@68: for i in range(0, len(attributes), 2): jpayne@68: aname = attributes[i] jpayne@68: value = attributes[i+1] jpayne@68: if ' ' in aname: jpayne@68: uri, localname, prefix, qname = _parse_ns_name(self, aname) jpayne@68: a = minidom.Attr(qname, uri, localname, prefix) jpayne@68: _attrs[qname] = a jpayne@68: _attrsNS[(uri, localname)] = a jpayne@68: else: jpayne@68: a = minidom.Attr(aname, EMPTY_NAMESPACE, jpayne@68: aname, EMPTY_PREFIX) jpayne@68: _attrs[aname] = a jpayne@68: _attrsNS[(EMPTY_NAMESPACE, aname)] = a jpayne@68: a.ownerDocument = self.document jpayne@68: a.value = value jpayne@68: a.ownerElement = node jpayne@68: jpayne@68: if __debug__: jpayne@68: # This only adds some asserts to the original jpayne@68: # end_element_handler(), so we only define this when -O is not jpayne@68: # used. If changing one, be sure to check the other to see if jpayne@68: # it needs to be changed as well. jpayne@68: # jpayne@68: def end_element_handler(self, name): jpayne@68: curNode = self.curNode jpayne@68: if ' ' in name: jpayne@68: uri, localname, prefix, qname = _parse_ns_name(self, name) jpayne@68: assert (curNode.namespaceURI == uri jpayne@68: and curNode.localName == localname jpayne@68: and curNode.prefix == prefix), \ jpayne@68: "element stack messed up! (namespace)" jpayne@68: else: jpayne@68: assert curNode.nodeName == name, \ jpayne@68: "element stack messed up - bad nodeName" jpayne@68: assert curNode.namespaceURI == EMPTY_NAMESPACE, \ jpayne@68: "element stack messed up - bad namespaceURI" jpayne@68: self.curNode = curNode.parentNode jpayne@68: self._finish_end_element(curNode) jpayne@68: jpayne@68: jpayne@68: class ExpatBuilderNS(Namespaces, ExpatBuilder): jpayne@68: """Document builder that supports namespaces.""" jpayne@68: jpayne@68: def reset(self): jpayne@68: ExpatBuilder.reset(self) jpayne@68: self._initNamespaces() jpayne@68: jpayne@68: jpayne@68: class FragmentBuilderNS(Namespaces, FragmentBuilder): jpayne@68: """Fragment builder that supports namespaces.""" jpayne@68: jpayne@68: def reset(self): jpayne@68: FragmentBuilder.reset(self) jpayne@68: self._initNamespaces() jpayne@68: jpayne@68: def _getNSattrs(self): jpayne@68: """Return string of namespace attributes from this element and jpayne@68: ancestors.""" jpayne@68: # XXX This needs to be re-written to walk the ancestors of the jpayne@68: # context to build up the namespace information from jpayne@68: # declarations, elements, and attributes found in context. jpayne@68: # Otherwise we have to store a bunch more data on the DOM jpayne@68: # (though that *might* be more reliable -- not clear). jpayne@68: attrs = "" jpayne@68: context = self.context jpayne@68: L = [] jpayne@68: while context: jpayne@68: if hasattr(context, '_ns_prefix_uri'): jpayne@68: for prefix, uri in context._ns_prefix_uri.items(): jpayne@68: # add every new NS decl from context to L and attrs string jpayne@68: if prefix in L: jpayne@68: continue jpayne@68: L.append(prefix) jpayne@68: if prefix: jpayne@68: declname = "xmlns:" + prefix jpayne@68: else: jpayne@68: declname = "xmlns" jpayne@68: if attrs: jpayne@68: attrs = "%s\n %s='%s'" % (attrs, declname, uri) jpayne@68: else: jpayne@68: attrs = " %s='%s'" % (declname, uri) jpayne@68: context = context.parentNode jpayne@68: return attrs jpayne@68: jpayne@68: jpayne@68: class ParseEscape(Exception): jpayne@68: """Exception raised to short-circuit parsing in InternalSubsetExtractor.""" jpayne@68: pass jpayne@68: jpayne@68: class InternalSubsetExtractor(ExpatBuilder): jpayne@68: """XML processor which can rip out the internal document type subset.""" jpayne@68: jpayne@68: subset = None jpayne@68: jpayne@68: def getSubset(self): jpayne@68: """Return the internal subset as a string.""" jpayne@68: return self.subset jpayne@68: jpayne@68: def parseFile(self, file): jpayne@68: try: jpayne@68: ExpatBuilder.parseFile(self, file) jpayne@68: except ParseEscape: jpayne@68: pass jpayne@68: jpayne@68: def parseString(self, string): jpayne@68: try: jpayne@68: ExpatBuilder.parseString(self, string) jpayne@68: except ParseEscape: jpayne@68: pass jpayne@68: jpayne@68: def install(self, parser): jpayne@68: parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler jpayne@68: parser.StartElementHandler = self.start_element_handler jpayne@68: jpayne@68: def start_doctype_decl_handler(self, name, publicId, systemId, jpayne@68: has_internal_subset): jpayne@68: if has_internal_subset: jpayne@68: parser = self.getParser() jpayne@68: self.subset = [] jpayne@68: parser.DefaultHandler = self.subset.append jpayne@68: parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler jpayne@68: else: jpayne@68: raise ParseEscape() jpayne@68: jpayne@68: def end_doctype_decl_handler(self): jpayne@68: s = ''.join(self.subset).replace('\r\n', '\n').replace('\r', '\n') jpayne@68: self.subset = s jpayne@68: raise ParseEscape() jpayne@68: jpayne@68: def start_element_handler(self, name, attrs): jpayne@68: raise ParseEscape() jpayne@68: jpayne@68: jpayne@68: def parse(file, namespaces=True): jpayne@68: """Parse a document, returning the resulting Document node. jpayne@68: jpayne@68: 'file' may be either a file name or an open file object. jpayne@68: """ jpayne@68: if namespaces: jpayne@68: builder = ExpatBuilderNS() jpayne@68: else: jpayne@68: builder = ExpatBuilder() jpayne@68: jpayne@68: if isinstance(file, str): jpayne@68: with open(file, 'rb') as fp: jpayne@68: result = builder.parseFile(fp) jpayne@68: else: jpayne@68: result = builder.parseFile(file) jpayne@68: return result jpayne@68: jpayne@68: jpayne@68: def parseString(string, namespaces=True): jpayne@68: """Parse a document from a string, returning the resulting jpayne@68: Document node. jpayne@68: """ jpayne@68: if namespaces: jpayne@68: builder = ExpatBuilderNS() jpayne@68: else: jpayne@68: builder = ExpatBuilder() jpayne@68: return builder.parseString(string) jpayne@68: jpayne@68: jpayne@68: def parseFragment(file, context, namespaces=True): jpayne@68: """Parse a fragment of a document, given the context from which it jpayne@68: was originally extracted. context should be the parent of the jpayne@68: node(s) which are in the fragment. jpayne@68: jpayne@68: 'file' may be either a file name or an open file object. jpayne@68: """ jpayne@68: if namespaces: jpayne@68: builder = FragmentBuilderNS(context) jpayne@68: else: jpayne@68: builder = FragmentBuilder(context) jpayne@68: jpayne@68: if isinstance(file, str): jpayne@68: with open(file, 'rb') as fp: jpayne@68: result = builder.parseFile(fp) jpayne@68: else: jpayne@68: result = builder.parseFile(file) jpayne@68: return result jpayne@68: jpayne@68: jpayne@68: def parseFragmentString(string, context, namespaces=True): jpayne@68: """Parse a fragment of a document from a string, given the context jpayne@68: from which it was originally extracted. context should be the jpayne@68: parent of the node(s) which are in the fragment. jpayne@68: """ jpayne@68: if namespaces: jpayne@68: builder = FragmentBuilderNS(context) jpayne@68: else: jpayne@68: builder = FragmentBuilder(context) jpayne@68: return builder.parseString(string) jpayne@68: jpayne@68: jpayne@68: def makeBuilder(options): jpayne@68: """Create a builder based on an Options object.""" jpayne@68: if options.namespaces: jpayne@68: return ExpatBuilderNS(options) jpayne@68: else: jpayne@68: return ExpatBuilder(options)