jpayne@68: """ jpayne@68: SAX driver for the pyexpat C module. This driver works with jpayne@68: pyexpat.__version__ == '2.22'. jpayne@68: """ jpayne@68: jpayne@68: version = "0.20" jpayne@68: jpayne@68: from xml.sax._exceptions import * jpayne@68: from xml.sax.handler import feature_validation, feature_namespaces jpayne@68: from xml.sax.handler import feature_namespace_prefixes jpayne@68: from xml.sax.handler import feature_external_ges, feature_external_pes jpayne@68: from xml.sax.handler import feature_string_interning jpayne@68: from xml.sax.handler import property_xml_string, property_interning_dict jpayne@68: jpayne@68: # xml.parsers.expat does not raise ImportError in Jython jpayne@68: import sys jpayne@68: if sys.platform[:4] == "java": jpayne@68: raise SAXReaderNotAvailable("expat not available in Java", None) jpayne@68: del sys jpayne@68: jpayne@68: try: jpayne@68: from xml.parsers import expat jpayne@68: except ImportError: jpayne@68: raise SAXReaderNotAvailable("expat not supported", None) jpayne@68: else: jpayne@68: if not hasattr(expat, "ParserCreate"): jpayne@68: raise SAXReaderNotAvailable("expat not supported", None) jpayne@68: from xml.sax import xmlreader, saxutils, handler jpayne@68: jpayne@68: AttributesImpl = xmlreader.AttributesImpl jpayne@68: AttributesNSImpl = xmlreader.AttributesNSImpl jpayne@68: jpayne@68: # If we're using a sufficiently recent version of Python, we can use jpayne@68: # weak references to avoid cycles between the parser and content jpayne@68: # handler, otherwise we'll just have to pretend. jpayne@68: try: jpayne@68: import _weakref jpayne@68: except ImportError: jpayne@68: def _mkproxy(o): jpayne@68: return o jpayne@68: else: jpayne@68: import weakref jpayne@68: _mkproxy = weakref.proxy jpayne@68: del weakref, _weakref jpayne@68: jpayne@68: class _ClosedParser: jpayne@68: pass jpayne@68: jpayne@68: # --- ExpatLocator jpayne@68: jpayne@68: class ExpatLocator(xmlreader.Locator): jpayne@68: """Locator for use with the ExpatParser class. jpayne@68: jpayne@68: This uses a weak reference to the parser object to avoid creating jpayne@68: a circular reference between the parser and the content handler. jpayne@68: """ jpayne@68: def __init__(self, parser): jpayne@68: self._ref = _mkproxy(parser) jpayne@68: jpayne@68: def getColumnNumber(self): jpayne@68: parser = self._ref jpayne@68: if parser._parser is None: jpayne@68: return None jpayne@68: return parser._parser.ErrorColumnNumber jpayne@68: jpayne@68: def getLineNumber(self): jpayne@68: parser = self._ref jpayne@68: if parser._parser is None: jpayne@68: return 1 jpayne@68: return parser._parser.ErrorLineNumber jpayne@68: jpayne@68: def getPublicId(self): jpayne@68: parser = self._ref jpayne@68: if parser is None: jpayne@68: return None jpayne@68: return parser._source.getPublicId() jpayne@68: jpayne@68: def getSystemId(self): jpayne@68: parser = self._ref jpayne@68: if parser is None: jpayne@68: return None jpayne@68: return parser._source.getSystemId() jpayne@68: jpayne@68: jpayne@68: # --- ExpatParser jpayne@68: jpayne@68: class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator): jpayne@68: """SAX driver for the pyexpat C module.""" jpayne@68: jpayne@68: def __init__(self, namespaceHandling=0, bufsize=2**16-20): jpayne@68: xmlreader.IncrementalParser.__init__(self, bufsize) jpayne@68: self._source = xmlreader.InputSource() jpayne@68: self._parser = None jpayne@68: self._namespaces = namespaceHandling jpayne@68: self._lex_handler_prop = None jpayne@68: self._parsing = 0 jpayne@68: self._entity_stack = [] jpayne@68: self._external_ges = 0 jpayne@68: self._interning = None jpayne@68: jpayne@68: # XMLReader methods jpayne@68: jpayne@68: def parse(self, source): jpayne@68: "Parse an XML document from a URL or an InputSource." jpayne@68: source = saxutils.prepare_input_source(source) jpayne@68: jpayne@68: self._source = source jpayne@68: try: jpayne@68: self.reset() jpayne@68: self._cont_handler.setDocumentLocator(ExpatLocator(self)) jpayne@68: xmlreader.IncrementalParser.parse(self, source) jpayne@68: except: jpayne@68: # bpo-30264: Close the source on error to not leak resources: jpayne@68: # xml.sax.parse() doesn't give access to the underlying parser jpayne@68: # to the caller jpayne@68: self._close_source() jpayne@68: raise jpayne@68: jpayne@68: def prepareParser(self, source): jpayne@68: if source.getSystemId() is not None: jpayne@68: self._parser.SetBase(source.getSystemId()) jpayne@68: jpayne@68: # Redefined setContentHandler to allow changing handlers during parsing jpayne@68: jpayne@68: def setContentHandler(self, handler): jpayne@68: xmlreader.IncrementalParser.setContentHandler(self, handler) jpayne@68: if self._parsing: jpayne@68: self._reset_cont_handler() jpayne@68: jpayne@68: def getFeature(self, name): jpayne@68: if name == feature_namespaces: jpayne@68: return self._namespaces jpayne@68: elif name == feature_string_interning: jpayne@68: return self._interning is not None jpayne@68: elif name in (feature_validation, feature_external_pes, jpayne@68: feature_namespace_prefixes): jpayne@68: return 0 jpayne@68: elif name == feature_external_ges: jpayne@68: return self._external_ges jpayne@68: raise SAXNotRecognizedException("Feature '%s' not recognized" % name) jpayne@68: jpayne@68: def setFeature(self, name, state): jpayne@68: if self._parsing: jpayne@68: raise SAXNotSupportedException("Cannot set features while parsing") jpayne@68: jpayne@68: if name == feature_namespaces: jpayne@68: self._namespaces = state jpayne@68: elif name == feature_external_ges: jpayne@68: self._external_ges = state jpayne@68: elif name == feature_string_interning: jpayne@68: if state: jpayne@68: if self._interning is None: jpayne@68: self._interning = {} jpayne@68: else: jpayne@68: self._interning = None jpayne@68: elif name == feature_validation: jpayne@68: if state: jpayne@68: raise SAXNotSupportedException( jpayne@68: "expat does not support validation") jpayne@68: elif name == feature_external_pes: jpayne@68: if state: jpayne@68: raise SAXNotSupportedException( jpayne@68: "expat does not read external parameter entities") jpayne@68: elif name == feature_namespace_prefixes: jpayne@68: if state: jpayne@68: raise SAXNotSupportedException( jpayne@68: "expat does not report namespace prefixes") jpayne@68: else: jpayne@68: raise SAXNotRecognizedException( jpayne@68: "Feature '%s' not recognized" % name) jpayne@68: jpayne@68: def getProperty(self, name): jpayne@68: if name == handler.property_lexical_handler: jpayne@68: return self._lex_handler_prop jpayne@68: elif name == property_interning_dict: jpayne@68: return self._interning jpayne@68: elif name == property_xml_string: jpayne@68: if self._parser: jpayne@68: if hasattr(self._parser, "GetInputContext"): jpayne@68: return self._parser.GetInputContext() jpayne@68: else: jpayne@68: raise SAXNotRecognizedException( jpayne@68: "This version of expat does not support getting" jpayne@68: " the XML string") jpayne@68: else: jpayne@68: raise SAXNotSupportedException( jpayne@68: "XML string cannot be returned when not parsing") jpayne@68: raise SAXNotRecognizedException("Property '%s' not recognized" % name) jpayne@68: jpayne@68: def setProperty(self, name, value): jpayne@68: if name == handler.property_lexical_handler: jpayne@68: self._lex_handler_prop = value jpayne@68: if self._parsing: jpayne@68: self._reset_lex_handler_prop() jpayne@68: elif name == property_interning_dict: jpayne@68: self._interning = value jpayne@68: elif name == property_xml_string: jpayne@68: raise SAXNotSupportedException("Property '%s' cannot be set" % jpayne@68: name) jpayne@68: else: jpayne@68: raise SAXNotRecognizedException("Property '%s' not recognized" % jpayne@68: name) jpayne@68: jpayne@68: # IncrementalParser methods jpayne@68: jpayne@68: def feed(self, data, isFinal = 0): jpayne@68: if not self._parsing: jpayne@68: self.reset() jpayne@68: self._parsing = 1 jpayne@68: self._cont_handler.startDocument() jpayne@68: jpayne@68: try: jpayne@68: # The isFinal parameter is internal to the expat reader. jpayne@68: # If it is set to true, expat will check validity of the entire jpayne@68: # document. When feeding chunks, they are not normally final - jpayne@68: # except when invoked from close. jpayne@68: self._parser.Parse(data, isFinal) jpayne@68: except expat.error as e: jpayne@68: exc = SAXParseException(expat.ErrorString(e.code), e, self) jpayne@68: # FIXME: when to invoke error()? jpayne@68: self._err_handler.fatalError(exc) jpayne@68: jpayne@68: def _close_source(self): jpayne@68: source = self._source jpayne@68: try: jpayne@68: file = source.getCharacterStream() jpayne@68: if file is not None: jpayne@68: file.close() jpayne@68: finally: jpayne@68: file = source.getByteStream() jpayne@68: if file is not None: jpayne@68: file.close() jpayne@68: jpayne@68: def close(self): jpayne@68: if (self._entity_stack or self._parser is None or jpayne@68: isinstance(self._parser, _ClosedParser)): jpayne@68: # If we are completing an external entity, do nothing here jpayne@68: return jpayne@68: try: jpayne@68: self.feed("", isFinal = 1) jpayne@68: self._cont_handler.endDocument() jpayne@68: self._parsing = 0 jpayne@68: # break cycle created by expat handlers pointing to our methods jpayne@68: self._parser = None jpayne@68: finally: jpayne@68: self._parsing = 0 jpayne@68: if self._parser is not None: jpayne@68: # Keep ErrorColumnNumber and ErrorLineNumber after closing. jpayne@68: parser = _ClosedParser() jpayne@68: parser.ErrorColumnNumber = self._parser.ErrorColumnNumber jpayne@68: parser.ErrorLineNumber = self._parser.ErrorLineNumber jpayne@68: self._parser = parser jpayne@68: self._close_source() jpayne@68: jpayne@68: def _reset_cont_handler(self): jpayne@68: self._parser.ProcessingInstructionHandler = \ jpayne@68: self._cont_handler.processingInstruction jpayne@68: self._parser.CharacterDataHandler = self._cont_handler.characters jpayne@68: jpayne@68: def _reset_lex_handler_prop(self): jpayne@68: lex = self._lex_handler_prop jpayne@68: parser = self._parser jpayne@68: if lex is None: jpayne@68: parser.CommentHandler = None jpayne@68: parser.StartCdataSectionHandler = None jpayne@68: parser.EndCdataSectionHandler = None jpayne@68: parser.StartDoctypeDeclHandler = None jpayne@68: parser.EndDoctypeDeclHandler = None jpayne@68: else: jpayne@68: parser.CommentHandler = lex.comment jpayne@68: parser.StartCdataSectionHandler = lex.startCDATA jpayne@68: parser.EndCdataSectionHandler = lex.endCDATA jpayne@68: parser.StartDoctypeDeclHandler = self.start_doctype_decl jpayne@68: parser.EndDoctypeDeclHandler = lex.endDTD jpayne@68: jpayne@68: def reset(self): jpayne@68: if self._namespaces: jpayne@68: self._parser = expat.ParserCreate(self._source.getEncoding(), " ", jpayne@68: intern=self._interning) jpayne@68: self._parser.namespace_prefixes = 1 jpayne@68: self._parser.StartElementHandler = self.start_element_ns jpayne@68: self._parser.EndElementHandler = self.end_element_ns jpayne@68: else: jpayne@68: self._parser = expat.ParserCreate(self._source.getEncoding(), jpayne@68: intern = self._interning) jpayne@68: self._parser.StartElementHandler = self.start_element jpayne@68: self._parser.EndElementHandler = self.end_element jpayne@68: jpayne@68: self._reset_cont_handler() jpayne@68: self._parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl jpayne@68: self._parser.NotationDeclHandler = self.notation_decl jpayne@68: self._parser.StartNamespaceDeclHandler = self.start_namespace_decl jpayne@68: self._parser.EndNamespaceDeclHandler = self.end_namespace_decl jpayne@68: jpayne@68: self._decl_handler_prop = None jpayne@68: if self._lex_handler_prop: jpayne@68: self._reset_lex_handler_prop() jpayne@68: # self._parser.DefaultHandler = jpayne@68: # self._parser.DefaultHandlerExpand = jpayne@68: # self._parser.NotStandaloneHandler = jpayne@68: self._parser.ExternalEntityRefHandler = self.external_entity_ref jpayne@68: try: jpayne@68: self._parser.SkippedEntityHandler = self.skipped_entity_handler jpayne@68: except AttributeError: jpayne@68: # This pyexpat does not support SkippedEntity jpayne@68: pass jpayne@68: self._parser.SetParamEntityParsing( jpayne@68: expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE) jpayne@68: jpayne@68: self._parsing = 0 jpayne@68: self._entity_stack = [] jpayne@68: jpayne@68: # Locator methods jpayne@68: jpayne@68: def getColumnNumber(self): jpayne@68: if self._parser is None: jpayne@68: return None jpayne@68: return self._parser.ErrorColumnNumber jpayne@68: jpayne@68: def getLineNumber(self): jpayne@68: if self._parser is None: jpayne@68: return 1 jpayne@68: return self._parser.ErrorLineNumber jpayne@68: jpayne@68: def getPublicId(self): jpayne@68: return self._source.getPublicId() jpayne@68: jpayne@68: def getSystemId(self): jpayne@68: return self._source.getSystemId() jpayne@68: jpayne@68: # event handlers jpayne@68: def start_element(self, name, attrs): jpayne@68: self._cont_handler.startElement(name, AttributesImpl(attrs)) jpayne@68: jpayne@68: def end_element(self, name): jpayne@68: self._cont_handler.endElement(name) jpayne@68: jpayne@68: def start_element_ns(self, name, attrs): jpayne@68: pair = name.split() jpayne@68: if len(pair) == 1: jpayne@68: # no namespace jpayne@68: pair = (None, name) jpayne@68: elif len(pair) == 3: jpayne@68: pair = pair[0], pair[1] jpayne@68: else: jpayne@68: # default namespace jpayne@68: pair = tuple(pair) jpayne@68: jpayne@68: newattrs = {} jpayne@68: qnames = {} jpayne@68: for (aname, value) in attrs.items(): jpayne@68: parts = aname.split() jpayne@68: length = len(parts) jpayne@68: if length == 1: jpayne@68: # no namespace jpayne@68: qname = aname jpayne@68: apair = (None, aname) jpayne@68: elif length == 3: jpayne@68: qname = "%s:%s" % (parts[2], parts[1]) jpayne@68: apair = parts[0], parts[1] jpayne@68: else: jpayne@68: # default namespace jpayne@68: qname = parts[1] jpayne@68: apair = tuple(parts) jpayne@68: jpayne@68: newattrs[apair] = value jpayne@68: qnames[apair] = qname jpayne@68: jpayne@68: self._cont_handler.startElementNS(pair, None, jpayne@68: AttributesNSImpl(newattrs, qnames)) jpayne@68: jpayne@68: def end_element_ns(self, name): jpayne@68: pair = name.split() jpayne@68: if len(pair) == 1: jpayne@68: pair = (None, name) jpayne@68: elif len(pair) == 3: jpayne@68: pair = pair[0], pair[1] jpayne@68: else: jpayne@68: pair = tuple(pair) jpayne@68: jpayne@68: self._cont_handler.endElementNS(pair, None) jpayne@68: jpayne@68: # this is not used (call directly to ContentHandler) jpayne@68: def processing_instruction(self, target, data): jpayne@68: self._cont_handler.processingInstruction(target, data) jpayne@68: jpayne@68: # this is not used (call directly to ContentHandler) jpayne@68: def character_data(self, data): jpayne@68: self._cont_handler.characters(data) jpayne@68: jpayne@68: def start_namespace_decl(self, prefix, uri): jpayne@68: self._cont_handler.startPrefixMapping(prefix, uri) jpayne@68: jpayne@68: def end_namespace_decl(self, prefix): jpayne@68: self._cont_handler.endPrefixMapping(prefix) jpayne@68: jpayne@68: def start_doctype_decl(self, name, sysid, pubid, has_internal_subset): jpayne@68: self._lex_handler_prop.startDTD(name, pubid, sysid) jpayne@68: jpayne@68: def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name): jpayne@68: self._dtd_handler.unparsedEntityDecl(name, pubid, sysid, notation_name) jpayne@68: jpayne@68: def notation_decl(self, name, base, sysid, pubid): jpayne@68: self._dtd_handler.notationDecl(name, pubid, sysid) jpayne@68: jpayne@68: def external_entity_ref(self, context, base, sysid, pubid): jpayne@68: if not self._external_ges: jpayne@68: return 1 jpayne@68: jpayne@68: source = self._ent_handler.resolveEntity(pubid, sysid) jpayne@68: source = saxutils.prepare_input_source(source, jpayne@68: self._source.getSystemId() or jpayne@68: "") jpayne@68: jpayne@68: self._entity_stack.append((self._parser, self._source)) jpayne@68: self._parser = self._parser.ExternalEntityParserCreate(context) jpayne@68: self._source = source jpayne@68: jpayne@68: try: jpayne@68: xmlreader.IncrementalParser.parse(self, source) jpayne@68: except: jpayne@68: return 0 # FIXME: save error info here? jpayne@68: jpayne@68: (self._parser, self._source) = self._entity_stack[-1] jpayne@68: del self._entity_stack[-1] jpayne@68: return 1 jpayne@68: jpayne@68: def skipped_entity_handler(self, name, is_pe): jpayne@68: if is_pe: jpayne@68: # The SAX spec requires to report skipped PEs with a '%' jpayne@68: name = '%'+name jpayne@68: self._cont_handler.skippedEntity(name) jpayne@68: jpayne@68: # --- jpayne@68: jpayne@68: def create_parser(*args, **kwargs): jpayne@68: return ExpatParser(*args, **kwargs) jpayne@68: jpayne@68: # --- jpayne@68: jpayne@68: if __name__ == "__main__": jpayne@68: import xml.sax.saxutils jpayne@68: p = create_parser() jpayne@68: p.setContentHandler(xml.sax.saxutils.XMLGenerator()) jpayne@68: p.setErrorHandler(xml.sax.ErrorHandler()) jpayne@68: p.parse("http://www.ibiblio.org/xml/examples/shakespeare/hamlet.xml")