jpayne@68: """A parser for HTML and XHTML."""
jpayne@68:
jpayne@68: # This file is based on sgmllib.py, but the API is slightly different.
jpayne@68:
jpayne@68: # XXX There should be a way to distinguish between PCDATA (parsed
jpayne@68: # character data -- the normal case), RCDATA (replaceable character
jpayne@68: # data -- only char and entity references and end tags are special)
jpayne@68: # and CDATA (character data -- only end tags are special).
jpayne@68:
jpayne@68:
jpayne@68: import re
jpayne@68: import warnings
jpayne@68: import _markupbase
jpayne@68:
jpayne@68: from html import unescape
jpayne@68:
jpayne@68:
jpayne@68: __all__ = ['HTMLParser']
jpayne@68:
jpayne@68: # Regular expressions used for parsing
jpayne@68:
jpayne@68: interesting_normal = re.compile('[&<]')
jpayne@68: incomplete = re.compile('&[a-zA-Z#]')
jpayne@68:
jpayne@68: entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
jpayne@68: charref = re.compile('(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
jpayne@68:
jpayne@68: starttagopen = re.compile('<[a-zA-Z]')
jpayne@68: piclose = re.compile('>')
jpayne@68: commentclose = re.compile(r'--\s*>')
jpayne@68: # Note:
jpayne@68: # 1) if you change tagfind/attrfind remember to update locatestarttagend too;
jpayne@68: # 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
jpayne@68: # explode, so don't do it.
jpayne@68: # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
jpayne@68: # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
jpayne@68: tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
jpayne@68: attrfind_tolerant = re.compile(
jpayne@68: r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
jpayne@68: r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
jpayne@68: locatestarttagend_tolerant = re.compile(r"""
jpayne@68: <[a-zA-Z][^\t\n\r\f />\x00]* # tag name
jpayne@68: (?:[\s/]* # optional whitespace before attribute name
jpayne@68: (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
jpayne@68: (?:\s*=+\s* # value indicator
jpayne@68: (?:'[^']*' # LITA-enclosed value
jpayne@68: |"[^"]*" # LIT-enclosed value
jpayne@68: |(?!['"])[^>\s]* # bare value
jpayne@68: )
jpayne@68: (?:\s*,)* # possibly followed by a comma
jpayne@68: )?(?:\s|/(?!>))*
jpayne@68: )*
jpayne@68: )?
jpayne@68: \s* # trailing whitespace
jpayne@68: """, re.VERBOSE)
jpayne@68: endendtag = re.compile('>')
jpayne@68: # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
jpayne@68: # and the tag name, so maybe this should be fixed
jpayne@68: endtagfind = re.compile(r'\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
jpayne@68:
jpayne@68:
jpayne@68:
jpayne@68: class HTMLParser(_markupbase.ParserBase):
jpayne@68: """Find tags and other markup and call handler functions.
jpayne@68:
jpayne@68: Usage:
jpayne@68: p = HTMLParser()
jpayne@68: p.feed(data)
jpayne@68: ...
jpayne@68: p.close()
jpayne@68:
jpayne@68: Start tags are handled by calling self.handle_starttag() or
jpayne@68: self.handle_startendtag(); end tags by self.handle_endtag(). The
jpayne@68: data between tags is passed from the parser to the derived class
jpayne@68: by calling self.handle_data() with the data as argument (the data
jpayne@68: may be split up in arbitrary chunks). If convert_charrefs is
jpayne@68: True the character references are converted automatically to the
jpayne@68: corresponding Unicode character (and self.handle_data() is no
jpayne@68: longer split in chunks), otherwise they are passed by calling
jpayne@68: self.handle_entityref() or self.handle_charref() with the string
jpayne@68: containing respectively the named or numeric reference as the
jpayne@68: argument.
jpayne@68: """
jpayne@68:
jpayne@68: CDATA_CONTENT_ELEMENTS = ("script", "style")
jpayne@68:
jpayne@68: def __init__(self, *, convert_charrefs=True):
jpayne@68: """Initialize and reset this instance.
jpayne@68:
jpayne@68: If convert_charrefs is True (the default), all character references
jpayne@68: are automatically converted to the corresponding Unicode characters.
jpayne@68: """
jpayne@68: self.convert_charrefs = convert_charrefs
jpayne@68: self.reset()
jpayne@68:
jpayne@68: def reset(self):
jpayne@68: """Reset this instance. Loses all unprocessed data."""
jpayne@68: self.rawdata = ''
jpayne@68: self.lasttag = '???'
jpayne@68: self.interesting = interesting_normal
jpayne@68: self.cdata_elem = None
jpayne@68: _markupbase.ParserBase.reset(self)
jpayne@68:
jpayne@68: def feed(self, data):
jpayne@68: r"""Feed data to the parser.
jpayne@68:
jpayne@68: Call this as often as you want, with as little or as much text
jpayne@68: as you want (may include '\n').
jpayne@68: """
jpayne@68: self.rawdata = self.rawdata + data
jpayne@68: self.goahead(0)
jpayne@68:
jpayne@68: def close(self):
jpayne@68: """Handle any buffered data."""
jpayne@68: self.goahead(1)
jpayne@68:
jpayne@68: __starttag_text = None
jpayne@68:
jpayne@68: def get_starttag_text(self):
jpayne@68: """Return full source of start tag: '<...>'."""
jpayne@68: return self.__starttag_text
jpayne@68:
jpayne@68: def set_cdata_mode(self, elem):
jpayne@68: self.cdata_elem = elem.lower()
jpayne@68: self.interesting = re.compile(r'\s*%s\s*>' % self.cdata_elem, re.I)
jpayne@68:
jpayne@68: def clear_cdata_mode(self):
jpayne@68: self.interesting = interesting_normal
jpayne@68: self.cdata_elem = None
jpayne@68:
jpayne@68: # Internal -- handle data as far as reasonable. May leave state
jpayne@68: # and data to be processed by a subsequent call. If 'end' is
jpayne@68: # true, force handling all data as if followed by EOF marker.
jpayne@68: def goahead(self, end):
jpayne@68: rawdata = self.rawdata
jpayne@68: i = 0
jpayne@68: n = len(rawdata)
jpayne@68: while i < n:
jpayne@68: if self.convert_charrefs and not self.cdata_elem:
jpayne@68: j = rawdata.find('<', i)
jpayne@68: if j < 0:
jpayne@68: # if we can't find the next <, either we are at the end
jpayne@68: # or there's more text incoming. If the latter is True,
jpayne@68: # we can't pass the text to handle_data in case we have
jpayne@68: # a charref cut in half at end. Try to determine if
jpayne@68: # this is the case before proceeding by looking for an
jpayne@68: # & near the end and see if it's followed by a space or ;.
jpayne@68: amppos = rawdata.rfind('&', max(i, n-34))
jpayne@68: if (amppos >= 0 and
jpayne@68: not re.compile(r'[\s;]').search(rawdata, amppos)):
jpayne@68: break # wait till we get all the text
jpayne@68: j = n
jpayne@68: else:
jpayne@68: match = self.interesting.search(rawdata, i) # < or &
jpayne@68: if match:
jpayne@68: j = match.start()
jpayne@68: else:
jpayne@68: if self.cdata_elem:
jpayne@68: break
jpayne@68: j = n
jpayne@68: if i < j:
jpayne@68: if self.convert_charrefs and not self.cdata_elem:
jpayne@68: self.handle_data(unescape(rawdata[i:j]))
jpayne@68: else:
jpayne@68: self.handle_data(rawdata[i:j])
jpayne@68: i = self.updatepos(i, j)
jpayne@68: if i == n: break
jpayne@68: startswith = rawdata.startswith
jpayne@68: if startswith('<', i):
jpayne@68: if starttagopen.match(rawdata, i): # < + letter
jpayne@68: k = self.parse_starttag(i)
jpayne@68: elif startswith("", i):
jpayne@68: k = self.parse_endtag(i)
jpayne@68: elif startswith("