jpayne@69: """A parser for HTML and XHTML."""
jpayne@69:
jpayne@69: # This file is based on sgmllib.py, but the API is slightly different.
jpayne@69:
jpayne@69: # XXX There should be a way to distinguish between PCDATA (parsed
jpayne@69: # character data -- the normal case), RCDATA (replaceable character
jpayne@69: # data -- only char and entity references and end tags are special)
jpayne@69: # and CDATA (character data -- only end tags are special).
jpayne@69:
jpayne@69:
jpayne@69: import re
jpayne@69: import warnings
jpayne@69: import _markupbase
jpayne@69:
jpayne@69: from html import unescape
jpayne@69:
jpayne@69:
jpayne@69: __all__ = ['HTMLParser']
jpayne@69:
jpayne@69: # Regular expressions used for parsing
jpayne@69:
jpayne@69: interesting_normal = re.compile('[&<]')
jpayne@69: incomplete = re.compile('&[a-zA-Z#]')
jpayne@69:
jpayne@69: entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
jpayne@69: charref = re.compile('(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
jpayne@69:
jpayne@69: starttagopen = re.compile('<[a-zA-Z]')
jpayne@69: piclose = re.compile('>')
jpayne@69: commentclose = re.compile(r'--\s*>')
jpayne@69: # Note:
jpayne@69: # 1) if you change tagfind/attrfind remember to update locatestarttagend too;
jpayne@69: # 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
jpayne@69: # explode, so don't do it.
jpayne@69: # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
jpayne@69: # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
jpayne@69: tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
jpayne@69: attrfind_tolerant = re.compile(
jpayne@69: r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
jpayne@69: r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
jpayne@69: locatestarttagend_tolerant = re.compile(r"""
jpayne@69: <[a-zA-Z][^\t\n\r\f />\x00]* # tag name
jpayne@69: (?:[\s/]* # optional whitespace before attribute name
jpayne@69: (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
jpayne@69: (?:\s*=+\s* # value indicator
jpayne@69: (?:'[^']*' # LITA-enclosed value
jpayne@69: |"[^"]*" # LIT-enclosed value
jpayne@69: |(?!['"])[^>\s]* # bare value
jpayne@69: )
jpayne@69: (?:\s*,)* # possibly followed by a comma
jpayne@69: )?(?:\s|/(?!>))*
jpayne@69: )*
jpayne@69: )?
jpayne@69: \s* # trailing whitespace
jpayne@69: """, re.VERBOSE)
jpayne@69: endendtag = re.compile('>')
jpayne@69: # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
jpayne@69: # and the tag name, so maybe this should be fixed
jpayne@69: endtagfind = re.compile(r'\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
jpayne@69:
jpayne@69:
jpayne@69:
jpayne@69: class HTMLParser(_markupbase.ParserBase):
jpayne@69: """Find tags and other markup and call handler functions.
jpayne@69:
jpayne@69: Usage:
jpayne@69: p = HTMLParser()
jpayne@69: p.feed(data)
jpayne@69: ...
jpayne@69: p.close()
jpayne@69:
jpayne@69: Start tags are handled by calling self.handle_starttag() or
jpayne@69: self.handle_startendtag(); end tags by self.handle_endtag(). The
jpayne@69: data between tags is passed from the parser to the derived class
jpayne@69: by calling self.handle_data() with the data as argument (the data
jpayne@69: may be split up in arbitrary chunks). If convert_charrefs is
jpayne@69: True the character references are converted automatically to the
jpayne@69: corresponding Unicode character (and self.handle_data() is no
jpayne@69: longer split in chunks), otherwise they are passed by calling
jpayne@69: self.handle_entityref() or self.handle_charref() with the string
jpayne@69: containing respectively the named or numeric reference as the
jpayne@69: argument.
jpayne@69: """
jpayne@69:
jpayne@69: CDATA_CONTENT_ELEMENTS = ("script", "style")
jpayne@69:
jpayne@69: def __init__(self, *, convert_charrefs=True):
jpayne@69: """Initialize and reset this instance.
jpayne@69:
jpayne@69: If convert_charrefs is True (the default), all character references
jpayne@69: are automatically converted to the corresponding Unicode characters.
jpayne@69: """
jpayne@69: self.convert_charrefs = convert_charrefs
jpayne@69: self.reset()
jpayne@69:
jpayne@69: def reset(self):
jpayne@69: """Reset this instance. Loses all unprocessed data."""
jpayne@69: self.rawdata = ''
jpayne@69: self.lasttag = '???'
jpayne@69: self.interesting = interesting_normal
jpayne@69: self.cdata_elem = None
jpayne@69: _markupbase.ParserBase.reset(self)
jpayne@69:
jpayne@69: def feed(self, data):
jpayne@69: r"""Feed data to the parser.
jpayne@69:
jpayne@69: Call this as often as you want, with as little or as much text
jpayne@69: as you want (may include '\n').
jpayne@69: """
jpayne@69: self.rawdata = self.rawdata + data
jpayne@69: self.goahead(0)
jpayne@69:
jpayne@69: def close(self):
jpayne@69: """Handle any buffered data."""
jpayne@69: self.goahead(1)
jpayne@69:
jpayne@69: __starttag_text = None
jpayne@69:
jpayne@69: def get_starttag_text(self):
jpayne@69: """Return full source of start tag: '<...>'."""
jpayne@69: return self.__starttag_text
jpayne@69:
jpayne@69: def set_cdata_mode(self, elem):
jpayne@69: self.cdata_elem = elem.lower()
jpayne@69: self.interesting = re.compile(r'\s*%s\s*>' % self.cdata_elem, re.I)
jpayne@69:
jpayne@69: def clear_cdata_mode(self):
jpayne@69: self.interesting = interesting_normal
jpayne@69: self.cdata_elem = None
jpayne@69:
jpayne@69: # Internal -- handle data as far as reasonable. May leave state
jpayne@69: # and data to be processed by a subsequent call. If 'end' is
jpayne@69: # true, force handling all data as if followed by EOF marker.
jpayne@69: def goahead(self, end):
jpayne@69: rawdata = self.rawdata
jpayne@69: i = 0
jpayne@69: n = len(rawdata)
jpayne@69: while i < n:
jpayne@69: if self.convert_charrefs and not self.cdata_elem:
jpayne@69: j = rawdata.find('<', i)
jpayne@69: if j < 0:
jpayne@69: # if we can't find the next <, either we are at the end
jpayne@69: # or there's more text incoming. If the latter is True,
jpayne@69: # we can't pass the text to handle_data in case we have
jpayne@69: # a charref cut in half at end. Try to determine if
jpayne@69: # this is the case before proceeding by looking for an
jpayne@69: # & near the end and see if it's followed by a space or ;.
jpayne@69: amppos = rawdata.rfind('&', max(i, n-34))
jpayne@69: if (amppos >= 0 and
jpayne@69: not re.compile(r'[\s;]').search(rawdata, amppos)):
jpayne@69: break # wait till we get all the text
jpayne@69: j = n
jpayne@69: else:
jpayne@69: match = self.interesting.search(rawdata, i) # < or &
jpayne@69: if match:
jpayne@69: j = match.start()
jpayne@69: else:
jpayne@69: if self.cdata_elem:
jpayne@69: break
jpayne@69: j = n
jpayne@69: if i < j:
jpayne@69: if self.convert_charrefs and not self.cdata_elem:
jpayne@69: self.handle_data(unescape(rawdata[i:j]))
jpayne@69: else:
jpayne@69: self.handle_data(rawdata[i:j])
jpayne@69: i = self.updatepos(i, j)
jpayne@69: if i == n: break
jpayne@69: startswith = rawdata.startswith
jpayne@69: if startswith('<', i):
jpayne@69: if starttagopen.match(rawdata, i): # < + letter
jpayne@69: k = self.parse_starttag(i)
jpayne@69: elif startswith("", i):
jpayne@69: k = self.parse_endtag(i)
jpayne@69: elif startswith("