jpayne@68: """A parser for HTML and XHTML.""" jpayne@68: jpayne@68: # This file is based on sgmllib.py, but the API is slightly different. jpayne@68: jpayne@68: # XXX There should be a way to distinguish between PCDATA (parsed jpayne@68: # character data -- the normal case), RCDATA (replaceable character jpayne@68: # data -- only char and entity references and end tags are special) jpayne@68: # and CDATA (character data -- only end tags are special). jpayne@68: jpayne@68: jpayne@68: import re jpayne@68: import warnings jpayne@68: import _markupbase jpayne@68: jpayne@68: from html import unescape jpayne@68: jpayne@68: jpayne@68: __all__ = ['HTMLParser'] jpayne@68: jpayne@68: # Regular expressions used for parsing jpayne@68: jpayne@68: interesting_normal = re.compile('[&<]') jpayne@68: incomplete = re.compile('&[a-zA-Z#]') jpayne@68: jpayne@68: entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') jpayne@68: charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') jpayne@68: jpayne@68: starttagopen = re.compile('<[a-zA-Z]') jpayne@68: piclose = re.compile('>') jpayne@68: commentclose = re.compile(r'--\s*>') jpayne@68: # Note: jpayne@68: # 1) if you change tagfind/attrfind remember to update locatestarttagend too; jpayne@68: # 2) if you change tagfind/attrfind and/or locatestarttagend the parser will jpayne@68: # explode, so don't do it. jpayne@68: # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state jpayne@68: # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state jpayne@68: tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*') jpayne@68: attrfind_tolerant = re.compile( jpayne@68: r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' jpayne@68: r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') jpayne@68: locatestarttagend_tolerant = re.compile(r""" jpayne@68: <[a-zA-Z][^\t\n\r\f />\x00]* # tag name jpayne@68: (?:[\s/]* # optional whitespace before attribute name jpayne@68: (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name jpayne@68: (?:\s*=+\s* # value indicator jpayne@68: (?:'[^']*' # LITA-enclosed value jpayne@68: |"[^"]*" # LIT-enclosed value jpayne@68: |(?!['"])[^>\s]* # bare value jpayne@68: ) jpayne@68: (?:\s*,)* # possibly followed by a comma jpayne@68: )?(?:\s|/(?!>))* jpayne@68: )* jpayne@68: )? jpayne@68: \s* # trailing whitespace jpayne@68: """, re.VERBOSE) jpayne@68: endendtag = re.compile('>') jpayne@68: # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between jpayne@68: # ') jpayne@68: jpayne@68: jpayne@68: jpayne@68: class HTMLParser(_markupbase.ParserBase): jpayne@68: """Find tags and other markup and call handler functions. jpayne@68: jpayne@68: Usage: jpayne@68: p = HTMLParser() jpayne@68: p.feed(data) jpayne@68: ... jpayne@68: p.close() jpayne@68: jpayne@68: Start tags are handled by calling self.handle_starttag() or jpayne@68: self.handle_startendtag(); end tags by self.handle_endtag(). The jpayne@68: data between tags is passed from the parser to the derived class jpayne@68: by calling self.handle_data() with the data as argument (the data jpayne@68: may be split up in arbitrary chunks). If convert_charrefs is jpayne@68: True the character references are converted automatically to the jpayne@68: corresponding Unicode character (and self.handle_data() is no jpayne@68: longer split in chunks), otherwise they are passed by calling jpayne@68: self.handle_entityref() or self.handle_charref() with the string jpayne@68: containing respectively the named or numeric reference as the jpayne@68: argument. jpayne@68: """ jpayne@68: jpayne@68: CDATA_CONTENT_ELEMENTS = ("script", "style") jpayne@68: jpayne@68: def __init__(self, *, convert_charrefs=True): jpayne@68: """Initialize and reset this instance. jpayne@68: jpayne@68: If convert_charrefs is True (the default), all character references jpayne@68: are automatically converted to the corresponding Unicode characters. jpayne@68: """ jpayne@68: self.convert_charrefs = convert_charrefs jpayne@68: self.reset() jpayne@68: jpayne@68: def reset(self): jpayne@68: """Reset this instance. Loses all unprocessed data.""" jpayne@68: self.rawdata = '' jpayne@68: self.lasttag = '???' jpayne@68: self.interesting = interesting_normal jpayne@68: self.cdata_elem = None jpayne@68: _markupbase.ParserBase.reset(self) jpayne@68: jpayne@68: def feed(self, data): jpayne@68: r"""Feed data to the parser. jpayne@68: jpayne@68: Call this as often as you want, with as little or as much text jpayne@68: as you want (may include '\n'). jpayne@68: """ jpayne@68: self.rawdata = self.rawdata + data jpayne@68: self.goahead(0) jpayne@68: jpayne@68: def close(self): jpayne@68: """Handle any buffered data.""" jpayne@68: self.goahead(1) jpayne@68: jpayne@68: __starttag_text = None jpayne@68: jpayne@68: def get_starttag_text(self): jpayne@68: """Return full source of start tag: '<...>'.""" jpayne@68: return self.__starttag_text jpayne@68: jpayne@68: def set_cdata_mode(self, elem): jpayne@68: self.cdata_elem = elem.lower() jpayne@68: self.interesting = re.compile(r'' % self.cdata_elem, re.I) jpayne@68: jpayne@68: def clear_cdata_mode(self): jpayne@68: self.interesting = interesting_normal jpayne@68: self.cdata_elem = None jpayne@68: jpayne@68: # Internal -- handle data as far as reasonable. May leave state jpayne@68: # and data to be processed by a subsequent call. If 'end' is jpayne@68: # true, force handling all data as if followed by EOF marker. jpayne@68: def goahead(self, end): jpayne@68: rawdata = self.rawdata jpayne@68: i = 0 jpayne@68: n = len(rawdata) jpayne@68: while i < n: jpayne@68: if self.convert_charrefs and not self.cdata_elem: jpayne@68: j = rawdata.find('<', i) jpayne@68: if j < 0: jpayne@68: # if we can't find the next <, either we are at the end jpayne@68: # or there's more text incoming. If the latter is True, jpayne@68: # we can't pass the text to handle_data in case we have jpayne@68: # a charref cut in half at end. Try to determine if jpayne@68: # this is the case before proceeding by looking for an jpayne@68: # & near the end and see if it's followed by a space or ;. jpayne@68: amppos = rawdata.rfind('&', max(i, n-34)) jpayne@68: if (amppos >= 0 and jpayne@68: not re.compile(r'[\s;]').search(rawdata, amppos)): jpayne@68: break # wait till we get all the text jpayne@68: j = n jpayne@68: else: jpayne@68: match = self.interesting.search(rawdata, i) # < or & jpayne@68: if match: jpayne@68: j = match.start() jpayne@68: else: jpayne@68: if self.cdata_elem: jpayne@68: break jpayne@68: j = n jpayne@68: if i < j: jpayne@68: if self.convert_charrefs and not self.cdata_elem: jpayne@68: self.handle_data(unescape(rawdata[i:j])) jpayne@68: else: jpayne@68: self.handle_data(rawdata[i:j]) jpayne@68: i = self.updatepos(i, j) jpayne@68: if i == n: break jpayne@68: startswith = rawdata.startswith jpayne@68: if startswith('<', i): jpayne@68: if starttagopen.match(rawdata, i): # < + letter jpayne@68: k = self.parse_starttag(i) jpayne@68: elif startswith("', i + 1) jpayne@68: if k < 0: jpayne@68: k = rawdata.find('<', i + 1) jpayne@68: if k < 0: jpayne@68: k = i + 1 jpayne@68: else: jpayne@68: k += 1 jpayne@68: if self.convert_charrefs and not self.cdata_elem: jpayne@68: self.handle_data(unescape(rawdata[i:k])) jpayne@68: else: jpayne@68: self.handle_data(rawdata[i:k]) jpayne@68: i = self.updatepos(i, k) jpayne@68: elif startswith("&#", i): jpayne@68: match = charref.match(rawdata, i) jpayne@68: if match: jpayne@68: name = match.group()[2:-1] jpayne@68: self.handle_charref(name) jpayne@68: k = match.end() jpayne@68: if not startswith(';', k-1): jpayne@68: k = k - 1 jpayne@68: i = self.updatepos(i, k) jpayne@68: continue jpayne@68: else: jpayne@68: if ";" in rawdata[i:]: # bail by consuming &# jpayne@68: self.handle_data(rawdata[i:i+2]) jpayne@68: i = self.updatepos(i, i+2) jpayne@68: break jpayne@68: elif startswith('&', i): jpayne@68: match = entityref.match(rawdata, i) jpayne@68: if match: jpayne@68: name = match.group(1) jpayne@68: self.handle_entityref(name) jpayne@68: k = match.end() jpayne@68: if not startswith(';', k-1): jpayne@68: k = k - 1 jpayne@68: i = self.updatepos(i, k) jpayne@68: continue jpayne@68: match = incomplete.match(rawdata, i) jpayne@68: if match: jpayne@68: # match.group() will contain at least 2 chars jpayne@68: if end and match.group() == rawdata[i:]: jpayne@68: k = match.end() jpayne@68: if k <= i: jpayne@68: k = n jpayne@68: i = self.updatepos(i, i + 1) jpayne@68: # incomplete jpayne@68: break jpayne@68: elif (i + 1) < n: jpayne@68: # not the end of the buffer, and can't be confused jpayne@68: # with some other construct jpayne@68: self.handle_data("&") jpayne@68: i = self.updatepos(i, i + 1) jpayne@68: else: jpayne@68: break jpayne@68: else: jpayne@68: assert 0, "interesting.search() lied" jpayne@68: # end while jpayne@68: if end and i < n and not self.cdata_elem: jpayne@68: if self.convert_charrefs and not self.cdata_elem: jpayne@68: self.handle_data(unescape(rawdata[i:n])) jpayne@68: else: jpayne@68: self.handle_data(rawdata[i:n]) jpayne@68: i = self.updatepos(i, n) jpayne@68: self.rawdata = rawdata[i:] jpayne@68: jpayne@68: # Internal -- parse html declarations, return length or -1 if not terminated jpayne@68: # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state jpayne@68: # See also parse_declaration in _markupbase jpayne@68: def parse_html_declaration(self, i): jpayne@68: rawdata = self.rawdata jpayne@68: assert rawdata[i:i+2] == ' jpayne@68: gtpos = rawdata.find('>', i+9) jpayne@68: if gtpos == -1: jpayne@68: return -1 jpayne@68: self.handle_decl(rawdata[i+2:gtpos]) jpayne@68: return gtpos+1 jpayne@68: else: jpayne@68: return self.parse_bogus_comment(i) jpayne@68: jpayne@68: # Internal -- parse bogus comment, return length or -1 if not terminated jpayne@68: # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state jpayne@68: def parse_bogus_comment(self, i, report=1): jpayne@68: rawdata = self.rawdata jpayne@68: assert rawdata[i:i+2] in ('', i+2) jpayne@68: if pos == -1: jpayne@68: return -1 jpayne@68: if report: jpayne@68: self.handle_comment(rawdata[i+2:pos]) jpayne@68: return pos + 1 jpayne@68: jpayne@68: # Internal -- parse processing instr, return end or -1 if not terminated jpayne@68: def parse_pi(self, i): jpayne@68: rawdata = self.rawdata jpayne@68: assert rawdata[i:i+2] == ' jpayne@68: if not match: jpayne@68: return -1 jpayne@68: j = match.start() jpayne@68: self.handle_pi(rawdata[i+2: j]) jpayne@68: j = match.end() jpayne@68: return j jpayne@68: jpayne@68: # Internal -- handle starttag, return end or -1 if not terminated jpayne@68: def parse_starttag(self, i): jpayne@68: self.__starttag_text = None jpayne@68: endpos = self.check_for_whole_start_tag(i) jpayne@68: if endpos < 0: jpayne@68: return endpos jpayne@68: rawdata = self.rawdata jpayne@68: self.__starttag_text = rawdata[i:endpos] jpayne@68: jpayne@68: # Now parse the data between i+1 and j into a tag and attrs jpayne@68: attrs = [] jpayne@68: match = tagfind_tolerant.match(rawdata, i+1) jpayne@68: assert match, 'unexpected call to parse_starttag()' jpayne@68: k = match.end() jpayne@68: self.lasttag = tag = match.group(1).lower() jpayne@68: while k < endpos: jpayne@68: m = attrfind_tolerant.match(rawdata, k) jpayne@68: if not m: jpayne@68: break jpayne@68: attrname, rest, attrvalue = m.group(1, 2, 3) jpayne@68: if not rest: jpayne@68: attrvalue = None jpayne@68: elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ jpayne@68: attrvalue[:1] == '"' == attrvalue[-1:]: jpayne@68: attrvalue = attrvalue[1:-1] jpayne@68: if attrvalue: jpayne@68: attrvalue = unescape(attrvalue) jpayne@68: attrs.append((attrname.lower(), attrvalue)) jpayne@68: k = m.end() jpayne@68: jpayne@68: end = rawdata[k:endpos].strip() jpayne@68: if end not in (">", "/>"): jpayne@68: lineno, offset = self.getpos() jpayne@68: if "\n" in self.__starttag_text: jpayne@68: lineno = lineno + self.__starttag_text.count("\n") jpayne@68: offset = len(self.__starttag_text) \ jpayne@68: - self.__starttag_text.rfind("\n") jpayne@68: else: jpayne@68: offset = offset + len(self.__starttag_text) jpayne@68: self.handle_data(rawdata[i:endpos]) jpayne@68: return endpos jpayne@68: if end.endswith('/>'): jpayne@68: # XHTML-style empty tag: jpayne@68: self.handle_startendtag(tag, attrs) jpayne@68: else: jpayne@68: self.handle_starttag(tag, attrs) jpayne@68: if tag in self.CDATA_CONTENT_ELEMENTS: jpayne@68: self.set_cdata_mode(tag) jpayne@68: return endpos jpayne@68: jpayne@68: # Internal -- check to see if we have a complete starttag; return end jpayne@68: # or -1 if incomplete. jpayne@68: def check_for_whole_start_tag(self, i): jpayne@68: rawdata = self.rawdata jpayne@68: m = locatestarttagend_tolerant.match(rawdata, i) jpayne@68: if m: jpayne@68: j = m.end() jpayne@68: next = rawdata[j:j+1] jpayne@68: if next == ">": jpayne@68: return j + 1 jpayne@68: if next == "/": jpayne@68: if rawdata.startswith("/>", j): jpayne@68: return j + 2 jpayne@68: if rawdata.startswith("/", j): jpayne@68: # buffer boundary jpayne@68: return -1 jpayne@68: # else bogus input jpayne@68: if j > i: jpayne@68: return j jpayne@68: else: jpayne@68: return i + 1 jpayne@68: if next == "": jpayne@68: # end of input jpayne@68: return -1 jpayne@68: if next in ("abcdefghijklmnopqrstuvwxyz=/" jpayne@68: "ABCDEFGHIJKLMNOPQRSTUVWXYZ"): jpayne@68: # end of input in or before attribute value, or we have the jpayne@68: # '/' from a '/>' ending jpayne@68: return -1 jpayne@68: if j > i: jpayne@68: return j jpayne@68: else: jpayne@68: return i + 1 jpayne@68: raise AssertionError("we should not get here!") jpayne@68: jpayne@68: # Internal -- parse endtag, return end or -1 if incomplete jpayne@68: def parse_endtag(self, i): jpayne@68: rawdata = self.rawdata jpayne@68: assert rawdata[i:i+2] == " jpayne@68: if not match: jpayne@68: return -1 jpayne@68: gtpos = match.end() jpayne@68: match = endtagfind.match(rawdata, i) # jpayne@68: if not match: jpayne@68: if self.cdata_elem is not None: jpayne@68: self.handle_data(rawdata[i:gtpos]) jpayne@68: return gtpos jpayne@68: # find the name: w3.org/TR/html5/tokenization.html#tag-name-state jpayne@68: namematch = tagfind_tolerant.match(rawdata, i+2) jpayne@68: if not namematch: jpayne@68: # w3.org/TR/html5/tokenization.html#end-tag-open-state jpayne@68: if rawdata[i:i+3] == '': jpayne@68: return i+3 jpayne@68: else: jpayne@68: return self.parse_bogus_comment(i) jpayne@68: tagname = namematch.group(1).lower() jpayne@68: # consume and ignore other stuff between the name and the > jpayne@68: # Note: this is not 100% correct, since we might have things like jpayne@68: # , but looking for > after tha name should cover jpayne@68: # most of the cases and is much simpler jpayne@68: gtpos = rawdata.find('>', namematch.end()) jpayne@68: self.handle_endtag(tagname) jpayne@68: return gtpos+1 jpayne@68: jpayne@68: elem = match.group(1).lower() # script or style jpayne@68: if self.cdata_elem is not None: jpayne@68: if elem != self.cdata_elem: jpayne@68: self.handle_data(rawdata[i:gtpos]) jpayne@68: return gtpos jpayne@68: jpayne@68: self.handle_endtag(elem) jpayne@68: self.clear_cdata_mode() jpayne@68: return gtpos jpayne@68: jpayne@68: # Overridable -- finish processing of start+end tag: jpayne@68: def handle_startendtag(self, tag, attrs): jpayne@68: self.handle_starttag(tag, attrs) jpayne@68: self.handle_endtag(tag) jpayne@68: jpayne@68: # Overridable -- handle start tag jpayne@68: def handle_starttag(self, tag, attrs): jpayne@68: pass jpayne@68: jpayne@68: # Overridable -- handle end tag jpayne@68: def handle_endtag(self, tag): jpayne@68: pass jpayne@68: jpayne@68: # Overridable -- handle character reference jpayne@68: def handle_charref(self, name): jpayne@68: pass jpayne@68: jpayne@68: # Overridable -- handle entity reference jpayne@68: def handle_entityref(self, name): jpayne@68: pass jpayne@68: jpayne@68: # Overridable -- handle data jpayne@68: def handle_data(self, data): jpayne@68: pass jpayne@68: jpayne@68: # Overridable -- handle comment jpayne@68: def handle_comment(self, data): jpayne@68: pass jpayne@68: jpayne@68: # Overridable -- handle declaration jpayne@68: def handle_decl(self, decl): jpayne@68: pass jpayne@68: jpayne@68: # Overridable -- handle processing instruction jpayne@68: def handle_pi(self, data): jpayne@68: pass jpayne@68: jpayne@68: def unknown_decl(self, data): jpayne@68: pass jpayne@68: jpayne@68: # Internal -- helper to remove special character quoting jpayne@68: def unescape(self, s): jpayne@68: warnings.warn('The unescape method is deprecated and will be removed ' jpayne@68: 'in 3.5, use html.unescape() instead.', jpayne@68: DeprecationWarning, stacklevel=2) jpayne@68: return unescape(s)