jpayne@69: """A parser for HTML and XHTML.""" jpayne@69: jpayne@69: # This file is based on sgmllib.py, but the API is slightly different. jpayne@69: jpayne@69: # XXX There should be a way to distinguish between PCDATA (parsed jpayne@69: # character data -- the normal case), RCDATA (replaceable character jpayne@69: # data -- only char and entity references and end tags are special) jpayne@69: # and CDATA (character data -- only end tags are special). jpayne@69: jpayne@69: jpayne@69: import re jpayne@69: import warnings jpayne@69: import _markupbase jpayne@69: jpayne@69: from html import unescape jpayne@69: jpayne@69: jpayne@69: __all__ = ['HTMLParser'] jpayne@69: jpayne@69: # Regular expressions used for parsing jpayne@69: jpayne@69: interesting_normal = re.compile('[&<]') jpayne@69: incomplete = re.compile('&[a-zA-Z#]') jpayne@69: jpayne@69: entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') jpayne@69: charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') jpayne@69: jpayne@69: starttagopen = re.compile('<[a-zA-Z]') jpayne@69: piclose = re.compile('>') jpayne@69: commentclose = re.compile(r'--\s*>') jpayne@69: # Note: jpayne@69: # 1) if you change tagfind/attrfind remember to update locatestarttagend too; jpayne@69: # 2) if you change tagfind/attrfind and/or locatestarttagend the parser will jpayne@69: # explode, so don't do it. jpayne@69: # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state jpayne@69: # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state jpayne@69: tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*') jpayne@69: attrfind_tolerant = re.compile( jpayne@69: r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' jpayne@69: r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') jpayne@69: locatestarttagend_tolerant = re.compile(r""" jpayne@69: <[a-zA-Z][^\t\n\r\f />\x00]* # tag name jpayne@69: (?:[\s/]* # optional whitespace before attribute name jpayne@69: (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name jpayne@69: (?:\s*=+\s* # value indicator jpayne@69: (?:'[^']*' # LITA-enclosed value jpayne@69: |"[^"]*" # LIT-enclosed value jpayne@69: |(?!['"])[^>\s]* # bare value jpayne@69: ) jpayne@69: (?:\s*,)* # possibly followed by a comma jpayne@69: )?(?:\s|/(?!>))* jpayne@69: )* jpayne@69: )? jpayne@69: \s* # trailing whitespace jpayne@69: """, re.VERBOSE) jpayne@69: endendtag = re.compile('>') jpayne@69: # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between jpayne@69: # ') jpayne@69: jpayne@69: jpayne@69: jpayne@69: class HTMLParser(_markupbase.ParserBase): jpayne@69: """Find tags and other markup and call handler functions. jpayne@69: jpayne@69: Usage: jpayne@69: p = HTMLParser() jpayne@69: p.feed(data) jpayne@69: ... jpayne@69: p.close() jpayne@69: jpayne@69: Start tags are handled by calling self.handle_starttag() or jpayne@69: self.handle_startendtag(); end tags by self.handle_endtag(). The jpayne@69: data between tags is passed from the parser to the derived class jpayne@69: by calling self.handle_data() with the data as argument (the data jpayne@69: may be split up in arbitrary chunks). If convert_charrefs is jpayne@69: True the character references are converted automatically to the jpayne@69: corresponding Unicode character (and self.handle_data() is no jpayne@69: longer split in chunks), otherwise they are passed by calling jpayne@69: self.handle_entityref() or self.handle_charref() with the string jpayne@69: containing respectively the named or numeric reference as the jpayne@69: argument. jpayne@69: """ jpayne@69: jpayne@69: CDATA_CONTENT_ELEMENTS = ("script", "style") jpayne@69: jpayne@69: def __init__(self, *, convert_charrefs=True): jpayne@69: """Initialize and reset this instance. jpayne@69: jpayne@69: If convert_charrefs is True (the default), all character references jpayne@69: are automatically converted to the corresponding Unicode characters. jpayne@69: """ jpayne@69: self.convert_charrefs = convert_charrefs jpayne@69: self.reset() jpayne@69: jpayne@69: def reset(self): jpayne@69: """Reset this instance. Loses all unprocessed data.""" jpayne@69: self.rawdata = '' jpayne@69: self.lasttag = '???' jpayne@69: self.interesting = interesting_normal jpayne@69: self.cdata_elem = None jpayne@69: _markupbase.ParserBase.reset(self) jpayne@69: jpayne@69: def feed(self, data): jpayne@69: r"""Feed data to the parser. jpayne@69: jpayne@69: Call this as often as you want, with as little or as much text jpayne@69: as you want (may include '\n'). jpayne@69: """ jpayne@69: self.rawdata = self.rawdata + data jpayne@69: self.goahead(0) jpayne@69: jpayne@69: def close(self): jpayne@69: """Handle any buffered data.""" jpayne@69: self.goahead(1) jpayne@69: jpayne@69: __starttag_text = None jpayne@69: jpayne@69: def get_starttag_text(self): jpayne@69: """Return full source of start tag: '<...>'.""" jpayne@69: return self.__starttag_text jpayne@69: jpayne@69: def set_cdata_mode(self, elem): jpayne@69: self.cdata_elem = elem.lower() jpayne@69: self.interesting = re.compile(r'' % self.cdata_elem, re.I) jpayne@69: jpayne@69: def clear_cdata_mode(self): jpayne@69: self.interesting = interesting_normal jpayne@69: self.cdata_elem = None jpayne@69: jpayne@69: # Internal -- handle data as far as reasonable. May leave state jpayne@69: # and data to be processed by a subsequent call. If 'end' is jpayne@69: # true, force handling all data as if followed by EOF marker. jpayne@69: def goahead(self, end): jpayne@69: rawdata = self.rawdata jpayne@69: i = 0 jpayne@69: n = len(rawdata) jpayne@69: while i < n: jpayne@69: if self.convert_charrefs and not self.cdata_elem: jpayne@69: j = rawdata.find('<', i) jpayne@69: if j < 0: jpayne@69: # if we can't find the next <, either we are at the end jpayne@69: # or there's more text incoming. If the latter is True, jpayne@69: # we can't pass the text to handle_data in case we have jpayne@69: # a charref cut in half at end. Try to determine if jpayne@69: # this is the case before proceeding by looking for an jpayne@69: # & near the end and see if it's followed by a space or ;. jpayne@69: amppos = rawdata.rfind('&', max(i, n-34)) jpayne@69: if (amppos >= 0 and jpayne@69: not re.compile(r'[\s;]').search(rawdata, amppos)): jpayne@69: break # wait till we get all the text jpayne@69: j = n jpayne@69: else: jpayne@69: match = self.interesting.search(rawdata, i) # < or & jpayne@69: if match: jpayne@69: j = match.start() jpayne@69: else: jpayne@69: if self.cdata_elem: jpayne@69: break jpayne@69: j = n jpayne@69: if i < j: jpayne@69: if self.convert_charrefs and not self.cdata_elem: jpayne@69: self.handle_data(unescape(rawdata[i:j])) jpayne@69: else: jpayne@69: self.handle_data(rawdata[i:j]) jpayne@69: i = self.updatepos(i, j) jpayne@69: if i == n: break jpayne@69: startswith = rawdata.startswith jpayne@69: if startswith('<', i): jpayne@69: if starttagopen.match(rawdata, i): # < + letter jpayne@69: k = self.parse_starttag(i) jpayne@69: elif startswith("', i + 1) jpayne@69: if k < 0: jpayne@69: k = rawdata.find('<', i + 1) jpayne@69: if k < 0: jpayne@69: k = i + 1 jpayne@69: else: jpayne@69: k += 1 jpayne@69: if self.convert_charrefs and not self.cdata_elem: jpayne@69: self.handle_data(unescape(rawdata[i:k])) jpayne@69: else: jpayne@69: self.handle_data(rawdata[i:k]) jpayne@69: i = self.updatepos(i, k) jpayne@69: elif startswith("&#", i): jpayne@69: match = charref.match(rawdata, i) jpayne@69: if match: jpayne@69: name = match.group()[2:-1] jpayne@69: self.handle_charref(name) jpayne@69: k = match.end() jpayne@69: if not startswith(';', k-1): jpayne@69: k = k - 1 jpayne@69: i = self.updatepos(i, k) jpayne@69: continue jpayne@69: else: jpayne@69: if ";" in rawdata[i:]: # bail by consuming &# jpayne@69: self.handle_data(rawdata[i:i+2]) jpayne@69: i = self.updatepos(i, i+2) jpayne@69: break jpayne@69: elif startswith('&', i): jpayne@69: match = entityref.match(rawdata, i) jpayne@69: if match: jpayne@69: name = match.group(1) jpayne@69: self.handle_entityref(name) jpayne@69: k = match.end() jpayne@69: if not startswith(';', k-1): jpayne@69: k = k - 1 jpayne@69: i = self.updatepos(i, k) jpayne@69: continue jpayne@69: match = incomplete.match(rawdata, i) jpayne@69: if match: jpayne@69: # match.group() will contain at least 2 chars jpayne@69: if end and match.group() == rawdata[i:]: jpayne@69: k = match.end() jpayne@69: if k <= i: jpayne@69: k = n jpayne@69: i = self.updatepos(i, i + 1) jpayne@69: # incomplete jpayne@69: break jpayne@69: elif (i + 1) < n: jpayne@69: # not the end of the buffer, and can't be confused jpayne@69: # with some other construct jpayne@69: self.handle_data("&") jpayne@69: i = self.updatepos(i, i + 1) jpayne@69: else: jpayne@69: break jpayne@69: else: jpayne@69: assert 0, "interesting.search() lied" jpayne@69: # end while jpayne@69: if end and i < n and not self.cdata_elem: jpayne@69: if self.convert_charrefs and not self.cdata_elem: jpayne@69: self.handle_data(unescape(rawdata[i:n])) jpayne@69: else: jpayne@69: self.handle_data(rawdata[i:n]) jpayne@69: i = self.updatepos(i, n) jpayne@69: self.rawdata = rawdata[i:] jpayne@69: jpayne@69: # Internal -- parse html declarations, return length or -1 if not terminated jpayne@69: # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state jpayne@69: # See also parse_declaration in _markupbase jpayne@69: def parse_html_declaration(self, i): jpayne@69: rawdata = self.rawdata jpayne@69: assert rawdata[i:i+2] == ' jpayne@69: gtpos = rawdata.find('>', i+9) jpayne@69: if gtpos == -1: jpayne@69: return -1 jpayne@69: self.handle_decl(rawdata[i+2:gtpos]) jpayne@69: return gtpos+1 jpayne@69: else: jpayne@69: return self.parse_bogus_comment(i) jpayne@69: jpayne@69: # Internal -- parse bogus comment, return length or -1 if not terminated jpayne@69: # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state jpayne@69: def parse_bogus_comment(self, i, report=1): jpayne@69: rawdata = self.rawdata jpayne@69: assert rawdata[i:i+2] in ('', i+2) jpayne@69: if pos == -1: jpayne@69: return -1 jpayne@69: if report: jpayne@69: self.handle_comment(rawdata[i+2:pos]) jpayne@69: return pos + 1 jpayne@69: jpayne@69: # Internal -- parse processing instr, return end or -1 if not terminated jpayne@69: def parse_pi(self, i): jpayne@69: rawdata = self.rawdata jpayne@69: assert rawdata[i:i+2] == ' jpayne@69: if not match: jpayne@69: return -1 jpayne@69: j = match.start() jpayne@69: self.handle_pi(rawdata[i+2: j]) jpayne@69: j = match.end() jpayne@69: return j jpayne@69: jpayne@69: # Internal -- handle starttag, return end or -1 if not terminated jpayne@69: def parse_starttag(self, i): jpayne@69: self.__starttag_text = None jpayne@69: endpos = self.check_for_whole_start_tag(i) jpayne@69: if endpos < 0: jpayne@69: return endpos jpayne@69: rawdata = self.rawdata jpayne@69: self.__starttag_text = rawdata[i:endpos] jpayne@69: jpayne@69: # Now parse the data between i+1 and j into a tag and attrs jpayne@69: attrs = [] jpayne@69: match = tagfind_tolerant.match(rawdata, i+1) jpayne@69: assert match, 'unexpected call to parse_starttag()' jpayne@69: k = match.end() jpayne@69: self.lasttag = tag = match.group(1).lower() jpayne@69: while k < endpos: jpayne@69: m = attrfind_tolerant.match(rawdata, k) jpayne@69: if not m: jpayne@69: break jpayne@69: attrname, rest, attrvalue = m.group(1, 2, 3) jpayne@69: if not rest: jpayne@69: attrvalue = None jpayne@69: elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ jpayne@69: attrvalue[:1] == '"' == attrvalue[-1:]: jpayne@69: attrvalue = attrvalue[1:-1] jpayne@69: if attrvalue: jpayne@69: attrvalue = unescape(attrvalue) jpayne@69: attrs.append((attrname.lower(), attrvalue)) jpayne@69: k = m.end() jpayne@69: jpayne@69: end = rawdata[k:endpos].strip() jpayne@69: if end not in (">", "/>"): jpayne@69: lineno, offset = self.getpos() jpayne@69: if "\n" in self.__starttag_text: jpayne@69: lineno = lineno + self.__starttag_text.count("\n") jpayne@69: offset = len(self.__starttag_text) \ jpayne@69: - self.__starttag_text.rfind("\n") jpayne@69: else: jpayne@69: offset = offset + len(self.__starttag_text) jpayne@69: self.handle_data(rawdata[i:endpos]) jpayne@69: return endpos jpayne@69: if end.endswith('/>'): jpayne@69: # XHTML-style empty tag: jpayne@69: self.handle_startendtag(tag, attrs) jpayne@69: else: jpayne@69: self.handle_starttag(tag, attrs) jpayne@69: if tag in self.CDATA_CONTENT_ELEMENTS: jpayne@69: self.set_cdata_mode(tag) jpayne@69: return endpos jpayne@69: jpayne@69: # Internal -- check to see if we have a complete starttag; return end jpayne@69: # or -1 if incomplete. jpayne@69: def check_for_whole_start_tag(self, i): jpayne@69: rawdata = self.rawdata jpayne@69: m = locatestarttagend_tolerant.match(rawdata, i) jpayne@69: if m: jpayne@69: j = m.end() jpayne@69: next = rawdata[j:j+1] jpayne@69: if next == ">": jpayne@69: return j + 1 jpayne@69: if next == "/": jpayne@69: if rawdata.startswith("/>", j): jpayne@69: return j + 2 jpayne@69: if rawdata.startswith("/", j): jpayne@69: # buffer boundary jpayne@69: return -1 jpayne@69: # else bogus input jpayne@69: if j > i: jpayne@69: return j jpayne@69: else: jpayne@69: return i + 1 jpayne@69: if next == "": jpayne@69: # end of input jpayne@69: return -1 jpayne@69: if next in ("abcdefghijklmnopqrstuvwxyz=/" jpayne@69: "ABCDEFGHIJKLMNOPQRSTUVWXYZ"): jpayne@69: # end of input in or before attribute value, or we have the jpayne@69: # '/' from a '/>' ending jpayne@69: return -1 jpayne@69: if j > i: jpayne@69: return j jpayne@69: else: jpayne@69: return i + 1 jpayne@69: raise AssertionError("we should not get here!") jpayne@69: jpayne@69: # Internal -- parse endtag, return end or -1 if incomplete jpayne@69: def parse_endtag(self, i): jpayne@69: rawdata = self.rawdata jpayne@69: assert rawdata[i:i+2] == " jpayne@69: if not match: jpayne@69: return -1 jpayne@69: gtpos = match.end() jpayne@69: match = endtagfind.match(rawdata, i) # jpayne@69: if not match: jpayne@69: if self.cdata_elem is not None: jpayne@69: self.handle_data(rawdata[i:gtpos]) jpayne@69: return gtpos jpayne@69: # find the name: w3.org/TR/html5/tokenization.html#tag-name-state jpayne@69: namematch = tagfind_tolerant.match(rawdata, i+2) jpayne@69: if not namematch: jpayne@69: # w3.org/TR/html5/tokenization.html#end-tag-open-state jpayne@69: if rawdata[i:i+3] == '': jpayne@69: return i+3 jpayne@69: else: jpayne@69: return self.parse_bogus_comment(i) jpayne@69: tagname = namematch.group(1).lower() jpayne@69: # consume and ignore other stuff between the name and the > jpayne@69: # Note: this is not 100% correct, since we might have things like jpayne@69: # , but looking for > after tha name should cover jpayne@69: # most of the cases and is much simpler jpayne@69: gtpos = rawdata.find('>', namematch.end()) jpayne@69: self.handle_endtag(tagname) jpayne@69: return gtpos+1 jpayne@69: jpayne@69: elem = match.group(1).lower() # script or style jpayne@69: if self.cdata_elem is not None: jpayne@69: if elem != self.cdata_elem: jpayne@69: self.handle_data(rawdata[i:gtpos]) jpayne@69: return gtpos jpayne@69: jpayne@69: self.handle_endtag(elem) jpayne@69: self.clear_cdata_mode() jpayne@69: return gtpos jpayne@69: jpayne@69: # Overridable -- finish processing of start+end tag: jpayne@69: def handle_startendtag(self, tag, attrs): jpayne@69: self.handle_starttag(tag, attrs) jpayne@69: self.handle_endtag(tag) jpayne@69: jpayne@69: # Overridable -- handle start tag jpayne@69: def handle_starttag(self, tag, attrs): jpayne@69: pass jpayne@69: jpayne@69: # Overridable -- handle end tag jpayne@69: def handle_endtag(self, tag): jpayne@69: pass jpayne@69: jpayne@69: # Overridable -- handle character reference jpayne@69: def handle_charref(self, name): jpayne@69: pass jpayne@69: jpayne@69: # Overridable -- handle entity reference jpayne@69: def handle_entityref(self, name): jpayne@69: pass jpayne@69: jpayne@69: # Overridable -- handle data jpayne@69: def handle_data(self, data): jpayne@69: pass jpayne@69: jpayne@69: # Overridable -- handle comment jpayne@69: def handle_comment(self, data): jpayne@69: pass jpayne@69: jpayne@69: # Overridable -- handle declaration jpayne@69: def handle_decl(self, decl): jpayne@69: pass jpayne@69: jpayne@69: # Overridable -- handle processing instruction jpayne@69: def handle_pi(self, data): jpayne@69: pass jpayne@69: jpayne@69: def unknown_decl(self, data): jpayne@69: pass jpayne@69: jpayne@69: # Internal -- helper to remove special character quoting jpayne@69: def unescape(self, s): jpayne@69: warnings.warn('The unescape method is deprecated and will be removed ' jpayne@69: 'in 3.5, use html.unescape() instead.', jpayne@69: DeprecationWarning, stacklevel=2) jpayne@69: return unescape(s)