jpayne@68: """Provide advanced parsing abilities for ParenMatch and other extensions. jpayne@68: jpayne@68: HyperParser uses PyParser. PyParser mostly gives information on the jpayne@68: proper indentation of code. HyperParser gives additional information on jpayne@68: the structure of code. jpayne@68: """ jpayne@68: from keyword import iskeyword jpayne@68: import string jpayne@68: jpayne@68: from idlelib import pyparse jpayne@68: jpayne@68: # all ASCII chars that may be in an identifier jpayne@68: _ASCII_ID_CHARS = frozenset(string.ascii_letters + string.digits + "_") jpayne@68: # all ASCII chars that may be the first char of an identifier jpayne@68: _ASCII_ID_FIRST_CHARS = frozenset(string.ascii_letters + "_") jpayne@68: jpayne@68: # lookup table for whether 7-bit ASCII chars are valid in a Python identifier jpayne@68: _IS_ASCII_ID_CHAR = [(chr(x) in _ASCII_ID_CHARS) for x in range(128)] jpayne@68: # lookup table for whether 7-bit ASCII chars are valid as the first jpayne@68: # char in a Python identifier jpayne@68: _IS_ASCII_ID_FIRST_CHAR = \ jpayne@68: [(chr(x) in _ASCII_ID_FIRST_CHARS) for x in range(128)] jpayne@68: jpayne@68: jpayne@68: class HyperParser: jpayne@68: def __init__(self, editwin, index): jpayne@68: "To initialize, analyze the surroundings of the given index." jpayne@68: jpayne@68: self.editwin = editwin jpayne@68: self.text = text = editwin.text jpayne@68: jpayne@68: parser = pyparse.Parser(editwin.indentwidth, editwin.tabwidth) jpayne@68: jpayne@68: def index2line(index): jpayne@68: return int(float(index)) jpayne@68: lno = index2line(text.index(index)) jpayne@68: jpayne@68: if not editwin.prompt_last_line: jpayne@68: for context in editwin.num_context_lines: jpayne@68: startat = max(lno - context, 1) jpayne@68: startatindex = repr(startat) + ".0" jpayne@68: stopatindex = "%d.end" % lno jpayne@68: # We add the newline because PyParse requires a newline jpayne@68: # at end. We add a space so that index won't be at end jpayne@68: # of line, so that its status will be the same as the jpayne@68: # char before it, if should. jpayne@68: parser.set_code(text.get(startatindex, stopatindex)+' \n') jpayne@68: bod = parser.find_good_parse_start( jpayne@68: editwin._build_char_in_string_func(startatindex)) jpayne@68: if bod is not None or startat == 1: jpayne@68: break jpayne@68: parser.set_lo(bod or 0) jpayne@68: else: jpayne@68: r = text.tag_prevrange("console", index) jpayne@68: if r: jpayne@68: startatindex = r[1] jpayne@68: else: jpayne@68: startatindex = "1.0" jpayne@68: stopatindex = "%d.end" % lno jpayne@68: # We add the newline because PyParse requires it. We add a jpayne@68: # space so that index won't be at end of line, so that its jpayne@68: # status will be the same as the char before it, if should. jpayne@68: parser.set_code(text.get(startatindex, stopatindex)+' \n') jpayne@68: parser.set_lo(0) jpayne@68: jpayne@68: # We want what the parser has, minus the last newline and space. jpayne@68: self.rawtext = parser.code[:-2] jpayne@68: # Parser.code apparently preserves the statement we are in, so jpayne@68: # that stopatindex can be used to synchronize the string with jpayne@68: # the text box indices. jpayne@68: self.stopatindex = stopatindex jpayne@68: self.bracketing = parser.get_last_stmt_bracketing() jpayne@68: # find which pairs of bracketing are openers. These always jpayne@68: # correspond to a character of rawtext. jpayne@68: self.isopener = [i>0 and self.bracketing[i][1] > jpayne@68: self.bracketing[i-1][1] jpayne@68: for i in range(len(self.bracketing))] jpayne@68: jpayne@68: self.set_index(index) jpayne@68: jpayne@68: def set_index(self, index): jpayne@68: """Set the index to which the functions relate. jpayne@68: jpayne@68: The index must be in the same statement. jpayne@68: """ jpayne@68: indexinrawtext = (len(self.rawtext) - jpayne@68: len(self.text.get(index, self.stopatindex))) jpayne@68: if indexinrawtext < 0: jpayne@68: raise ValueError("Index %s precedes the analyzed statement" jpayne@68: % index) jpayne@68: self.indexinrawtext = indexinrawtext jpayne@68: # find the rightmost bracket to which index belongs jpayne@68: self.indexbracket = 0 jpayne@68: while (self.indexbracket < len(self.bracketing)-1 and jpayne@68: self.bracketing[self.indexbracket+1][0] < self.indexinrawtext): jpayne@68: self.indexbracket += 1 jpayne@68: if (self.indexbracket < len(self.bracketing)-1 and jpayne@68: self.bracketing[self.indexbracket+1][0] == self.indexinrawtext and jpayne@68: not self.isopener[self.indexbracket+1]): jpayne@68: self.indexbracket += 1 jpayne@68: jpayne@68: def is_in_string(self): jpayne@68: """Is the index given to the HyperParser in a string?""" jpayne@68: # The bracket to which we belong should be an opener. jpayne@68: # If it's an opener, it has to have a character. jpayne@68: return (self.isopener[self.indexbracket] and jpayne@68: self.rawtext[self.bracketing[self.indexbracket][0]] jpayne@68: in ('"', "'")) jpayne@68: jpayne@68: def is_in_code(self): jpayne@68: """Is the index given to the HyperParser in normal code?""" jpayne@68: return (not self.isopener[self.indexbracket] or jpayne@68: self.rawtext[self.bracketing[self.indexbracket][0]] jpayne@68: not in ('#', '"', "'")) jpayne@68: jpayne@68: def get_surrounding_brackets(self, openers='([{', mustclose=False): jpayne@68: """Return bracket indexes or None. jpayne@68: jpayne@68: If the index given to the HyperParser is surrounded by a jpayne@68: bracket defined in openers (or at least has one before it), jpayne@68: return the indices of the opening bracket and the closing jpayne@68: bracket (or the end of line, whichever comes first). jpayne@68: jpayne@68: If it is not surrounded by brackets, or the end of line comes jpayne@68: before the closing bracket and mustclose is True, returns None. jpayne@68: """ jpayne@68: jpayne@68: bracketinglevel = self.bracketing[self.indexbracket][1] jpayne@68: before = self.indexbracket jpayne@68: while (not self.isopener[before] or jpayne@68: self.rawtext[self.bracketing[before][0]] not in openers or jpayne@68: self.bracketing[before][1] > bracketinglevel): jpayne@68: before -= 1 jpayne@68: if before < 0: jpayne@68: return None jpayne@68: bracketinglevel = min(bracketinglevel, self.bracketing[before][1]) jpayne@68: after = self.indexbracket + 1 jpayne@68: while (after < len(self.bracketing) and jpayne@68: self.bracketing[after][1] >= bracketinglevel): jpayne@68: after += 1 jpayne@68: jpayne@68: beforeindex = self.text.index("%s-%dc" % jpayne@68: (self.stopatindex, len(self.rawtext)-self.bracketing[before][0])) jpayne@68: if (after >= len(self.bracketing) or jpayne@68: self.bracketing[after][0] > len(self.rawtext)): jpayne@68: if mustclose: jpayne@68: return None jpayne@68: afterindex = self.stopatindex jpayne@68: else: jpayne@68: # We are after a real char, so it is a ')' and we give the jpayne@68: # index before it. jpayne@68: afterindex = self.text.index( jpayne@68: "%s-%dc" % (self.stopatindex, jpayne@68: len(self.rawtext)-(self.bracketing[after][0]-1))) jpayne@68: jpayne@68: return beforeindex, afterindex jpayne@68: jpayne@68: # the set of built-in identifiers which are also keywords, jpayne@68: # i.e. keyword.iskeyword() returns True for them jpayne@68: _ID_KEYWORDS = frozenset({"True", "False", "None"}) jpayne@68: jpayne@68: @classmethod jpayne@68: def _eat_identifier(cls, str, limit, pos): jpayne@68: """Given a string and pos, return the number of chars in the jpayne@68: identifier which ends at pos, or 0 if there is no such one. jpayne@68: jpayne@68: This ignores non-identifier eywords are not identifiers. jpayne@68: """ jpayne@68: is_ascii_id_char = _IS_ASCII_ID_CHAR jpayne@68: jpayne@68: # Start at the end (pos) and work backwards. jpayne@68: i = pos jpayne@68: jpayne@68: # Go backwards as long as the characters are valid ASCII jpayne@68: # identifier characters. This is an optimization, since it jpayne@68: # is faster in the common case where most of the characters jpayne@68: # are ASCII. jpayne@68: while i > limit and ( jpayne@68: ord(str[i - 1]) < 128 and jpayne@68: is_ascii_id_char[ord(str[i - 1])] jpayne@68: ): jpayne@68: i -= 1 jpayne@68: jpayne@68: # If the above loop ended due to reaching a non-ASCII jpayne@68: # character, continue going backwards using the most generic jpayne@68: # test for whether a string contains only valid identifier jpayne@68: # characters. jpayne@68: if i > limit and ord(str[i - 1]) >= 128: jpayne@68: while i - 4 >= limit and ('a' + str[i - 4:pos]).isidentifier(): jpayne@68: i -= 4 jpayne@68: if i - 2 >= limit and ('a' + str[i - 2:pos]).isidentifier(): jpayne@68: i -= 2 jpayne@68: if i - 1 >= limit and ('a' + str[i - 1:pos]).isidentifier(): jpayne@68: i -= 1 jpayne@68: jpayne@68: # The identifier candidate starts here. If it isn't a valid jpayne@68: # identifier, don't eat anything. At this point that is only jpayne@68: # possible if the first character isn't a valid first jpayne@68: # character for an identifier. jpayne@68: if not str[i:pos].isidentifier(): jpayne@68: return 0 jpayne@68: elif i < pos: jpayne@68: # All characters in str[i:pos] are valid ASCII identifier jpayne@68: # characters, so it is enough to check that the first is jpayne@68: # valid as the first character of an identifier. jpayne@68: if not _IS_ASCII_ID_FIRST_CHAR[ord(str[i])]: jpayne@68: return 0 jpayne@68: jpayne@68: # All keywords are valid identifiers, but should not be jpayne@68: # considered identifiers here, except for True, False and None. jpayne@68: if i < pos and ( jpayne@68: iskeyword(str[i:pos]) and jpayne@68: str[i:pos] not in cls._ID_KEYWORDS jpayne@68: ): jpayne@68: return 0 jpayne@68: jpayne@68: return pos - i jpayne@68: jpayne@68: # This string includes all chars that may be in a white space jpayne@68: _whitespace_chars = " \t\n\\" jpayne@68: jpayne@68: def get_expression(self): jpayne@68: """Return a string with the Python expression which ends at the jpayne@68: given index, which is empty if there is no real one. jpayne@68: """ jpayne@68: if not self.is_in_code(): jpayne@68: raise ValueError("get_expression should only be called " jpayne@68: "if index is inside a code.") jpayne@68: jpayne@68: rawtext = self.rawtext jpayne@68: bracketing = self.bracketing jpayne@68: jpayne@68: brck_index = self.indexbracket jpayne@68: brck_limit = bracketing[brck_index][0] jpayne@68: pos = self.indexinrawtext jpayne@68: jpayne@68: last_identifier_pos = pos jpayne@68: postdot_phase = True jpayne@68: jpayne@68: while 1: jpayne@68: # Eat whitespaces, comments, and if postdot_phase is False - a dot jpayne@68: while 1: jpayne@68: if pos>brck_limit and rawtext[pos-1] in self._whitespace_chars: jpayne@68: # Eat a whitespace jpayne@68: pos -= 1 jpayne@68: elif (not postdot_phase and jpayne@68: pos > brck_limit and rawtext[pos-1] == '.'): jpayne@68: # Eat a dot jpayne@68: pos -= 1 jpayne@68: postdot_phase = True jpayne@68: # The next line will fail if we are *inside* a comment, jpayne@68: # but we shouldn't be. jpayne@68: elif (pos == brck_limit and brck_index > 0 and jpayne@68: rawtext[bracketing[brck_index-1][0]] == '#'): jpayne@68: # Eat a comment jpayne@68: brck_index -= 2 jpayne@68: brck_limit = bracketing[brck_index][0] jpayne@68: pos = bracketing[brck_index+1][0] jpayne@68: else: jpayne@68: # If we didn't eat anything, quit. jpayne@68: break jpayne@68: jpayne@68: if not postdot_phase: jpayne@68: # We didn't find a dot, so the expression end at the jpayne@68: # last identifier pos. jpayne@68: break jpayne@68: jpayne@68: ret = self._eat_identifier(rawtext, brck_limit, pos) jpayne@68: if ret: jpayne@68: # There is an identifier to eat jpayne@68: pos = pos - ret jpayne@68: last_identifier_pos = pos jpayne@68: # Now, to continue the search, we must find a dot. jpayne@68: postdot_phase = False jpayne@68: # (the loop continues now) jpayne@68: jpayne@68: elif pos == brck_limit: jpayne@68: # We are at a bracketing limit. If it is a closing jpayne@68: # bracket, eat the bracket, otherwise, stop the search. jpayne@68: level = bracketing[brck_index][1] jpayne@68: while brck_index > 0 and bracketing[brck_index-1][1] > level: jpayne@68: brck_index -= 1 jpayne@68: if bracketing[brck_index][0] == brck_limit: jpayne@68: # We were not at the end of a closing bracket jpayne@68: break jpayne@68: pos = bracketing[brck_index][0] jpayne@68: brck_index -= 1 jpayne@68: brck_limit = bracketing[brck_index][0] jpayne@68: last_identifier_pos = pos jpayne@68: if rawtext[pos] in "([": jpayne@68: # [] and () may be used after an identifier, so we jpayne@68: # continue. postdot_phase is True, so we don't allow a dot. jpayne@68: pass jpayne@68: else: jpayne@68: # We can't continue after other types of brackets jpayne@68: if rawtext[pos] in "'\"": jpayne@68: # Scan a string prefix jpayne@68: while pos > 0 and rawtext[pos - 1] in "rRbBuU": jpayne@68: pos -= 1 jpayne@68: last_identifier_pos = pos jpayne@68: break jpayne@68: jpayne@68: else: jpayne@68: # We've found an operator or something. jpayne@68: break jpayne@68: jpayne@68: return rawtext[last_identifier_pos:self.indexinrawtext] jpayne@68: jpayne@68: jpayne@68: if __name__ == '__main__': jpayne@68: from unittest import main jpayne@68: main('idlelib.idle_test.test_hyperparser', verbosity=2)