jpayne@69: """Provide advanced parsing abilities for ParenMatch and other extensions. jpayne@69: jpayne@69: HyperParser uses PyParser. PyParser mostly gives information on the jpayne@69: proper indentation of code. HyperParser gives additional information on jpayne@69: the structure of code. jpayne@69: """ jpayne@69: from keyword import iskeyword jpayne@69: import string jpayne@69: jpayne@69: from idlelib import pyparse jpayne@69: jpayne@69: # all ASCII chars that may be in an identifier jpayne@69: _ASCII_ID_CHARS = frozenset(string.ascii_letters + string.digits + "_") jpayne@69: # all ASCII chars that may be the first char of an identifier jpayne@69: _ASCII_ID_FIRST_CHARS = frozenset(string.ascii_letters + "_") jpayne@69: jpayne@69: # lookup table for whether 7-bit ASCII chars are valid in a Python identifier jpayne@69: _IS_ASCII_ID_CHAR = [(chr(x) in _ASCII_ID_CHARS) for x in range(128)] jpayne@69: # lookup table for whether 7-bit ASCII chars are valid as the first jpayne@69: # char in a Python identifier jpayne@69: _IS_ASCII_ID_FIRST_CHAR = \ jpayne@69: [(chr(x) in _ASCII_ID_FIRST_CHARS) for x in range(128)] jpayne@69: jpayne@69: jpayne@69: class HyperParser: jpayne@69: def __init__(self, editwin, index): jpayne@69: "To initialize, analyze the surroundings of the given index." jpayne@69: jpayne@69: self.editwin = editwin jpayne@69: self.text = text = editwin.text jpayne@69: jpayne@69: parser = pyparse.Parser(editwin.indentwidth, editwin.tabwidth) jpayne@69: jpayne@69: def index2line(index): jpayne@69: return int(float(index)) jpayne@69: lno = index2line(text.index(index)) jpayne@69: jpayne@69: if not editwin.prompt_last_line: jpayne@69: for context in editwin.num_context_lines: jpayne@69: startat = max(lno - context, 1) jpayne@69: startatindex = repr(startat) + ".0" jpayne@69: stopatindex = "%d.end" % lno jpayne@69: # We add the newline because PyParse requires a newline jpayne@69: # at end. We add a space so that index won't be at end jpayne@69: # of line, so that its status will be the same as the jpayne@69: # char before it, if should. jpayne@69: parser.set_code(text.get(startatindex, stopatindex)+' \n') jpayne@69: bod = parser.find_good_parse_start( jpayne@69: editwin._build_char_in_string_func(startatindex)) jpayne@69: if bod is not None or startat == 1: jpayne@69: break jpayne@69: parser.set_lo(bod or 0) jpayne@69: else: jpayne@69: r = text.tag_prevrange("console", index) jpayne@69: if r: jpayne@69: startatindex = r[1] jpayne@69: else: jpayne@69: startatindex = "1.0" jpayne@69: stopatindex = "%d.end" % lno jpayne@69: # We add the newline because PyParse requires it. We add a jpayne@69: # space so that index won't be at end of line, so that its jpayne@69: # status will be the same as the char before it, if should. jpayne@69: parser.set_code(text.get(startatindex, stopatindex)+' \n') jpayne@69: parser.set_lo(0) jpayne@69: jpayne@69: # We want what the parser has, minus the last newline and space. jpayne@69: self.rawtext = parser.code[:-2] jpayne@69: # Parser.code apparently preserves the statement we are in, so jpayne@69: # that stopatindex can be used to synchronize the string with jpayne@69: # the text box indices. jpayne@69: self.stopatindex = stopatindex jpayne@69: self.bracketing = parser.get_last_stmt_bracketing() jpayne@69: # find which pairs of bracketing are openers. These always jpayne@69: # correspond to a character of rawtext. jpayne@69: self.isopener = [i>0 and self.bracketing[i][1] > jpayne@69: self.bracketing[i-1][1] jpayne@69: for i in range(len(self.bracketing))] jpayne@69: jpayne@69: self.set_index(index) jpayne@69: jpayne@69: def set_index(self, index): jpayne@69: """Set the index to which the functions relate. jpayne@69: jpayne@69: The index must be in the same statement. jpayne@69: """ jpayne@69: indexinrawtext = (len(self.rawtext) - jpayne@69: len(self.text.get(index, self.stopatindex))) jpayne@69: if indexinrawtext < 0: jpayne@69: raise ValueError("Index %s precedes the analyzed statement" jpayne@69: % index) jpayne@69: self.indexinrawtext = indexinrawtext jpayne@69: # find the rightmost bracket to which index belongs jpayne@69: self.indexbracket = 0 jpayne@69: while (self.indexbracket < len(self.bracketing)-1 and jpayne@69: self.bracketing[self.indexbracket+1][0] < self.indexinrawtext): jpayne@69: self.indexbracket += 1 jpayne@69: if (self.indexbracket < len(self.bracketing)-1 and jpayne@69: self.bracketing[self.indexbracket+1][0] == self.indexinrawtext and jpayne@69: not self.isopener[self.indexbracket+1]): jpayne@69: self.indexbracket += 1 jpayne@69: jpayne@69: def is_in_string(self): jpayne@69: """Is the index given to the HyperParser in a string?""" jpayne@69: # The bracket to which we belong should be an opener. jpayne@69: # If it's an opener, it has to have a character. jpayne@69: return (self.isopener[self.indexbracket] and jpayne@69: self.rawtext[self.bracketing[self.indexbracket][0]] jpayne@69: in ('"', "'")) jpayne@69: jpayne@69: def is_in_code(self): jpayne@69: """Is the index given to the HyperParser in normal code?""" jpayne@69: return (not self.isopener[self.indexbracket] or jpayne@69: self.rawtext[self.bracketing[self.indexbracket][0]] jpayne@69: not in ('#', '"', "'")) jpayne@69: jpayne@69: def get_surrounding_brackets(self, openers='([{', mustclose=False): jpayne@69: """Return bracket indexes or None. jpayne@69: jpayne@69: If the index given to the HyperParser is surrounded by a jpayne@69: bracket defined in openers (or at least has one before it), jpayne@69: return the indices of the opening bracket and the closing jpayne@69: bracket (or the end of line, whichever comes first). jpayne@69: jpayne@69: If it is not surrounded by brackets, or the end of line comes jpayne@69: before the closing bracket and mustclose is True, returns None. jpayne@69: """ jpayne@69: jpayne@69: bracketinglevel = self.bracketing[self.indexbracket][1] jpayne@69: before = self.indexbracket jpayne@69: while (not self.isopener[before] or jpayne@69: self.rawtext[self.bracketing[before][0]] not in openers or jpayne@69: self.bracketing[before][1] > bracketinglevel): jpayne@69: before -= 1 jpayne@69: if before < 0: jpayne@69: return None jpayne@69: bracketinglevel = min(bracketinglevel, self.bracketing[before][1]) jpayne@69: after = self.indexbracket + 1 jpayne@69: while (after < len(self.bracketing) and jpayne@69: self.bracketing[after][1] >= bracketinglevel): jpayne@69: after += 1 jpayne@69: jpayne@69: beforeindex = self.text.index("%s-%dc" % jpayne@69: (self.stopatindex, len(self.rawtext)-self.bracketing[before][0])) jpayne@69: if (after >= len(self.bracketing) or jpayne@69: self.bracketing[after][0] > len(self.rawtext)): jpayne@69: if mustclose: jpayne@69: return None jpayne@69: afterindex = self.stopatindex jpayne@69: else: jpayne@69: # We are after a real char, so it is a ')' and we give the jpayne@69: # index before it. jpayne@69: afterindex = self.text.index( jpayne@69: "%s-%dc" % (self.stopatindex, jpayne@69: len(self.rawtext)-(self.bracketing[after][0]-1))) jpayne@69: jpayne@69: return beforeindex, afterindex jpayne@69: jpayne@69: # the set of built-in identifiers which are also keywords, jpayne@69: # i.e. keyword.iskeyword() returns True for them jpayne@69: _ID_KEYWORDS = frozenset({"True", "False", "None"}) jpayne@69: jpayne@69: @classmethod jpayne@69: def _eat_identifier(cls, str, limit, pos): jpayne@69: """Given a string and pos, return the number of chars in the jpayne@69: identifier which ends at pos, or 0 if there is no such one. jpayne@69: jpayne@69: This ignores non-identifier eywords are not identifiers. jpayne@69: """ jpayne@69: is_ascii_id_char = _IS_ASCII_ID_CHAR jpayne@69: jpayne@69: # Start at the end (pos) and work backwards. jpayne@69: i = pos jpayne@69: jpayne@69: # Go backwards as long as the characters are valid ASCII jpayne@69: # identifier characters. This is an optimization, since it jpayne@69: # is faster in the common case where most of the characters jpayne@69: # are ASCII. jpayne@69: while i > limit and ( jpayne@69: ord(str[i - 1]) < 128 and jpayne@69: is_ascii_id_char[ord(str[i - 1])] jpayne@69: ): jpayne@69: i -= 1 jpayne@69: jpayne@69: # If the above loop ended due to reaching a non-ASCII jpayne@69: # character, continue going backwards using the most generic jpayne@69: # test for whether a string contains only valid identifier jpayne@69: # characters. jpayne@69: if i > limit and ord(str[i - 1]) >= 128: jpayne@69: while i - 4 >= limit and ('a' + str[i - 4:pos]).isidentifier(): jpayne@69: i -= 4 jpayne@69: if i - 2 >= limit and ('a' + str[i - 2:pos]).isidentifier(): jpayne@69: i -= 2 jpayne@69: if i - 1 >= limit and ('a' + str[i - 1:pos]).isidentifier(): jpayne@69: i -= 1 jpayne@69: jpayne@69: # The identifier candidate starts here. If it isn't a valid jpayne@69: # identifier, don't eat anything. At this point that is only jpayne@69: # possible if the first character isn't a valid first jpayne@69: # character for an identifier. jpayne@69: if not str[i:pos].isidentifier(): jpayne@69: return 0 jpayne@69: elif i < pos: jpayne@69: # All characters in str[i:pos] are valid ASCII identifier jpayne@69: # characters, so it is enough to check that the first is jpayne@69: # valid as the first character of an identifier. jpayne@69: if not _IS_ASCII_ID_FIRST_CHAR[ord(str[i])]: jpayne@69: return 0 jpayne@69: jpayne@69: # All keywords are valid identifiers, but should not be jpayne@69: # considered identifiers here, except for True, False and None. jpayne@69: if i < pos and ( jpayne@69: iskeyword(str[i:pos]) and jpayne@69: str[i:pos] not in cls._ID_KEYWORDS jpayne@69: ): jpayne@69: return 0 jpayne@69: jpayne@69: return pos - i jpayne@69: jpayne@69: # This string includes all chars that may be in a white space jpayne@69: _whitespace_chars = " \t\n\\" jpayne@69: jpayne@69: def get_expression(self): jpayne@69: """Return a string with the Python expression which ends at the jpayne@69: given index, which is empty if there is no real one. jpayne@69: """ jpayne@69: if not self.is_in_code(): jpayne@69: raise ValueError("get_expression should only be called " jpayne@69: "if index is inside a code.") jpayne@69: jpayne@69: rawtext = self.rawtext jpayne@69: bracketing = self.bracketing jpayne@69: jpayne@69: brck_index = self.indexbracket jpayne@69: brck_limit = bracketing[brck_index][0] jpayne@69: pos = self.indexinrawtext jpayne@69: jpayne@69: last_identifier_pos = pos jpayne@69: postdot_phase = True jpayne@69: jpayne@69: while 1: jpayne@69: # Eat whitespaces, comments, and if postdot_phase is False - a dot jpayne@69: while 1: jpayne@69: if pos>brck_limit and rawtext[pos-1] in self._whitespace_chars: jpayne@69: # Eat a whitespace jpayne@69: pos -= 1 jpayne@69: elif (not postdot_phase and jpayne@69: pos > brck_limit and rawtext[pos-1] == '.'): jpayne@69: # Eat a dot jpayne@69: pos -= 1 jpayne@69: postdot_phase = True jpayne@69: # The next line will fail if we are *inside* a comment, jpayne@69: # but we shouldn't be. jpayne@69: elif (pos == brck_limit and brck_index > 0 and jpayne@69: rawtext[bracketing[brck_index-1][0]] == '#'): jpayne@69: # Eat a comment jpayne@69: brck_index -= 2 jpayne@69: brck_limit = bracketing[brck_index][0] jpayne@69: pos = bracketing[brck_index+1][0] jpayne@69: else: jpayne@69: # If we didn't eat anything, quit. jpayne@69: break jpayne@69: jpayne@69: if not postdot_phase: jpayne@69: # We didn't find a dot, so the expression end at the jpayne@69: # last identifier pos. jpayne@69: break jpayne@69: jpayne@69: ret = self._eat_identifier(rawtext, brck_limit, pos) jpayne@69: if ret: jpayne@69: # There is an identifier to eat jpayne@69: pos = pos - ret jpayne@69: last_identifier_pos = pos jpayne@69: # Now, to continue the search, we must find a dot. jpayne@69: postdot_phase = False jpayne@69: # (the loop continues now) jpayne@69: jpayne@69: elif pos == brck_limit: jpayne@69: # We are at a bracketing limit. If it is a closing jpayne@69: # bracket, eat the bracket, otherwise, stop the search. jpayne@69: level = bracketing[brck_index][1] jpayne@69: while brck_index > 0 and bracketing[brck_index-1][1] > level: jpayne@69: brck_index -= 1 jpayne@69: if bracketing[brck_index][0] == brck_limit: jpayne@69: # We were not at the end of a closing bracket jpayne@69: break jpayne@69: pos = bracketing[brck_index][0] jpayne@69: brck_index -= 1 jpayne@69: brck_limit = bracketing[brck_index][0] jpayne@69: last_identifier_pos = pos jpayne@69: if rawtext[pos] in "([": jpayne@69: # [] and () may be used after an identifier, so we jpayne@69: # continue. postdot_phase is True, so we don't allow a dot. jpayne@69: pass jpayne@69: else: jpayne@69: # We can't continue after other types of brackets jpayne@69: if rawtext[pos] in "'\"": jpayne@69: # Scan a string prefix jpayne@69: while pos > 0 and rawtext[pos - 1] in "rRbBuU": jpayne@69: pos -= 1 jpayne@69: last_identifier_pos = pos jpayne@69: break jpayne@69: jpayne@69: else: jpayne@69: # We've found an operator or something. jpayne@69: break jpayne@69: jpayne@69: return rawtext[last_identifier_pos:self.indexinrawtext] jpayne@69: jpayne@69: jpayne@69: if __name__ == '__main__': jpayne@69: from unittest import main jpayne@69: main('idlelib.idle_test.test_hyperparser', verbosity=2)