jpayne@69: """Provide advanced parsing abilities for ParenMatch and other extensions.
jpayne@69: 
jpayne@69: HyperParser uses PyParser.  PyParser mostly gives information on the
jpayne@69: proper indentation of code.  HyperParser gives additional information on
jpayne@69: the structure of code.
jpayne@69: """
jpayne@69: from keyword import iskeyword
jpayne@69: import string
jpayne@69: 
jpayne@69: from idlelib import pyparse
jpayne@69: 
jpayne@69: # all ASCII chars that may be in an identifier
jpayne@69: _ASCII_ID_CHARS = frozenset(string.ascii_letters + string.digits + "_")
jpayne@69: # all ASCII chars that may be the first char of an identifier
jpayne@69: _ASCII_ID_FIRST_CHARS = frozenset(string.ascii_letters + "_")
jpayne@69: 
jpayne@69: # lookup table for whether 7-bit ASCII chars are valid in a Python identifier
jpayne@69: _IS_ASCII_ID_CHAR = [(chr(x) in _ASCII_ID_CHARS) for x in range(128)]
jpayne@69: # lookup table for whether 7-bit ASCII chars are valid as the first
jpayne@69: # char in a Python identifier
jpayne@69: _IS_ASCII_ID_FIRST_CHAR = \
jpayne@69:     [(chr(x) in _ASCII_ID_FIRST_CHARS) for x in range(128)]
jpayne@69: 
jpayne@69: 
jpayne@69: class HyperParser:
jpayne@69:     def __init__(self, editwin, index):
jpayne@69:         "To initialize, analyze the surroundings of the given index."
jpayne@69: 
jpayne@69:         self.editwin = editwin
jpayne@69:         self.text = text = editwin.text
jpayne@69: 
jpayne@69:         parser = pyparse.Parser(editwin.indentwidth, editwin.tabwidth)
jpayne@69: 
jpayne@69:         def index2line(index):
jpayne@69:             return int(float(index))
jpayne@69:         lno = index2line(text.index(index))
jpayne@69: 
jpayne@69:         if not editwin.prompt_last_line:
jpayne@69:             for context in editwin.num_context_lines:
jpayne@69:                 startat = max(lno - context, 1)
jpayne@69:                 startatindex = repr(startat) + ".0"
jpayne@69:                 stopatindex = "%d.end" % lno
jpayne@69:                 # We add the newline because PyParse requires a newline
jpayne@69:                 # at end. We add a space so that index won't be at end
jpayne@69:                 # of line, so that its status will be the same as the
jpayne@69:                 # char before it, if should.
jpayne@69:                 parser.set_code(text.get(startatindex, stopatindex)+' \n')
jpayne@69:                 bod = parser.find_good_parse_start(
jpayne@69:                           editwin._build_char_in_string_func(startatindex))
jpayne@69:                 if bod is not None or startat == 1:
jpayne@69:                     break
jpayne@69:             parser.set_lo(bod or 0)
jpayne@69:         else:
jpayne@69:             r = text.tag_prevrange("console", index)
jpayne@69:             if r:
jpayne@69:                 startatindex = r[1]
jpayne@69:             else:
jpayne@69:                 startatindex = "1.0"
jpayne@69:             stopatindex = "%d.end" % lno
jpayne@69:             # We add the newline because PyParse requires it. We add a
jpayne@69:             # space so that index won't be at end of line, so that its
jpayne@69:             # status will be the same as the char before it, if should.
jpayne@69:             parser.set_code(text.get(startatindex, stopatindex)+' \n')
jpayne@69:             parser.set_lo(0)
jpayne@69: 
jpayne@69:         # We want what the parser has, minus the last newline and space.
jpayne@69:         self.rawtext = parser.code[:-2]
jpayne@69:         # Parser.code apparently preserves the statement we are in, so
jpayne@69:         # that stopatindex can be used to synchronize the string with
jpayne@69:         # the text box indices.
jpayne@69:         self.stopatindex = stopatindex
jpayne@69:         self.bracketing = parser.get_last_stmt_bracketing()
jpayne@69:         # find which pairs of bracketing are openers. These always
jpayne@69:         # correspond to a character of rawtext.
jpayne@69:         self.isopener = [i>0 and self.bracketing[i][1] >
jpayne@69:                          self.bracketing[i-1][1]
jpayne@69:                          for i in range(len(self.bracketing))]
jpayne@69: 
jpayne@69:         self.set_index(index)
jpayne@69: 
jpayne@69:     def set_index(self, index):
jpayne@69:         """Set the index to which the functions relate.
jpayne@69: 
jpayne@69:         The index must be in the same statement.
jpayne@69:         """
jpayne@69:         indexinrawtext = (len(self.rawtext) -
jpayne@69:                           len(self.text.get(index, self.stopatindex)))
jpayne@69:         if indexinrawtext < 0:
jpayne@69:             raise ValueError("Index %s precedes the analyzed statement"
jpayne@69:                              % index)
jpayne@69:         self.indexinrawtext = indexinrawtext
jpayne@69:         # find the rightmost bracket to which index belongs
jpayne@69:         self.indexbracket = 0
jpayne@69:         while (self.indexbracket < len(self.bracketing)-1 and
jpayne@69:                self.bracketing[self.indexbracket+1][0] < self.indexinrawtext):
jpayne@69:             self.indexbracket += 1
jpayne@69:         if (self.indexbracket < len(self.bracketing)-1 and
jpayne@69:             self.bracketing[self.indexbracket+1][0] == self.indexinrawtext and
jpayne@69:            not self.isopener[self.indexbracket+1]):
jpayne@69:             self.indexbracket += 1
jpayne@69: 
jpayne@69:     def is_in_string(self):
jpayne@69:         """Is the index given to the HyperParser in a string?"""
jpayne@69:         # The bracket to which we belong should be an opener.
jpayne@69:         # If it's an opener, it has to have a character.
jpayne@69:         return (self.isopener[self.indexbracket] and
jpayne@69:                 self.rawtext[self.bracketing[self.indexbracket][0]]
jpayne@69:                 in ('"', "'"))
jpayne@69: 
jpayne@69:     def is_in_code(self):
jpayne@69:         """Is the index given to the HyperParser in normal code?"""
jpayne@69:         return (not self.isopener[self.indexbracket] or
jpayne@69:                 self.rawtext[self.bracketing[self.indexbracket][0]]
jpayne@69:                 not in ('#', '"', "'"))
jpayne@69: 
jpayne@69:     def get_surrounding_brackets(self, openers='([{', mustclose=False):
jpayne@69:         """Return bracket indexes or None.
jpayne@69: 
jpayne@69:         If the index given to the HyperParser is surrounded by a
jpayne@69:         bracket defined in openers (or at least has one before it),
jpayne@69:         return the indices of the opening bracket and the closing
jpayne@69:         bracket (or the end of line, whichever comes first).
jpayne@69: 
jpayne@69:         If it is not surrounded by brackets, or the end of line comes
jpayne@69:         before the closing bracket and mustclose is True, returns None.
jpayne@69:         """
jpayne@69: 
jpayne@69:         bracketinglevel = self.bracketing[self.indexbracket][1]
jpayne@69:         before = self.indexbracket
jpayne@69:         while (not self.isopener[before] or
jpayne@69:               self.rawtext[self.bracketing[before][0]] not in openers or
jpayne@69:               self.bracketing[before][1] > bracketinglevel):
jpayne@69:             before -= 1
jpayne@69:             if before < 0:
jpayne@69:                 return None
jpayne@69:             bracketinglevel = min(bracketinglevel, self.bracketing[before][1])
jpayne@69:         after = self.indexbracket + 1
jpayne@69:         while (after < len(self.bracketing) and
jpayne@69:               self.bracketing[after][1] >= bracketinglevel):
jpayne@69:             after += 1
jpayne@69: 
jpayne@69:         beforeindex = self.text.index("%s-%dc" %
jpayne@69:             (self.stopatindex, len(self.rawtext)-self.bracketing[before][0]))
jpayne@69:         if (after >= len(self.bracketing) or
jpayne@69:            self.bracketing[after][0] > len(self.rawtext)):
jpayne@69:             if mustclose:
jpayne@69:                 return None
jpayne@69:             afterindex = self.stopatindex
jpayne@69:         else:
jpayne@69:             # We are after a real char, so it is a ')' and we give the
jpayne@69:             # index before it.
jpayne@69:             afterindex = self.text.index(
jpayne@69:                 "%s-%dc" % (self.stopatindex,
jpayne@69:                  len(self.rawtext)-(self.bracketing[after][0]-1)))
jpayne@69: 
jpayne@69:         return beforeindex, afterindex
jpayne@69: 
jpayne@69:     # the set of built-in identifiers which are also keywords,
jpayne@69:     # i.e. keyword.iskeyword() returns True for them
jpayne@69:     _ID_KEYWORDS = frozenset({"True", "False", "None"})
jpayne@69: 
jpayne@69:     @classmethod
jpayne@69:     def _eat_identifier(cls, str, limit, pos):
jpayne@69:         """Given a string and pos, return the number of chars in the
jpayne@69:         identifier which ends at pos, or 0 if there is no such one.
jpayne@69: 
jpayne@69:         This ignores non-identifier eywords are not identifiers.
jpayne@69:         """
jpayne@69:         is_ascii_id_char = _IS_ASCII_ID_CHAR
jpayne@69: 
jpayne@69:         # Start at the end (pos) and work backwards.
jpayne@69:         i = pos
jpayne@69: 
jpayne@69:         # Go backwards as long as the characters are valid ASCII
jpayne@69:         # identifier characters. This is an optimization, since it
jpayne@69:         # is faster in the common case where most of the characters
jpayne@69:         # are ASCII.
jpayne@69:         while i > limit and (
jpayne@69:                 ord(str[i - 1]) < 128 and
jpayne@69:                 is_ascii_id_char[ord(str[i - 1])]
jpayne@69:         ):
jpayne@69:             i -= 1
jpayne@69: 
jpayne@69:         # If the above loop ended due to reaching a non-ASCII
jpayne@69:         # character, continue going backwards using the most generic
jpayne@69:         # test for whether a string contains only valid identifier
jpayne@69:         # characters.
jpayne@69:         if i > limit and ord(str[i - 1]) >= 128:
jpayne@69:             while i - 4 >= limit and ('a' + str[i - 4:pos]).isidentifier():
jpayne@69:                 i -= 4
jpayne@69:             if i - 2 >= limit and ('a' + str[i - 2:pos]).isidentifier():
jpayne@69:                 i -= 2
jpayne@69:             if i - 1 >= limit and ('a' + str[i - 1:pos]).isidentifier():
jpayne@69:                 i -= 1
jpayne@69: 
jpayne@69:             # The identifier candidate starts here. If it isn't a valid
jpayne@69:             # identifier, don't eat anything. At this point that is only
jpayne@69:             # possible if the first character isn't a valid first
jpayne@69:             # character for an identifier.
jpayne@69:             if not str[i:pos].isidentifier():
jpayne@69:                 return 0
jpayne@69:         elif i < pos:
jpayne@69:             # All characters in str[i:pos] are valid ASCII identifier
jpayne@69:             # characters, so it is enough to check that the first is
jpayne@69:             # valid as the first character of an identifier.
jpayne@69:             if not _IS_ASCII_ID_FIRST_CHAR[ord(str[i])]:
jpayne@69:                 return 0
jpayne@69: 
jpayne@69:         # All keywords are valid identifiers, but should not be
jpayne@69:         # considered identifiers here, except for True, False and None.
jpayne@69:         if i < pos and (
jpayne@69:                 iskeyword(str[i:pos]) and
jpayne@69:                 str[i:pos] not in cls._ID_KEYWORDS
jpayne@69:         ):
jpayne@69:             return 0
jpayne@69: 
jpayne@69:         return pos - i
jpayne@69: 
jpayne@69:     # This string includes all chars that may be in a white space
jpayne@69:     _whitespace_chars = " \t\n\\"
jpayne@69: 
jpayne@69:     def get_expression(self):
jpayne@69:         """Return a string with the Python expression which ends at the
jpayne@69:         given index, which is empty if there is no real one.
jpayne@69:         """
jpayne@69:         if not self.is_in_code():
jpayne@69:             raise ValueError("get_expression should only be called "
jpayne@69:                              "if index is inside a code.")
jpayne@69: 
jpayne@69:         rawtext = self.rawtext
jpayne@69:         bracketing = self.bracketing
jpayne@69: 
jpayne@69:         brck_index = self.indexbracket
jpayne@69:         brck_limit = bracketing[brck_index][0]
jpayne@69:         pos = self.indexinrawtext
jpayne@69: 
jpayne@69:         last_identifier_pos = pos
jpayne@69:         postdot_phase = True
jpayne@69: 
jpayne@69:         while 1:
jpayne@69:             # Eat whitespaces, comments, and if postdot_phase is False - a dot
jpayne@69:             while 1:
jpayne@69:                 if pos>brck_limit and rawtext[pos-1] in self._whitespace_chars:
jpayne@69:                     # Eat a whitespace
jpayne@69:                     pos -= 1
jpayne@69:                 elif (not postdot_phase and
jpayne@69:                       pos > brck_limit and rawtext[pos-1] == '.'):
jpayne@69:                     # Eat a dot
jpayne@69:                     pos -= 1
jpayne@69:                     postdot_phase = True
jpayne@69:                 # The next line will fail if we are *inside* a comment,
jpayne@69:                 # but we shouldn't be.
jpayne@69:                 elif (pos == brck_limit and brck_index > 0 and
jpayne@69:                       rawtext[bracketing[brck_index-1][0]] == '#'):
jpayne@69:                     # Eat a comment
jpayne@69:                     brck_index -= 2
jpayne@69:                     brck_limit = bracketing[brck_index][0]
jpayne@69:                     pos = bracketing[brck_index+1][0]
jpayne@69:                 else:
jpayne@69:                     # If we didn't eat anything, quit.
jpayne@69:                     break
jpayne@69: 
jpayne@69:             if not postdot_phase:
jpayne@69:                 # We didn't find a dot, so the expression end at the
jpayne@69:                 # last identifier pos.
jpayne@69:                 break
jpayne@69: 
jpayne@69:             ret = self._eat_identifier(rawtext, brck_limit, pos)
jpayne@69:             if ret:
jpayne@69:                 # There is an identifier to eat
jpayne@69:                 pos = pos - ret
jpayne@69:                 last_identifier_pos = pos
jpayne@69:                 # Now, to continue the search, we must find a dot.
jpayne@69:                 postdot_phase = False
jpayne@69:                 # (the loop continues now)
jpayne@69: 
jpayne@69:             elif pos == brck_limit:
jpayne@69:                 # We are at a bracketing limit. If it is a closing
jpayne@69:                 # bracket, eat the bracket, otherwise, stop the search.
jpayne@69:                 level = bracketing[brck_index][1]
jpayne@69:                 while brck_index > 0 and bracketing[brck_index-1][1] > level:
jpayne@69:                     brck_index -= 1
jpayne@69:                 if bracketing[brck_index][0] == brck_limit:
jpayne@69:                     # We were not at the end of a closing bracket
jpayne@69:                     break
jpayne@69:                 pos = bracketing[brck_index][0]
jpayne@69:                 brck_index -= 1
jpayne@69:                 brck_limit = bracketing[brck_index][0]
jpayne@69:                 last_identifier_pos = pos
jpayne@69:                 if rawtext[pos] in "([":
jpayne@69:                     # [] and () may be used after an identifier, so we
jpayne@69:                     # continue. postdot_phase is True, so we don't allow a dot.
jpayne@69:                     pass
jpayne@69:                 else:
jpayne@69:                     # We can't continue after other types of brackets
jpayne@69:                     if rawtext[pos] in "'\"":
jpayne@69:                         # Scan a string prefix
jpayne@69:                         while pos > 0 and rawtext[pos - 1] in "rRbBuU":
jpayne@69:                             pos -= 1
jpayne@69:                         last_identifier_pos = pos
jpayne@69:                     break
jpayne@69: 
jpayne@69:             else:
jpayne@69:                 # We've found an operator or something.
jpayne@69:                 break
jpayne@69: 
jpayne@69:         return rawtext[last_identifier_pos:self.indexinrawtext]
jpayne@69: 
jpayne@69: 
jpayne@69: if __name__ == '__main__':
jpayne@69:     from unittest import main
jpayne@69:     main('idlelib.idle_test.test_hyperparser', verbosity=2)