comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/idlelib/pyparse.py @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 """Define partial Python code Parser used by editor and hyperparser.
2
3 Instances of ParseMap are used with str.translate.
4
5 The following bound search and match functions are defined:
6 _synchre - start of popular statement;
7 _junkre - whitespace or comment line;
8 _match_stringre: string, possibly without closer;
9 _itemre - line that may have bracket structure start;
10 _closere - line that must be followed by dedent.
11 _chew_ordinaryre - non-special characters.
12 """
13 import re
14
15 # Reason last statement is continued (or C_NONE if it's not).
16 (C_NONE, C_BACKSLASH, C_STRING_FIRST_LINE,
17 C_STRING_NEXT_LINES, C_BRACKET) = range(5)
18
19 # Find what looks like the start of a popular statement.
20
21 _synchre = re.compile(r"""
22 ^
23 [ \t]*
24 (?: while
25 | else
26 | def
27 | return
28 | assert
29 | break
30 | class
31 | continue
32 | elif
33 | try
34 | except
35 | raise
36 | import
37 | yield
38 )
39 \b
40 """, re.VERBOSE | re.MULTILINE).search
41
42 # Match blank line or non-indenting comment line.
43
44 _junkre = re.compile(r"""
45 [ \t]*
46 (?: \# \S .* )?
47 \n
48 """, re.VERBOSE).match
49
50 # Match any flavor of string; the terminating quote is optional
51 # so that we're robust in the face of incomplete program text.
52
53 _match_stringre = re.compile(r"""
54 \""" [^"\\]* (?:
55 (?: \\. | "(?!"") )
56 [^"\\]*
57 )*
58 (?: \""" )?
59
60 | " [^"\\\n]* (?: \\. [^"\\\n]* )* "?
61
62 | ''' [^'\\]* (?:
63 (?: \\. | '(?!'') )
64 [^'\\]*
65 )*
66 (?: ''' )?
67
68 | ' [^'\\\n]* (?: \\. [^'\\\n]* )* '?
69 """, re.VERBOSE | re.DOTALL).match
70
71 # Match a line that starts with something interesting;
72 # used to find the first item of a bracket structure.
73
74 _itemre = re.compile(r"""
75 [ \t]*
76 [^\s#\\] # if we match, m.end()-1 is the interesting char
77 """, re.VERBOSE).match
78
79 # Match start of statements that should be followed by a dedent.
80
81 _closere = re.compile(r"""
82 \s*
83 (?: return
84 | break
85 | continue
86 | raise
87 | pass
88 )
89 \b
90 """, re.VERBOSE).match
91
92 # Chew up non-special chars as quickly as possible. If match is
93 # successful, m.end() less 1 is the index of the last boring char
94 # matched. If match is unsuccessful, the string starts with an
95 # interesting char.
96
97 _chew_ordinaryre = re.compile(r"""
98 [^[\](){}#'"\\]+
99 """, re.VERBOSE).match
100
101
102 class ParseMap(dict):
103 r"""Dict subclass that maps anything not in dict to 'x'.
104
105 This is designed to be used with str.translate in study1.
106 Anything not specifically mapped otherwise becomes 'x'.
107 Example: replace everything except whitespace with 'x'.
108
109 >>> keepwhite = ParseMap((ord(c), ord(c)) for c in ' \t\n\r')
110 >>> "a + b\tc\nd".translate(keepwhite)
111 'x x x\tx\nx'
112 """
113 # Calling this triples access time; see bpo-32940
114 def __missing__(self, key):
115 return 120 # ord('x')
116
117
118 # Map all ascii to 120 to avoid __missing__ call, then replace some.
119 trans = ParseMap.fromkeys(range(128), 120)
120 trans.update((ord(c), ord('(')) for c in "({[") # open brackets => '(';
121 trans.update((ord(c), ord(')')) for c in ")}]") # close brackets => ')'.
122 trans.update((ord(c), ord(c)) for c in "\"'\\\n#") # Keep these.
123
124
125 class Parser:
126
127 def __init__(self, indentwidth, tabwidth):
128 self.indentwidth = indentwidth
129 self.tabwidth = tabwidth
130
131 def set_code(self, s):
132 assert len(s) == 0 or s[-1] == '\n'
133 self.code = s
134 self.study_level = 0
135
136 def find_good_parse_start(self, is_char_in_string=None,
137 _synchre=_synchre):
138 """
139 Return index of a good place to begin parsing, as close to the
140 end of the string as possible. This will be the start of some
141 popular stmt like "if" or "def". Return None if none found:
142 the caller should pass more prior context then, if possible, or
143 if not (the entire program text up until the point of interest
144 has already been tried) pass 0 to set_lo().
145
146 This will be reliable iff given a reliable is_char_in_string()
147 function, meaning that when it says "no", it's absolutely
148 guaranteed that the char is not in a string.
149 """
150 code, pos = self.code, None
151
152 if not is_char_in_string:
153 # no clue -- make the caller pass everything
154 return None
155
156 # Peek back from the end for a good place to start,
157 # but don't try too often; pos will be left None, or
158 # bumped to a legitimate synch point.
159 limit = len(code)
160 for tries in range(5):
161 i = code.rfind(":\n", 0, limit)
162 if i < 0:
163 break
164 i = code.rfind('\n', 0, i) + 1 # start of colon line (-1+1=0)
165 m = _synchre(code, i, limit)
166 if m and not is_char_in_string(m.start()):
167 pos = m.start()
168 break
169 limit = i
170 if pos is None:
171 # Nothing looks like a block-opener, or stuff does
172 # but is_char_in_string keeps returning true; most likely
173 # we're in or near a giant string, the colorizer hasn't
174 # caught up enough to be helpful, or there simply *aren't*
175 # any interesting stmts. In any of these cases we're
176 # going to have to parse the whole thing to be sure, so
177 # give it one last try from the start, but stop wasting
178 # time here regardless of the outcome.
179 m = _synchre(code)
180 if m and not is_char_in_string(m.start()):
181 pos = m.start()
182 return pos
183
184 # Peeking back worked; look forward until _synchre no longer
185 # matches.
186 i = pos + 1
187 while 1:
188 m = _synchre(code, i)
189 if m:
190 s, i = m.span()
191 if not is_char_in_string(s):
192 pos = s
193 else:
194 break
195 return pos
196
197 def set_lo(self, lo):
198 """ Throw away the start of the string.
199
200 Intended to be called with the result of find_good_parse_start().
201 """
202 assert lo == 0 or self.code[lo-1] == '\n'
203 if lo > 0:
204 self.code = self.code[lo:]
205
206 def _study1(self):
207 """Find the line numbers of non-continuation lines.
208
209 As quickly as humanly possible <wink>, find the line numbers (0-
210 based) of the non-continuation lines.
211 Creates self.{goodlines, continuation}.
212 """
213 if self.study_level >= 1:
214 return
215 self.study_level = 1
216
217 # Map all uninteresting characters to "x", all open brackets
218 # to "(", all close brackets to ")", then collapse runs of
219 # uninteresting characters. This can cut the number of chars
220 # by a factor of 10-40, and so greatly speed the following loop.
221 code = self.code
222 code = code.translate(trans)
223 code = code.replace('xxxxxxxx', 'x')
224 code = code.replace('xxxx', 'x')
225 code = code.replace('xx', 'x')
226 code = code.replace('xx', 'x')
227 code = code.replace('\nx', '\n')
228 # Replacing x\n with \n would be incorrect because
229 # x may be preceded by a backslash.
230
231 # March over the squashed version of the program, accumulating
232 # the line numbers of non-continued stmts, and determining
233 # whether & why the last stmt is a continuation.
234 continuation = C_NONE
235 level = lno = 0 # level is nesting level; lno is line number
236 self.goodlines = goodlines = [0]
237 push_good = goodlines.append
238 i, n = 0, len(code)
239 while i < n:
240 ch = code[i]
241 i = i+1
242
243 # cases are checked in decreasing order of frequency
244 if ch == 'x':
245 continue
246
247 if ch == '\n':
248 lno = lno + 1
249 if level == 0:
250 push_good(lno)
251 # else we're in an unclosed bracket structure
252 continue
253
254 if ch == '(':
255 level = level + 1
256 continue
257
258 if ch == ')':
259 if level:
260 level = level - 1
261 # else the program is invalid, but we can't complain
262 continue
263
264 if ch == '"' or ch == "'":
265 # consume the string
266 quote = ch
267 if code[i-1:i+2] == quote * 3:
268 quote = quote * 3
269 firstlno = lno
270 w = len(quote) - 1
271 i = i+w
272 while i < n:
273 ch = code[i]
274 i = i+1
275
276 if ch == 'x':
277 continue
278
279 if code[i-1:i+w] == quote:
280 i = i+w
281 break
282
283 if ch == '\n':
284 lno = lno + 1
285 if w == 0:
286 # unterminated single-quoted string
287 if level == 0:
288 push_good(lno)
289 break
290 continue
291
292 if ch == '\\':
293 assert i < n
294 if code[i] == '\n':
295 lno = lno + 1
296 i = i+1
297 continue
298
299 # else comment char or paren inside string
300
301 else:
302 # didn't break out of the loop, so we're still
303 # inside a string
304 if (lno - 1) == firstlno:
305 # before the previous \n in code, we were in the first
306 # line of the string
307 continuation = C_STRING_FIRST_LINE
308 else:
309 continuation = C_STRING_NEXT_LINES
310 continue # with outer loop
311
312 if ch == '#':
313 # consume the comment
314 i = code.find('\n', i)
315 assert i >= 0
316 continue
317
318 assert ch == '\\'
319 assert i < n
320 if code[i] == '\n':
321 lno = lno + 1
322 if i+1 == n:
323 continuation = C_BACKSLASH
324 i = i+1
325
326 # The last stmt may be continued for all 3 reasons.
327 # String continuation takes precedence over bracket
328 # continuation, which beats backslash continuation.
329 if (continuation != C_STRING_FIRST_LINE
330 and continuation != C_STRING_NEXT_LINES and level > 0):
331 continuation = C_BRACKET
332 self.continuation = continuation
333
334 # Push the final line number as a sentinel value, regardless of
335 # whether it's continued.
336 assert (continuation == C_NONE) == (goodlines[-1] == lno)
337 if goodlines[-1] != lno:
338 push_good(lno)
339
340 def get_continuation_type(self):
341 self._study1()
342 return self.continuation
343
344 def _study2(self):
345 """
346 study1 was sufficient to determine the continuation status,
347 but doing more requires looking at every character. study2
348 does this for the last interesting statement in the block.
349 Creates:
350 self.stmt_start, stmt_end
351 slice indices of last interesting stmt
352 self.stmt_bracketing
353 the bracketing structure of the last interesting stmt; for
354 example, for the statement "say(boo) or die",
355 stmt_bracketing will be ((0, 0), (0, 1), (2, 0), (2, 1),
356 (4, 0)). Strings and comments are treated as brackets, for
357 the matter.
358 self.lastch
359 last interesting character before optional trailing comment
360 self.lastopenbracketpos
361 if continuation is C_BRACKET, index of last open bracket
362 """
363 if self.study_level >= 2:
364 return
365 self._study1()
366 self.study_level = 2
367
368 # Set p and q to slice indices of last interesting stmt.
369 code, goodlines = self.code, self.goodlines
370 i = len(goodlines) - 1 # Index of newest line.
371 p = len(code) # End of goodlines[i]
372 while i:
373 assert p
374 # Make p be the index of the stmt at line number goodlines[i].
375 # Move p back to the stmt at line number goodlines[i-1].
376 q = p
377 for nothing in range(goodlines[i-1], goodlines[i]):
378 # tricky: sets p to 0 if no preceding newline
379 p = code.rfind('\n', 0, p-1) + 1
380 # The stmt code[p:q] isn't a continuation, but may be blank
381 # or a non-indenting comment line.
382 if _junkre(code, p):
383 i = i-1
384 else:
385 break
386 if i == 0:
387 # nothing but junk!
388 assert p == 0
389 q = p
390 self.stmt_start, self.stmt_end = p, q
391
392 # Analyze this stmt, to find the last open bracket (if any)
393 # and last interesting character (if any).
394 lastch = ""
395 stack = [] # stack of open bracket indices
396 push_stack = stack.append
397 bracketing = [(p, 0)]
398 while p < q:
399 # suck up all except ()[]{}'"#\\
400 m = _chew_ordinaryre(code, p, q)
401 if m:
402 # we skipped at least one boring char
403 newp = m.end()
404 # back up over totally boring whitespace
405 i = newp - 1 # index of last boring char
406 while i >= p and code[i] in " \t\n":
407 i = i-1
408 if i >= p:
409 lastch = code[i]
410 p = newp
411 if p >= q:
412 break
413
414 ch = code[p]
415
416 if ch in "([{":
417 push_stack(p)
418 bracketing.append((p, len(stack)))
419 lastch = ch
420 p = p+1
421 continue
422
423 if ch in ")]}":
424 if stack:
425 del stack[-1]
426 lastch = ch
427 p = p+1
428 bracketing.append((p, len(stack)))
429 continue
430
431 if ch == '"' or ch == "'":
432 # consume string
433 # Note that study1 did this with a Python loop, but
434 # we use a regexp here; the reason is speed in both
435 # cases; the string may be huge, but study1 pre-squashed
436 # strings to a couple of characters per line. study1
437 # also needed to keep track of newlines, and we don't
438 # have to.
439 bracketing.append((p, len(stack)+1))
440 lastch = ch
441 p = _match_stringre(code, p, q).end()
442 bracketing.append((p, len(stack)))
443 continue
444
445 if ch == '#':
446 # consume comment and trailing newline
447 bracketing.append((p, len(stack)+1))
448 p = code.find('\n', p, q) + 1
449 assert p > 0
450 bracketing.append((p, len(stack)))
451 continue
452
453 assert ch == '\\'
454 p = p+1 # beyond backslash
455 assert p < q
456 if code[p] != '\n':
457 # the program is invalid, but can't complain
458 lastch = ch + code[p]
459 p = p+1 # beyond escaped char
460
461 # end while p < q:
462
463 self.lastch = lastch
464 self.lastopenbracketpos = stack[-1] if stack else None
465 self.stmt_bracketing = tuple(bracketing)
466
467 def compute_bracket_indent(self):
468 """Return number of spaces the next line should be indented.
469
470 Line continuation must be C_BRACKET.
471 """
472 self._study2()
473 assert self.continuation == C_BRACKET
474 j = self.lastopenbracketpos
475 code = self.code
476 n = len(code)
477 origi = i = code.rfind('\n', 0, j) + 1
478 j = j+1 # one beyond open bracket
479 # find first list item; set i to start of its line
480 while j < n:
481 m = _itemre(code, j)
482 if m:
483 j = m.end() - 1 # index of first interesting char
484 extra = 0
485 break
486 else:
487 # this line is junk; advance to next line
488 i = j = code.find('\n', j) + 1
489 else:
490 # nothing interesting follows the bracket;
491 # reproduce the bracket line's indentation + a level
492 j = i = origi
493 while code[j] in " \t":
494 j = j+1
495 extra = self.indentwidth
496 return len(code[i:j].expandtabs(self.tabwidth)) + extra
497
498 def get_num_lines_in_stmt(self):
499 """Return number of physical lines in last stmt.
500
501 The statement doesn't have to be an interesting statement. This is
502 intended to be called when continuation is C_BACKSLASH.
503 """
504 self._study1()
505 goodlines = self.goodlines
506 return goodlines[-1] - goodlines[-2]
507
508 def compute_backslash_indent(self):
509 """Return number of spaces the next line should be indented.
510
511 Line continuation must be C_BACKSLASH. Also assume that the new
512 line is the first one following the initial line of the stmt.
513 """
514 self._study2()
515 assert self.continuation == C_BACKSLASH
516 code = self.code
517 i = self.stmt_start
518 while code[i] in " \t":
519 i = i+1
520 startpos = i
521
522 # See whether the initial line starts an assignment stmt; i.e.,
523 # look for an = operator
524 endpos = code.find('\n', startpos) + 1
525 found = level = 0
526 while i < endpos:
527 ch = code[i]
528 if ch in "([{":
529 level = level + 1
530 i = i+1
531 elif ch in ")]}":
532 if level:
533 level = level - 1
534 i = i+1
535 elif ch == '"' or ch == "'":
536 i = _match_stringre(code, i, endpos).end()
537 elif ch == '#':
538 # This line is unreachable because the # makes a comment of
539 # everything after it.
540 break
541 elif level == 0 and ch == '=' and \
542 (i == 0 or code[i-1] not in "=<>!") and \
543 code[i+1] != '=':
544 found = 1
545 break
546 else:
547 i = i+1
548
549 if found:
550 # found a legit =, but it may be the last interesting
551 # thing on the line
552 i = i+1 # move beyond the =
553 found = re.match(r"\s*\\", code[i:endpos]) is None
554
555 if not found:
556 # oh well ... settle for moving beyond the first chunk
557 # of non-whitespace chars
558 i = startpos
559 while code[i] not in " \t\n":
560 i = i+1
561
562 return len(code[self.stmt_start:i].expandtabs(\
563 self.tabwidth)) + 1
564
565 def get_base_indent_string(self):
566 """Return the leading whitespace on the initial line of the last
567 interesting stmt.
568 """
569 self._study2()
570 i, n = self.stmt_start, self.stmt_end
571 j = i
572 code = self.code
573 while j < n and code[j] in " \t":
574 j = j + 1
575 return code[i:j]
576
577 def is_block_opener(self):
578 "Return True if the last interesting statement opens a block."
579 self._study2()
580 return self.lastch == ':'
581
582 def is_block_closer(self):
583 "Return True if the last interesting statement closes a block."
584 self._study2()
585 return _closere(self.code, self.stmt_start) is not None
586
587 def get_last_stmt_bracketing(self):
588 """Return bracketing structure of the last interesting statement.
589
590 The returned tuple is in the format defined in _study2().
591 """
592 self._study2()
593 return self.stmt_bracketing
594
595
596 if __name__ == '__main__':
597 from unittest import main
598 main('idlelib.idle_test.test_pyparse', verbosity=2)