Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/idlelib/pyparse.py @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 """Define partial Python code Parser used by editor and hyperparser. | |
2 | |
3 Instances of ParseMap are used with str.translate. | |
4 | |
5 The following bound search and match functions are defined: | |
6 _synchre - start of popular statement; | |
7 _junkre - whitespace or comment line; | |
8 _match_stringre: string, possibly without closer; | |
9 _itemre - line that may have bracket structure start; | |
10 _closere - line that must be followed by dedent. | |
11 _chew_ordinaryre - non-special characters. | |
12 """ | |
13 import re | |
14 | |
15 # Reason last statement is continued (or C_NONE if it's not). | |
16 (C_NONE, C_BACKSLASH, C_STRING_FIRST_LINE, | |
17 C_STRING_NEXT_LINES, C_BRACKET) = range(5) | |
18 | |
19 # Find what looks like the start of a popular statement. | |
20 | |
21 _synchre = re.compile(r""" | |
22 ^ | |
23 [ \t]* | |
24 (?: while | |
25 | else | |
26 | def | |
27 | return | |
28 | assert | |
29 | break | |
30 | class | |
31 | continue | |
32 | elif | |
33 | try | |
34 | except | |
35 | raise | |
36 | import | |
37 | yield | |
38 ) | |
39 \b | |
40 """, re.VERBOSE | re.MULTILINE).search | |
41 | |
42 # Match blank line or non-indenting comment line. | |
43 | |
44 _junkre = re.compile(r""" | |
45 [ \t]* | |
46 (?: \# \S .* )? | |
47 \n | |
48 """, re.VERBOSE).match | |
49 | |
50 # Match any flavor of string; the terminating quote is optional | |
51 # so that we're robust in the face of incomplete program text. | |
52 | |
53 _match_stringre = re.compile(r""" | |
54 \""" [^"\\]* (?: | |
55 (?: \\. | "(?!"") ) | |
56 [^"\\]* | |
57 )* | |
58 (?: \""" )? | |
59 | |
60 | " [^"\\\n]* (?: \\. [^"\\\n]* )* "? | |
61 | |
62 | ''' [^'\\]* (?: | |
63 (?: \\. | '(?!'') ) | |
64 [^'\\]* | |
65 )* | |
66 (?: ''' )? | |
67 | |
68 | ' [^'\\\n]* (?: \\. [^'\\\n]* )* '? | |
69 """, re.VERBOSE | re.DOTALL).match | |
70 | |
71 # Match a line that starts with something interesting; | |
72 # used to find the first item of a bracket structure. | |
73 | |
74 _itemre = re.compile(r""" | |
75 [ \t]* | |
76 [^\s#\\] # if we match, m.end()-1 is the interesting char | |
77 """, re.VERBOSE).match | |
78 | |
79 # Match start of statements that should be followed by a dedent. | |
80 | |
81 _closere = re.compile(r""" | |
82 \s* | |
83 (?: return | |
84 | break | |
85 | continue | |
86 | raise | |
87 | pass | |
88 ) | |
89 \b | |
90 """, re.VERBOSE).match | |
91 | |
92 # Chew up non-special chars as quickly as possible. If match is | |
93 # successful, m.end() less 1 is the index of the last boring char | |
94 # matched. If match is unsuccessful, the string starts with an | |
95 # interesting char. | |
96 | |
97 _chew_ordinaryre = re.compile(r""" | |
98 [^[\](){}#'"\\]+ | |
99 """, re.VERBOSE).match | |
100 | |
101 | |
102 class ParseMap(dict): | |
103 r"""Dict subclass that maps anything not in dict to 'x'. | |
104 | |
105 This is designed to be used with str.translate in study1. | |
106 Anything not specifically mapped otherwise becomes 'x'. | |
107 Example: replace everything except whitespace with 'x'. | |
108 | |
109 >>> keepwhite = ParseMap((ord(c), ord(c)) for c in ' \t\n\r') | |
110 >>> "a + b\tc\nd".translate(keepwhite) | |
111 'x x x\tx\nx' | |
112 """ | |
113 # Calling this triples access time; see bpo-32940 | |
114 def __missing__(self, key): | |
115 return 120 # ord('x') | |
116 | |
117 | |
118 # Map all ascii to 120 to avoid __missing__ call, then replace some. | |
119 trans = ParseMap.fromkeys(range(128), 120) | |
120 trans.update((ord(c), ord('(')) for c in "({[") # open brackets => '('; | |
121 trans.update((ord(c), ord(')')) for c in ")}]") # close brackets => ')'. | |
122 trans.update((ord(c), ord(c)) for c in "\"'\\\n#") # Keep these. | |
123 | |
124 | |
125 class Parser: | |
126 | |
127 def __init__(self, indentwidth, tabwidth): | |
128 self.indentwidth = indentwidth | |
129 self.tabwidth = tabwidth | |
130 | |
131 def set_code(self, s): | |
132 assert len(s) == 0 or s[-1] == '\n' | |
133 self.code = s | |
134 self.study_level = 0 | |
135 | |
136 def find_good_parse_start(self, is_char_in_string=None, | |
137 _synchre=_synchre): | |
138 """ | |
139 Return index of a good place to begin parsing, as close to the | |
140 end of the string as possible. This will be the start of some | |
141 popular stmt like "if" or "def". Return None if none found: | |
142 the caller should pass more prior context then, if possible, or | |
143 if not (the entire program text up until the point of interest | |
144 has already been tried) pass 0 to set_lo(). | |
145 | |
146 This will be reliable iff given a reliable is_char_in_string() | |
147 function, meaning that when it says "no", it's absolutely | |
148 guaranteed that the char is not in a string. | |
149 """ | |
150 code, pos = self.code, None | |
151 | |
152 if not is_char_in_string: | |
153 # no clue -- make the caller pass everything | |
154 return None | |
155 | |
156 # Peek back from the end for a good place to start, | |
157 # but don't try too often; pos will be left None, or | |
158 # bumped to a legitimate synch point. | |
159 limit = len(code) | |
160 for tries in range(5): | |
161 i = code.rfind(":\n", 0, limit) | |
162 if i < 0: | |
163 break | |
164 i = code.rfind('\n', 0, i) + 1 # start of colon line (-1+1=0) | |
165 m = _synchre(code, i, limit) | |
166 if m and not is_char_in_string(m.start()): | |
167 pos = m.start() | |
168 break | |
169 limit = i | |
170 if pos is None: | |
171 # Nothing looks like a block-opener, or stuff does | |
172 # but is_char_in_string keeps returning true; most likely | |
173 # we're in or near a giant string, the colorizer hasn't | |
174 # caught up enough to be helpful, or there simply *aren't* | |
175 # any interesting stmts. In any of these cases we're | |
176 # going to have to parse the whole thing to be sure, so | |
177 # give it one last try from the start, but stop wasting | |
178 # time here regardless of the outcome. | |
179 m = _synchre(code) | |
180 if m and not is_char_in_string(m.start()): | |
181 pos = m.start() | |
182 return pos | |
183 | |
184 # Peeking back worked; look forward until _synchre no longer | |
185 # matches. | |
186 i = pos + 1 | |
187 while 1: | |
188 m = _synchre(code, i) | |
189 if m: | |
190 s, i = m.span() | |
191 if not is_char_in_string(s): | |
192 pos = s | |
193 else: | |
194 break | |
195 return pos | |
196 | |
197 def set_lo(self, lo): | |
198 """ Throw away the start of the string. | |
199 | |
200 Intended to be called with the result of find_good_parse_start(). | |
201 """ | |
202 assert lo == 0 or self.code[lo-1] == '\n' | |
203 if lo > 0: | |
204 self.code = self.code[lo:] | |
205 | |
206 def _study1(self): | |
207 """Find the line numbers of non-continuation lines. | |
208 | |
209 As quickly as humanly possible <wink>, find the line numbers (0- | |
210 based) of the non-continuation lines. | |
211 Creates self.{goodlines, continuation}. | |
212 """ | |
213 if self.study_level >= 1: | |
214 return | |
215 self.study_level = 1 | |
216 | |
217 # Map all uninteresting characters to "x", all open brackets | |
218 # to "(", all close brackets to ")", then collapse runs of | |
219 # uninteresting characters. This can cut the number of chars | |
220 # by a factor of 10-40, and so greatly speed the following loop. | |
221 code = self.code | |
222 code = code.translate(trans) | |
223 code = code.replace('xxxxxxxx', 'x') | |
224 code = code.replace('xxxx', 'x') | |
225 code = code.replace('xx', 'x') | |
226 code = code.replace('xx', 'x') | |
227 code = code.replace('\nx', '\n') | |
228 # Replacing x\n with \n would be incorrect because | |
229 # x may be preceded by a backslash. | |
230 | |
231 # March over the squashed version of the program, accumulating | |
232 # the line numbers of non-continued stmts, and determining | |
233 # whether & why the last stmt is a continuation. | |
234 continuation = C_NONE | |
235 level = lno = 0 # level is nesting level; lno is line number | |
236 self.goodlines = goodlines = [0] | |
237 push_good = goodlines.append | |
238 i, n = 0, len(code) | |
239 while i < n: | |
240 ch = code[i] | |
241 i = i+1 | |
242 | |
243 # cases are checked in decreasing order of frequency | |
244 if ch == 'x': | |
245 continue | |
246 | |
247 if ch == '\n': | |
248 lno = lno + 1 | |
249 if level == 0: | |
250 push_good(lno) | |
251 # else we're in an unclosed bracket structure | |
252 continue | |
253 | |
254 if ch == '(': | |
255 level = level + 1 | |
256 continue | |
257 | |
258 if ch == ')': | |
259 if level: | |
260 level = level - 1 | |
261 # else the program is invalid, but we can't complain | |
262 continue | |
263 | |
264 if ch == '"' or ch == "'": | |
265 # consume the string | |
266 quote = ch | |
267 if code[i-1:i+2] == quote * 3: | |
268 quote = quote * 3 | |
269 firstlno = lno | |
270 w = len(quote) - 1 | |
271 i = i+w | |
272 while i < n: | |
273 ch = code[i] | |
274 i = i+1 | |
275 | |
276 if ch == 'x': | |
277 continue | |
278 | |
279 if code[i-1:i+w] == quote: | |
280 i = i+w | |
281 break | |
282 | |
283 if ch == '\n': | |
284 lno = lno + 1 | |
285 if w == 0: | |
286 # unterminated single-quoted string | |
287 if level == 0: | |
288 push_good(lno) | |
289 break | |
290 continue | |
291 | |
292 if ch == '\\': | |
293 assert i < n | |
294 if code[i] == '\n': | |
295 lno = lno + 1 | |
296 i = i+1 | |
297 continue | |
298 | |
299 # else comment char or paren inside string | |
300 | |
301 else: | |
302 # didn't break out of the loop, so we're still | |
303 # inside a string | |
304 if (lno - 1) == firstlno: | |
305 # before the previous \n in code, we were in the first | |
306 # line of the string | |
307 continuation = C_STRING_FIRST_LINE | |
308 else: | |
309 continuation = C_STRING_NEXT_LINES | |
310 continue # with outer loop | |
311 | |
312 if ch == '#': | |
313 # consume the comment | |
314 i = code.find('\n', i) | |
315 assert i >= 0 | |
316 continue | |
317 | |
318 assert ch == '\\' | |
319 assert i < n | |
320 if code[i] == '\n': | |
321 lno = lno + 1 | |
322 if i+1 == n: | |
323 continuation = C_BACKSLASH | |
324 i = i+1 | |
325 | |
326 # The last stmt may be continued for all 3 reasons. | |
327 # String continuation takes precedence over bracket | |
328 # continuation, which beats backslash continuation. | |
329 if (continuation != C_STRING_FIRST_LINE | |
330 and continuation != C_STRING_NEXT_LINES and level > 0): | |
331 continuation = C_BRACKET | |
332 self.continuation = continuation | |
333 | |
334 # Push the final line number as a sentinel value, regardless of | |
335 # whether it's continued. | |
336 assert (continuation == C_NONE) == (goodlines[-1] == lno) | |
337 if goodlines[-1] != lno: | |
338 push_good(lno) | |
339 | |
340 def get_continuation_type(self): | |
341 self._study1() | |
342 return self.continuation | |
343 | |
344 def _study2(self): | |
345 """ | |
346 study1 was sufficient to determine the continuation status, | |
347 but doing more requires looking at every character. study2 | |
348 does this for the last interesting statement in the block. | |
349 Creates: | |
350 self.stmt_start, stmt_end | |
351 slice indices of last interesting stmt | |
352 self.stmt_bracketing | |
353 the bracketing structure of the last interesting stmt; for | |
354 example, for the statement "say(boo) or die", | |
355 stmt_bracketing will be ((0, 0), (0, 1), (2, 0), (2, 1), | |
356 (4, 0)). Strings and comments are treated as brackets, for | |
357 the matter. | |
358 self.lastch | |
359 last interesting character before optional trailing comment | |
360 self.lastopenbracketpos | |
361 if continuation is C_BRACKET, index of last open bracket | |
362 """ | |
363 if self.study_level >= 2: | |
364 return | |
365 self._study1() | |
366 self.study_level = 2 | |
367 | |
368 # Set p and q to slice indices of last interesting stmt. | |
369 code, goodlines = self.code, self.goodlines | |
370 i = len(goodlines) - 1 # Index of newest line. | |
371 p = len(code) # End of goodlines[i] | |
372 while i: | |
373 assert p | |
374 # Make p be the index of the stmt at line number goodlines[i]. | |
375 # Move p back to the stmt at line number goodlines[i-1]. | |
376 q = p | |
377 for nothing in range(goodlines[i-1], goodlines[i]): | |
378 # tricky: sets p to 0 if no preceding newline | |
379 p = code.rfind('\n', 0, p-1) + 1 | |
380 # The stmt code[p:q] isn't a continuation, but may be blank | |
381 # or a non-indenting comment line. | |
382 if _junkre(code, p): | |
383 i = i-1 | |
384 else: | |
385 break | |
386 if i == 0: | |
387 # nothing but junk! | |
388 assert p == 0 | |
389 q = p | |
390 self.stmt_start, self.stmt_end = p, q | |
391 | |
392 # Analyze this stmt, to find the last open bracket (if any) | |
393 # and last interesting character (if any). | |
394 lastch = "" | |
395 stack = [] # stack of open bracket indices | |
396 push_stack = stack.append | |
397 bracketing = [(p, 0)] | |
398 while p < q: | |
399 # suck up all except ()[]{}'"#\\ | |
400 m = _chew_ordinaryre(code, p, q) | |
401 if m: | |
402 # we skipped at least one boring char | |
403 newp = m.end() | |
404 # back up over totally boring whitespace | |
405 i = newp - 1 # index of last boring char | |
406 while i >= p and code[i] in " \t\n": | |
407 i = i-1 | |
408 if i >= p: | |
409 lastch = code[i] | |
410 p = newp | |
411 if p >= q: | |
412 break | |
413 | |
414 ch = code[p] | |
415 | |
416 if ch in "([{": | |
417 push_stack(p) | |
418 bracketing.append((p, len(stack))) | |
419 lastch = ch | |
420 p = p+1 | |
421 continue | |
422 | |
423 if ch in ")]}": | |
424 if stack: | |
425 del stack[-1] | |
426 lastch = ch | |
427 p = p+1 | |
428 bracketing.append((p, len(stack))) | |
429 continue | |
430 | |
431 if ch == '"' or ch == "'": | |
432 # consume string | |
433 # Note that study1 did this with a Python loop, but | |
434 # we use a regexp here; the reason is speed in both | |
435 # cases; the string may be huge, but study1 pre-squashed | |
436 # strings to a couple of characters per line. study1 | |
437 # also needed to keep track of newlines, and we don't | |
438 # have to. | |
439 bracketing.append((p, len(stack)+1)) | |
440 lastch = ch | |
441 p = _match_stringre(code, p, q).end() | |
442 bracketing.append((p, len(stack))) | |
443 continue | |
444 | |
445 if ch == '#': | |
446 # consume comment and trailing newline | |
447 bracketing.append((p, len(stack)+1)) | |
448 p = code.find('\n', p, q) + 1 | |
449 assert p > 0 | |
450 bracketing.append((p, len(stack))) | |
451 continue | |
452 | |
453 assert ch == '\\' | |
454 p = p+1 # beyond backslash | |
455 assert p < q | |
456 if code[p] != '\n': | |
457 # the program is invalid, but can't complain | |
458 lastch = ch + code[p] | |
459 p = p+1 # beyond escaped char | |
460 | |
461 # end while p < q: | |
462 | |
463 self.lastch = lastch | |
464 self.lastopenbracketpos = stack[-1] if stack else None | |
465 self.stmt_bracketing = tuple(bracketing) | |
466 | |
467 def compute_bracket_indent(self): | |
468 """Return number of spaces the next line should be indented. | |
469 | |
470 Line continuation must be C_BRACKET. | |
471 """ | |
472 self._study2() | |
473 assert self.continuation == C_BRACKET | |
474 j = self.lastopenbracketpos | |
475 code = self.code | |
476 n = len(code) | |
477 origi = i = code.rfind('\n', 0, j) + 1 | |
478 j = j+1 # one beyond open bracket | |
479 # find first list item; set i to start of its line | |
480 while j < n: | |
481 m = _itemre(code, j) | |
482 if m: | |
483 j = m.end() - 1 # index of first interesting char | |
484 extra = 0 | |
485 break | |
486 else: | |
487 # this line is junk; advance to next line | |
488 i = j = code.find('\n', j) + 1 | |
489 else: | |
490 # nothing interesting follows the bracket; | |
491 # reproduce the bracket line's indentation + a level | |
492 j = i = origi | |
493 while code[j] in " \t": | |
494 j = j+1 | |
495 extra = self.indentwidth | |
496 return len(code[i:j].expandtabs(self.tabwidth)) + extra | |
497 | |
498 def get_num_lines_in_stmt(self): | |
499 """Return number of physical lines in last stmt. | |
500 | |
501 The statement doesn't have to be an interesting statement. This is | |
502 intended to be called when continuation is C_BACKSLASH. | |
503 """ | |
504 self._study1() | |
505 goodlines = self.goodlines | |
506 return goodlines[-1] - goodlines[-2] | |
507 | |
508 def compute_backslash_indent(self): | |
509 """Return number of spaces the next line should be indented. | |
510 | |
511 Line continuation must be C_BACKSLASH. Also assume that the new | |
512 line is the first one following the initial line of the stmt. | |
513 """ | |
514 self._study2() | |
515 assert self.continuation == C_BACKSLASH | |
516 code = self.code | |
517 i = self.stmt_start | |
518 while code[i] in " \t": | |
519 i = i+1 | |
520 startpos = i | |
521 | |
522 # See whether the initial line starts an assignment stmt; i.e., | |
523 # look for an = operator | |
524 endpos = code.find('\n', startpos) + 1 | |
525 found = level = 0 | |
526 while i < endpos: | |
527 ch = code[i] | |
528 if ch in "([{": | |
529 level = level + 1 | |
530 i = i+1 | |
531 elif ch in ")]}": | |
532 if level: | |
533 level = level - 1 | |
534 i = i+1 | |
535 elif ch == '"' or ch == "'": | |
536 i = _match_stringre(code, i, endpos).end() | |
537 elif ch == '#': | |
538 # This line is unreachable because the # makes a comment of | |
539 # everything after it. | |
540 break | |
541 elif level == 0 and ch == '=' and \ | |
542 (i == 0 or code[i-1] not in "=<>!") and \ | |
543 code[i+1] != '=': | |
544 found = 1 | |
545 break | |
546 else: | |
547 i = i+1 | |
548 | |
549 if found: | |
550 # found a legit =, but it may be the last interesting | |
551 # thing on the line | |
552 i = i+1 # move beyond the = | |
553 found = re.match(r"\s*\\", code[i:endpos]) is None | |
554 | |
555 if not found: | |
556 # oh well ... settle for moving beyond the first chunk | |
557 # of non-whitespace chars | |
558 i = startpos | |
559 while code[i] not in " \t\n": | |
560 i = i+1 | |
561 | |
562 return len(code[self.stmt_start:i].expandtabs(\ | |
563 self.tabwidth)) + 1 | |
564 | |
565 def get_base_indent_string(self): | |
566 """Return the leading whitespace on the initial line of the last | |
567 interesting stmt. | |
568 """ | |
569 self._study2() | |
570 i, n = self.stmt_start, self.stmt_end | |
571 j = i | |
572 code = self.code | |
573 while j < n and code[j] in " \t": | |
574 j = j + 1 | |
575 return code[i:j] | |
576 | |
577 def is_block_opener(self): | |
578 "Return True if the last interesting statement opens a block." | |
579 self._study2() | |
580 return self.lastch == ':' | |
581 | |
582 def is_block_closer(self): | |
583 "Return True if the last interesting statement closes a block." | |
584 self._study2() | |
585 return _closere(self.code, self.stmt_start) is not None | |
586 | |
587 def get_last_stmt_bracketing(self): | |
588 """Return bracketing structure of the last interesting statement. | |
589 | |
590 The returned tuple is in the format defined in _study2(). | |
591 """ | |
592 self._study2() | |
593 return self.stmt_bracketing | |
594 | |
595 | |
596 if __name__ == '__main__': | |
597 from unittest import main | |
598 main('idlelib.idle_test.test_pyparse', verbosity=2) |