Mercurial > repos > jpayne > bioproject_to_srr_2
comparison idna/core.py @ 7:5eb2d5e3bf22
planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author | jpayne |
---|---|
date | Sun, 05 May 2024 23:32:17 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
6:b2745907b1eb | 7:5eb2d5e3bf22 |
---|---|
1 from . import idnadata | |
2 import bisect | |
3 import unicodedata | |
4 import re | |
5 from typing import Union, Optional | |
6 from .intranges import intranges_contain | |
7 | |
8 _virama_combining_class = 9 | |
9 _alabel_prefix = b'xn--' | |
10 _unicode_dots_re = re.compile('[\u002e\u3002\uff0e\uff61]') | |
11 | |
12 class IDNAError(UnicodeError): | |
13 """ Base exception for all IDNA-encoding related problems """ | |
14 pass | |
15 | |
16 | |
17 class IDNABidiError(IDNAError): | |
18 """ Exception when bidirectional requirements are not satisfied """ | |
19 pass | |
20 | |
21 | |
22 class InvalidCodepoint(IDNAError): | |
23 """ Exception when a disallowed or unallocated codepoint is used """ | |
24 pass | |
25 | |
26 | |
27 class InvalidCodepointContext(IDNAError): | |
28 """ Exception when the codepoint is not valid in the context it is used """ | |
29 pass | |
30 | |
31 | |
32 def _combining_class(cp: int) -> int: | |
33 v = unicodedata.combining(chr(cp)) | |
34 if v == 0: | |
35 if not unicodedata.name(chr(cp)): | |
36 raise ValueError('Unknown character in unicodedata') | |
37 return v | |
38 | |
39 def _is_script(cp: str, script: str) -> bool: | |
40 return intranges_contain(ord(cp), idnadata.scripts[script]) | |
41 | |
42 def _punycode(s: str) -> bytes: | |
43 return s.encode('punycode') | |
44 | |
45 def _unot(s: int) -> str: | |
46 return 'U+{:04X}'.format(s) | |
47 | |
48 | |
49 def valid_label_length(label: Union[bytes, str]) -> bool: | |
50 if len(label) > 63: | |
51 return False | |
52 return True | |
53 | |
54 | |
55 def valid_string_length(label: Union[bytes, str], trailing_dot: bool) -> bool: | |
56 if len(label) > (254 if trailing_dot else 253): | |
57 return False | |
58 return True | |
59 | |
60 | |
61 def check_bidi(label: str, check_ltr: bool = False) -> bool: | |
62 # Bidi rules should only be applied if string contains RTL characters | |
63 bidi_label = False | |
64 for (idx, cp) in enumerate(label, 1): | |
65 direction = unicodedata.bidirectional(cp) | |
66 if direction == '': | |
67 # String likely comes from a newer version of Unicode | |
68 raise IDNABidiError('Unknown directionality in label {} at position {}'.format(repr(label), idx)) | |
69 if direction in ['R', 'AL', 'AN']: | |
70 bidi_label = True | |
71 if not bidi_label and not check_ltr: | |
72 return True | |
73 | |
74 # Bidi rule 1 | |
75 direction = unicodedata.bidirectional(label[0]) | |
76 if direction in ['R', 'AL']: | |
77 rtl = True | |
78 elif direction == 'L': | |
79 rtl = False | |
80 else: | |
81 raise IDNABidiError('First codepoint in label {} must be directionality L, R or AL'.format(repr(label))) | |
82 | |
83 valid_ending = False | |
84 number_type = None # type: Optional[str] | |
85 for (idx, cp) in enumerate(label, 1): | |
86 direction = unicodedata.bidirectional(cp) | |
87 | |
88 if rtl: | |
89 # Bidi rule 2 | |
90 if not direction in ['R', 'AL', 'AN', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']: | |
91 raise IDNABidiError('Invalid direction for codepoint at position {} in a right-to-left label'.format(idx)) | |
92 # Bidi rule 3 | |
93 if direction in ['R', 'AL', 'EN', 'AN']: | |
94 valid_ending = True | |
95 elif direction != 'NSM': | |
96 valid_ending = False | |
97 # Bidi rule 4 | |
98 if direction in ['AN', 'EN']: | |
99 if not number_type: | |
100 number_type = direction | |
101 else: | |
102 if number_type != direction: | |
103 raise IDNABidiError('Can not mix numeral types in a right-to-left label') | |
104 else: | |
105 # Bidi rule 5 | |
106 if not direction in ['L', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']: | |
107 raise IDNABidiError('Invalid direction for codepoint at position {} in a left-to-right label'.format(idx)) | |
108 # Bidi rule 6 | |
109 if direction in ['L', 'EN']: | |
110 valid_ending = True | |
111 elif direction != 'NSM': | |
112 valid_ending = False | |
113 | |
114 if not valid_ending: | |
115 raise IDNABidiError('Label ends with illegal codepoint directionality') | |
116 | |
117 return True | |
118 | |
119 | |
120 def check_initial_combiner(label: str) -> bool: | |
121 if unicodedata.category(label[0])[0] == 'M': | |
122 raise IDNAError('Label begins with an illegal combining character') | |
123 return True | |
124 | |
125 | |
126 def check_hyphen_ok(label: str) -> bool: | |
127 if label[2:4] == '--': | |
128 raise IDNAError('Label has disallowed hyphens in 3rd and 4th position') | |
129 if label[0] == '-' or label[-1] == '-': | |
130 raise IDNAError('Label must not start or end with a hyphen') | |
131 return True | |
132 | |
133 | |
134 def check_nfc(label: str) -> None: | |
135 if unicodedata.normalize('NFC', label) != label: | |
136 raise IDNAError('Label must be in Normalization Form C') | |
137 | |
138 | |
139 def valid_contextj(label: str, pos: int) -> bool: | |
140 cp_value = ord(label[pos]) | |
141 | |
142 if cp_value == 0x200c: | |
143 | |
144 if pos > 0: | |
145 if _combining_class(ord(label[pos - 1])) == _virama_combining_class: | |
146 return True | |
147 | |
148 ok = False | |
149 for i in range(pos-1, -1, -1): | |
150 joining_type = idnadata.joining_types.get(ord(label[i])) | |
151 if joining_type == ord('T'): | |
152 continue | |
153 elif joining_type in [ord('L'), ord('D')]: | |
154 ok = True | |
155 break | |
156 else: | |
157 break | |
158 | |
159 if not ok: | |
160 return False | |
161 | |
162 ok = False | |
163 for i in range(pos+1, len(label)): | |
164 joining_type = idnadata.joining_types.get(ord(label[i])) | |
165 if joining_type == ord('T'): | |
166 continue | |
167 elif joining_type in [ord('R'), ord('D')]: | |
168 ok = True | |
169 break | |
170 else: | |
171 break | |
172 return ok | |
173 | |
174 if cp_value == 0x200d: | |
175 | |
176 if pos > 0: | |
177 if _combining_class(ord(label[pos - 1])) == _virama_combining_class: | |
178 return True | |
179 return False | |
180 | |
181 else: | |
182 | |
183 return False | |
184 | |
185 | |
186 def valid_contexto(label: str, pos: int, exception: bool = False) -> bool: | |
187 cp_value = ord(label[pos]) | |
188 | |
189 if cp_value == 0x00b7: | |
190 if 0 < pos < len(label)-1: | |
191 if ord(label[pos - 1]) == 0x006c and ord(label[pos + 1]) == 0x006c: | |
192 return True | |
193 return False | |
194 | |
195 elif cp_value == 0x0375: | |
196 if pos < len(label)-1 and len(label) > 1: | |
197 return _is_script(label[pos + 1], 'Greek') | |
198 return False | |
199 | |
200 elif cp_value == 0x05f3 or cp_value == 0x05f4: | |
201 if pos > 0: | |
202 return _is_script(label[pos - 1], 'Hebrew') | |
203 return False | |
204 | |
205 elif cp_value == 0x30fb: | |
206 for cp in label: | |
207 if cp == '\u30fb': | |
208 continue | |
209 if _is_script(cp, 'Hiragana') or _is_script(cp, 'Katakana') or _is_script(cp, 'Han'): | |
210 return True | |
211 return False | |
212 | |
213 elif 0x660 <= cp_value <= 0x669: | |
214 for cp in label: | |
215 if 0x6f0 <= ord(cp) <= 0x06f9: | |
216 return False | |
217 return True | |
218 | |
219 elif 0x6f0 <= cp_value <= 0x6f9: | |
220 for cp in label: | |
221 if 0x660 <= ord(cp) <= 0x0669: | |
222 return False | |
223 return True | |
224 | |
225 return False | |
226 | |
227 | |
228 def check_label(label: Union[str, bytes, bytearray]) -> None: | |
229 if isinstance(label, (bytes, bytearray)): | |
230 label = label.decode('utf-8') | |
231 if len(label) == 0: | |
232 raise IDNAError('Empty Label') | |
233 | |
234 check_nfc(label) | |
235 check_hyphen_ok(label) | |
236 check_initial_combiner(label) | |
237 | |
238 for (pos, cp) in enumerate(label): | |
239 cp_value = ord(cp) | |
240 if intranges_contain(cp_value, idnadata.codepoint_classes['PVALID']): | |
241 continue | |
242 elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTJ']): | |
243 if not valid_contextj(label, pos): | |
244 raise InvalidCodepointContext('Joiner {} not allowed at position {} in {}'.format( | |
245 _unot(cp_value), pos+1, repr(label))) | |
246 elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTO']): | |
247 if not valid_contexto(label, pos): | |
248 raise InvalidCodepointContext('Codepoint {} not allowed at position {} in {}'.format(_unot(cp_value), pos+1, repr(label))) | |
249 else: | |
250 raise InvalidCodepoint('Codepoint {} at position {} of {} not allowed'.format(_unot(cp_value), pos+1, repr(label))) | |
251 | |
252 check_bidi(label) | |
253 | |
254 | |
255 def alabel(label: str) -> bytes: | |
256 try: | |
257 label_bytes = label.encode('ascii') | |
258 ulabel(label_bytes) | |
259 if not valid_label_length(label_bytes): | |
260 raise IDNAError('Label too long') | |
261 return label_bytes | |
262 except UnicodeEncodeError: | |
263 pass | |
264 | |
265 check_label(label) | |
266 label_bytes = _alabel_prefix + _punycode(label) | |
267 | |
268 if not valid_label_length(label_bytes): | |
269 raise IDNAError('Label too long') | |
270 | |
271 return label_bytes | |
272 | |
273 | |
274 def ulabel(label: Union[str, bytes, bytearray]) -> str: | |
275 if not isinstance(label, (bytes, bytearray)): | |
276 try: | |
277 label_bytes = label.encode('ascii') | |
278 except UnicodeEncodeError: | |
279 check_label(label) | |
280 return label | |
281 else: | |
282 label_bytes = label | |
283 | |
284 label_bytes = label_bytes.lower() | |
285 if label_bytes.startswith(_alabel_prefix): | |
286 label_bytes = label_bytes[len(_alabel_prefix):] | |
287 if not label_bytes: | |
288 raise IDNAError('Malformed A-label, no Punycode eligible content found') | |
289 if label_bytes.decode('ascii')[-1] == '-': | |
290 raise IDNAError('A-label must not end with a hyphen') | |
291 else: | |
292 check_label(label_bytes) | |
293 return label_bytes.decode('ascii') | |
294 | |
295 try: | |
296 label = label_bytes.decode('punycode') | |
297 except UnicodeError: | |
298 raise IDNAError('Invalid A-label') | |
299 check_label(label) | |
300 return label | |
301 | |
302 | |
303 def uts46_remap(domain: str, std3_rules: bool = True, transitional: bool = False) -> str: | |
304 """Re-map the characters in the string according to UTS46 processing.""" | |
305 from .uts46data import uts46data | |
306 output = '' | |
307 | |
308 for pos, char in enumerate(domain): | |
309 code_point = ord(char) | |
310 try: | |
311 uts46row = uts46data[code_point if code_point < 256 else | |
312 bisect.bisect_left(uts46data, (code_point, 'Z')) - 1] | |
313 status = uts46row[1] | |
314 replacement = None # type: Optional[str] | |
315 if len(uts46row) == 3: | |
316 replacement = uts46row[2] | |
317 if (status == 'V' or | |
318 (status == 'D' and not transitional) or | |
319 (status == '3' and not std3_rules and replacement is None)): | |
320 output += char | |
321 elif replacement is not None and (status == 'M' or | |
322 (status == '3' and not std3_rules) or | |
323 (status == 'D' and transitional)): | |
324 output += replacement | |
325 elif status != 'I': | |
326 raise IndexError() | |
327 except IndexError: | |
328 raise InvalidCodepoint( | |
329 'Codepoint {} not allowed at position {} in {}'.format( | |
330 _unot(code_point), pos + 1, repr(domain))) | |
331 | |
332 return unicodedata.normalize('NFC', output) | |
333 | |
334 | |
335 def encode(s: Union[str, bytes, bytearray], strict: bool = False, uts46: bool = False, std3_rules: bool = False, transitional: bool = False) -> bytes: | |
336 if not isinstance(s, str): | |
337 try: | |
338 s = str(s, 'ascii') | |
339 except UnicodeDecodeError: | |
340 raise IDNAError('should pass a unicode string to the function rather than a byte string.') | |
341 if uts46: | |
342 s = uts46_remap(s, std3_rules, transitional) | |
343 trailing_dot = False | |
344 result = [] | |
345 if strict: | |
346 labels = s.split('.') | |
347 else: | |
348 labels = _unicode_dots_re.split(s) | |
349 if not labels or labels == ['']: | |
350 raise IDNAError('Empty domain') | |
351 if labels[-1] == '': | |
352 del labels[-1] | |
353 trailing_dot = True | |
354 for label in labels: | |
355 s = alabel(label) | |
356 if s: | |
357 result.append(s) | |
358 else: | |
359 raise IDNAError('Empty label') | |
360 if trailing_dot: | |
361 result.append(b'') | |
362 s = b'.'.join(result) | |
363 if not valid_string_length(s, trailing_dot): | |
364 raise IDNAError('Domain too long') | |
365 return s | |
366 | |
367 | |
368 def decode(s: Union[str, bytes, bytearray], strict: bool = False, uts46: bool = False, std3_rules: bool = False) -> str: | |
369 try: | |
370 if not isinstance(s, str): | |
371 s = str(s, 'ascii') | |
372 except UnicodeDecodeError: | |
373 raise IDNAError('Invalid ASCII in A-label') | |
374 if uts46: | |
375 s = uts46_remap(s, std3_rules, False) | |
376 trailing_dot = False | |
377 result = [] | |
378 if not strict: | |
379 labels = _unicode_dots_re.split(s) | |
380 else: | |
381 labels = s.split('.') | |
382 if not labels or labels == ['']: | |
383 raise IDNAError('Empty domain') | |
384 if not labels[-1]: | |
385 del labels[-1] | |
386 trailing_dot = True | |
387 for label in labels: | |
388 s = ulabel(label) | |
389 if s: | |
390 result.append(s) | |
391 else: | |
392 raise IDNAError('Empty label') | |
393 if trailing_dot: | |
394 result.append('') | |
395 return '.'.join(result) |