annotate idna/core.py @ 14:18e1cb6018fd

planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author jpayne
date Mon, 20 May 2024 02:25:23 -0400
parents 5eb2d5e3bf22
children
rev   line source
jpayne@7 1 from . import idnadata
jpayne@7 2 import bisect
jpayne@7 3 import unicodedata
jpayne@7 4 import re
jpayne@7 5 from typing import Union, Optional
jpayne@7 6 from .intranges import intranges_contain
jpayne@7 7
jpayne@7 8 _virama_combining_class = 9
jpayne@7 9 _alabel_prefix = b'xn--'
jpayne@7 10 _unicode_dots_re = re.compile('[\u002e\u3002\uff0e\uff61]')
jpayne@7 11
jpayne@7 12 class IDNAError(UnicodeError):
jpayne@7 13 """ Base exception for all IDNA-encoding related problems """
jpayne@7 14 pass
jpayne@7 15
jpayne@7 16
jpayne@7 17 class IDNABidiError(IDNAError):
jpayne@7 18 """ Exception when bidirectional requirements are not satisfied """
jpayne@7 19 pass
jpayne@7 20
jpayne@7 21
jpayne@7 22 class InvalidCodepoint(IDNAError):
jpayne@7 23 """ Exception when a disallowed or unallocated codepoint is used """
jpayne@7 24 pass
jpayne@7 25
jpayne@7 26
jpayne@7 27 class InvalidCodepointContext(IDNAError):
jpayne@7 28 """ Exception when the codepoint is not valid in the context it is used """
jpayne@7 29 pass
jpayne@7 30
jpayne@7 31
jpayne@7 32 def _combining_class(cp: int) -> int:
jpayne@7 33 v = unicodedata.combining(chr(cp))
jpayne@7 34 if v == 0:
jpayne@7 35 if not unicodedata.name(chr(cp)):
jpayne@7 36 raise ValueError('Unknown character in unicodedata')
jpayne@7 37 return v
jpayne@7 38
jpayne@7 39 def _is_script(cp: str, script: str) -> bool:
jpayne@7 40 return intranges_contain(ord(cp), idnadata.scripts[script])
jpayne@7 41
jpayne@7 42 def _punycode(s: str) -> bytes:
jpayne@7 43 return s.encode('punycode')
jpayne@7 44
jpayne@7 45 def _unot(s: int) -> str:
jpayne@7 46 return 'U+{:04X}'.format(s)
jpayne@7 47
jpayne@7 48
jpayne@7 49 def valid_label_length(label: Union[bytes, str]) -> bool:
jpayne@7 50 if len(label) > 63:
jpayne@7 51 return False
jpayne@7 52 return True
jpayne@7 53
jpayne@7 54
jpayne@7 55 def valid_string_length(label: Union[bytes, str], trailing_dot: bool) -> bool:
jpayne@7 56 if len(label) > (254 if trailing_dot else 253):
jpayne@7 57 return False
jpayne@7 58 return True
jpayne@7 59
jpayne@7 60
jpayne@7 61 def check_bidi(label: str, check_ltr: bool = False) -> bool:
jpayne@7 62 # Bidi rules should only be applied if string contains RTL characters
jpayne@7 63 bidi_label = False
jpayne@7 64 for (idx, cp) in enumerate(label, 1):
jpayne@7 65 direction = unicodedata.bidirectional(cp)
jpayne@7 66 if direction == '':
jpayne@7 67 # String likely comes from a newer version of Unicode
jpayne@7 68 raise IDNABidiError('Unknown directionality in label {} at position {}'.format(repr(label), idx))
jpayne@7 69 if direction in ['R', 'AL', 'AN']:
jpayne@7 70 bidi_label = True
jpayne@7 71 if not bidi_label and not check_ltr:
jpayne@7 72 return True
jpayne@7 73
jpayne@7 74 # Bidi rule 1
jpayne@7 75 direction = unicodedata.bidirectional(label[0])
jpayne@7 76 if direction in ['R', 'AL']:
jpayne@7 77 rtl = True
jpayne@7 78 elif direction == 'L':
jpayne@7 79 rtl = False
jpayne@7 80 else:
jpayne@7 81 raise IDNABidiError('First codepoint in label {} must be directionality L, R or AL'.format(repr(label)))
jpayne@7 82
jpayne@7 83 valid_ending = False
jpayne@7 84 number_type = None # type: Optional[str]
jpayne@7 85 for (idx, cp) in enumerate(label, 1):
jpayne@7 86 direction = unicodedata.bidirectional(cp)
jpayne@7 87
jpayne@7 88 if rtl:
jpayne@7 89 # Bidi rule 2
jpayne@7 90 if not direction in ['R', 'AL', 'AN', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:
jpayne@7 91 raise IDNABidiError('Invalid direction for codepoint at position {} in a right-to-left label'.format(idx))
jpayne@7 92 # Bidi rule 3
jpayne@7 93 if direction in ['R', 'AL', 'EN', 'AN']:
jpayne@7 94 valid_ending = True
jpayne@7 95 elif direction != 'NSM':
jpayne@7 96 valid_ending = False
jpayne@7 97 # Bidi rule 4
jpayne@7 98 if direction in ['AN', 'EN']:
jpayne@7 99 if not number_type:
jpayne@7 100 number_type = direction
jpayne@7 101 else:
jpayne@7 102 if number_type != direction:
jpayne@7 103 raise IDNABidiError('Can not mix numeral types in a right-to-left label')
jpayne@7 104 else:
jpayne@7 105 # Bidi rule 5
jpayne@7 106 if not direction in ['L', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:
jpayne@7 107 raise IDNABidiError('Invalid direction for codepoint at position {} in a left-to-right label'.format(idx))
jpayne@7 108 # Bidi rule 6
jpayne@7 109 if direction in ['L', 'EN']:
jpayne@7 110 valid_ending = True
jpayne@7 111 elif direction != 'NSM':
jpayne@7 112 valid_ending = False
jpayne@7 113
jpayne@7 114 if not valid_ending:
jpayne@7 115 raise IDNABidiError('Label ends with illegal codepoint directionality')
jpayne@7 116
jpayne@7 117 return True
jpayne@7 118
jpayne@7 119
jpayne@7 120 def check_initial_combiner(label: str) -> bool:
jpayne@7 121 if unicodedata.category(label[0])[0] == 'M':
jpayne@7 122 raise IDNAError('Label begins with an illegal combining character')
jpayne@7 123 return True
jpayne@7 124
jpayne@7 125
jpayne@7 126 def check_hyphen_ok(label: str) -> bool:
jpayne@7 127 if label[2:4] == '--':
jpayne@7 128 raise IDNAError('Label has disallowed hyphens in 3rd and 4th position')
jpayne@7 129 if label[0] == '-' or label[-1] == '-':
jpayne@7 130 raise IDNAError('Label must not start or end with a hyphen')
jpayne@7 131 return True
jpayne@7 132
jpayne@7 133
jpayne@7 134 def check_nfc(label: str) -> None:
jpayne@7 135 if unicodedata.normalize('NFC', label) != label:
jpayne@7 136 raise IDNAError('Label must be in Normalization Form C')
jpayne@7 137
jpayne@7 138
jpayne@7 139 def valid_contextj(label: str, pos: int) -> bool:
jpayne@7 140 cp_value = ord(label[pos])
jpayne@7 141
jpayne@7 142 if cp_value == 0x200c:
jpayne@7 143
jpayne@7 144 if pos > 0:
jpayne@7 145 if _combining_class(ord(label[pos - 1])) == _virama_combining_class:
jpayne@7 146 return True
jpayne@7 147
jpayne@7 148 ok = False
jpayne@7 149 for i in range(pos-1, -1, -1):
jpayne@7 150 joining_type = idnadata.joining_types.get(ord(label[i]))
jpayne@7 151 if joining_type == ord('T'):
jpayne@7 152 continue
jpayne@7 153 elif joining_type in [ord('L'), ord('D')]:
jpayne@7 154 ok = True
jpayne@7 155 break
jpayne@7 156 else:
jpayne@7 157 break
jpayne@7 158
jpayne@7 159 if not ok:
jpayne@7 160 return False
jpayne@7 161
jpayne@7 162 ok = False
jpayne@7 163 for i in range(pos+1, len(label)):
jpayne@7 164 joining_type = idnadata.joining_types.get(ord(label[i]))
jpayne@7 165 if joining_type == ord('T'):
jpayne@7 166 continue
jpayne@7 167 elif joining_type in [ord('R'), ord('D')]:
jpayne@7 168 ok = True
jpayne@7 169 break
jpayne@7 170 else:
jpayne@7 171 break
jpayne@7 172 return ok
jpayne@7 173
jpayne@7 174 if cp_value == 0x200d:
jpayne@7 175
jpayne@7 176 if pos > 0:
jpayne@7 177 if _combining_class(ord(label[pos - 1])) == _virama_combining_class:
jpayne@7 178 return True
jpayne@7 179 return False
jpayne@7 180
jpayne@7 181 else:
jpayne@7 182
jpayne@7 183 return False
jpayne@7 184
jpayne@7 185
jpayne@7 186 def valid_contexto(label: str, pos: int, exception: bool = False) -> bool:
jpayne@7 187 cp_value = ord(label[pos])
jpayne@7 188
jpayne@7 189 if cp_value == 0x00b7:
jpayne@7 190 if 0 < pos < len(label)-1:
jpayne@7 191 if ord(label[pos - 1]) == 0x006c and ord(label[pos + 1]) == 0x006c:
jpayne@7 192 return True
jpayne@7 193 return False
jpayne@7 194
jpayne@7 195 elif cp_value == 0x0375:
jpayne@7 196 if pos < len(label)-1 and len(label) > 1:
jpayne@7 197 return _is_script(label[pos + 1], 'Greek')
jpayne@7 198 return False
jpayne@7 199
jpayne@7 200 elif cp_value == 0x05f3 or cp_value == 0x05f4:
jpayne@7 201 if pos > 0:
jpayne@7 202 return _is_script(label[pos - 1], 'Hebrew')
jpayne@7 203 return False
jpayne@7 204
jpayne@7 205 elif cp_value == 0x30fb:
jpayne@7 206 for cp in label:
jpayne@7 207 if cp == '\u30fb':
jpayne@7 208 continue
jpayne@7 209 if _is_script(cp, 'Hiragana') or _is_script(cp, 'Katakana') or _is_script(cp, 'Han'):
jpayne@7 210 return True
jpayne@7 211 return False
jpayne@7 212
jpayne@7 213 elif 0x660 <= cp_value <= 0x669:
jpayne@7 214 for cp in label:
jpayne@7 215 if 0x6f0 <= ord(cp) <= 0x06f9:
jpayne@7 216 return False
jpayne@7 217 return True
jpayne@7 218
jpayne@7 219 elif 0x6f0 <= cp_value <= 0x6f9:
jpayne@7 220 for cp in label:
jpayne@7 221 if 0x660 <= ord(cp) <= 0x0669:
jpayne@7 222 return False
jpayne@7 223 return True
jpayne@7 224
jpayne@7 225 return False
jpayne@7 226
jpayne@7 227
jpayne@7 228 def check_label(label: Union[str, bytes, bytearray]) -> None:
jpayne@7 229 if isinstance(label, (bytes, bytearray)):
jpayne@7 230 label = label.decode('utf-8')
jpayne@7 231 if len(label) == 0:
jpayne@7 232 raise IDNAError('Empty Label')
jpayne@7 233
jpayne@7 234 check_nfc(label)
jpayne@7 235 check_hyphen_ok(label)
jpayne@7 236 check_initial_combiner(label)
jpayne@7 237
jpayne@7 238 for (pos, cp) in enumerate(label):
jpayne@7 239 cp_value = ord(cp)
jpayne@7 240 if intranges_contain(cp_value, idnadata.codepoint_classes['PVALID']):
jpayne@7 241 continue
jpayne@7 242 elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTJ']):
jpayne@7 243 if not valid_contextj(label, pos):
jpayne@7 244 raise InvalidCodepointContext('Joiner {} not allowed at position {} in {}'.format(
jpayne@7 245 _unot(cp_value), pos+1, repr(label)))
jpayne@7 246 elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTO']):
jpayne@7 247 if not valid_contexto(label, pos):
jpayne@7 248 raise InvalidCodepointContext('Codepoint {} not allowed at position {} in {}'.format(_unot(cp_value), pos+1, repr(label)))
jpayne@7 249 else:
jpayne@7 250 raise InvalidCodepoint('Codepoint {} at position {} of {} not allowed'.format(_unot(cp_value), pos+1, repr(label)))
jpayne@7 251
jpayne@7 252 check_bidi(label)
jpayne@7 253
jpayne@7 254
jpayne@7 255 def alabel(label: str) -> bytes:
jpayne@7 256 try:
jpayne@7 257 label_bytes = label.encode('ascii')
jpayne@7 258 ulabel(label_bytes)
jpayne@7 259 if not valid_label_length(label_bytes):
jpayne@7 260 raise IDNAError('Label too long')
jpayne@7 261 return label_bytes
jpayne@7 262 except UnicodeEncodeError:
jpayne@7 263 pass
jpayne@7 264
jpayne@7 265 check_label(label)
jpayne@7 266 label_bytes = _alabel_prefix + _punycode(label)
jpayne@7 267
jpayne@7 268 if not valid_label_length(label_bytes):
jpayne@7 269 raise IDNAError('Label too long')
jpayne@7 270
jpayne@7 271 return label_bytes
jpayne@7 272
jpayne@7 273
jpayne@7 274 def ulabel(label: Union[str, bytes, bytearray]) -> str:
jpayne@7 275 if not isinstance(label, (bytes, bytearray)):
jpayne@7 276 try:
jpayne@7 277 label_bytes = label.encode('ascii')
jpayne@7 278 except UnicodeEncodeError:
jpayne@7 279 check_label(label)
jpayne@7 280 return label
jpayne@7 281 else:
jpayne@7 282 label_bytes = label
jpayne@7 283
jpayne@7 284 label_bytes = label_bytes.lower()
jpayne@7 285 if label_bytes.startswith(_alabel_prefix):
jpayne@7 286 label_bytes = label_bytes[len(_alabel_prefix):]
jpayne@7 287 if not label_bytes:
jpayne@7 288 raise IDNAError('Malformed A-label, no Punycode eligible content found')
jpayne@7 289 if label_bytes.decode('ascii')[-1] == '-':
jpayne@7 290 raise IDNAError('A-label must not end with a hyphen')
jpayne@7 291 else:
jpayne@7 292 check_label(label_bytes)
jpayne@7 293 return label_bytes.decode('ascii')
jpayne@7 294
jpayne@7 295 try:
jpayne@7 296 label = label_bytes.decode('punycode')
jpayne@7 297 except UnicodeError:
jpayne@7 298 raise IDNAError('Invalid A-label')
jpayne@7 299 check_label(label)
jpayne@7 300 return label
jpayne@7 301
jpayne@7 302
jpayne@7 303 def uts46_remap(domain: str, std3_rules: bool = True, transitional: bool = False) -> str:
jpayne@7 304 """Re-map the characters in the string according to UTS46 processing."""
jpayne@7 305 from .uts46data import uts46data
jpayne@7 306 output = ''
jpayne@7 307
jpayne@7 308 for pos, char in enumerate(domain):
jpayne@7 309 code_point = ord(char)
jpayne@7 310 try:
jpayne@7 311 uts46row = uts46data[code_point if code_point < 256 else
jpayne@7 312 bisect.bisect_left(uts46data, (code_point, 'Z')) - 1]
jpayne@7 313 status = uts46row[1]
jpayne@7 314 replacement = None # type: Optional[str]
jpayne@7 315 if len(uts46row) == 3:
jpayne@7 316 replacement = uts46row[2]
jpayne@7 317 if (status == 'V' or
jpayne@7 318 (status == 'D' and not transitional) or
jpayne@7 319 (status == '3' and not std3_rules and replacement is None)):
jpayne@7 320 output += char
jpayne@7 321 elif replacement is not None and (status == 'M' or
jpayne@7 322 (status == '3' and not std3_rules) or
jpayne@7 323 (status == 'D' and transitional)):
jpayne@7 324 output += replacement
jpayne@7 325 elif status != 'I':
jpayne@7 326 raise IndexError()
jpayne@7 327 except IndexError:
jpayne@7 328 raise InvalidCodepoint(
jpayne@7 329 'Codepoint {} not allowed at position {} in {}'.format(
jpayne@7 330 _unot(code_point), pos + 1, repr(domain)))
jpayne@7 331
jpayne@7 332 return unicodedata.normalize('NFC', output)
jpayne@7 333
jpayne@7 334
jpayne@7 335 def encode(s: Union[str, bytes, bytearray], strict: bool = False, uts46: bool = False, std3_rules: bool = False, transitional: bool = False) -> bytes:
jpayne@7 336 if not isinstance(s, str):
jpayne@7 337 try:
jpayne@7 338 s = str(s, 'ascii')
jpayne@7 339 except UnicodeDecodeError:
jpayne@7 340 raise IDNAError('should pass a unicode string to the function rather than a byte string.')
jpayne@7 341 if uts46:
jpayne@7 342 s = uts46_remap(s, std3_rules, transitional)
jpayne@7 343 trailing_dot = False
jpayne@7 344 result = []
jpayne@7 345 if strict:
jpayne@7 346 labels = s.split('.')
jpayne@7 347 else:
jpayne@7 348 labels = _unicode_dots_re.split(s)
jpayne@7 349 if not labels or labels == ['']:
jpayne@7 350 raise IDNAError('Empty domain')
jpayne@7 351 if labels[-1] == '':
jpayne@7 352 del labels[-1]
jpayne@7 353 trailing_dot = True
jpayne@7 354 for label in labels:
jpayne@7 355 s = alabel(label)
jpayne@7 356 if s:
jpayne@7 357 result.append(s)
jpayne@7 358 else:
jpayne@7 359 raise IDNAError('Empty label')
jpayne@7 360 if trailing_dot:
jpayne@7 361 result.append(b'')
jpayne@7 362 s = b'.'.join(result)
jpayne@7 363 if not valid_string_length(s, trailing_dot):
jpayne@7 364 raise IDNAError('Domain too long')
jpayne@7 365 return s
jpayne@7 366
jpayne@7 367
jpayne@7 368 def decode(s: Union[str, bytes, bytearray], strict: bool = False, uts46: bool = False, std3_rules: bool = False) -> str:
jpayne@7 369 try:
jpayne@7 370 if not isinstance(s, str):
jpayne@7 371 s = str(s, 'ascii')
jpayne@7 372 except UnicodeDecodeError:
jpayne@7 373 raise IDNAError('Invalid ASCII in A-label')
jpayne@7 374 if uts46:
jpayne@7 375 s = uts46_remap(s, std3_rules, False)
jpayne@7 376 trailing_dot = False
jpayne@7 377 result = []
jpayne@7 378 if not strict:
jpayne@7 379 labels = _unicode_dots_re.split(s)
jpayne@7 380 else:
jpayne@7 381 labels = s.split('.')
jpayne@7 382 if not labels or labels == ['']:
jpayne@7 383 raise IDNAError('Empty domain')
jpayne@7 384 if not labels[-1]:
jpayne@7 385 del labels[-1]
jpayne@7 386 trailing_dot = True
jpayne@7 387 for label in labels:
jpayne@7 388 s = ulabel(label)
jpayne@7 389 if s:
jpayne@7 390 result.append(s)
jpayne@7 391 else:
jpayne@7 392 raise IDNAError('Empty label')
jpayne@7 393 if trailing_dot:
jpayne@7 394 result.append('')
jpayne@7 395 return '.'.join(result)