jpayne@68: # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep) jpayne@68: jpayne@68: import stringprep, re, codecs jpayne@68: from unicodedata import ucd_3_2_0 as unicodedata jpayne@68: jpayne@68: # IDNA section 3.1 jpayne@68: dots = re.compile("[\u002E\u3002\uFF0E\uFF61]") jpayne@68: jpayne@68: # IDNA section 5 jpayne@68: ace_prefix = b"xn--" jpayne@68: sace_prefix = "xn--" jpayne@68: jpayne@68: # This assumes query strings, so AllowUnassigned is true jpayne@68: def nameprep(label): jpayne@68: # Map jpayne@68: newlabel = [] jpayne@68: for c in label: jpayne@68: if stringprep.in_table_b1(c): jpayne@68: # Map to nothing jpayne@68: continue jpayne@68: newlabel.append(stringprep.map_table_b2(c)) jpayne@68: label = "".join(newlabel) jpayne@68: jpayne@68: # Normalize jpayne@68: label = unicodedata.normalize("NFKC", label) jpayne@68: jpayne@68: # Prohibit jpayne@68: for c in label: jpayne@68: if stringprep.in_table_c12(c) or \ jpayne@68: stringprep.in_table_c22(c) or \ jpayne@68: stringprep.in_table_c3(c) or \ jpayne@68: stringprep.in_table_c4(c) or \ jpayne@68: stringprep.in_table_c5(c) or \ jpayne@68: stringprep.in_table_c6(c) or \ jpayne@68: stringprep.in_table_c7(c) or \ jpayne@68: stringprep.in_table_c8(c) or \ jpayne@68: stringprep.in_table_c9(c): jpayne@68: raise UnicodeError("Invalid character %r" % c) jpayne@68: jpayne@68: # Check bidi jpayne@68: RandAL = [stringprep.in_table_d1(x) for x in label] jpayne@68: for c in RandAL: jpayne@68: if c: jpayne@68: # There is a RandAL char in the string. Must perform further jpayne@68: # tests: jpayne@68: # 1) The characters in section 5.8 MUST be prohibited. jpayne@68: # This is table C.8, which was already checked jpayne@68: # 2) If a string contains any RandALCat character, the string jpayne@68: # MUST NOT contain any LCat character. jpayne@68: if any(stringprep.in_table_d2(x) for x in label): jpayne@68: raise UnicodeError("Violation of BIDI requirement 2") jpayne@68: jpayne@68: # 3) If a string contains any RandALCat character, a jpayne@68: # RandALCat character MUST be the first character of the jpayne@68: # string, and a RandALCat character MUST be the last jpayne@68: # character of the string. jpayne@68: if not RandAL[0] or not RandAL[-1]: jpayne@68: raise UnicodeError("Violation of BIDI requirement 3") jpayne@68: jpayne@68: return label jpayne@68: jpayne@68: def ToASCII(label): jpayne@68: try: jpayne@68: # Step 1: try ASCII jpayne@68: label = label.encode("ascii") jpayne@68: except UnicodeError: jpayne@68: pass jpayne@68: else: jpayne@68: # Skip to step 3: UseSTD3ASCIIRules is false, so jpayne@68: # Skip to step 8. jpayne@68: if 0 < len(label) < 64: jpayne@68: return label jpayne@68: raise UnicodeError("label empty or too long") jpayne@68: jpayne@68: # Step 2: nameprep jpayne@68: label = nameprep(label) jpayne@68: jpayne@68: # Step 3: UseSTD3ASCIIRules is false jpayne@68: # Step 4: try ASCII jpayne@68: try: jpayne@68: label = label.encode("ascii") jpayne@68: except UnicodeError: jpayne@68: pass jpayne@68: else: jpayne@68: # Skip to step 8. jpayne@68: if 0 < len(label) < 64: jpayne@68: return label jpayne@68: raise UnicodeError("label empty or too long") jpayne@68: jpayne@68: # Step 5: Check ACE prefix jpayne@68: if label.startswith(sace_prefix): jpayne@68: raise UnicodeError("Label starts with ACE prefix") jpayne@68: jpayne@68: # Step 6: Encode with PUNYCODE jpayne@68: label = label.encode("punycode") jpayne@68: jpayne@68: # Step 7: Prepend ACE prefix jpayne@68: label = ace_prefix + label jpayne@68: jpayne@68: # Step 8: Check size jpayne@68: if 0 < len(label) < 64: jpayne@68: return label jpayne@68: raise UnicodeError("label empty or too long") jpayne@68: jpayne@68: def ToUnicode(label): jpayne@68: # Step 1: Check for ASCII jpayne@68: if isinstance(label, bytes): jpayne@68: pure_ascii = True jpayne@68: else: jpayne@68: try: jpayne@68: label = label.encode("ascii") jpayne@68: pure_ascii = True jpayne@68: except UnicodeError: jpayne@68: pure_ascii = False jpayne@68: if not pure_ascii: jpayne@68: # Step 2: Perform nameprep jpayne@68: label = nameprep(label) jpayne@68: # It doesn't say this, but apparently, it should be ASCII now jpayne@68: try: jpayne@68: label = label.encode("ascii") jpayne@68: except UnicodeError: jpayne@68: raise UnicodeError("Invalid character in IDN label") jpayne@68: # Step 3: Check for ACE prefix jpayne@68: if not label.startswith(ace_prefix): jpayne@68: return str(label, "ascii") jpayne@68: jpayne@68: # Step 4: Remove ACE prefix jpayne@68: label1 = label[len(ace_prefix):] jpayne@68: jpayne@68: # Step 5: Decode using PUNYCODE jpayne@68: result = label1.decode("punycode") jpayne@68: jpayne@68: # Step 6: Apply ToASCII jpayne@68: label2 = ToASCII(result) jpayne@68: jpayne@68: # Step 7: Compare the result of step 6 with the one of step 3 jpayne@68: # label2 will already be in lower case. jpayne@68: if str(label, "ascii").lower() != str(label2, "ascii"): jpayne@68: raise UnicodeError("IDNA does not round-trip", label, label2) jpayne@68: jpayne@68: # Step 8: return the result of step 5 jpayne@68: return result jpayne@68: jpayne@68: ### Codec APIs jpayne@68: jpayne@68: class Codec(codecs.Codec): jpayne@68: def encode(self, input, errors='strict'): jpayne@68: jpayne@68: if errors != 'strict': jpayne@68: # IDNA is quite clear that implementations must be strict jpayne@68: raise UnicodeError("unsupported error handling "+errors) jpayne@68: jpayne@68: if not input: jpayne@68: return b'', 0 jpayne@68: jpayne@68: try: jpayne@68: result = input.encode('ascii') jpayne@68: except UnicodeEncodeError: jpayne@68: pass jpayne@68: else: jpayne@68: # ASCII name: fast path jpayne@68: labels = result.split(b'.') jpayne@68: for label in labels[:-1]: jpayne@68: if not (0 < len(label) < 64): jpayne@68: raise UnicodeError("label empty or too long") jpayne@68: if len(labels[-1]) >= 64: jpayne@68: raise UnicodeError("label too long") jpayne@68: return result, len(input) jpayne@68: jpayne@68: result = bytearray() jpayne@68: labels = dots.split(input) jpayne@68: if labels and not labels[-1]: jpayne@68: trailing_dot = b'.' jpayne@68: del labels[-1] jpayne@68: else: jpayne@68: trailing_dot = b'' jpayne@68: for label in labels: jpayne@68: if result: jpayne@68: # Join with U+002E jpayne@68: result.extend(b'.') jpayne@68: result.extend(ToASCII(label)) jpayne@68: return bytes(result+trailing_dot), len(input) jpayne@68: jpayne@68: def decode(self, input, errors='strict'): jpayne@68: jpayne@68: if errors != 'strict': jpayne@68: raise UnicodeError("Unsupported error handling "+errors) jpayne@68: jpayne@68: if not input: jpayne@68: return "", 0 jpayne@68: jpayne@68: # IDNA allows decoding to operate on Unicode strings, too. jpayne@68: if not isinstance(input, bytes): jpayne@68: # XXX obviously wrong, see #3232 jpayne@68: input = bytes(input) jpayne@68: jpayne@68: if ace_prefix not in input: jpayne@68: # Fast path jpayne@68: try: jpayne@68: return input.decode('ascii'), len(input) jpayne@68: except UnicodeDecodeError: jpayne@68: pass jpayne@68: jpayne@68: labels = input.split(b".") jpayne@68: jpayne@68: if labels and len(labels[-1]) == 0: jpayne@68: trailing_dot = '.' jpayne@68: del labels[-1] jpayne@68: else: jpayne@68: trailing_dot = '' jpayne@68: jpayne@68: result = [] jpayne@68: for label in labels: jpayne@68: result.append(ToUnicode(label)) jpayne@68: jpayne@68: return ".".join(result)+trailing_dot, len(input) jpayne@68: jpayne@68: class IncrementalEncoder(codecs.BufferedIncrementalEncoder): jpayne@68: def _buffer_encode(self, input, errors, final): jpayne@68: if errors != 'strict': jpayne@68: # IDNA is quite clear that implementations must be strict jpayne@68: raise UnicodeError("unsupported error handling "+errors) jpayne@68: jpayne@68: if not input: jpayne@68: return (b'', 0) jpayne@68: jpayne@68: labels = dots.split(input) jpayne@68: trailing_dot = b'' jpayne@68: if labels: jpayne@68: if not labels[-1]: jpayne@68: trailing_dot = b'.' jpayne@68: del labels[-1] jpayne@68: elif not final: jpayne@68: # Keep potentially unfinished label until the next call jpayne@68: del labels[-1] jpayne@68: if labels: jpayne@68: trailing_dot = b'.' jpayne@68: jpayne@68: result = bytearray() jpayne@68: size = 0 jpayne@68: for label in labels: jpayne@68: if size: jpayne@68: # Join with U+002E jpayne@68: result.extend(b'.') jpayne@68: size += 1 jpayne@68: result.extend(ToASCII(label)) jpayne@68: size += len(label) jpayne@68: jpayne@68: result += trailing_dot jpayne@68: size += len(trailing_dot) jpayne@68: return (bytes(result), size) jpayne@68: jpayne@68: class IncrementalDecoder(codecs.BufferedIncrementalDecoder): jpayne@68: def _buffer_decode(self, input, errors, final): jpayne@68: if errors != 'strict': jpayne@68: raise UnicodeError("Unsupported error handling "+errors) jpayne@68: jpayne@68: if not input: jpayne@68: return ("", 0) jpayne@68: jpayne@68: # IDNA allows decoding to operate on Unicode strings, too. jpayne@68: if isinstance(input, str): jpayne@68: labels = dots.split(input) jpayne@68: else: jpayne@68: # Must be ASCII string jpayne@68: input = str(input, "ascii") jpayne@68: labels = input.split(".") jpayne@68: jpayne@68: trailing_dot = '' jpayne@68: if labels: jpayne@68: if not labels[-1]: jpayne@68: trailing_dot = '.' jpayne@68: del labels[-1] jpayne@68: elif not final: jpayne@68: # Keep potentially unfinished label until the next call jpayne@68: del labels[-1] jpayne@68: if labels: jpayne@68: trailing_dot = '.' jpayne@68: jpayne@68: result = [] jpayne@68: size = 0 jpayne@68: for label in labels: jpayne@68: result.append(ToUnicode(label)) jpayne@68: if size: jpayne@68: size += 1 jpayne@68: size += len(label) jpayne@68: jpayne@68: result = ".".join(result) + trailing_dot jpayne@68: size += len(trailing_dot) jpayne@68: return (result, size) jpayne@68: jpayne@68: class StreamWriter(Codec,codecs.StreamWriter): jpayne@68: pass jpayne@68: jpayne@68: class StreamReader(Codec,codecs.StreamReader): jpayne@68: pass jpayne@68: jpayne@68: ### encodings module API jpayne@68: jpayne@68: def getregentry(): jpayne@68: return codecs.CodecInfo( jpayne@68: name='idna', jpayne@68: encode=Codec().encode, jpayne@68: decode=Codec().decode, jpayne@68: incrementalencoder=IncrementalEncoder, jpayne@68: incrementaldecoder=IncrementalDecoder, jpayne@68: streamwriter=StreamWriter, jpayne@68: streamreader=StreamReader, jpayne@68: )