jpayne@68: # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
jpayne@68: 
jpayne@68: import stringprep, re, codecs
jpayne@68: from unicodedata import ucd_3_2_0 as unicodedata
jpayne@68: 
jpayne@68: # IDNA section 3.1
jpayne@68: dots = re.compile("[\u002E\u3002\uFF0E\uFF61]")
jpayne@68: 
jpayne@68: # IDNA section 5
jpayne@68: ace_prefix = b"xn--"
jpayne@68: sace_prefix = "xn--"
jpayne@68: 
jpayne@68: # This assumes query strings, so AllowUnassigned is true
jpayne@68: def nameprep(label):
jpayne@68:     # Map
jpayne@68:     newlabel = []
jpayne@68:     for c in label:
jpayne@68:         if stringprep.in_table_b1(c):
jpayne@68:             # Map to nothing
jpayne@68:             continue
jpayne@68:         newlabel.append(stringprep.map_table_b2(c))
jpayne@68:     label = "".join(newlabel)
jpayne@68: 
jpayne@68:     # Normalize
jpayne@68:     label = unicodedata.normalize("NFKC", label)
jpayne@68: 
jpayne@68:     # Prohibit
jpayne@68:     for c in label:
jpayne@68:         if stringprep.in_table_c12(c) or \
jpayne@68:            stringprep.in_table_c22(c) or \
jpayne@68:            stringprep.in_table_c3(c) or \
jpayne@68:            stringprep.in_table_c4(c) or \
jpayne@68:            stringprep.in_table_c5(c) or \
jpayne@68:            stringprep.in_table_c6(c) or \
jpayne@68:            stringprep.in_table_c7(c) or \
jpayne@68:            stringprep.in_table_c8(c) or \
jpayne@68:            stringprep.in_table_c9(c):
jpayne@68:             raise UnicodeError("Invalid character %r" % c)
jpayne@68: 
jpayne@68:     # Check bidi
jpayne@68:     RandAL = [stringprep.in_table_d1(x) for x in label]
jpayne@68:     for c in RandAL:
jpayne@68:         if c:
jpayne@68:             # There is a RandAL char in the string. Must perform further
jpayne@68:             # tests:
jpayne@68:             # 1) The characters in section 5.8 MUST be prohibited.
jpayne@68:             # This is table C.8, which was already checked
jpayne@68:             # 2) If a string contains any RandALCat character, the string
jpayne@68:             # MUST NOT contain any LCat character.
jpayne@68:             if any(stringprep.in_table_d2(x) for x in label):
jpayne@68:                 raise UnicodeError("Violation of BIDI requirement 2")
jpayne@68: 
jpayne@68:             # 3) If a string contains any RandALCat character, a
jpayne@68:             # RandALCat character MUST be the first character of the
jpayne@68:             # string, and a RandALCat character MUST be the last
jpayne@68:             # character of the string.
jpayne@68:             if not RandAL[0] or not RandAL[-1]:
jpayne@68:                 raise UnicodeError("Violation of BIDI requirement 3")
jpayne@68: 
jpayne@68:     return label
jpayne@68: 
jpayne@68: def ToASCII(label):
jpayne@68:     try:
jpayne@68:         # Step 1: try ASCII
jpayne@68:         label = label.encode("ascii")
jpayne@68:     except UnicodeError:
jpayne@68:         pass
jpayne@68:     else:
jpayne@68:         # Skip to step 3: UseSTD3ASCIIRules is false, so
jpayne@68:         # Skip to step 8.
jpayne@68:         if 0 < len(label) < 64:
jpayne@68:             return label
jpayne@68:         raise UnicodeError("label empty or too long")
jpayne@68: 
jpayne@68:     # Step 2: nameprep
jpayne@68:     label = nameprep(label)
jpayne@68: 
jpayne@68:     # Step 3: UseSTD3ASCIIRules is false
jpayne@68:     # Step 4: try ASCII
jpayne@68:     try:
jpayne@68:         label = label.encode("ascii")
jpayne@68:     except UnicodeError:
jpayne@68:         pass
jpayne@68:     else:
jpayne@68:         # Skip to step 8.
jpayne@68:         if 0 < len(label) < 64:
jpayne@68:             return label
jpayne@68:         raise UnicodeError("label empty or too long")
jpayne@68: 
jpayne@68:     # Step 5: Check ACE prefix
jpayne@68:     if label.startswith(sace_prefix):
jpayne@68:         raise UnicodeError("Label starts with ACE prefix")
jpayne@68: 
jpayne@68:     # Step 6: Encode with PUNYCODE
jpayne@68:     label = label.encode("punycode")
jpayne@68: 
jpayne@68:     # Step 7: Prepend ACE prefix
jpayne@68:     label = ace_prefix + label
jpayne@68: 
jpayne@68:     # Step 8: Check size
jpayne@68:     if 0 < len(label) < 64:
jpayne@68:         return label
jpayne@68:     raise UnicodeError("label empty or too long")
jpayne@68: 
jpayne@68: def ToUnicode(label):
jpayne@68:     # Step 1: Check for ASCII
jpayne@68:     if isinstance(label, bytes):
jpayne@68:         pure_ascii = True
jpayne@68:     else:
jpayne@68:         try:
jpayne@68:             label = label.encode("ascii")
jpayne@68:             pure_ascii = True
jpayne@68:         except UnicodeError:
jpayne@68:             pure_ascii = False
jpayne@68:     if not pure_ascii:
jpayne@68:         # Step 2: Perform nameprep
jpayne@68:         label = nameprep(label)
jpayne@68:         # It doesn't say this, but apparently, it should be ASCII now
jpayne@68:         try:
jpayne@68:             label = label.encode("ascii")
jpayne@68:         except UnicodeError:
jpayne@68:             raise UnicodeError("Invalid character in IDN label")
jpayne@68:     # Step 3: Check for ACE prefix
jpayne@68:     if not label.startswith(ace_prefix):
jpayne@68:         return str(label, "ascii")
jpayne@68: 
jpayne@68:     # Step 4: Remove ACE prefix
jpayne@68:     label1 = label[len(ace_prefix):]
jpayne@68: 
jpayne@68:     # Step 5: Decode using PUNYCODE
jpayne@68:     result = label1.decode("punycode")
jpayne@68: 
jpayne@68:     # Step 6: Apply ToASCII
jpayne@68:     label2 = ToASCII(result)
jpayne@68: 
jpayne@68:     # Step 7: Compare the result of step 6 with the one of step 3
jpayne@68:     # label2 will already be in lower case.
jpayne@68:     if str(label, "ascii").lower() != str(label2, "ascii"):
jpayne@68:         raise UnicodeError("IDNA does not round-trip", label, label2)
jpayne@68: 
jpayne@68:     # Step 8: return the result of step 5
jpayne@68:     return result
jpayne@68: 
jpayne@68: ### Codec APIs
jpayne@68: 
jpayne@68: class Codec(codecs.Codec):
jpayne@68:     def encode(self, input, errors='strict'):
jpayne@68: 
jpayne@68:         if errors != 'strict':
jpayne@68:             # IDNA is quite clear that implementations must be strict
jpayne@68:             raise UnicodeError("unsupported error handling "+errors)
jpayne@68: 
jpayne@68:         if not input:
jpayne@68:             return b'', 0
jpayne@68: 
jpayne@68:         try:
jpayne@68:             result = input.encode('ascii')
jpayne@68:         except UnicodeEncodeError:
jpayne@68:             pass
jpayne@68:         else:
jpayne@68:             # ASCII name: fast path
jpayne@68:             labels = result.split(b'.')
jpayne@68:             for label in labels[:-1]:
jpayne@68:                 if not (0 < len(label) < 64):
jpayne@68:                     raise UnicodeError("label empty or too long")
jpayne@68:             if len(labels[-1]) >= 64:
jpayne@68:                 raise UnicodeError("label too long")
jpayne@68:             return result, len(input)
jpayne@68: 
jpayne@68:         result = bytearray()
jpayne@68:         labels = dots.split(input)
jpayne@68:         if labels and not labels[-1]:
jpayne@68:             trailing_dot = b'.'
jpayne@68:             del labels[-1]
jpayne@68:         else:
jpayne@68:             trailing_dot = b''
jpayne@68:         for label in labels:
jpayne@68:             if result:
jpayne@68:                 # Join with U+002E
jpayne@68:                 result.extend(b'.')
jpayne@68:             result.extend(ToASCII(label))
jpayne@68:         return bytes(result+trailing_dot), len(input)
jpayne@68: 
jpayne@68:     def decode(self, input, errors='strict'):
jpayne@68: 
jpayne@68:         if errors != 'strict':
jpayne@68:             raise UnicodeError("Unsupported error handling "+errors)
jpayne@68: 
jpayne@68:         if not input:
jpayne@68:             return "", 0
jpayne@68: 
jpayne@68:         # IDNA allows decoding to operate on Unicode strings, too.
jpayne@68:         if not isinstance(input, bytes):
jpayne@68:             # XXX obviously wrong, see #3232
jpayne@68:             input = bytes(input)
jpayne@68: 
jpayne@68:         if ace_prefix not in input:
jpayne@68:             # Fast path
jpayne@68:             try:
jpayne@68:                 return input.decode('ascii'), len(input)
jpayne@68:             except UnicodeDecodeError:
jpayne@68:                 pass
jpayne@68: 
jpayne@68:         labels = input.split(b".")
jpayne@68: 
jpayne@68:         if labels and len(labels[-1]) == 0:
jpayne@68:             trailing_dot = '.'
jpayne@68:             del labels[-1]
jpayne@68:         else:
jpayne@68:             trailing_dot = ''
jpayne@68: 
jpayne@68:         result = []
jpayne@68:         for label in labels:
jpayne@68:             result.append(ToUnicode(label))
jpayne@68: 
jpayne@68:         return ".".join(result)+trailing_dot, len(input)
jpayne@68: 
jpayne@68: class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
jpayne@68:     def _buffer_encode(self, input, errors, final):
jpayne@68:         if errors != 'strict':
jpayne@68:             # IDNA is quite clear that implementations must be strict
jpayne@68:             raise UnicodeError("unsupported error handling "+errors)
jpayne@68: 
jpayne@68:         if not input:
jpayne@68:             return (b'', 0)
jpayne@68: 
jpayne@68:         labels = dots.split(input)
jpayne@68:         trailing_dot = b''
jpayne@68:         if labels:
jpayne@68:             if not labels[-1]:
jpayne@68:                 trailing_dot = b'.'
jpayne@68:                 del labels[-1]
jpayne@68:             elif not final:
jpayne@68:                 # Keep potentially unfinished label until the next call
jpayne@68:                 del labels[-1]
jpayne@68:                 if labels:
jpayne@68:                     trailing_dot = b'.'
jpayne@68: 
jpayne@68:         result = bytearray()
jpayne@68:         size = 0
jpayne@68:         for label in labels:
jpayne@68:             if size:
jpayne@68:                 # Join with U+002E
jpayne@68:                 result.extend(b'.')
jpayne@68:                 size += 1
jpayne@68:             result.extend(ToASCII(label))
jpayne@68:             size += len(label)
jpayne@68: 
jpayne@68:         result += trailing_dot
jpayne@68:         size += len(trailing_dot)
jpayne@68:         return (bytes(result), size)
jpayne@68: 
jpayne@68: class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
jpayne@68:     def _buffer_decode(self, input, errors, final):
jpayne@68:         if errors != 'strict':
jpayne@68:             raise UnicodeError("Unsupported error handling "+errors)
jpayne@68: 
jpayne@68:         if not input:
jpayne@68:             return ("", 0)
jpayne@68: 
jpayne@68:         # IDNA allows decoding to operate on Unicode strings, too.
jpayne@68:         if isinstance(input, str):
jpayne@68:             labels = dots.split(input)
jpayne@68:         else:
jpayne@68:             # Must be ASCII string
jpayne@68:             input = str(input, "ascii")
jpayne@68:             labels = input.split(".")
jpayne@68: 
jpayne@68:         trailing_dot = ''
jpayne@68:         if labels:
jpayne@68:             if not labels[-1]:
jpayne@68:                 trailing_dot = '.'
jpayne@68:                 del labels[-1]
jpayne@68:             elif not final:
jpayne@68:                 # Keep potentially unfinished label until the next call
jpayne@68:                 del labels[-1]
jpayne@68:                 if labels:
jpayne@68:                     trailing_dot = '.'
jpayne@68: 
jpayne@68:         result = []
jpayne@68:         size = 0
jpayne@68:         for label in labels:
jpayne@68:             result.append(ToUnicode(label))
jpayne@68:             if size:
jpayne@68:                 size += 1
jpayne@68:             size += len(label)
jpayne@68: 
jpayne@68:         result = ".".join(result) + trailing_dot
jpayne@68:         size += len(trailing_dot)
jpayne@68:         return (result, size)
jpayne@68: 
jpayne@68: class StreamWriter(Codec,codecs.StreamWriter):
jpayne@68:     pass
jpayne@68: 
jpayne@68: class StreamReader(Codec,codecs.StreamReader):
jpayne@68:     pass
jpayne@68: 
jpayne@68: ### encodings module API
jpayne@68: 
jpayne@68: def getregentry():
jpayne@68:     return codecs.CodecInfo(
jpayne@68:         name='idna',
jpayne@68:         encode=Codec().encode,
jpayne@68:         decode=Codec().decode,
jpayne@68:         incrementalencoder=IncrementalEncoder,
jpayne@68:         incrementaldecoder=IncrementalDecoder,
jpayne@68:         streamwriter=StreamWriter,
jpayne@68:         streamreader=StreamReader,
jpayne@68:     )