jpayne@69: """ Codec for the Punicode encoding, as specified in RFC 3492
jpayne@69: 
jpayne@69: Written by Martin v. Löwis.
jpayne@69: """
jpayne@69: 
jpayne@69: import codecs
jpayne@69: 
jpayne@69: ##################### Encoding #####################################
jpayne@69: 
jpayne@69: def segregate(str):
jpayne@69:     """3.1 Basic code point segregation"""
jpayne@69:     base = bytearray()
jpayne@69:     extended = set()
jpayne@69:     for c in str:
jpayne@69:         if ord(c) < 128:
jpayne@69:             base.append(ord(c))
jpayne@69:         else:
jpayne@69:             extended.add(c)
jpayne@69:     extended = sorted(extended)
jpayne@69:     return bytes(base), extended
jpayne@69: 
jpayne@69: def selective_len(str, max):
jpayne@69:     """Return the length of str, considering only characters below max."""
jpayne@69:     res = 0
jpayne@69:     for c in str:
jpayne@69:         if ord(c) < max:
jpayne@69:             res += 1
jpayne@69:     return res
jpayne@69: 
jpayne@69: def selective_find(str, char, index, pos):
jpayne@69:     """Return a pair (index, pos), indicating the next occurrence of
jpayne@69:     char in str. index is the position of the character considering
jpayne@69:     only ordinals up to and including char, and pos is the position in
jpayne@69:     the full string. index/pos is the starting position in the full
jpayne@69:     string."""
jpayne@69: 
jpayne@69:     l = len(str)
jpayne@69:     while 1:
jpayne@69:         pos += 1
jpayne@69:         if pos == l:
jpayne@69:             return (-1, -1)
jpayne@69:         c = str[pos]
jpayne@69:         if c == char:
jpayne@69:             return index+1, pos
jpayne@69:         elif c < char:
jpayne@69:             index += 1
jpayne@69: 
jpayne@69: def insertion_unsort(str, extended):
jpayne@69:     """3.2 Insertion unsort coding"""
jpayne@69:     oldchar = 0x80
jpayne@69:     result = []
jpayne@69:     oldindex = -1
jpayne@69:     for c in extended:
jpayne@69:         index = pos = -1
jpayne@69:         char = ord(c)
jpayne@69:         curlen = selective_len(str, char)
jpayne@69:         delta = (curlen+1) * (char - oldchar)
jpayne@69:         while 1:
jpayne@69:             index,pos = selective_find(str,c,index,pos)
jpayne@69:             if index == -1:
jpayne@69:                 break
jpayne@69:             delta += index - oldindex
jpayne@69:             result.append(delta-1)
jpayne@69:             oldindex = index
jpayne@69:             delta = 0
jpayne@69:         oldchar = char
jpayne@69: 
jpayne@69:     return result
jpayne@69: 
jpayne@69: def T(j, bias):
jpayne@69:     # Punycode parameters: tmin = 1, tmax = 26, base = 36
jpayne@69:     res = 36 * (j + 1) - bias
jpayne@69:     if res < 1: return 1
jpayne@69:     if res > 26: return 26
jpayne@69:     return res
jpayne@69: 
jpayne@69: digits = b"abcdefghijklmnopqrstuvwxyz0123456789"
jpayne@69: def generate_generalized_integer(N, bias):
jpayne@69:     """3.3 Generalized variable-length integers"""
jpayne@69:     result = bytearray()
jpayne@69:     j = 0
jpayne@69:     while 1:
jpayne@69:         t = T(j, bias)
jpayne@69:         if N < t:
jpayne@69:             result.append(digits[N])
jpayne@69:             return bytes(result)
jpayne@69:         result.append(digits[t + ((N - t) % (36 - t))])
jpayne@69:         N = (N - t) // (36 - t)
jpayne@69:         j += 1
jpayne@69: 
jpayne@69: def adapt(delta, first, numchars):
jpayne@69:     if first:
jpayne@69:         delta //= 700
jpayne@69:     else:
jpayne@69:         delta //= 2
jpayne@69:     delta += delta // numchars
jpayne@69:     # ((base - tmin) * tmax) // 2 == 455
jpayne@69:     divisions = 0
jpayne@69:     while delta > 455:
jpayne@69:         delta = delta // 35 # base - tmin
jpayne@69:         divisions += 36
jpayne@69:     bias = divisions + (36 * delta // (delta + 38))
jpayne@69:     return bias
jpayne@69: 
jpayne@69: 
jpayne@69: def generate_integers(baselen, deltas):
jpayne@69:     """3.4 Bias adaptation"""
jpayne@69:     # Punycode parameters: initial bias = 72, damp = 700, skew = 38
jpayne@69:     result = bytearray()
jpayne@69:     bias = 72
jpayne@69:     for points, delta in enumerate(deltas):
jpayne@69:         s = generate_generalized_integer(delta, bias)
jpayne@69:         result.extend(s)
jpayne@69:         bias = adapt(delta, points==0, baselen+points+1)
jpayne@69:     return bytes(result)
jpayne@69: 
jpayne@69: def punycode_encode(text):
jpayne@69:     base, extended = segregate(text)
jpayne@69:     deltas = insertion_unsort(text, extended)
jpayne@69:     extended = generate_integers(len(base), deltas)
jpayne@69:     if base:
jpayne@69:         return base + b"-" + extended
jpayne@69:     return extended
jpayne@69: 
jpayne@69: ##################### Decoding #####################################
jpayne@69: 
jpayne@69: def decode_generalized_number(extended, extpos, bias, errors):
jpayne@69:     """3.3 Generalized variable-length integers"""
jpayne@69:     result = 0
jpayne@69:     w = 1
jpayne@69:     j = 0
jpayne@69:     while 1:
jpayne@69:         try:
jpayne@69:             char = ord(extended[extpos])
jpayne@69:         except IndexError:
jpayne@69:             if errors == "strict":
jpayne@69:                 raise UnicodeError("incomplete punicode string")
jpayne@69:             return extpos + 1, None
jpayne@69:         extpos += 1
jpayne@69:         if 0x41 <= char <= 0x5A: # A-Z
jpayne@69:             digit = char - 0x41
jpayne@69:         elif 0x30 <= char <= 0x39:
jpayne@69:             digit = char - 22 # 0x30-26
jpayne@69:         elif errors == "strict":
jpayne@69:             raise UnicodeError("Invalid extended code point '%s'"
jpayne@69:                                % extended[extpos])
jpayne@69:         else:
jpayne@69:             return extpos, None
jpayne@69:         t = T(j, bias)
jpayne@69:         result += digit * w
jpayne@69:         if digit < t:
jpayne@69:             return extpos, result
jpayne@69:         w = w * (36 - t)
jpayne@69:         j += 1
jpayne@69: 
jpayne@69: 
jpayne@69: def insertion_sort(base, extended, errors):
jpayne@69:     """3.2 Insertion unsort coding"""
jpayne@69:     char = 0x80
jpayne@69:     pos = -1
jpayne@69:     bias = 72
jpayne@69:     extpos = 0
jpayne@69:     while extpos < len(extended):
jpayne@69:         newpos, delta = decode_generalized_number(extended, extpos,
jpayne@69:                                                   bias, errors)
jpayne@69:         if delta is None:
jpayne@69:             # There was an error in decoding. We can't continue because
jpayne@69:             # synchronization is lost.
jpayne@69:             return base
jpayne@69:         pos += delta+1
jpayne@69:         char += pos // (len(base) + 1)
jpayne@69:         if char > 0x10FFFF:
jpayne@69:             if errors == "strict":
jpayne@69:                 raise UnicodeError("Invalid character U+%x" % char)
jpayne@69:             char = ord('?')
jpayne@69:         pos = pos % (len(base) + 1)
jpayne@69:         base = base[:pos] + chr(char) + base[pos:]
jpayne@69:         bias = adapt(delta, (extpos == 0), len(base))
jpayne@69:         extpos = newpos
jpayne@69:     return base
jpayne@69: 
jpayne@69: def punycode_decode(text, errors):
jpayne@69:     if isinstance(text, str):
jpayne@69:         text = text.encode("ascii")
jpayne@69:     if isinstance(text, memoryview):
jpayne@69:         text = bytes(text)
jpayne@69:     pos = text.rfind(b"-")
jpayne@69:     if pos == -1:
jpayne@69:         base = ""
jpayne@69:         extended = str(text, "ascii").upper()
jpayne@69:     else:
jpayne@69:         base = str(text[:pos], "ascii", errors)
jpayne@69:         extended = str(text[pos+1:], "ascii").upper()
jpayne@69:     return insertion_sort(base, extended, errors)
jpayne@69: 
jpayne@69: ### Codec APIs
jpayne@69: 
jpayne@69: class Codec(codecs.Codec):
jpayne@69: 
jpayne@69:     def encode(self, input, errors='strict'):
jpayne@69:         res = punycode_encode(input)
jpayne@69:         return res, len(input)
jpayne@69: 
jpayne@69:     def decode(self, input, errors='strict'):
jpayne@69:         if errors not in ('strict', 'replace', 'ignore'):
jpayne@69:             raise UnicodeError("Unsupported error handling "+errors)
jpayne@69:         res = punycode_decode(input, errors)
jpayne@69:         return res, len(input)
jpayne@69: 
jpayne@69: class IncrementalEncoder(codecs.IncrementalEncoder):
jpayne@69:     def encode(self, input, final=False):
jpayne@69:         return punycode_encode(input)
jpayne@69: 
jpayne@69: class IncrementalDecoder(codecs.IncrementalDecoder):
jpayne@69:     def decode(self, input, final=False):
jpayne@69:         if self.errors not in ('strict', 'replace', 'ignore'):
jpayne@69:             raise UnicodeError("Unsupported error handling "+self.errors)
jpayne@69:         return punycode_decode(input, self.errors)
jpayne@69: 
jpayne@69: class StreamWriter(Codec,codecs.StreamWriter):
jpayne@69:     pass
jpayne@69: 
jpayne@69: class StreamReader(Codec,codecs.StreamReader):
jpayne@69:     pass
jpayne@69: 
jpayne@69: ### encodings module API
jpayne@69: 
jpayne@69: def getregentry():
jpayne@69:     return codecs.CodecInfo(
jpayne@69:         name='punycode',
jpayne@69:         encode=Codec().encode,
jpayne@69:         decode=Codec().decode,
jpayne@69:         incrementalencoder=IncrementalEncoder,
jpayne@69:         incrementaldecoder=IncrementalDecoder,
jpayne@69:         streamwriter=StreamWriter,
jpayne@69:         streamreader=StreamReader,
jpayne@69:     )