jpayne@69: """ Codec for the Punicode encoding, as specified in RFC 3492 jpayne@69: jpayne@69: Written by Martin v. Löwis. jpayne@69: """ jpayne@69: jpayne@69: import codecs jpayne@69: jpayne@69: ##################### Encoding ##################################### jpayne@69: jpayne@69: def segregate(str): jpayne@69: """3.1 Basic code point segregation""" jpayne@69: base = bytearray() jpayne@69: extended = set() jpayne@69: for c in str: jpayne@69: if ord(c) < 128: jpayne@69: base.append(ord(c)) jpayne@69: else: jpayne@69: extended.add(c) jpayne@69: extended = sorted(extended) jpayne@69: return bytes(base), extended jpayne@69: jpayne@69: def selective_len(str, max): jpayne@69: """Return the length of str, considering only characters below max.""" jpayne@69: res = 0 jpayne@69: for c in str: jpayne@69: if ord(c) < max: jpayne@69: res += 1 jpayne@69: return res jpayne@69: jpayne@69: def selective_find(str, char, index, pos): jpayne@69: """Return a pair (index, pos), indicating the next occurrence of jpayne@69: char in str. index is the position of the character considering jpayne@69: only ordinals up to and including char, and pos is the position in jpayne@69: the full string. index/pos is the starting position in the full jpayne@69: string.""" jpayne@69: jpayne@69: l = len(str) jpayne@69: while 1: jpayne@69: pos += 1 jpayne@69: if pos == l: jpayne@69: return (-1, -1) jpayne@69: c = str[pos] jpayne@69: if c == char: jpayne@69: return index+1, pos jpayne@69: elif c < char: jpayne@69: index += 1 jpayne@69: jpayne@69: def insertion_unsort(str, extended): jpayne@69: """3.2 Insertion unsort coding""" jpayne@69: oldchar = 0x80 jpayne@69: result = [] jpayne@69: oldindex = -1 jpayne@69: for c in extended: jpayne@69: index = pos = -1 jpayne@69: char = ord(c) jpayne@69: curlen = selective_len(str, char) jpayne@69: delta = (curlen+1) * (char - oldchar) jpayne@69: while 1: jpayne@69: index,pos = selective_find(str,c,index,pos) jpayne@69: if index == -1: jpayne@69: break jpayne@69: delta += index - oldindex jpayne@69: result.append(delta-1) jpayne@69: oldindex = index jpayne@69: delta = 0 jpayne@69: oldchar = char jpayne@69: jpayne@69: return result jpayne@69: jpayne@69: def T(j, bias): jpayne@69: # Punycode parameters: tmin = 1, tmax = 26, base = 36 jpayne@69: res = 36 * (j + 1) - bias jpayne@69: if res < 1: return 1 jpayne@69: if res > 26: return 26 jpayne@69: return res jpayne@69: jpayne@69: digits = b"abcdefghijklmnopqrstuvwxyz0123456789" jpayne@69: def generate_generalized_integer(N, bias): jpayne@69: """3.3 Generalized variable-length integers""" jpayne@69: result = bytearray() jpayne@69: j = 0 jpayne@69: while 1: jpayne@69: t = T(j, bias) jpayne@69: if N < t: jpayne@69: result.append(digits[N]) jpayne@69: return bytes(result) jpayne@69: result.append(digits[t + ((N - t) % (36 - t))]) jpayne@69: N = (N - t) // (36 - t) jpayne@69: j += 1 jpayne@69: jpayne@69: def adapt(delta, first, numchars): jpayne@69: if first: jpayne@69: delta //= 700 jpayne@69: else: jpayne@69: delta //= 2 jpayne@69: delta += delta // numchars jpayne@69: # ((base - tmin) * tmax) // 2 == 455 jpayne@69: divisions = 0 jpayne@69: while delta > 455: jpayne@69: delta = delta // 35 # base - tmin jpayne@69: divisions += 36 jpayne@69: bias = divisions + (36 * delta // (delta + 38)) jpayne@69: return bias jpayne@69: jpayne@69: jpayne@69: def generate_integers(baselen, deltas): jpayne@69: """3.4 Bias adaptation""" jpayne@69: # Punycode parameters: initial bias = 72, damp = 700, skew = 38 jpayne@69: result = bytearray() jpayne@69: bias = 72 jpayne@69: for points, delta in enumerate(deltas): jpayne@69: s = generate_generalized_integer(delta, bias) jpayne@69: result.extend(s) jpayne@69: bias = adapt(delta, points==0, baselen+points+1) jpayne@69: return bytes(result) jpayne@69: jpayne@69: def punycode_encode(text): jpayne@69: base, extended = segregate(text) jpayne@69: deltas = insertion_unsort(text, extended) jpayne@69: extended = generate_integers(len(base), deltas) jpayne@69: if base: jpayne@69: return base + b"-" + extended jpayne@69: return extended jpayne@69: jpayne@69: ##################### Decoding ##################################### jpayne@69: jpayne@69: def decode_generalized_number(extended, extpos, bias, errors): jpayne@69: """3.3 Generalized variable-length integers""" jpayne@69: result = 0 jpayne@69: w = 1 jpayne@69: j = 0 jpayne@69: while 1: jpayne@69: try: jpayne@69: char = ord(extended[extpos]) jpayne@69: except IndexError: jpayne@69: if errors == "strict": jpayne@69: raise UnicodeError("incomplete punicode string") jpayne@69: return extpos + 1, None jpayne@69: extpos += 1 jpayne@69: if 0x41 <= char <= 0x5A: # A-Z jpayne@69: digit = char - 0x41 jpayne@69: elif 0x30 <= char <= 0x39: jpayne@69: digit = char - 22 # 0x30-26 jpayne@69: elif errors == "strict": jpayne@69: raise UnicodeError("Invalid extended code point '%s'" jpayne@69: % extended[extpos]) jpayne@69: else: jpayne@69: return extpos, None jpayne@69: t = T(j, bias) jpayne@69: result += digit * w jpayne@69: if digit < t: jpayne@69: return extpos, result jpayne@69: w = w * (36 - t) jpayne@69: j += 1 jpayne@69: jpayne@69: jpayne@69: def insertion_sort(base, extended, errors): jpayne@69: """3.2 Insertion unsort coding""" jpayne@69: char = 0x80 jpayne@69: pos = -1 jpayne@69: bias = 72 jpayne@69: extpos = 0 jpayne@69: while extpos < len(extended): jpayne@69: newpos, delta = decode_generalized_number(extended, extpos, jpayne@69: bias, errors) jpayne@69: if delta is None: jpayne@69: # There was an error in decoding. We can't continue because jpayne@69: # synchronization is lost. jpayne@69: return base jpayne@69: pos += delta+1 jpayne@69: char += pos // (len(base) + 1) jpayne@69: if char > 0x10FFFF: jpayne@69: if errors == "strict": jpayne@69: raise UnicodeError("Invalid character U+%x" % char) jpayne@69: char = ord('?') jpayne@69: pos = pos % (len(base) + 1) jpayne@69: base = base[:pos] + chr(char) + base[pos:] jpayne@69: bias = adapt(delta, (extpos == 0), len(base)) jpayne@69: extpos = newpos jpayne@69: return base jpayne@69: jpayne@69: def punycode_decode(text, errors): jpayne@69: if isinstance(text, str): jpayne@69: text = text.encode("ascii") jpayne@69: if isinstance(text, memoryview): jpayne@69: text = bytes(text) jpayne@69: pos = text.rfind(b"-") jpayne@69: if pos == -1: jpayne@69: base = "" jpayne@69: extended = str(text, "ascii").upper() jpayne@69: else: jpayne@69: base = str(text[:pos], "ascii", errors) jpayne@69: extended = str(text[pos+1:], "ascii").upper() jpayne@69: return insertion_sort(base, extended, errors) jpayne@69: jpayne@69: ### Codec APIs jpayne@69: jpayne@69: class Codec(codecs.Codec): jpayne@69: jpayne@69: def encode(self, input, errors='strict'): jpayne@69: res = punycode_encode(input) jpayne@69: return res, len(input) jpayne@69: jpayne@69: def decode(self, input, errors='strict'): jpayne@69: if errors not in ('strict', 'replace', 'ignore'): jpayne@69: raise UnicodeError("Unsupported error handling "+errors) jpayne@69: res = punycode_decode(input, errors) jpayne@69: return res, len(input) jpayne@69: jpayne@69: class IncrementalEncoder(codecs.IncrementalEncoder): jpayne@69: def encode(self, input, final=False): jpayne@69: return punycode_encode(input) jpayne@69: jpayne@69: class IncrementalDecoder(codecs.IncrementalDecoder): jpayne@69: def decode(self, input, final=False): jpayne@69: if self.errors not in ('strict', 'replace', 'ignore'): jpayne@69: raise UnicodeError("Unsupported error handling "+self.errors) jpayne@69: return punycode_decode(input, self.errors) jpayne@69: jpayne@69: class StreamWriter(Codec,codecs.StreamWriter): jpayne@69: pass jpayne@69: jpayne@69: class StreamReader(Codec,codecs.StreamReader): jpayne@69: pass jpayne@69: jpayne@69: ### encodings module API jpayne@69: jpayne@69: def getregentry(): jpayne@69: return codecs.CodecInfo( jpayne@69: name='punycode', jpayne@69: encode=Codec().encode, jpayne@69: decode=Codec().decode, jpayne@69: incrementalencoder=IncrementalEncoder, jpayne@69: incrementaldecoder=IncrementalDecoder, jpayne@69: streamwriter=StreamWriter, jpayne@69: streamreader=StreamReader, jpayne@69: )