jpayne@69
|
1 """ Codec for the Punicode encoding, as specified in RFC 3492
|
jpayne@69
|
2
|
jpayne@69
|
3 Written by Martin v. Löwis.
|
jpayne@69
|
4 """
|
jpayne@69
|
5
|
jpayne@69
|
6 import codecs
|
jpayne@69
|
7
|
jpayne@69
|
8 ##################### Encoding #####################################
|
jpayne@69
|
9
|
jpayne@69
|
10 def segregate(str):
|
jpayne@69
|
11 """3.1 Basic code point segregation"""
|
jpayne@69
|
12 base = bytearray()
|
jpayne@69
|
13 extended = set()
|
jpayne@69
|
14 for c in str:
|
jpayne@69
|
15 if ord(c) < 128:
|
jpayne@69
|
16 base.append(ord(c))
|
jpayne@69
|
17 else:
|
jpayne@69
|
18 extended.add(c)
|
jpayne@69
|
19 extended = sorted(extended)
|
jpayne@69
|
20 return bytes(base), extended
|
jpayne@69
|
21
|
jpayne@69
|
22 def selective_len(str, max):
|
jpayne@69
|
23 """Return the length of str, considering only characters below max."""
|
jpayne@69
|
24 res = 0
|
jpayne@69
|
25 for c in str:
|
jpayne@69
|
26 if ord(c) < max:
|
jpayne@69
|
27 res += 1
|
jpayne@69
|
28 return res
|
jpayne@69
|
29
|
jpayne@69
|
30 def selective_find(str, char, index, pos):
|
jpayne@69
|
31 """Return a pair (index, pos), indicating the next occurrence of
|
jpayne@69
|
32 char in str. index is the position of the character considering
|
jpayne@69
|
33 only ordinals up to and including char, and pos is the position in
|
jpayne@69
|
34 the full string. index/pos is the starting position in the full
|
jpayne@69
|
35 string."""
|
jpayne@69
|
36
|
jpayne@69
|
37 l = len(str)
|
jpayne@69
|
38 while 1:
|
jpayne@69
|
39 pos += 1
|
jpayne@69
|
40 if pos == l:
|
jpayne@69
|
41 return (-1, -1)
|
jpayne@69
|
42 c = str[pos]
|
jpayne@69
|
43 if c == char:
|
jpayne@69
|
44 return index+1, pos
|
jpayne@69
|
45 elif c < char:
|
jpayne@69
|
46 index += 1
|
jpayne@69
|
47
|
jpayne@69
|
48 def insertion_unsort(str, extended):
|
jpayne@69
|
49 """3.2 Insertion unsort coding"""
|
jpayne@69
|
50 oldchar = 0x80
|
jpayne@69
|
51 result = []
|
jpayne@69
|
52 oldindex = -1
|
jpayne@69
|
53 for c in extended:
|
jpayne@69
|
54 index = pos = -1
|
jpayne@69
|
55 char = ord(c)
|
jpayne@69
|
56 curlen = selective_len(str, char)
|
jpayne@69
|
57 delta = (curlen+1) * (char - oldchar)
|
jpayne@69
|
58 while 1:
|
jpayne@69
|
59 index,pos = selective_find(str,c,index,pos)
|
jpayne@69
|
60 if index == -1:
|
jpayne@69
|
61 break
|
jpayne@69
|
62 delta += index - oldindex
|
jpayne@69
|
63 result.append(delta-1)
|
jpayne@69
|
64 oldindex = index
|
jpayne@69
|
65 delta = 0
|
jpayne@69
|
66 oldchar = char
|
jpayne@69
|
67
|
jpayne@69
|
68 return result
|
jpayne@69
|
69
|
jpayne@69
|
70 def T(j, bias):
|
jpayne@69
|
71 # Punycode parameters: tmin = 1, tmax = 26, base = 36
|
jpayne@69
|
72 res = 36 * (j + 1) - bias
|
jpayne@69
|
73 if res < 1: return 1
|
jpayne@69
|
74 if res > 26: return 26
|
jpayne@69
|
75 return res
|
jpayne@69
|
76
|
jpayne@69
|
77 digits = b"abcdefghijklmnopqrstuvwxyz0123456789"
|
jpayne@69
|
78 def generate_generalized_integer(N, bias):
|
jpayne@69
|
79 """3.3 Generalized variable-length integers"""
|
jpayne@69
|
80 result = bytearray()
|
jpayne@69
|
81 j = 0
|
jpayne@69
|
82 while 1:
|
jpayne@69
|
83 t = T(j, bias)
|
jpayne@69
|
84 if N < t:
|
jpayne@69
|
85 result.append(digits[N])
|
jpayne@69
|
86 return bytes(result)
|
jpayne@69
|
87 result.append(digits[t + ((N - t) % (36 - t))])
|
jpayne@69
|
88 N = (N - t) // (36 - t)
|
jpayne@69
|
89 j += 1
|
jpayne@69
|
90
|
jpayne@69
|
91 def adapt(delta, first, numchars):
|
jpayne@69
|
92 if first:
|
jpayne@69
|
93 delta //= 700
|
jpayne@69
|
94 else:
|
jpayne@69
|
95 delta //= 2
|
jpayne@69
|
96 delta += delta // numchars
|
jpayne@69
|
97 # ((base - tmin) * tmax) // 2 == 455
|
jpayne@69
|
98 divisions = 0
|
jpayne@69
|
99 while delta > 455:
|
jpayne@69
|
100 delta = delta // 35 # base - tmin
|
jpayne@69
|
101 divisions += 36
|
jpayne@69
|
102 bias = divisions + (36 * delta // (delta + 38))
|
jpayne@69
|
103 return bias
|
jpayne@69
|
104
|
jpayne@69
|
105
|
jpayne@69
|
106 def generate_integers(baselen, deltas):
|
jpayne@69
|
107 """3.4 Bias adaptation"""
|
jpayne@69
|
108 # Punycode parameters: initial bias = 72, damp = 700, skew = 38
|
jpayne@69
|
109 result = bytearray()
|
jpayne@69
|
110 bias = 72
|
jpayne@69
|
111 for points, delta in enumerate(deltas):
|
jpayne@69
|
112 s = generate_generalized_integer(delta, bias)
|
jpayne@69
|
113 result.extend(s)
|
jpayne@69
|
114 bias = adapt(delta, points==0, baselen+points+1)
|
jpayne@69
|
115 return bytes(result)
|
jpayne@69
|
116
|
jpayne@69
|
117 def punycode_encode(text):
|
jpayne@69
|
118 base, extended = segregate(text)
|
jpayne@69
|
119 deltas = insertion_unsort(text, extended)
|
jpayne@69
|
120 extended = generate_integers(len(base), deltas)
|
jpayne@69
|
121 if base:
|
jpayne@69
|
122 return base + b"-" + extended
|
jpayne@69
|
123 return extended
|
jpayne@69
|
124
|
jpayne@69
|
125 ##################### Decoding #####################################
|
jpayne@69
|
126
|
jpayne@69
|
127 def decode_generalized_number(extended, extpos, bias, errors):
|
jpayne@69
|
128 """3.3 Generalized variable-length integers"""
|
jpayne@69
|
129 result = 0
|
jpayne@69
|
130 w = 1
|
jpayne@69
|
131 j = 0
|
jpayne@69
|
132 while 1:
|
jpayne@69
|
133 try:
|
jpayne@69
|
134 char = ord(extended[extpos])
|
jpayne@69
|
135 except IndexError:
|
jpayne@69
|
136 if errors == "strict":
|
jpayne@69
|
137 raise UnicodeError("incomplete punicode string")
|
jpayne@69
|
138 return extpos + 1, None
|
jpayne@69
|
139 extpos += 1
|
jpayne@69
|
140 if 0x41 <= char <= 0x5A: # A-Z
|
jpayne@69
|
141 digit = char - 0x41
|
jpayne@69
|
142 elif 0x30 <= char <= 0x39:
|
jpayne@69
|
143 digit = char - 22 # 0x30-26
|
jpayne@69
|
144 elif errors == "strict":
|
jpayne@69
|
145 raise UnicodeError("Invalid extended code point '%s'"
|
jpayne@69
|
146 % extended[extpos])
|
jpayne@69
|
147 else:
|
jpayne@69
|
148 return extpos, None
|
jpayne@69
|
149 t = T(j, bias)
|
jpayne@69
|
150 result += digit * w
|
jpayne@69
|
151 if digit < t:
|
jpayne@69
|
152 return extpos, result
|
jpayne@69
|
153 w = w * (36 - t)
|
jpayne@69
|
154 j += 1
|
jpayne@69
|
155
|
jpayne@69
|
156
|
jpayne@69
|
157 def insertion_sort(base, extended, errors):
|
jpayne@69
|
158 """3.2 Insertion unsort coding"""
|
jpayne@69
|
159 char = 0x80
|
jpayne@69
|
160 pos = -1
|
jpayne@69
|
161 bias = 72
|
jpayne@69
|
162 extpos = 0
|
jpayne@69
|
163 while extpos < len(extended):
|
jpayne@69
|
164 newpos, delta = decode_generalized_number(extended, extpos,
|
jpayne@69
|
165 bias, errors)
|
jpayne@69
|
166 if delta is None:
|
jpayne@69
|
167 # There was an error in decoding. We can't continue because
|
jpayne@69
|
168 # synchronization is lost.
|
jpayne@69
|
169 return base
|
jpayne@69
|
170 pos += delta+1
|
jpayne@69
|
171 char += pos // (len(base) + 1)
|
jpayne@69
|
172 if char > 0x10FFFF:
|
jpayne@69
|
173 if errors == "strict":
|
jpayne@69
|
174 raise UnicodeError("Invalid character U+%x" % char)
|
jpayne@69
|
175 char = ord('?')
|
jpayne@69
|
176 pos = pos % (len(base) + 1)
|
jpayne@69
|
177 base = base[:pos] + chr(char) + base[pos:]
|
jpayne@69
|
178 bias = adapt(delta, (extpos == 0), len(base))
|
jpayne@69
|
179 extpos = newpos
|
jpayne@69
|
180 return base
|
jpayne@69
|
181
|
jpayne@69
|
182 def punycode_decode(text, errors):
|
jpayne@69
|
183 if isinstance(text, str):
|
jpayne@69
|
184 text = text.encode("ascii")
|
jpayne@69
|
185 if isinstance(text, memoryview):
|
jpayne@69
|
186 text = bytes(text)
|
jpayne@69
|
187 pos = text.rfind(b"-")
|
jpayne@69
|
188 if pos == -1:
|
jpayne@69
|
189 base = ""
|
jpayne@69
|
190 extended = str(text, "ascii").upper()
|
jpayne@69
|
191 else:
|
jpayne@69
|
192 base = str(text[:pos], "ascii", errors)
|
jpayne@69
|
193 extended = str(text[pos+1:], "ascii").upper()
|
jpayne@69
|
194 return insertion_sort(base, extended, errors)
|
jpayne@69
|
195
|
jpayne@69
|
196 ### Codec APIs
|
jpayne@69
|
197
|
jpayne@69
|
198 class Codec(codecs.Codec):
|
jpayne@69
|
199
|
jpayne@69
|
200 def encode(self, input, errors='strict'):
|
jpayne@69
|
201 res = punycode_encode(input)
|
jpayne@69
|
202 return res, len(input)
|
jpayne@69
|
203
|
jpayne@69
|
204 def decode(self, input, errors='strict'):
|
jpayne@69
|
205 if errors not in ('strict', 'replace', 'ignore'):
|
jpayne@69
|
206 raise UnicodeError("Unsupported error handling "+errors)
|
jpayne@69
|
207 res = punycode_decode(input, errors)
|
jpayne@69
|
208 return res, len(input)
|
jpayne@69
|
209
|
jpayne@69
|
210 class IncrementalEncoder(codecs.IncrementalEncoder):
|
jpayne@69
|
211 def encode(self, input, final=False):
|
jpayne@69
|
212 return punycode_encode(input)
|
jpayne@69
|
213
|
jpayne@69
|
214 class IncrementalDecoder(codecs.IncrementalDecoder):
|
jpayne@69
|
215 def decode(self, input, final=False):
|
jpayne@69
|
216 if self.errors not in ('strict', 'replace', 'ignore'):
|
jpayne@69
|
217 raise UnicodeError("Unsupported error handling "+self.errors)
|
jpayne@69
|
218 return punycode_decode(input, self.errors)
|
jpayne@69
|
219
|
jpayne@69
|
220 class StreamWriter(Codec,codecs.StreamWriter):
|
jpayne@69
|
221 pass
|
jpayne@69
|
222
|
jpayne@69
|
223 class StreamReader(Codec,codecs.StreamReader):
|
jpayne@69
|
224 pass
|
jpayne@69
|
225
|
jpayne@69
|
226 ### encodings module API
|
jpayne@69
|
227
|
jpayne@69
|
228 def getregentry():
|
jpayne@69
|
229 return codecs.CodecInfo(
|
jpayne@69
|
230 name='punycode',
|
jpayne@69
|
231 encode=Codec().encode,
|
jpayne@69
|
232 decode=Codec().decode,
|
jpayne@69
|
233 incrementalencoder=IncrementalEncoder,
|
jpayne@69
|
234 incrementaldecoder=IncrementalDecoder,
|
jpayne@69
|
235 streamwriter=StreamWriter,
|
jpayne@69
|
236 streamreader=StreamReader,
|
jpayne@69
|
237 )
|