jpayne@68
|
1 # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
|
jpayne@68
|
2
|
jpayne@68
|
3 import stringprep, re, codecs
|
jpayne@68
|
4 from unicodedata import ucd_3_2_0 as unicodedata
|
jpayne@68
|
5
|
jpayne@68
|
6 # IDNA section 3.1
|
jpayne@68
|
7 dots = re.compile("[\u002E\u3002\uFF0E\uFF61]")
|
jpayne@68
|
8
|
jpayne@68
|
9 # IDNA section 5
|
jpayne@68
|
10 ace_prefix = b"xn--"
|
jpayne@68
|
11 sace_prefix = "xn--"
|
jpayne@68
|
12
|
jpayne@68
|
13 # This assumes query strings, so AllowUnassigned is true
|
jpayne@68
|
14 def nameprep(label):
|
jpayne@68
|
15 # Map
|
jpayne@68
|
16 newlabel = []
|
jpayne@68
|
17 for c in label:
|
jpayne@68
|
18 if stringprep.in_table_b1(c):
|
jpayne@68
|
19 # Map to nothing
|
jpayne@68
|
20 continue
|
jpayne@68
|
21 newlabel.append(stringprep.map_table_b2(c))
|
jpayne@68
|
22 label = "".join(newlabel)
|
jpayne@68
|
23
|
jpayne@68
|
24 # Normalize
|
jpayne@68
|
25 label = unicodedata.normalize("NFKC", label)
|
jpayne@68
|
26
|
jpayne@68
|
27 # Prohibit
|
jpayne@68
|
28 for c in label:
|
jpayne@68
|
29 if stringprep.in_table_c12(c) or \
|
jpayne@68
|
30 stringprep.in_table_c22(c) or \
|
jpayne@68
|
31 stringprep.in_table_c3(c) or \
|
jpayne@68
|
32 stringprep.in_table_c4(c) or \
|
jpayne@68
|
33 stringprep.in_table_c5(c) or \
|
jpayne@68
|
34 stringprep.in_table_c6(c) or \
|
jpayne@68
|
35 stringprep.in_table_c7(c) or \
|
jpayne@68
|
36 stringprep.in_table_c8(c) or \
|
jpayne@68
|
37 stringprep.in_table_c9(c):
|
jpayne@68
|
38 raise UnicodeError("Invalid character %r" % c)
|
jpayne@68
|
39
|
jpayne@68
|
40 # Check bidi
|
jpayne@68
|
41 RandAL = [stringprep.in_table_d1(x) for x in label]
|
jpayne@68
|
42 for c in RandAL:
|
jpayne@68
|
43 if c:
|
jpayne@68
|
44 # There is a RandAL char in the string. Must perform further
|
jpayne@68
|
45 # tests:
|
jpayne@68
|
46 # 1) The characters in section 5.8 MUST be prohibited.
|
jpayne@68
|
47 # This is table C.8, which was already checked
|
jpayne@68
|
48 # 2) If a string contains any RandALCat character, the string
|
jpayne@68
|
49 # MUST NOT contain any LCat character.
|
jpayne@68
|
50 if any(stringprep.in_table_d2(x) for x in label):
|
jpayne@68
|
51 raise UnicodeError("Violation of BIDI requirement 2")
|
jpayne@68
|
52
|
jpayne@68
|
53 # 3) If a string contains any RandALCat character, a
|
jpayne@68
|
54 # RandALCat character MUST be the first character of the
|
jpayne@68
|
55 # string, and a RandALCat character MUST be the last
|
jpayne@68
|
56 # character of the string.
|
jpayne@68
|
57 if not RandAL[0] or not RandAL[-1]:
|
jpayne@68
|
58 raise UnicodeError("Violation of BIDI requirement 3")
|
jpayne@68
|
59
|
jpayne@68
|
60 return label
|
jpayne@68
|
61
|
jpayne@68
|
62 def ToASCII(label):
|
jpayne@68
|
63 try:
|
jpayne@68
|
64 # Step 1: try ASCII
|
jpayne@68
|
65 label = label.encode("ascii")
|
jpayne@68
|
66 except UnicodeError:
|
jpayne@68
|
67 pass
|
jpayne@68
|
68 else:
|
jpayne@68
|
69 # Skip to step 3: UseSTD3ASCIIRules is false, so
|
jpayne@68
|
70 # Skip to step 8.
|
jpayne@68
|
71 if 0 < len(label) < 64:
|
jpayne@68
|
72 return label
|
jpayne@68
|
73 raise UnicodeError("label empty or too long")
|
jpayne@68
|
74
|
jpayne@68
|
75 # Step 2: nameprep
|
jpayne@68
|
76 label = nameprep(label)
|
jpayne@68
|
77
|
jpayne@68
|
78 # Step 3: UseSTD3ASCIIRules is false
|
jpayne@68
|
79 # Step 4: try ASCII
|
jpayne@68
|
80 try:
|
jpayne@68
|
81 label = label.encode("ascii")
|
jpayne@68
|
82 except UnicodeError:
|
jpayne@68
|
83 pass
|
jpayne@68
|
84 else:
|
jpayne@68
|
85 # Skip to step 8.
|
jpayne@68
|
86 if 0 < len(label) < 64:
|
jpayne@68
|
87 return label
|
jpayne@68
|
88 raise UnicodeError("label empty or too long")
|
jpayne@68
|
89
|
jpayne@68
|
90 # Step 5: Check ACE prefix
|
jpayne@68
|
91 if label.startswith(sace_prefix):
|
jpayne@68
|
92 raise UnicodeError("Label starts with ACE prefix")
|
jpayne@68
|
93
|
jpayne@68
|
94 # Step 6: Encode with PUNYCODE
|
jpayne@68
|
95 label = label.encode("punycode")
|
jpayne@68
|
96
|
jpayne@68
|
97 # Step 7: Prepend ACE prefix
|
jpayne@68
|
98 label = ace_prefix + label
|
jpayne@68
|
99
|
jpayne@68
|
100 # Step 8: Check size
|
jpayne@68
|
101 if 0 < len(label) < 64:
|
jpayne@68
|
102 return label
|
jpayne@68
|
103 raise UnicodeError("label empty or too long")
|
jpayne@68
|
104
|
jpayne@68
|
105 def ToUnicode(label):
|
jpayne@68
|
106 # Step 1: Check for ASCII
|
jpayne@68
|
107 if isinstance(label, bytes):
|
jpayne@68
|
108 pure_ascii = True
|
jpayne@68
|
109 else:
|
jpayne@68
|
110 try:
|
jpayne@68
|
111 label = label.encode("ascii")
|
jpayne@68
|
112 pure_ascii = True
|
jpayne@68
|
113 except UnicodeError:
|
jpayne@68
|
114 pure_ascii = False
|
jpayne@68
|
115 if not pure_ascii:
|
jpayne@68
|
116 # Step 2: Perform nameprep
|
jpayne@68
|
117 label = nameprep(label)
|
jpayne@68
|
118 # It doesn't say this, but apparently, it should be ASCII now
|
jpayne@68
|
119 try:
|
jpayne@68
|
120 label = label.encode("ascii")
|
jpayne@68
|
121 except UnicodeError:
|
jpayne@68
|
122 raise UnicodeError("Invalid character in IDN label")
|
jpayne@68
|
123 # Step 3: Check for ACE prefix
|
jpayne@68
|
124 if not label.startswith(ace_prefix):
|
jpayne@68
|
125 return str(label, "ascii")
|
jpayne@68
|
126
|
jpayne@68
|
127 # Step 4: Remove ACE prefix
|
jpayne@68
|
128 label1 = label[len(ace_prefix):]
|
jpayne@68
|
129
|
jpayne@68
|
130 # Step 5: Decode using PUNYCODE
|
jpayne@68
|
131 result = label1.decode("punycode")
|
jpayne@68
|
132
|
jpayne@68
|
133 # Step 6: Apply ToASCII
|
jpayne@68
|
134 label2 = ToASCII(result)
|
jpayne@68
|
135
|
jpayne@68
|
136 # Step 7: Compare the result of step 6 with the one of step 3
|
jpayne@68
|
137 # label2 will already be in lower case.
|
jpayne@68
|
138 if str(label, "ascii").lower() != str(label2, "ascii"):
|
jpayne@68
|
139 raise UnicodeError("IDNA does not round-trip", label, label2)
|
jpayne@68
|
140
|
jpayne@68
|
141 # Step 8: return the result of step 5
|
jpayne@68
|
142 return result
|
jpayne@68
|
143
|
jpayne@68
|
144 ### Codec APIs
|
jpayne@68
|
145
|
jpayne@68
|
146 class Codec(codecs.Codec):
|
jpayne@68
|
147 def encode(self, input, errors='strict'):
|
jpayne@68
|
148
|
jpayne@68
|
149 if errors != 'strict':
|
jpayne@68
|
150 # IDNA is quite clear that implementations must be strict
|
jpayne@68
|
151 raise UnicodeError("unsupported error handling "+errors)
|
jpayne@68
|
152
|
jpayne@68
|
153 if not input:
|
jpayne@68
|
154 return b'', 0
|
jpayne@68
|
155
|
jpayne@68
|
156 try:
|
jpayne@68
|
157 result = input.encode('ascii')
|
jpayne@68
|
158 except UnicodeEncodeError:
|
jpayne@68
|
159 pass
|
jpayne@68
|
160 else:
|
jpayne@68
|
161 # ASCII name: fast path
|
jpayne@68
|
162 labels = result.split(b'.')
|
jpayne@68
|
163 for label in labels[:-1]:
|
jpayne@68
|
164 if not (0 < len(label) < 64):
|
jpayne@68
|
165 raise UnicodeError("label empty or too long")
|
jpayne@68
|
166 if len(labels[-1]) >= 64:
|
jpayne@68
|
167 raise UnicodeError("label too long")
|
jpayne@68
|
168 return result, len(input)
|
jpayne@68
|
169
|
jpayne@68
|
170 result = bytearray()
|
jpayne@68
|
171 labels = dots.split(input)
|
jpayne@68
|
172 if labels and not labels[-1]:
|
jpayne@68
|
173 trailing_dot = b'.'
|
jpayne@68
|
174 del labels[-1]
|
jpayne@68
|
175 else:
|
jpayne@68
|
176 trailing_dot = b''
|
jpayne@68
|
177 for label in labels:
|
jpayne@68
|
178 if result:
|
jpayne@68
|
179 # Join with U+002E
|
jpayne@68
|
180 result.extend(b'.')
|
jpayne@68
|
181 result.extend(ToASCII(label))
|
jpayne@68
|
182 return bytes(result+trailing_dot), len(input)
|
jpayne@68
|
183
|
jpayne@68
|
184 def decode(self, input, errors='strict'):
|
jpayne@68
|
185
|
jpayne@68
|
186 if errors != 'strict':
|
jpayne@68
|
187 raise UnicodeError("Unsupported error handling "+errors)
|
jpayne@68
|
188
|
jpayne@68
|
189 if not input:
|
jpayne@68
|
190 return "", 0
|
jpayne@68
|
191
|
jpayne@68
|
192 # IDNA allows decoding to operate on Unicode strings, too.
|
jpayne@68
|
193 if not isinstance(input, bytes):
|
jpayne@68
|
194 # XXX obviously wrong, see #3232
|
jpayne@68
|
195 input = bytes(input)
|
jpayne@68
|
196
|
jpayne@68
|
197 if ace_prefix not in input:
|
jpayne@68
|
198 # Fast path
|
jpayne@68
|
199 try:
|
jpayne@68
|
200 return input.decode('ascii'), len(input)
|
jpayne@68
|
201 except UnicodeDecodeError:
|
jpayne@68
|
202 pass
|
jpayne@68
|
203
|
jpayne@68
|
204 labels = input.split(b".")
|
jpayne@68
|
205
|
jpayne@68
|
206 if labels and len(labels[-1]) == 0:
|
jpayne@68
|
207 trailing_dot = '.'
|
jpayne@68
|
208 del labels[-1]
|
jpayne@68
|
209 else:
|
jpayne@68
|
210 trailing_dot = ''
|
jpayne@68
|
211
|
jpayne@68
|
212 result = []
|
jpayne@68
|
213 for label in labels:
|
jpayne@68
|
214 result.append(ToUnicode(label))
|
jpayne@68
|
215
|
jpayne@68
|
216 return ".".join(result)+trailing_dot, len(input)
|
jpayne@68
|
217
|
jpayne@68
|
218 class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
|
jpayne@68
|
219 def _buffer_encode(self, input, errors, final):
|
jpayne@68
|
220 if errors != 'strict':
|
jpayne@68
|
221 # IDNA is quite clear that implementations must be strict
|
jpayne@68
|
222 raise UnicodeError("unsupported error handling "+errors)
|
jpayne@68
|
223
|
jpayne@68
|
224 if not input:
|
jpayne@68
|
225 return (b'', 0)
|
jpayne@68
|
226
|
jpayne@68
|
227 labels = dots.split(input)
|
jpayne@68
|
228 trailing_dot = b''
|
jpayne@68
|
229 if labels:
|
jpayne@68
|
230 if not labels[-1]:
|
jpayne@68
|
231 trailing_dot = b'.'
|
jpayne@68
|
232 del labels[-1]
|
jpayne@68
|
233 elif not final:
|
jpayne@68
|
234 # Keep potentially unfinished label until the next call
|
jpayne@68
|
235 del labels[-1]
|
jpayne@68
|
236 if labels:
|
jpayne@68
|
237 trailing_dot = b'.'
|
jpayne@68
|
238
|
jpayne@68
|
239 result = bytearray()
|
jpayne@68
|
240 size = 0
|
jpayne@68
|
241 for label in labels:
|
jpayne@68
|
242 if size:
|
jpayne@68
|
243 # Join with U+002E
|
jpayne@68
|
244 result.extend(b'.')
|
jpayne@68
|
245 size += 1
|
jpayne@68
|
246 result.extend(ToASCII(label))
|
jpayne@68
|
247 size += len(label)
|
jpayne@68
|
248
|
jpayne@68
|
249 result += trailing_dot
|
jpayne@68
|
250 size += len(trailing_dot)
|
jpayne@68
|
251 return (bytes(result), size)
|
jpayne@68
|
252
|
jpayne@68
|
253 class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
|
jpayne@68
|
254 def _buffer_decode(self, input, errors, final):
|
jpayne@68
|
255 if errors != 'strict':
|
jpayne@68
|
256 raise UnicodeError("Unsupported error handling "+errors)
|
jpayne@68
|
257
|
jpayne@68
|
258 if not input:
|
jpayne@68
|
259 return ("", 0)
|
jpayne@68
|
260
|
jpayne@68
|
261 # IDNA allows decoding to operate on Unicode strings, too.
|
jpayne@68
|
262 if isinstance(input, str):
|
jpayne@68
|
263 labels = dots.split(input)
|
jpayne@68
|
264 else:
|
jpayne@68
|
265 # Must be ASCII string
|
jpayne@68
|
266 input = str(input, "ascii")
|
jpayne@68
|
267 labels = input.split(".")
|
jpayne@68
|
268
|
jpayne@68
|
269 trailing_dot = ''
|
jpayne@68
|
270 if labels:
|
jpayne@68
|
271 if not labels[-1]:
|
jpayne@68
|
272 trailing_dot = '.'
|
jpayne@68
|
273 del labels[-1]
|
jpayne@68
|
274 elif not final:
|
jpayne@68
|
275 # Keep potentially unfinished label until the next call
|
jpayne@68
|
276 del labels[-1]
|
jpayne@68
|
277 if labels:
|
jpayne@68
|
278 trailing_dot = '.'
|
jpayne@68
|
279
|
jpayne@68
|
280 result = []
|
jpayne@68
|
281 size = 0
|
jpayne@68
|
282 for label in labels:
|
jpayne@68
|
283 result.append(ToUnicode(label))
|
jpayne@68
|
284 if size:
|
jpayne@68
|
285 size += 1
|
jpayne@68
|
286 size += len(label)
|
jpayne@68
|
287
|
jpayne@68
|
288 result = ".".join(result) + trailing_dot
|
jpayne@68
|
289 size += len(trailing_dot)
|
jpayne@68
|
290 return (result, size)
|
jpayne@68
|
291
|
jpayne@68
|
292 class StreamWriter(Codec,codecs.StreamWriter):
|
jpayne@68
|
293 pass
|
jpayne@68
|
294
|
jpayne@68
|
295 class StreamReader(Codec,codecs.StreamReader):
|
jpayne@68
|
296 pass
|
jpayne@68
|
297
|
jpayne@68
|
298 ### encodings module API
|
jpayne@68
|
299
|
jpayne@68
|
300 def getregentry():
|
jpayne@68
|
301 return codecs.CodecInfo(
|
jpayne@68
|
302 name='idna',
|
jpayne@68
|
303 encode=Codec().encode,
|
jpayne@68
|
304 decode=Codec().decode,
|
jpayne@68
|
305 incrementalencoder=IncrementalEncoder,
|
jpayne@68
|
306 incrementaldecoder=IncrementalDecoder,
|
jpayne@68
|
307 streamwriter=StreamWriter,
|
jpayne@68
|
308 streamreader=StreamReader,
|
jpayne@68
|
309 )
|