Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/encodings/idna.py @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep) | |
2 | |
3 import stringprep, re, codecs | |
4 from unicodedata import ucd_3_2_0 as unicodedata | |
5 | |
6 # IDNA section 3.1 | |
7 dots = re.compile("[\u002E\u3002\uFF0E\uFF61]") | |
8 | |
9 # IDNA section 5 | |
10 ace_prefix = b"xn--" | |
11 sace_prefix = "xn--" | |
12 | |
13 # This assumes query strings, so AllowUnassigned is true | |
14 def nameprep(label): | |
15 # Map | |
16 newlabel = [] | |
17 for c in label: | |
18 if stringprep.in_table_b1(c): | |
19 # Map to nothing | |
20 continue | |
21 newlabel.append(stringprep.map_table_b2(c)) | |
22 label = "".join(newlabel) | |
23 | |
24 # Normalize | |
25 label = unicodedata.normalize("NFKC", label) | |
26 | |
27 # Prohibit | |
28 for c in label: | |
29 if stringprep.in_table_c12(c) or \ | |
30 stringprep.in_table_c22(c) or \ | |
31 stringprep.in_table_c3(c) or \ | |
32 stringprep.in_table_c4(c) or \ | |
33 stringprep.in_table_c5(c) or \ | |
34 stringprep.in_table_c6(c) or \ | |
35 stringprep.in_table_c7(c) or \ | |
36 stringprep.in_table_c8(c) or \ | |
37 stringprep.in_table_c9(c): | |
38 raise UnicodeError("Invalid character %r" % c) | |
39 | |
40 # Check bidi | |
41 RandAL = [stringprep.in_table_d1(x) for x in label] | |
42 for c in RandAL: | |
43 if c: | |
44 # There is a RandAL char in the string. Must perform further | |
45 # tests: | |
46 # 1) The characters in section 5.8 MUST be prohibited. | |
47 # This is table C.8, which was already checked | |
48 # 2) If a string contains any RandALCat character, the string | |
49 # MUST NOT contain any LCat character. | |
50 if any(stringprep.in_table_d2(x) for x in label): | |
51 raise UnicodeError("Violation of BIDI requirement 2") | |
52 | |
53 # 3) If a string contains any RandALCat character, a | |
54 # RandALCat character MUST be the first character of the | |
55 # string, and a RandALCat character MUST be the last | |
56 # character of the string. | |
57 if not RandAL[0] or not RandAL[-1]: | |
58 raise UnicodeError("Violation of BIDI requirement 3") | |
59 | |
60 return label | |
61 | |
62 def ToASCII(label): | |
63 try: | |
64 # Step 1: try ASCII | |
65 label = label.encode("ascii") | |
66 except UnicodeError: | |
67 pass | |
68 else: | |
69 # Skip to step 3: UseSTD3ASCIIRules is false, so | |
70 # Skip to step 8. | |
71 if 0 < len(label) < 64: | |
72 return label | |
73 raise UnicodeError("label empty or too long") | |
74 | |
75 # Step 2: nameprep | |
76 label = nameprep(label) | |
77 | |
78 # Step 3: UseSTD3ASCIIRules is false | |
79 # Step 4: try ASCII | |
80 try: | |
81 label = label.encode("ascii") | |
82 except UnicodeError: | |
83 pass | |
84 else: | |
85 # Skip to step 8. | |
86 if 0 < len(label) < 64: | |
87 return label | |
88 raise UnicodeError("label empty or too long") | |
89 | |
90 # Step 5: Check ACE prefix | |
91 if label.startswith(sace_prefix): | |
92 raise UnicodeError("Label starts with ACE prefix") | |
93 | |
94 # Step 6: Encode with PUNYCODE | |
95 label = label.encode("punycode") | |
96 | |
97 # Step 7: Prepend ACE prefix | |
98 label = ace_prefix + label | |
99 | |
100 # Step 8: Check size | |
101 if 0 < len(label) < 64: | |
102 return label | |
103 raise UnicodeError("label empty or too long") | |
104 | |
105 def ToUnicode(label): | |
106 # Step 1: Check for ASCII | |
107 if isinstance(label, bytes): | |
108 pure_ascii = True | |
109 else: | |
110 try: | |
111 label = label.encode("ascii") | |
112 pure_ascii = True | |
113 except UnicodeError: | |
114 pure_ascii = False | |
115 if not pure_ascii: | |
116 # Step 2: Perform nameprep | |
117 label = nameprep(label) | |
118 # It doesn't say this, but apparently, it should be ASCII now | |
119 try: | |
120 label = label.encode("ascii") | |
121 except UnicodeError: | |
122 raise UnicodeError("Invalid character in IDN label") | |
123 # Step 3: Check for ACE prefix | |
124 if not label.startswith(ace_prefix): | |
125 return str(label, "ascii") | |
126 | |
127 # Step 4: Remove ACE prefix | |
128 label1 = label[len(ace_prefix):] | |
129 | |
130 # Step 5: Decode using PUNYCODE | |
131 result = label1.decode("punycode") | |
132 | |
133 # Step 6: Apply ToASCII | |
134 label2 = ToASCII(result) | |
135 | |
136 # Step 7: Compare the result of step 6 with the one of step 3 | |
137 # label2 will already be in lower case. | |
138 if str(label, "ascii").lower() != str(label2, "ascii"): | |
139 raise UnicodeError("IDNA does not round-trip", label, label2) | |
140 | |
141 # Step 8: return the result of step 5 | |
142 return result | |
143 | |
144 ### Codec APIs | |
145 | |
146 class Codec(codecs.Codec): | |
147 def encode(self, input, errors='strict'): | |
148 | |
149 if errors != 'strict': | |
150 # IDNA is quite clear that implementations must be strict | |
151 raise UnicodeError("unsupported error handling "+errors) | |
152 | |
153 if not input: | |
154 return b'', 0 | |
155 | |
156 try: | |
157 result = input.encode('ascii') | |
158 except UnicodeEncodeError: | |
159 pass | |
160 else: | |
161 # ASCII name: fast path | |
162 labels = result.split(b'.') | |
163 for label in labels[:-1]: | |
164 if not (0 < len(label) < 64): | |
165 raise UnicodeError("label empty or too long") | |
166 if len(labels[-1]) >= 64: | |
167 raise UnicodeError("label too long") | |
168 return result, len(input) | |
169 | |
170 result = bytearray() | |
171 labels = dots.split(input) | |
172 if labels and not labels[-1]: | |
173 trailing_dot = b'.' | |
174 del labels[-1] | |
175 else: | |
176 trailing_dot = b'' | |
177 for label in labels: | |
178 if result: | |
179 # Join with U+002E | |
180 result.extend(b'.') | |
181 result.extend(ToASCII(label)) | |
182 return bytes(result+trailing_dot), len(input) | |
183 | |
184 def decode(self, input, errors='strict'): | |
185 | |
186 if errors != 'strict': | |
187 raise UnicodeError("Unsupported error handling "+errors) | |
188 | |
189 if not input: | |
190 return "", 0 | |
191 | |
192 # IDNA allows decoding to operate on Unicode strings, too. | |
193 if not isinstance(input, bytes): | |
194 # XXX obviously wrong, see #3232 | |
195 input = bytes(input) | |
196 | |
197 if ace_prefix not in input: | |
198 # Fast path | |
199 try: | |
200 return input.decode('ascii'), len(input) | |
201 except UnicodeDecodeError: | |
202 pass | |
203 | |
204 labels = input.split(b".") | |
205 | |
206 if labels and len(labels[-1]) == 0: | |
207 trailing_dot = '.' | |
208 del labels[-1] | |
209 else: | |
210 trailing_dot = '' | |
211 | |
212 result = [] | |
213 for label in labels: | |
214 result.append(ToUnicode(label)) | |
215 | |
216 return ".".join(result)+trailing_dot, len(input) | |
217 | |
218 class IncrementalEncoder(codecs.BufferedIncrementalEncoder): | |
219 def _buffer_encode(self, input, errors, final): | |
220 if errors != 'strict': | |
221 # IDNA is quite clear that implementations must be strict | |
222 raise UnicodeError("unsupported error handling "+errors) | |
223 | |
224 if not input: | |
225 return (b'', 0) | |
226 | |
227 labels = dots.split(input) | |
228 trailing_dot = b'' | |
229 if labels: | |
230 if not labels[-1]: | |
231 trailing_dot = b'.' | |
232 del labels[-1] | |
233 elif not final: | |
234 # Keep potentially unfinished label until the next call | |
235 del labels[-1] | |
236 if labels: | |
237 trailing_dot = b'.' | |
238 | |
239 result = bytearray() | |
240 size = 0 | |
241 for label in labels: | |
242 if size: | |
243 # Join with U+002E | |
244 result.extend(b'.') | |
245 size += 1 | |
246 result.extend(ToASCII(label)) | |
247 size += len(label) | |
248 | |
249 result += trailing_dot | |
250 size += len(trailing_dot) | |
251 return (bytes(result), size) | |
252 | |
253 class IncrementalDecoder(codecs.BufferedIncrementalDecoder): | |
254 def _buffer_decode(self, input, errors, final): | |
255 if errors != 'strict': | |
256 raise UnicodeError("Unsupported error handling "+errors) | |
257 | |
258 if not input: | |
259 return ("", 0) | |
260 | |
261 # IDNA allows decoding to operate on Unicode strings, too. | |
262 if isinstance(input, str): | |
263 labels = dots.split(input) | |
264 else: | |
265 # Must be ASCII string | |
266 input = str(input, "ascii") | |
267 labels = input.split(".") | |
268 | |
269 trailing_dot = '' | |
270 if labels: | |
271 if not labels[-1]: | |
272 trailing_dot = '.' | |
273 del labels[-1] | |
274 elif not final: | |
275 # Keep potentially unfinished label until the next call | |
276 del labels[-1] | |
277 if labels: | |
278 trailing_dot = '.' | |
279 | |
280 result = [] | |
281 size = 0 | |
282 for label in labels: | |
283 result.append(ToUnicode(label)) | |
284 if size: | |
285 size += 1 | |
286 size += len(label) | |
287 | |
288 result = ".".join(result) + trailing_dot | |
289 size += len(trailing_dot) | |
290 return (result, size) | |
291 | |
292 class StreamWriter(Codec,codecs.StreamWriter): | |
293 pass | |
294 | |
295 class StreamReader(Codec,codecs.StreamReader): | |
296 pass | |
297 | |
298 ### encodings module API | |
299 | |
300 def getregentry(): | |
301 return codecs.CodecInfo( | |
302 name='idna', | |
303 encode=Codec().encode, | |
304 decode=Codec().decode, | |
305 incrementalencoder=IncrementalEncoder, | |
306 incrementaldecoder=IncrementalDecoder, | |
307 streamwriter=StreamWriter, | |
308 streamreader=StreamReader, | |
309 ) |