Mercurial > repos > jpayne > bioproject_to_srr_2
comparison urllib3/util/url.py @ 7:5eb2d5e3bf22
planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author | jpayne |
---|---|
date | Sun, 05 May 2024 23:32:17 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
6:b2745907b1eb | 7:5eb2d5e3bf22 |
---|---|
1 from __future__ import annotations | |
2 | |
3 import re | |
4 import typing | |
5 | |
6 from ..exceptions import LocationParseError | |
7 from .util import to_str | |
8 | |
9 # We only want to normalize urls with an HTTP(S) scheme. | |
10 # urllib3 infers URLs without a scheme (None) to be http. | |
11 _NORMALIZABLE_SCHEMES = ("http", "https", None) | |
12 | |
13 # Almost all of these patterns were derived from the | |
14 # 'rfc3986' module: https://github.com/python-hyper/rfc3986 | |
15 _PERCENT_RE = re.compile(r"%[a-fA-F0-9]{2}") | |
16 _SCHEME_RE = re.compile(r"^(?:[a-zA-Z][a-zA-Z0-9+-]*:|/)") | |
17 _URI_RE = re.compile( | |
18 r"^(?:([a-zA-Z][a-zA-Z0-9+.-]*):)?" | |
19 r"(?://([^\\/?#]*))?" | |
20 r"([^?#]*)" | |
21 r"(?:\?([^#]*))?" | |
22 r"(?:#(.*))?$", | |
23 re.UNICODE | re.DOTALL, | |
24 ) | |
25 | |
26 _IPV4_PAT = r"(?:[0-9]{1,3}\.){3}[0-9]{1,3}" | |
27 _HEX_PAT = "[0-9A-Fa-f]{1,4}" | |
28 _LS32_PAT = "(?:{hex}:{hex}|{ipv4})".format(hex=_HEX_PAT, ipv4=_IPV4_PAT) | |
29 _subs = {"hex": _HEX_PAT, "ls32": _LS32_PAT} | |
30 _variations = [ | |
31 # 6( h16 ":" ) ls32 | |
32 "(?:%(hex)s:){6}%(ls32)s", | |
33 # "::" 5( h16 ":" ) ls32 | |
34 "::(?:%(hex)s:){5}%(ls32)s", | |
35 # [ h16 ] "::" 4( h16 ":" ) ls32 | |
36 "(?:%(hex)s)?::(?:%(hex)s:){4}%(ls32)s", | |
37 # [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 | |
38 "(?:(?:%(hex)s:)?%(hex)s)?::(?:%(hex)s:){3}%(ls32)s", | |
39 # [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 | |
40 "(?:(?:%(hex)s:){0,2}%(hex)s)?::(?:%(hex)s:){2}%(ls32)s", | |
41 # [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 | |
42 "(?:(?:%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s", | |
43 # [ *4( h16 ":" ) h16 ] "::" ls32 | |
44 "(?:(?:%(hex)s:){0,4}%(hex)s)?::%(ls32)s", | |
45 # [ *5( h16 ":" ) h16 ] "::" h16 | |
46 "(?:(?:%(hex)s:){0,5}%(hex)s)?::%(hex)s", | |
47 # [ *6( h16 ":" ) h16 ] "::" | |
48 "(?:(?:%(hex)s:){0,6}%(hex)s)?::", | |
49 ] | |
50 | |
51 _UNRESERVED_PAT = r"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._\-~" | |
52 _IPV6_PAT = "(?:" + "|".join([x % _subs for x in _variations]) + ")" | |
53 _ZONE_ID_PAT = "(?:%25|%)(?:[" + _UNRESERVED_PAT + "]|%[a-fA-F0-9]{2})+" | |
54 _IPV6_ADDRZ_PAT = r"\[" + _IPV6_PAT + r"(?:" + _ZONE_ID_PAT + r")?\]" | |
55 _REG_NAME_PAT = r"(?:[^\[\]%:/?#]|%[a-fA-F0-9]{2})*" | |
56 _TARGET_RE = re.compile(r"^(/[^?#]*)(?:\?([^#]*))?(?:#.*)?$") | |
57 | |
58 _IPV4_RE = re.compile("^" + _IPV4_PAT + "$") | |
59 _IPV6_RE = re.compile("^" + _IPV6_PAT + "$") | |
60 _IPV6_ADDRZ_RE = re.compile("^" + _IPV6_ADDRZ_PAT + "$") | |
61 _BRACELESS_IPV6_ADDRZ_RE = re.compile("^" + _IPV6_ADDRZ_PAT[2:-2] + "$") | |
62 _ZONE_ID_RE = re.compile("(" + _ZONE_ID_PAT + r")\]$") | |
63 | |
64 _HOST_PORT_PAT = ("^(%s|%s|%s)(?::0*?(|0|[1-9][0-9]{0,4}))?$") % ( | |
65 _REG_NAME_PAT, | |
66 _IPV4_PAT, | |
67 _IPV6_ADDRZ_PAT, | |
68 ) | |
69 _HOST_PORT_RE = re.compile(_HOST_PORT_PAT, re.UNICODE | re.DOTALL) | |
70 | |
71 _UNRESERVED_CHARS = set( | |
72 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._-~" | |
73 ) | |
74 _SUB_DELIM_CHARS = set("!$&'()*+,;=") | |
75 _USERINFO_CHARS = _UNRESERVED_CHARS | _SUB_DELIM_CHARS | {":"} | |
76 _PATH_CHARS = _USERINFO_CHARS | {"@", "/"} | |
77 _QUERY_CHARS = _FRAGMENT_CHARS = _PATH_CHARS | {"?"} | |
78 | |
79 | |
80 class Url( | |
81 typing.NamedTuple( | |
82 "Url", | |
83 [ | |
84 ("scheme", typing.Optional[str]), | |
85 ("auth", typing.Optional[str]), | |
86 ("host", typing.Optional[str]), | |
87 ("port", typing.Optional[int]), | |
88 ("path", typing.Optional[str]), | |
89 ("query", typing.Optional[str]), | |
90 ("fragment", typing.Optional[str]), | |
91 ], | |
92 ) | |
93 ): | |
94 """ | |
95 Data structure for representing an HTTP URL. Used as a return value for | |
96 :func:`parse_url`. Both the scheme and host are normalized as they are | |
97 both case-insensitive according to RFC 3986. | |
98 """ | |
99 | |
100 def __new__( # type: ignore[no-untyped-def] | |
101 cls, | |
102 scheme: str | None = None, | |
103 auth: str | None = None, | |
104 host: str | None = None, | |
105 port: int | None = None, | |
106 path: str | None = None, | |
107 query: str | None = None, | |
108 fragment: str | None = None, | |
109 ): | |
110 if path and not path.startswith("/"): | |
111 path = "/" + path | |
112 if scheme is not None: | |
113 scheme = scheme.lower() | |
114 return super().__new__(cls, scheme, auth, host, port, path, query, fragment) | |
115 | |
116 @property | |
117 def hostname(self) -> str | None: | |
118 """For backwards-compatibility with urlparse. We're nice like that.""" | |
119 return self.host | |
120 | |
121 @property | |
122 def request_uri(self) -> str: | |
123 """Absolute path including the query string.""" | |
124 uri = self.path or "/" | |
125 | |
126 if self.query is not None: | |
127 uri += "?" + self.query | |
128 | |
129 return uri | |
130 | |
131 @property | |
132 def authority(self) -> str | None: | |
133 """ | |
134 Authority component as defined in RFC 3986 3.2. | |
135 This includes userinfo (auth), host and port. | |
136 | |
137 i.e. | |
138 userinfo@host:port | |
139 """ | |
140 userinfo = self.auth | |
141 netloc = self.netloc | |
142 if netloc is None or userinfo is None: | |
143 return netloc | |
144 else: | |
145 return f"{userinfo}@{netloc}" | |
146 | |
147 @property | |
148 def netloc(self) -> str | None: | |
149 """ | |
150 Network location including host and port. | |
151 | |
152 If you need the equivalent of urllib.parse's ``netloc``, | |
153 use the ``authority`` property instead. | |
154 """ | |
155 if self.host is None: | |
156 return None | |
157 if self.port: | |
158 return f"{self.host}:{self.port}" | |
159 return self.host | |
160 | |
161 @property | |
162 def url(self) -> str: | |
163 """ | |
164 Convert self into a url | |
165 | |
166 This function should more or less round-trip with :func:`.parse_url`. The | |
167 returned url may not be exactly the same as the url inputted to | |
168 :func:`.parse_url`, but it should be equivalent by the RFC (e.g., urls | |
169 with a blank port will have : removed). | |
170 | |
171 Example: | |
172 | |
173 .. code-block:: python | |
174 | |
175 import urllib3 | |
176 | |
177 U = urllib3.util.parse_url("https://google.com/mail/") | |
178 | |
179 print(U.url) | |
180 # "https://google.com/mail/" | |
181 | |
182 print( urllib3.util.Url("https", "username:password", | |
183 "host.com", 80, "/path", "query", "fragment" | |
184 ).url | |
185 ) | |
186 # "https://username:password@host.com:80/path?query#fragment" | |
187 """ | |
188 scheme, auth, host, port, path, query, fragment = self | |
189 url = "" | |
190 | |
191 # We use "is not None" we want things to happen with empty strings (or 0 port) | |
192 if scheme is not None: | |
193 url += scheme + "://" | |
194 if auth is not None: | |
195 url += auth + "@" | |
196 if host is not None: | |
197 url += host | |
198 if port is not None: | |
199 url += ":" + str(port) | |
200 if path is not None: | |
201 url += path | |
202 if query is not None: | |
203 url += "?" + query | |
204 if fragment is not None: | |
205 url += "#" + fragment | |
206 | |
207 return url | |
208 | |
209 def __str__(self) -> str: | |
210 return self.url | |
211 | |
212 | |
213 @typing.overload | |
214 def _encode_invalid_chars( | |
215 component: str, allowed_chars: typing.Container[str] | |
216 ) -> str: # Abstract | |
217 ... | |
218 | |
219 | |
220 @typing.overload | |
221 def _encode_invalid_chars( | |
222 component: None, allowed_chars: typing.Container[str] | |
223 ) -> None: # Abstract | |
224 ... | |
225 | |
226 | |
227 def _encode_invalid_chars( | |
228 component: str | None, allowed_chars: typing.Container[str] | |
229 ) -> str | None: | |
230 """Percent-encodes a URI component without reapplying | |
231 onto an already percent-encoded component. | |
232 """ | |
233 if component is None: | |
234 return component | |
235 | |
236 component = to_str(component) | |
237 | |
238 # Normalize existing percent-encoded bytes. | |
239 # Try to see if the component we're encoding is already percent-encoded | |
240 # so we can skip all '%' characters but still encode all others. | |
241 component, percent_encodings = _PERCENT_RE.subn( | |
242 lambda match: match.group(0).upper(), component | |
243 ) | |
244 | |
245 uri_bytes = component.encode("utf-8", "surrogatepass") | |
246 is_percent_encoded = percent_encodings == uri_bytes.count(b"%") | |
247 encoded_component = bytearray() | |
248 | |
249 for i in range(0, len(uri_bytes)): | |
250 # Will return a single character bytestring | |
251 byte = uri_bytes[i : i + 1] | |
252 byte_ord = ord(byte) | |
253 if (is_percent_encoded and byte == b"%") or ( | |
254 byte_ord < 128 and byte.decode() in allowed_chars | |
255 ): | |
256 encoded_component += byte | |
257 continue | |
258 encoded_component.extend(b"%" + (hex(byte_ord)[2:].encode().zfill(2).upper())) | |
259 | |
260 return encoded_component.decode() | |
261 | |
262 | |
263 def _remove_path_dot_segments(path: str) -> str: | |
264 # See http://tools.ietf.org/html/rfc3986#section-5.2.4 for pseudo-code | |
265 segments = path.split("/") # Turn the path into a list of segments | |
266 output = [] # Initialize the variable to use to store output | |
267 | |
268 for segment in segments: | |
269 # '.' is the current directory, so ignore it, it is superfluous | |
270 if segment == ".": | |
271 continue | |
272 # Anything other than '..', should be appended to the output | |
273 if segment != "..": | |
274 output.append(segment) | |
275 # In this case segment == '..', if we can, we should pop the last | |
276 # element | |
277 elif output: | |
278 output.pop() | |
279 | |
280 # If the path starts with '/' and the output is empty or the first string | |
281 # is non-empty | |
282 if path.startswith("/") and (not output or output[0]): | |
283 output.insert(0, "") | |
284 | |
285 # If the path starts with '/.' or '/..' ensure we add one more empty | |
286 # string to add a trailing '/' | |
287 if path.endswith(("/.", "/..")): | |
288 output.append("") | |
289 | |
290 return "/".join(output) | |
291 | |
292 | |
293 @typing.overload | |
294 def _normalize_host(host: None, scheme: str | None) -> None: | |
295 ... | |
296 | |
297 | |
298 @typing.overload | |
299 def _normalize_host(host: str, scheme: str | None) -> str: | |
300 ... | |
301 | |
302 | |
303 def _normalize_host(host: str | None, scheme: str | None) -> str | None: | |
304 if host: | |
305 if scheme in _NORMALIZABLE_SCHEMES: | |
306 is_ipv6 = _IPV6_ADDRZ_RE.match(host) | |
307 if is_ipv6: | |
308 # IPv6 hosts of the form 'a::b%zone' are encoded in a URL as | |
309 # such per RFC 6874: 'a::b%25zone'. Unquote the ZoneID | |
310 # separator as necessary to return a valid RFC 4007 scoped IP. | |
311 match = _ZONE_ID_RE.search(host) | |
312 if match: | |
313 start, end = match.span(1) | |
314 zone_id = host[start:end] | |
315 | |
316 if zone_id.startswith("%25") and zone_id != "%25": | |
317 zone_id = zone_id[3:] | |
318 else: | |
319 zone_id = zone_id[1:] | |
320 zone_id = _encode_invalid_chars(zone_id, _UNRESERVED_CHARS) | |
321 return f"{host[:start].lower()}%{zone_id}{host[end:]}" | |
322 else: | |
323 return host.lower() | |
324 elif not _IPV4_RE.match(host): | |
325 return to_str( | |
326 b".".join([_idna_encode(label) for label in host.split(".")]), | |
327 "ascii", | |
328 ) | |
329 return host | |
330 | |
331 | |
332 def _idna_encode(name: str) -> bytes: | |
333 if not name.isascii(): | |
334 try: | |
335 import idna | |
336 except ImportError: | |
337 raise LocationParseError( | |
338 "Unable to parse URL without the 'idna' module" | |
339 ) from None | |
340 | |
341 try: | |
342 return idna.encode(name.lower(), strict=True, std3_rules=True) | |
343 except idna.IDNAError: | |
344 raise LocationParseError( | |
345 f"Name '{name}' is not a valid IDNA label" | |
346 ) from None | |
347 | |
348 return name.lower().encode("ascii") | |
349 | |
350 | |
351 def _encode_target(target: str) -> str: | |
352 """Percent-encodes a request target so that there are no invalid characters | |
353 | |
354 Pre-condition for this function is that 'target' must start with '/'. | |
355 If that is the case then _TARGET_RE will always produce a match. | |
356 """ | |
357 match = _TARGET_RE.match(target) | |
358 if not match: # Defensive: | |
359 raise LocationParseError(f"{target!r} is not a valid request URI") | |
360 | |
361 path, query = match.groups() | |
362 encoded_target = _encode_invalid_chars(path, _PATH_CHARS) | |
363 if query is not None: | |
364 query = _encode_invalid_chars(query, _QUERY_CHARS) | |
365 encoded_target += "?" + query | |
366 return encoded_target | |
367 | |
368 | |
369 def parse_url(url: str) -> Url: | |
370 """ | |
371 Given a url, return a parsed :class:`.Url` namedtuple. Best-effort is | |
372 performed to parse incomplete urls. Fields not provided will be None. | |
373 This parser is RFC 3986 and RFC 6874 compliant. | |
374 | |
375 The parser logic and helper functions are based heavily on | |
376 work done in the ``rfc3986`` module. | |
377 | |
378 :param str url: URL to parse into a :class:`.Url` namedtuple. | |
379 | |
380 Partly backwards-compatible with :mod:`urllib.parse`. | |
381 | |
382 Example: | |
383 | |
384 .. code-block:: python | |
385 | |
386 import urllib3 | |
387 | |
388 print( urllib3.util.parse_url('http://google.com/mail/')) | |
389 # Url(scheme='http', host='google.com', port=None, path='/mail/', ...) | |
390 | |
391 print( urllib3.util.parse_url('google.com:80')) | |
392 # Url(scheme=None, host='google.com', port=80, path=None, ...) | |
393 | |
394 print( urllib3.util.parse_url('/foo?bar')) | |
395 # Url(scheme=None, host=None, port=None, path='/foo', query='bar', ...) | |
396 """ | |
397 if not url: | |
398 # Empty | |
399 return Url() | |
400 | |
401 source_url = url | |
402 if not _SCHEME_RE.search(url): | |
403 url = "//" + url | |
404 | |
405 scheme: str | None | |
406 authority: str | None | |
407 auth: str | None | |
408 host: str | None | |
409 port: str | None | |
410 port_int: int | None | |
411 path: str | None | |
412 query: str | None | |
413 fragment: str | None | |
414 | |
415 try: | |
416 scheme, authority, path, query, fragment = _URI_RE.match(url).groups() # type: ignore[union-attr] | |
417 normalize_uri = scheme is None or scheme.lower() in _NORMALIZABLE_SCHEMES | |
418 | |
419 if scheme: | |
420 scheme = scheme.lower() | |
421 | |
422 if authority: | |
423 auth, _, host_port = authority.rpartition("@") | |
424 auth = auth or None | |
425 host, port = _HOST_PORT_RE.match(host_port).groups() # type: ignore[union-attr] | |
426 if auth and normalize_uri: | |
427 auth = _encode_invalid_chars(auth, _USERINFO_CHARS) | |
428 if port == "": | |
429 port = None | |
430 else: | |
431 auth, host, port = None, None, None | |
432 | |
433 if port is not None: | |
434 port_int = int(port) | |
435 if not (0 <= port_int <= 65535): | |
436 raise LocationParseError(url) | |
437 else: | |
438 port_int = None | |
439 | |
440 host = _normalize_host(host, scheme) | |
441 | |
442 if normalize_uri and path: | |
443 path = _remove_path_dot_segments(path) | |
444 path = _encode_invalid_chars(path, _PATH_CHARS) | |
445 if normalize_uri and query: | |
446 query = _encode_invalid_chars(query, _QUERY_CHARS) | |
447 if normalize_uri and fragment: | |
448 fragment = _encode_invalid_chars(fragment, _FRAGMENT_CHARS) | |
449 | |
450 except (ValueError, AttributeError) as e: | |
451 raise LocationParseError(source_url) from e | |
452 | |
453 # For the sake of backwards compatibility we put empty | |
454 # string values for path if there are any defined values | |
455 # beyond the path in the URL. | |
456 # TODO: Remove this when we break backwards compatibility. | |
457 if not path: | |
458 if query is not None or fragment is not None: | |
459 path = "" | |
460 else: | |
461 path = None | |
462 | |
463 return Url( | |
464 scheme=scheme, | |
465 auth=auth, | |
466 host=host, | |
467 port=port_int, | |
468 path=path, | |
469 query=query, | |
470 fragment=fragment, | |
471 ) |