comparison urllib3/util/url.py @ 7:5eb2d5e3bf22

planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author jpayne
date Sun, 05 May 2024 23:32:17 -0400
parents
children
comparison
equal deleted inserted replaced
6:b2745907b1eb 7:5eb2d5e3bf22
1 from __future__ import annotations
2
3 import re
4 import typing
5
6 from ..exceptions import LocationParseError
7 from .util import to_str
8
9 # We only want to normalize urls with an HTTP(S) scheme.
10 # urllib3 infers URLs without a scheme (None) to be http.
11 _NORMALIZABLE_SCHEMES = ("http", "https", None)
12
13 # Almost all of these patterns were derived from the
14 # 'rfc3986' module: https://github.com/python-hyper/rfc3986
15 _PERCENT_RE = re.compile(r"%[a-fA-F0-9]{2}")
16 _SCHEME_RE = re.compile(r"^(?:[a-zA-Z][a-zA-Z0-9+-]*:|/)")
17 _URI_RE = re.compile(
18 r"^(?:([a-zA-Z][a-zA-Z0-9+.-]*):)?"
19 r"(?://([^\\/?#]*))?"
20 r"([^?#]*)"
21 r"(?:\?([^#]*))?"
22 r"(?:#(.*))?$",
23 re.UNICODE | re.DOTALL,
24 )
25
26 _IPV4_PAT = r"(?:[0-9]{1,3}\.){3}[0-9]{1,3}"
27 _HEX_PAT = "[0-9A-Fa-f]{1,4}"
28 _LS32_PAT = "(?:{hex}:{hex}|{ipv4})".format(hex=_HEX_PAT, ipv4=_IPV4_PAT)
29 _subs = {"hex": _HEX_PAT, "ls32": _LS32_PAT}
30 _variations = [
31 # 6( h16 ":" ) ls32
32 "(?:%(hex)s:){6}%(ls32)s",
33 # "::" 5( h16 ":" ) ls32
34 "::(?:%(hex)s:){5}%(ls32)s",
35 # [ h16 ] "::" 4( h16 ":" ) ls32
36 "(?:%(hex)s)?::(?:%(hex)s:){4}%(ls32)s",
37 # [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
38 "(?:(?:%(hex)s:)?%(hex)s)?::(?:%(hex)s:){3}%(ls32)s",
39 # [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
40 "(?:(?:%(hex)s:){0,2}%(hex)s)?::(?:%(hex)s:){2}%(ls32)s",
41 # [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
42 "(?:(?:%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s",
43 # [ *4( h16 ":" ) h16 ] "::" ls32
44 "(?:(?:%(hex)s:){0,4}%(hex)s)?::%(ls32)s",
45 # [ *5( h16 ":" ) h16 ] "::" h16
46 "(?:(?:%(hex)s:){0,5}%(hex)s)?::%(hex)s",
47 # [ *6( h16 ":" ) h16 ] "::"
48 "(?:(?:%(hex)s:){0,6}%(hex)s)?::",
49 ]
50
51 _UNRESERVED_PAT = r"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._\-~"
52 _IPV6_PAT = "(?:" + "|".join([x % _subs for x in _variations]) + ")"
53 _ZONE_ID_PAT = "(?:%25|%)(?:[" + _UNRESERVED_PAT + "]|%[a-fA-F0-9]{2})+"
54 _IPV6_ADDRZ_PAT = r"\[" + _IPV6_PAT + r"(?:" + _ZONE_ID_PAT + r")?\]"
55 _REG_NAME_PAT = r"(?:[^\[\]%:/?#]|%[a-fA-F0-9]{2})*"
56 _TARGET_RE = re.compile(r"^(/[^?#]*)(?:\?([^#]*))?(?:#.*)?$")
57
58 _IPV4_RE = re.compile("^" + _IPV4_PAT + "$")
59 _IPV6_RE = re.compile("^" + _IPV6_PAT + "$")
60 _IPV6_ADDRZ_RE = re.compile("^" + _IPV6_ADDRZ_PAT + "$")
61 _BRACELESS_IPV6_ADDRZ_RE = re.compile("^" + _IPV6_ADDRZ_PAT[2:-2] + "$")
62 _ZONE_ID_RE = re.compile("(" + _ZONE_ID_PAT + r")\]$")
63
64 _HOST_PORT_PAT = ("^(%s|%s|%s)(?::0*?(|0|[1-9][0-9]{0,4}))?$") % (
65 _REG_NAME_PAT,
66 _IPV4_PAT,
67 _IPV6_ADDRZ_PAT,
68 )
69 _HOST_PORT_RE = re.compile(_HOST_PORT_PAT, re.UNICODE | re.DOTALL)
70
71 _UNRESERVED_CHARS = set(
72 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._-~"
73 )
74 _SUB_DELIM_CHARS = set("!$&'()*+,;=")
75 _USERINFO_CHARS = _UNRESERVED_CHARS | _SUB_DELIM_CHARS | {":"}
76 _PATH_CHARS = _USERINFO_CHARS | {"@", "/"}
77 _QUERY_CHARS = _FRAGMENT_CHARS = _PATH_CHARS | {"?"}
78
79
80 class Url(
81 typing.NamedTuple(
82 "Url",
83 [
84 ("scheme", typing.Optional[str]),
85 ("auth", typing.Optional[str]),
86 ("host", typing.Optional[str]),
87 ("port", typing.Optional[int]),
88 ("path", typing.Optional[str]),
89 ("query", typing.Optional[str]),
90 ("fragment", typing.Optional[str]),
91 ],
92 )
93 ):
94 """
95 Data structure for representing an HTTP URL. Used as a return value for
96 :func:`parse_url`. Both the scheme and host are normalized as they are
97 both case-insensitive according to RFC 3986.
98 """
99
100 def __new__( # type: ignore[no-untyped-def]
101 cls,
102 scheme: str | None = None,
103 auth: str | None = None,
104 host: str | None = None,
105 port: int | None = None,
106 path: str | None = None,
107 query: str | None = None,
108 fragment: str | None = None,
109 ):
110 if path and not path.startswith("/"):
111 path = "/" + path
112 if scheme is not None:
113 scheme = scheme.lower()
114 return super().__new__(cls, scheme, auth, host, port, path, query, fragment)
115
116 @property
117 def hostname(self) -> str | None:
118 """For backwards-compatibility with urlparse. We're nice like that."""
119 return self.host
120
121 @property
122 def request_uri(self) -> str:
123 """Absolute path including the query string."""
124 uri = self.path or "/"
125
126 if self.query is not None:
127 uri += "?" + self.query
128
129 return uri
130
131 @property
132 def authority(self) -> str | None:
133 """
134 Authority component as defined in RFC 3986 3.2.
135 This includes userinfo (auth), host and port.
136
137 i.e.
138 userinfo@host:port
139 """
140 userinfo = self.auth
141 netloc = self.netloc
142 if netloc is None or userinfo is None:
143 return netloc
144 else:
145 return f"{userinfo}@{netloc}"
146
147 @property
148 def netloc(self) -> str | None:
149 """
150 Network location including host and port.
151
152 If you need the equivalent of urllib.parse's ``netloc``,
153 use the ``authority`` property instead.
154 """
155 if self.host is None:
156 return None
157 if self.port:
158 return f"{self.host}:{self.port}"
159 return self.host
160
161 @property
162 def url(self) -> str:
163 """
164 Convert self into a url
165
166 This function should more or less round-trip with :func:`.parse_url`. The
167 returned url may not be exactly the same as the url inputted to
168 :func:`.parse_url`, but it should be equivalent by the RFC (e.g., urls
169 with a blank port will have : removed).
170
171 Example:
172
173 .. code-block:: python
174
175 import urllib3
176
177 U = urllib3.util.parse_url("https://google.com/mail/")
178
179 print(U.url)
180 # "https://google.com/mail/"
181
182 print( urllib3.util.Url("https", "username:password",
183 "host.com", 80, "/path", "query", "fragment"
184 ).url
185 )
186 # "https://username:password@host.com:80/path?query#fragment"
187 """
188 scheme, auth, host, port, path, query, fragment = self
189 url = ""
190
191 # We use "is not None" we want things to happen with empty strings (or 0 port)
192 if scheme is not None:
193 url += scheme + "://"
194 if auth is not None:
195 url += auth + "@"
196 if host is not None:
197 url += host
198 if port is not None:
199 url += ":" + str(port)
200 if path is not None:
201 url += path
202 if query is not None:
203 url += "?" + query
204 if fragment is not None:
205 url += "#" + fragment
206
207 return url
208
209 def __str__(self) -> str:
210 return self.url
211
212
213 @typing.overload
214 def _encode_invalid_chars(
215 component: str, allowed_chars: typing.Container[str]
216 ) -> str: # Abstract
217 ...
218
219
220 @typing.overload
221 def _encode_invalid_chars(
222 component: None, allowed_chars: typing.Container[str]
223 ) -> None: # Abstract
224 ...
225
226
227 def _encode_invalid_chars(
228 component: str | None, allowed_chars: typing.Container[str]
229 ) -> str | None:
230 """Percent-encodes a URI component without reapplying
231 onto an already percent-encoded component.
232 """
233 if component is None:
234 return component
235
236 component = to_str(component)
237
238 # Normalize existing percent-encoded bytes.
239 # Try to see if the component we're encoding is already percent-encoded
240 # so we can skip all '%' characters but still encode all others.
241 component, percent_encodings = _PERCENT_RE.subn(
242 lambda match: match.group(0).upper(), component
243 )
244
245 uri_bytes = component.encode("utf-8", "surrogatepass")
246 is_percent_encoded = percent_encodings == uri_bytes.count(b"%")
247 encoded_component = bytearray()
248
249 for i in range(0, len(uri_bytes)):
250 # Will return a single character bytestring
251 byte = uri_bytes[i : i + 1]
252 byte_ord = ord(byte)
253 if (is_percent_encoded and byte == b"%") or (
254 byte_ord < 128 and byte.decode() in allowed_chars
255 ):
256 encoded_component += byte
257 continue
258 encoded_component.extend(b"%" + (hex(byte_ord)[2:].encode().zfill(2).upper()))
259
260 return encoded_component.decode()
261
262
263 def _remove_path_dot_segments(path: str) -> str:
264 # See http://tools.ietf.org/html/rfc3986#section-5.2.4 for pseudo-code
265 segments = path.split("/") # Turn the path into a list of segments
266 output = [] # Initialize the variable to use to store output
267
268 for segment in segments:
269 # '.' is the current directory, so ignore it, it is superfluous
270 if segment == ".":
271 continue
272 # Anything other than '..', should be appended to the output
273 if segment != "..":
274 output.append(segment)
275 # In this case segment == '..', if we can, we should pop the last
276 # element
277 elif output:
278 output.pop()
279
280 # If the path starts with '/' and the output is empty or the first string
281 # is non-empty
282 if path.startswith("/") and (not output or output[0]):
283 output.insert(0, "")
284
285 # If the path starts with '/.' or '/..' ensure we add one more empty
286 # string to add a trailing '/'
287 if path.endswith(("/.", "/..")):
288 output.append("")
289
290 return "/".join(output)
291
292
293 @typing.overload
294 def _normalize_host(host: None, scheme: str | None) -> None:
295 ...
296
297
298 @typing.overload
299 def _normalize_host(host: str, scheme: str | None) -> str:
300 ...
301
302
303 def _normalize_host(host: str | None, scheme: str | None) -> str | None:
304 if host:
305 if scheme in _NORMALIZABLE_SCHEMES:
306 is_ipv6 = _IPV6_ADDRZ_RE.match(host)
307 if is_ipv6:
308 # IPv6 hosts of the form 'a::b%zone' are encoded in a URL as
309 # such per RFC 6874: 'a::b%25zone'. Unquote the ZoneID
310 # separator as necessary to return a valid RFC 4007 scoped IP.
311 match = _ZONE_ID_RE.search(host)
312 if match:
313 start, end = match.span(1)
314 zone_id = host[start:end]
315
316 if zone_id.startswith("%25") and zone_id != "%25":
317 zone_id = zone_id[3:]
318 else:
319 zone_id = zone_id[1:]
320 zone_id = _encode_invalid_chars(zone_id, _UNRESERVED_CHARS)
321 return f"{host[:start].lower()}%{zone_id}{host[end:]}"
322 else:
323 return host.lower()
324 elif not _IPV4_RE.match(host):
325 return to_str(
326 b".".join([_idna_encode(label) for label in host.split(".")]),
327 "ascii",
328 )
329 return host
330
331
332 def _idna_encode(name: str) -> bytes:
333 if not name.isascii():
334 try:
335 import idna
336 except ImportError:
337 raise LocationParseError(
338 "Unable to parse URL without the 'idna' module"
339 ) from None
340
341 try:
342 return idna.encode(name.lower(), strict=True, std3_rules=True)
343 except idna.IDNAError:
344 raise LocationParseError(
345 f"Name '{name}' is not a valid IDNA label"
346 ) from None
347
348 return name.lower().encode("ascii")
349
350
351 def _encode_target(target: str) -> str:
352 """Percent-encodes a request target so that there are no invalid characters
353
354 Pre-condition for this function is that 'target' must start with '/'.
355 If that is the case then _TARGET_RE will always produce a match.
356 """
357 match = _TARGET_RE.match(target)
358 if not match: # Defensive:
359 raise LocationParseError(f"{target!r} is not a valid request URI")
360
361 path, query = match.groups()
362 encoded_target = _encode_invalid_chars(path, _PATH_CHARS)
363 if query is not None:
364 query = _encode_invalid_chars(query, _QUERY_CHARS)
365 encoded_target += "?" + query
366 return encoded_target
367
368
369 def parse_url(url: str) -> Url:
370 """
371 Given a url, return a parsed :class:`.Url` namedtuple. Best-effort is
372 performed to parse incomplete urls. Fields not provided will be None.
373 This parser is RFC 3986 and RFC 6874 compliant.
374
375 The parser logic and helper functions are based heavily on
376 work done in the ``rfc3986`` module.
377
378 :param str url: URL to parse into a :class:`.Url` namedtuple.
379
380 Partly backwards-compatible with :mod:`urllib.parse`.
381
382 Example:
383
384 .. code-block:: python
385
386 import urllib3
387
388 print( urllib3.util.parse_url('http://google.com/mail/'))
389 # Url(scheme='http', host='google.com', port=None, path='/mail/', ...)
390
391 print( urllib3.util.parse_url('google.com:80'))
392 # Url(scheme=None, host='google.com', port=80, path=None, ...)
393
394 print( urllib3.util.parse_url('/foo?bar'))
395 # Url(scheme=None, host=None, port=None, path='/foo', query='bar', ...)
396 """
397 if not url:
398 # Empty
399 return Url()
400
401 source_url = url
402 if not _SCHEME_RE.search(url):
403 url = "//" + url
404
405 scheme: str | None
406 authority: str | None
407 auth: str | None
408 host: str | None
409 port: str | None
410 port_int: int | None
411 path: str | None
412 query: str | None
413 fragment: str | None
414
415 try:
416 scheme, authority, path, query, fragment = _URI_RE.match(url).groups() # type: ignore[union-attr]
417 normalize_uri = scheme is None or scheme.lower() in _NORMALIZABLE_SCHEMES
418
419 if scheme:
420 scheme = scheme.lower()
421
422 if authority:
423 auth, _, host_port = authority.rpartition("@")
424 auth = auth or None
425 host, port = _HOST_PORT_RE.match(host_port).groups() # type: ignore[union-attr]
426 if auth and normalize_uri:
427 auth = _encode_invalid_chars(auth, _USERINFO_CHARS)
428 if port == "":
429 port = None
430 else:
431 auth, host, port = None, None, None
432
433 if port is not None:
434 port_int = int(port)
435 if not (0 <= port_int <= 65535):
436 raise LocationParseError(url)
437 else:
438 port_int = None
439
440 host = _normalize_host(host, scheme)
441
442 if normalize_uri and path:
443 path = _remove_path_dot_segments(path)
444 path = _encode_invalid_chars(path, _PATH_CHARS)
445 if normalize_uri and query:
446 query = _encode_invalid_chars(query, _QUERY_CHARS)
447 if normalize_uri and fragment:
448 fragment = _encode_invalid_chars(fragment, _FRAGMENT_CHARS)
449
450 except (ValueError, AttributeError) as e:
451 raise LocationParseError(source_url) from e
452
453 # For the sake of backwards compatibility we put empty
454 # string values for path if there are any defined values
455 # beyond the path in the URL.
456 # TODO: Remove this when we break backwards compatibility.
457 if not path:
458 if query is not None or fragment is not None:
459 path = ""
460 else:
461 path = None
462
463 return Url(
464 scheme=scheme,
465 auth=auth,
466 host=host,
467 port=port_int,
468 path=path,
469 query=query,
470 fragment=fragment,
471 )