jpayne@7
|
1 from __future__ import annotations
|
jpayne@7
|
2
|
jpayne@7
|
3 import re
|
jpayne@7
|
4 import typing
|
jpayne@7
|
5
|
jpayne@7
|
6 from ..exceptions import LocationParseError
|
jpayne@7
|
7 from .util import to_str
|
jpayne@7
|
8
|
jpayne@7
|
9 # We only want to normalize urls with an HTTP(S) scheme.
|
jpayne@7
|
10 # urllib3 infers URLs without a scheme (None) to be http.
|
jpayne@7
|
11 _NORMALIZABLE_SCHEMES = ("http", "https", None)
|
jpayne@7
|
12
|
jpayne@7
|
13 # Almost all of these patterns were derived from the
|
jpayne@7
|
14 # 'rfc3986' module: https://github.com/python-hyper/rfc3986
|
jpayne@7
|
15 _PERCENT_RE = re.compile(r"%[a-fA-F0-9]{2}")
|
jpayne@7
|
16 _SCHEME_RE = re.compile(r"^(?:[a-zA-Z][a-zA-Z0-9+-]*:|/)")
|
jpayne@7
|
17 _URI_RE = re.compile(
|
jpayne@7
|
18 r"^(?:([a-zA-Z][a-zA-Z0-9+.-]*):)?"
|
jpayne@7
|
19 r"(?://([^\\/?#]*))?"
|
jpayne@7
|
20 r"([^?#]*)"
|
jpayne@7
|
21 r"(?:\?([^#]*))?"
|
jpayne@7
|
22 r"(?:#(.*))?$",
|
jpayne@7
|
23 re.UNICODE | re.DOTALL,
|
jpayne@7
|
24 )
|
jpayne@7
|
25
|
jpayne@7
|
26 _IPV4_PAT = r"(?:[0-9]{1,3}\.){3}[0-9]{1,3}"
|
jpayne@7
|
27 _HEX_PAT = "[0-9A-Fa-f]{1,4}"
|
jpayne@7
|
28 _LS32_PAT = "(?:{hex}:{hex}|{ipv4})".format(hex=_HEX_PAT, ipv4=_IPV4_PAT)
|
jpayne@7
|
29 _subs = {"hex": _HEX_PAT, "ls32": _LS32_PAT}
|
jpayne@7
|
30 _variations = [
|
jpayne@7
|
31 # 6( h16 ":" ) ls32
|
jpayne@7
|
32 "(?:%(hex)s:){6}%(ls32)s",
|
jpayne@7
|
33 # "::" 5( h16 ":" ) ls32
|
jpayne@7
|
34 "::(?:%(hex)s:){5}%(ls32)s",
|
jpayne@7
|
35 # [ h16 ] "::" 4( h16 ":" ) ls32
|
jpayne@7
|
36 "(?:%(hex)s)?::(?:%(hex)s:){4}%(ls32)s",
|
jpayne@7
|
37 # [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
|
jpayne@7
|
38 "(?:(?:%(hex)s:)?%(hex)s)?::(?:%(hex)s:){3}%(ls32)s",
|
jpayne@7
|
39 # [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
|
jpayne@7
|
40 "(?:(?:%(hex)s:){0,2}%(hex)s)?::(?:%(hex)s:){2}%(ls32)s",
|
jpayne@7
|
41 # [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
|
jpayne@7
|
42 "(?:(?:%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s",
|
jpayne@7
|
43 # [ *4( h16 ":" ) h16 ] "::" ls32
|
jpayne@7
|
44 "(?:(?:%(hex)s:){0,4}%(hex)s)?::%(ls32)s",
|
jpayne@7
|
45 # [ *5( h16 ":" ) h16 ] "::" h16
|
jpayne@7
|
46 "(?:(?:%(hex)s:){0,5}%(hex)s)?::%(hex)s",
|
jpayne@7
|
47 # [ *6( h16 ":" ) h16 ] "::"
|
jpayne@7
|
48 "(?:(?:%(hex)s:){0,6}%(hex)s)?::",
|
jpayne@7
|
49 ]
|
jpayne@7
|
50
|
jpayne@7
|
51 _UNRESERVED_PAT = r"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._\-~"
|
jpayne@7
|
52 _IPV6_PAT = "(?:" + "|".join([x % _subs for x in _variations]) + ")"
|
jpayne@7
|
53 _ZONE_ID_PAT = "(?:%25|%)(?:[" + _UNRESERVED_PAT + "]|%[a-fA-F0-9]{2})+"
|
jpayne@7
|
54 _IPV6_ADDRZ_PAT = r"\[" + _IPV6_PAT + r"(?:" + _ZONE_ID_PAT + r")?\]"
|
jpayne@7
|
55 _REG_NAME_PAT = r"(?:[^\[\]%:/?#]|%[a-fA-F0-9]{2})*"
|
jpayne@7
|
56 _TARGET_RE = re.compile(r"^(/[^?#]*)(?:\?([^#]*))?(?:#.*)?$")
|
jpayne@7
|
57
|
jpayne@7
|
58 _IPV4_RE = re.compile("^" + _IPV4_PAT + "$")
|
jpayne@7
|
59 _IPV6_RE = re.compile("^" + _IPV6_PAT + "$")
|
jpayne@7
|
60 _IPV6_ADDRZ_RE = re.compile("^" + _IPV6_ADDRZ_PAT + "$")
|
jpayne@7
|
61 _BRACELESS_IPV6_ADDRZ_RE = re.compile("^" + _IPV6_ADDRZ_PAT[2:-2] + "$")
|
jpayne@7
|
62 _ZONE_ID_RE = re.compile("(" + _ZONE_ID_PAT + r")\]$")
|
jpayne@7
|
63
|
jpayne@7
|
64 _HOST_PORT_PAT = ("^(%s|%s|%s)(?::0*?(|0|[1-9][0-9]{0,4}))?$") % (
|
jpayne@7
|
65 _REG_NAME_PAT,
|
jpayne@7
|
66 _IPV4_PAT,
|
jpayne@7
|
67 _IPV6_ADDRZ_PAT,
|
jpayne@7
|
68 )
|
jpayne@7
|
69 _HOST_PORT_RE = re.compile(_HOST_PORT_PAT, re.UNICODE | re.DOTALL)
|
jpayne@7
|
70
|
jpayne@7
|
71 _UNRESERVED_CHARS = set(
|
jpayne@7
|
72 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._-~"
|
jpayne@7
|
73 )
|
jpayne@7
|
74 _SUB_DELIM_CHARS = set("!$&'()*+,;=")
|
jpayne@7
|
75 _USERINFO_CHARS = _UNRESERVED_CHARS | _SUB_DELIM_CHARS | {":"}
|
jpayne@7
|
76 _PATH_CHARS = _USERINFO_CHARS | {"@", "/"}
|
jpayne@7
|
77 _QUERY_CHARS = _FRAGMENT_CHARS = _PATH_CHARS | {"?"}
|
jpayne@7
|
78
|
jpayne@7
|
79
|
jpayne@7
|
80 class Url(
|
jpayne@7
|
81 typing.NamedTuple(
|
jpayne@7
|
82 "Url",
|
jpayne@7
|
83 [
|
jpayne@7
|
84 ("scheme", typing.Optional[str]),
|
jpayne@7
|
85 ("auth", typing.Optional[str]),
|
jpayne@7
|
86 ("host", typing.Optional[str]),
|
jpayne@7
|
87 ("port", typing.Optional[int]),
|
jpayne@7
|
88 ("path", typing.Optional[str]),
|
jpayne@7
|
89 ("query", typing.Optional[str]),
|
jpayne@7
|
90 ("fragment", typing.Optional[str]),
|
jpayne@7
|
91 ],
|
jpayne@7
|
92 )
|
jpayne@7
|
93 ):
|
jpayne@7
|
94 """
|
jpayne@7
|
95 Data structure for representing an HTTP URL. Used as a return value for
|
jpayne@7
|
96 :func:`parse_url`. Both the scheme and host are normalized as they are
|
jpayne@7
|
97 both case-insensitive according to RFC 3986.
|
jpayne@7
|
98 """
|
jpayne@7
|
99
|
jpayne@7
|
100 def __new__( # type: ignore[no-untyped-def]
|
jpayne@7
|
101 cls,
|
jpayne@7
|
102 scheme: str | None = None,
|
jpayne@7
|
103 auth: str | None = None,
|
jpayne@7
|
104 host: str | None = None,
|
jpayne@7
|
105 port: int | None = None,
|
jpayne@7
|
106 path: str | None = None,
|
jpayne@7
|
107 query: str | None = None,
|
jpayne@7
|
108 fragment: str | None = None,
|
jpayne@7
|
109 ):
|
jpayne@7
|
110 if path and not path.startswith("/"):
|
jpayne@7
|
111 path = "/" + path
|
jpayne@7
|
112 if scheme is not None:
|
jpayne@7
|
113 scheme = scheme.lower()
|
jpayne@7
|
114 return super().__new__(cls, scheme, auth, host, port, path, query, fragment)
|
jpayne@7
|
115
|
jpayne@7
|
116 @property
|
jpayne@7
|
117 def hostname(self) -> str | None:
|
jpayne@7
|
118 """For backwards-compatibility with urlparse. We're nice like that."""
|
jpayne@7
|
119 return self.host
|
jpayne@7
|
120
|
jpayne@7
|
121 @property
|
jpayne@7
|
122 def request_uri(self) -> str:
|
jpayne@7
|
123 """Absolute path including the query string."""
|
jpayne@7
|
124 uri = self.path or "/"
|
jpayne@7
|
125
|
jpayne@7
|
126 if self.query is not None:
|
jpayne@7
|
127 uri += "?" + self.query
|
jpayne@7
|
128
|
jpayne@7
|
129 return uri
|
jpayne@7
|
130
|
jpayne@7
|
131 @property
|
jpayne@7
|
132 def authority(self) -> str | None:
|
jpayne@7
|
133 """
|
jpayne@7
|
134 Authority component as defined in RFC 3986 3.2.
|
jpayne@7
|
135 This includes userinfo (auth), host and port.
|
jpayne@7
|
136
|
jpayne@7
|
137 i.e.
|
jpayne@7
|
138 userinfo@host:port
|
jpayne@7
|
139 """
|
jpayne@7
|
140 userinfo = self.auth
|
jpayne@7
|
141 netloc = self.netloc
|
jpayne@7
|
142 if netloc is None or userinfo is None:
|
jpayne@7
|
143 return netloc
|
jpayne@7
|
144 else:
|
jpayne@7
|
145 return f"{userinfo}@{netloc}"
|
jpayne@7
|
146
|
jpayne@7
|
147 @property
|
jpayne@7
|
148 def netloc(self) -> str | None:
|
jpayne@7
|
149 """
|
jpayne@7
|
150 Network location including host and port.
|
jpayne@7
|
151
|
jpayne@7
|
152 If you need the equivalent of urllib.parse's ``netloc``,
|
jpayne@7
|
153 use the ``authority`` property instead.
|
jpayne@7
|
154 """
|
jpayne@7
|
155 if self.host is None:
|
jpayne@7
|
156 return None
|
jpayne@7
|
157 if self.port:
|
jpayne@7
|
158 return f"{self.host}:{self.port}"
|
jpayne@7
|
159 return self.host
|
jpayne@7
|
160
|
jpayne@7
|
161 @property
|
jpayne@7
|
162 def url(self) -> str:
|
jpayne@7
|
163 """
|
jpayne@7
|
164 Convert self into a url
|
jpayne@7
|
165
|
jpayne@7
|
166 This function should more or less round-trip with :func:`.parse_url`. The
|
jpayne@7
|
167 returned url may not be exactly the same as the url inputted to
|
jpayne@7
|
168 :func:`.parse_url`, but it should be equivalent by the RFC (e.g., urls
|
jpayne@7
|
169 with a blank port will have : removed).
|
jpayne@7
|
170
|
jpayne@7
|
171 Example:
|
jpayne@7
|
172
|
jpayne@7
|
173 .. code-block:: python
|
jpayne@7
|
174
|
jpayne@7
|
175 import urllib3
|
jpayne@7
|
176
|
jpayne@7
|
177 U = urllib3.util.parse_url("https://google.com/mail/")
|
jpayne@7
|
178
|
jpayne@7
|
179 print(U.url)
|
jpayne@7
|
180 # "https://google.com/mail/"
|
jpayne@7
|
181
|
jpayne@7
|
182 print( urllib3.util.Url("https", "username:password",
|
jpayne@7
|
183 "host.com", 80, "/path", "query", "fragment"
|
jpayne@7
|
184 ).url
|
jpayne@7
|
185 )
|
jpayne@7
|
186 # "https://username:password@host.com:80/path?query#fragment"
|
jpayne@7
|
187 """
|
jpayne@7
|
188 scheme, auth, host, port, path, query, fragment = self
|
jpayne@7
|
189 url = ""
|
jpayne@7
|
190
|
jpayne@7
|
191 # We use "is not None" we want things to happen with empty strings (or 0 port)
|
jpayne@7
|
192 if scheme is not None:
|
jpayne@7
|
193 url += scheme + "://"
|
jpayne@7
|
194 if auth is not None:
|
jpayne@7
|
195 url += auth + "@"
|
jpayne@7
|
196 if host is not None:
|
jpayne@7
|
197 url += host
|
jpayne@7
|
198 if port is not None:
|
jpayne@7
|
199 url += ":" + str(port)
|
jpayne@7
|
200 if path is not None:
|
jpayne@7
|
201 url += path
|
jpayne@7
|
202 if query is not None:
|
jpayne@7
|
203 url += "?" + query
|
jpayne@7
|
204 if fragment is not None:
|
jpayne@7
|
205 url += "#" + fragment
|
jpayne@7
|
206
|
jpayne@7
|
207 return url
|
jpayne@7
|
208
|
jpayne@7
|
209 def __str__(self) -> str:
|
jpayne@7
|
210 return self.url
|
jpayne@7
|
211
|
jpayne@7
|
212
|
jpayne@7
|
213 @typing.overload
|
jpayne@7
|
214 def _encode_invalid_chars(
|
jpayne@7
|
215 component: str, allowed_chars: typing.Container[str]
|
jpayne@7
|
216 ) -> str: # Abstract
|
jpayne@7
|
217 ...
|
jpayne@7
|
218
|
jpayne@7
|
219
|
jpayne@7
|
220 @typing.overload
|
jpayne@7
|
221 def _encode_invalid_chars(
|
jpayne@7
|
222 component: None, allowed_chars: typing.Container[str]
|
jpayne@7
|
223 ) -> None: # Abstract
|
jpayne@7
|
224 ...
|
jpayne@7
|
225
|
jpayne@7
|
226
|
jpayne@7
|
227 def _encode_invalid_chars(
|
jpayne@7
|
228 component: str | None, allowed_chars: typing.Container[str]
|
jpayne@7
|
229 ) -> str | None:
|
jpayne@7
|
230 """Percent-encodes a URI component without reapplying
|
jpayne@7
|
231 onto an already percent-encoded component.
|
jpayne@7
|
232 """
|
jpayne@7
|
233 if component is None:
|
jpayne@7
|
234 return component
|
jpayne@7
|
235
|
jpayne@7
|
236 component = to_str(component)
|
jpayne@7
|
237
|
jpayne@7
|
238 # Normalize existing percent-encoded bytes.
|
jpayne@7
|
239 # Try to see if the component we're encoding is already percent-encoded
|
jpayne@7
|
240 # so we can skip all '%' characters but still encode all others.
|
jpayne@7
|
241 component, percent_encodings = _PERCENT_RE.subn(
|
jpayne@7
|
242 lambda match: match.group(0).upper(), component
|
jpayne@7
|
243 )
|
jpayne@7
|
244
|
jpayne@7
|
245 uri_bytes = component.encode("utf-8", "surrogatepass")
|
jpayne@7
|
246 is_percent_encoded = percent_encodings == uri_bytes.count(b"%")
|
jpayne@7
|
247 encoded_component = bytearray()
|
jpayne@7
|
248
|
jpayne@7
|
249 for i in range(0, len(uri_bytes)):
|
jpayne@7
|
250 # Will return a single character bytestring
|
jpayne@7
|
251 byte = uri_bytes[i : i + 1]
|
jpayne@7
|
252 byte_ord = ord(byte)
|
jpayne@7
|
253 if (is_percent_encoded and byte == b"%") or (
|
jpayne@7
|
254 byte_ord < 128 and byte.decode() in allowed_chars
|
jpayne@7
|
255 ):
|
jpayne@7
|
256 encoded_component += byte
|
jpayne@7
|
257 continue
|
jpayne@7
|
258 encoded_component.extend(b"%" + (hex(byte_ord)[2:].encode().zfill(2).upper()))
|
jpayne@7
|
259
|
jpayne@7
|
260 return encoded_component.decode()
|
jpayne@7
|
261
|
jpayne@7
|
262
|
jpayne@7
|
263 def _remove_path_dot_segments(path: str) -> str:
|
jpayne@7
|
264 # See http://tools.ietf.org/html/rfc3986#section-5.2.4 for pseudo-code
|
jpayne@7
|
265 segments = path.split("/") # Turn the path into a list of segments
|
jpayne@7
|
266 output = [] # Initialize the variable to use to store output
|
jpayne@7
|
267
|
jpayne@7
|
268 for segment in segments:
|
jpayne@7
|
269 # '.' is the current directory, so ignore it, it is superfluous
|
jpayne@7
|
270 if segment == ".":
|
jpayne@7
|
271 continue
|
jpayne@7
|
272 # Anything other than '..', should be appended to the output
|
jpayne@7
|
273 if segment != "..":
|
jpayne@7
|
274 output.append(segment)
|
jpayne@7
|
275 # In this case segment == '..', if we can, we should pop the last
|
jpayne@7
|
276 # element
|
jpayne@7
|
277 elif output:
|
jpayne@7
|
278 output.pop()
|
jpayne@7
|
279
|
jpayne@7
|
280 # If the path starts with '/' and the output is empty or the first string
|
jpayne@7
|
281 # is non-empty
|
jpayne@7
|
282 if path.startswith("/") and (not output or output[0]):
|
jpayne@7
|
283 output.insert(0, "")
|
jpayne@7
|
284
|
jpayne@7
|
285 # If the path starts with '/.' or '/..' ensure we add one more empty
|
jpayne@7
|
286 # string to add a trailing '/'
|
jpayne@7
|
287 if path.endswith(("/.", "/..")):
|
jpayne@7
|
288 output.append("")
|
jpayne@7
|
289
|
jpayne@7
|
290 return "/".join(output)
|
jpayne@7
|
291
|
jpayne@7
|
292
|
jpayne@7
|
293 @typing.overload
|
jpayne@7
|
294 def _normalize_host(host: None, scheme: str | None) -> None:
|
jpayne@7
|
295 ...
|
jpayne@7
|
296
|
jpayne@7
|
297
|
jpayne@7
|
298 @typing.overload
|
jpayne@7
|
299 def _normalize_host(host: str, scheme: str | None) -> str:
|
jpayne@7
|
300 ...
|
jpayne@7
|
301
|
jpayne@7
|
302
|
jpayne@7
|
303 def _normalize_host(host: str | None, scheme: str | None) -> str | None:
|
jpayne@7
|
304 if host:
|
jpayne@7
|
305 if scheme in _NORMALIZABLE_SCHEMES:
|
jpayne@7
|
306 is_ipv6 = _IPV6_ADDRZ_RE.match(host)
|
jpayne@7
|
307 if is_ipv6:
|
jpayne@7
|
308 # IPv6 hosts of the form 'a::b%zone' are encoded in a URL as
|
jpayne@7
|
309 # such per RFC 6874: 'a::b%25zone'. Unquote the ZoneID
|
jpayne@7
|
310 # separator as necessary to return a valid RFC 4007 scoped IP.
|
jpayne@7
|
311 match = _ZONE_ID_RE.search(host)
|
jpayne@7
|
312 if match:
|
jpayne@7
|
313 start, end = match.span(1)
|
jpayne@7
|
314 zone_id = host[start:end]
|
jpayne@7
|
315
|
jpayne@7
|
316 if zone_id.startswith("%25") and zone_id != "%25":
|
jpayne@7
|
317 zone_id = zone_id[3:]
|
jpayne@7
|
318 else:
|
jpayne@7
|
319 zone_id = zone_id[1:]
|
jpayne@7
|
320 zone_id = _encode_invalid_chars(zone_id, _UNRESERVED_CHARS)
|
jpayne@7
|
321 return f"{host[:start].lower()}%{zone_id}{host[end:]}"
|
jpayne@7
|
322 else:
|
jpayne@7
|
323 return host.lower()
|
jpayne@7
|
324 elif not _IPV4_RE.match(host):
|
jpayne@7
|
325 return to_str(
|
jpayne@7
|
326 b".".join([_idna_encode(label) for label in host.split(".")]),
|
jpayne@7
|
327 "ascii",
|
jpayne@7
|
328 )
|
jpayne@7
|
329 return host
|
jpayne@7
|
330
|
jpayne@7
|
331
|
jpayne@7
|
332 def _idna_encode(name: str) -> bytes:
|
jpayne@7
|
333 if not name.isascii():
|
jpayne@7
|
334 try:
|
jpayne@7
|
335 import idna
|
jpayne@7
|
336 except ImportError:
|
jpayne@7
|
337 raise LocationParseError(
|
jpayne@7
|
338 "Unable to parse URL without the 'idna' module"
|
jpayne@7
|
339 ) from None
|
jpayne@7
|
340
|
jpayne@7
|
341 try:
|
jpayne@7
|
342 return idna.encode(name.lower(), strict=True, std3_rules=True)
|
jpayne@7
|
343 except idna.IDNAError:
|
jpayne@7
|
344 raise LocationParseError(
|
jpayne@7
|
345 f"Name '{name}' is not a valid IDNA label"
|
jpayne@7
|
346 ) from None
|
jpayne@7
|
347
|
jpayne@7
|
348 return name.lower().encode("ascii")
|
jpayne@7
|
349
|
jpayne@7
|
350
|
jpayne@7
|
351 def _encode_target(target: str) -> str:
|
jpayne@7
|
352 """Percent-encodes a request target so that there are no invalid characters
|
jpayne@7
|
353
|
jpayne@7
|
354 Pre-condition for this function is that 'target' must start with '/'.
|
jpayne@7
|
355 If that is the case then _TARGET_RE will always produce a match.
|
jpayne@7
|
356 """
|
jpayne@7
|
357 match = _TARGET_RE.match(target)
|
jpayne@7
|
358 if not match: # Defensive:
|
jpayne@7
|
359 raise LocationParseError(f"{target!r} is not a valid request URI")
|
jpayne@7
|
360
|
jpayne@7
|
361 path, query = match.groups()
|
jpayne@7
|
362 encoded_target = _encode_invalid_chars(path, _PATH_CHARS)
|
jpayne@7
|
363 if query is not None:
|
jpayne@7
|
364 query = _encode_invalid_chars(query, _QUERY_CHARS)
|
jpayne@7
|
365 encoded_target += "?" + query
|
jpayne@7
|
366 return encoded_target
|
jpayne@7
|
367
|
jpayne@7
|
368
|
jpayne@7
|
369 def parse_url(url: str) -> Url:
|
jpayne@7
|
370 """
|
jpayne@7
|
371 Given a url, return a parsed :class:`.Url` namedtuple. Best-effort is
|
jpayne@7
|
372 performed to parse incomplete urls. Fields not provided will be None.
|
jpayne@7
|
373 This parser is RFC 3986 and RFC 6874 compliant.
|
jpayne@7
|
374
|
jpayne@7
|
375 The parser logic and helper functions are based heavily on
|
jpayne@7
|
376 work done in the ``rfc3986`` module.
|
jpayne@7
|
377
|
jpayne@7
|
378 :param str url: URL to parse into a :class:`.Url` namedtuple.
|
jpayne@7
|
379
|
jpayne@7
|
380 Partly backwards-compatible with :mod:`urllib.parse`.
|
jpayne@7
|
381
|
jpayne@7
|
382 Example:
|
jpayne@7
|
383
|
jpayne@7
|
384 .. code-block:: python
|
jpayne@7
|
385
|
jpayne@7
|
386 import urllib3
|
jpayne@7
|
387
|
jpayne@7
|
388 print( urllib3.util.parse_url('http://google.com/mail/'))
|
jpayne@7
|
389 # Url(scheme='http', host='google.com', port=None, path='/mail/', ...)
|
jpayne@7
|
390
|
jpayne@7
|
391 print( urllib3.util.parse_url('google.com:80'))
|
jpayne@7
|
392 # Url(scheme=None, host='google.com', port=80, path=None, ...)
|
jpayne@7
|
393
|
jpayne@7
|
394 print( urllib3.util.parse_url('/foo?bar'))
|
jpayne@7
|
395 # Url(scheme=None, host=None, port=None, path='/foo', query='bar', ...)
|
jpayne@7
|
396 """
|
jpayne@7
|
397 if not url:
|
jpayne@7
|
398 # Empty
|
jpayne@7
|
399 return Url()
|
jpayne@7
|
400
|
jpayne@7
|
401 source_url = url
|
jpayne@7
|
402 if not _SCHEME_RE.search(url):
|
jpayne@7
|
403 url = "//" + url
|
jpayne@7
|
404
|
jpayne@7
|
405 scheme: str | None
|
jpayne@7
|
406 authority: str | None
|
jpayne@7
|
407 auth: str | None
|
jpayne@7
|
408 host: str | None
|
jpayne@7
|
409 port: str | None
|
jpayne@7
|
410 port_int: int | None
|
jpayne@7
|
411 path: str | None
|
jpayne@7
|
412 query: str | None
|
jpayne@7
|
413 fragment: str | None
|
jpayne@7
|
414
|
jpayne@7
|
415 try:
|
jpayne@7
|
416 scheme, authority, path, query, fragment = _URI_RE.match(url).groups() # type: ignore[union-attr]
|
jpayne@7
|
417 normalize_uri = scheme is None or scheme.lower() in _NORMALIZABLE_SCHEMES
|
jpayne@7
|
418
|
jpayne@7
|
419 if scheme:
|
jpayne@7
|
420 scheme = scheme.lower()
|
jpayne@7
|
421
|
jpayne@7
|
422 if authority:
|
jpayne@7
|
423 auth, _, host_port = authority.rpartition("@")
|
jpayne@7
|
424 auth = auth or None
|
jpayne@7
|
425 host, port = _HOST_PORT_RE.match(host_port).groups() # type: ignore[union-attr]
|
jpayne@7
|
426 if auth and normalize_uri:
|
jpayne@7
|
427 auth = _encode_invalid_chars(auth, _USERINFO_CHARS)
|
jpayne@7
|
428 if port == "":
|
jpayne@7
|
429 port = None
|
jpayne@7
|
430 else:
|
jpayne@7
|
431 auth, host, port = None, None, None
|
jpayne@7
|
432
|
jpayne@7
|
433 if port is not None:
|
jpayne@7
|
434 port_int = int(port)
|
jpayne@7
|
435 if not (0 <= port_int <= 65535):
|
jpayne@7
|
436 raise LocationParseError(url)
|
jpayne@7
|
437 else:
|
jpayne@7
|
438 port_int = None
|
jpayne@7
|
439
|
jpayne@7
|
440 host = _normalize_host(host, scheme)
|
jpayne@7
|
441
|
jpayne@7
|
442 if normalize_uri and path:
|
jpayne@7
|
443 path = _remove_path_dot_segments(path)
|
jpayne@7
|
444 path = _encode_invalid_chars(path, _PATH_CHARS)
|
jpayne@7
|
445 if normalize_uri and query:
|
jpayne@7
|
446 query = _encode_invalid_chars(query, _QUERY_CHARS)
|
jpayne@7
|
447 if normalize_uri and fragment:
|
jpayne@7
|
448 fragment = _encode_invalid_chars(fragment, _FRAGMENT_CHARS)
|
jpayne@7
|
449
|
jpayne@7
|
450 except (ValueError, AttributeError) as e:
|
jpayne@7
|
451 raise LocationParseError(source_url) from e
|
jpayne@7
|
452
|
jpayne@7
|
453 # For the sake of backwards compatibility we put empty
|
jpayne@7
|
454 # string values for path if there are any defined values
|
jpayne@7
|
455 # beyond the path in the URL.
|
jpayne@7
|
456 # TODO: Remove this when we break backwards compatibility.
|
jpayne@7
|
457 if not path:
|
jpayne@7
|
458 if query is not None or fragment is not None:
|
jpayne@7
|
459 path = ""
|
jpayne@7
|
460 else:
|
jpayne@7
|
461 path = None
|
jpayne@7
|
462
|
jpayne@7
|
463 return Url(
|
jpayne@7
|
464 scheme=scheme,
|
jpayne@7
|
465 auth=auth,
|
jpayne@7
|
466 host=host,
|
jpayne@7
|
467 port=port_int,
|
jpayne@7
|
468 path=path,
|
jpayne@7
|
469 query=query,
|
jpayne@7
|
470 fragment=fragment,
|
jpayne@7
|
471 )
|