2016-10-17 02:15:22 +00:00
|
|
|
import urllib
|
2016-10-17 03:56:46 +00:00
|
|
|
from typing import Sequence
|
|
|
|
from typing import Tuple
|
2016-05-31 06:42:56 +00:00
|
|
|
|
2016-10-19 21:46:47 +00:00
|
|
|
from netlib import check
|
2016-05-31 06:42:56 +00:00
|
|
|
|
2016-05-31 07:58:28 +00:00
|
|
|
|
2016-05-31 06:42:56 +00:00
|
|
|
# PY2 workaround
|
|
|
|
def decode_parse_result(result, enc):
|
|
|
|
if hasattr(result, "decode"):
|
|
|
|
return result.decode(enc)
|
|
|
|
else:
|
|
|
|
return urllib.parse.ParseResult(*[x.decode(enc) for x in result])
|
|
|
|
|
|
|
|
|
|
|
|
# PY2 workaround
|
|
|
|
def encode_parse_result(result, enc):
|
|
|
|
if hasattr(result, "encode"):
|
|
|
|
return result.encode(enc)
|
|
|
|
else:
|
|
|
|
return urllib.parse.ParseResult(*[x.encode(enc) for x in result])
|
|
|
|
|
|
|
|
|
2016-05-31 06:46:19 +00:00
|
|
|
def parse(url):
|
2016-05-31 06:42:56 +00:00
|
|
|
"""
|
|
|
|
URL-parsing function that checks that
|
|
|
|
- port is an integer 0-65535
|
|
|
|
- host is a valid IDNA-encoded hostname with no null-bytes
|
|
|
|
- path is valid ASCII
|
|
|
|
|
|
|
|
Args:
|
|
|
|
A URL (as bytes or as unicode)
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
A (scheme, host, port, path) tuple
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
ValueError, if the URL is not properly formatted.
|
|
|
|
"""
|
|
|
|
parsed = urllib.parse.urlparse(url)
|
|
|
|
|
|
|
|
if not parsed.hostname:
|
|
|
|
raise ValueError("No hostname given")
|
|
|
|
|
2016-10-17 02:15:22 +00:00
|
|
|
if isinstance(url, bytes):
|
2016-05-31 06:42:56 +00:00
|
|
|
host = parsed.hostname
|
|
|
|
|
|
|
|
# this should not raise a ValueError,
|
|
|
|
# but we try to be very forgiving here and accept just everything.
|
|
|
|
# decode_parse_result(parsed, "ascii")
|
|
|
|
else:
|
|
|
|
host = parsed.hostname.encode("idna")
|
|
|
|
parsed = encode_parse_result(parsed, "ascii")
|
|
|
|
|
|
|
|
port = parsed.port
|
|
|
|
if not port:
|
|
|
|
port = 443 if parsed.scheme == b"https" else 80
|
|
|
|
|
|
|
|
full_path = urllib.parse.urlunparse(
|
|
|
|
(b"", b"", parsed.path, parsed.params, parsed.query, parsed.fragment)
|
|
|
|
)
|
|
|
|
if not full_path.startswith(b"/"):
|
|
|
|
full_path = b"/" + full_path
|
|
|
|
|
2016-10-19 21:46:47 +00:00
|
|
|
if not check.is_valid_host(host):
|
2016-05-31 06:42:56 +00:00
|
|
|
raise ValueError("Invalid Host")
|
2016-10-19 21:46:47 +00:00
|
|
|
if not check.is_valid_port(port):
|
2016-05-31 06:42:56 +00:00
|
|
|
raise ValueError("Invalid Port")
|
|
|
|
|
|
|
|
return parsed.scheme, host, port, full_path
|
|
|
|
|
|
|
|
|
2016-05-31 06:46:19 +00:00
|
|
|
def unparse(scheme, host, port, path=""):
|
2016-05-31 06:42:56 +00:00
|
|
|
"""
|
|
|
|
Returns a URL string, constructed from the specified components.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
All args must be str.
|
|
|
|
"""
|
|
|
|
if path == "*":
|
|
|
|
path = ""
|
2016-06-07 05:12:52 +00:00
|
|
|
return "%s://%s%s" % (scheme, hostport(scheme, host, port), path)
|
2016-05-31 06:42:56 +00:00
|
|
|
|
|
|
|
|
2016-10-17 03:56:46 +00:00
|
|
|
def encode(s: Sequence[Tuple[str, str]]) -> str:
|
2016-05-31 06:42:56 +00:00
|
|
|
"""
|
|
|
|
Takes a list of (key, value) tuples and returns a urlencoded string.
|
|
|
|
"""
|
2016-10-17 02:15:22 +00:00
|
|
|
return urllib.parse.urlencode(s, False, errors="surrogateescape")
|
2016-05-31 06:42:56 +00:00
|
|
|
|
|
|
|
|
2016-05-31 06:46:19 +00:00
|
|
|
def decode(s):
|
2016-05-31 06:42:56 +00:00
|
|
|
"""
|
2016-07-25 02:06:49 +00:00
|
|
|
Takes a urlencoded string and returns a list of surrogate-escaped (key, value) tuples.
|
|
|
|
"""
|
2016-10-17 02:15:22 +00:00
|
|
|
return urllib.parse.parse_qsl(s, keep_blank_values=True, errors='surrogateescape')
|
2016-07-25 02:06:49 +00:00
|
|
|
|
|
|
|
|
2016-10-17 03:56:46 +00:00
|
|
|
def quote(b: str, safe: str="/") -> str:
|
2016-07-25 02:06:49 +00:00
|
|
|
"""
|
|
|
|
Returns:
|
|
|
|
An ascii-encodable str.
|
|
|
|
"""
|
2016-10-17 02:15:22 +00:00
|
|
|
return urllib.parse.quote(b, safe=safe, errors="surrogateescape")
|
2016-07-25 02:06:49 +00:00
|
|
|
|
|
|
|
|
2016-10-17 03:56:46 +00:00
|
|
|
def unquote(s: str) -> str:
|
2016-05-31 06:42:56 +00:00
|
|
|
"""
|
2016-07-25 02:06:49 +00:00
|
|
|
Args:
|
|
|
|
s: A surrogate-escaped str
|
|
|
|
Returns:
|
|
|
|
A surrogate-escaped str
|
|
|
|
"""
|
2016-10-17 02:15:22 +00:00
|
|
|
return urllib.parse.unquote(s, errors="surrogateescape")
|
2016-06-07 05:12:52 +00:00
|
|
|
|
|
|
|
|
|
|
|
def hostport(scheme, host, port):
|
|
|
|
"""
|
|
|
|
Returns the host component, with a port specifcation if needed.
|
|
|
|
"""
|
|
|
|
if (port, scheme) in [(80, "http"), (443, "https"), (80, b"http"), (443, b"https")]:
|
|
|
|
return host
|
|
|
|
else:
|
2016-10-17 02:15:22 +00:00
|
|
|
if isinstance(host, bytes):
|
2016-06-07 05:12:52 +00:00
|
|
|
return b"%s:%d" % (host, port)
|
|
|
|
else:
|
|
|
|
return "%s:%d" % (host, port)
|