Merge pull request #3464 from rjt-gupta/url-fix

Non ascii characters in url
This commit is contained in:
Thomas Kriechbaumer 2019-09-28 11:46:58 +02:00 committed by GitHub
commit 7d60dde76c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 31 additions and 6 deletions

View File

@ -21,16 +21,25 @@ def parse(url):
Raises:
ValueError, if the URL is not properly formatted.
"""
parsed = urllib.parse.urlparse(url)
# Size of Ascii character after encoding is 1 byte which is same as its size
# But non-Ascii character's size after encoding will be more than its size
def ascii_check(l):
if len(l) == len(str(l).encode()):
return True
return False
if isinstance(url, bytes):
url = url.decode()
if not ascii_check(url):
url = urllib.parse.urlsplit(url)
url = list(url)
url[3] = urllib.parse.quote(url[3])
url = urllib.parse.urlunsplit(url)
parsed = urllib.parse.urlparse(url)
if not parsed.hostname:
raise ValueError("No hostname given")
if isinstance(url, bytes):
host = parsed.hostname
# this should not raise a ValueError,
# but we try to be very forgiving here and accept just everything.
else:
host = parsed.hostname.encode("idna")
if isinstance(parsed, urllib.parse.ParseResult):

View File

@ -49,6 +49,17 @@ def test_parse():
url.parse('http://lo[calhost')
def test_ascii_check():
test_url = "https://xyz.tax-edu.net?flag=selectCourse&lc_id=42825&lc_name=茅莽莽猫氓猫氓".encode()
scheme, host, port, full_path = url.parse(test_url)
assert scheme == b'https'
assert host == b'xyz.tax-edu.net'
assert port == 443
assert full_path == b'/?flag%3DselectCourse%26lc_id%3D42825%26lc_name%3D%E8%8C%85%E8%8E%BD%E8%8E' \
b'%BD%E7%8C%AB%E6%B0%93%E7%8C%AB%E6%B0%93'
@pytest.mark.skipif(sys.version_info < (3, 6), reason='requires Python 3.6 or higher')
def test_parse_port_range():
# Port out of range
@ -61,6 +72,7 @@ def test_unparse():
assert url.unparse("http", "foo.com", 80, "/bar") == "http://foo.com/bar"
assert url.unparse("https", "foo.com", 80, "") == "https://foo.com:80"
assert url.unparse("https", "foo.com", 443, "") == "https://foo.com"
assert url.unparse("https", "foo.com", 443, "*") == "https://foo.com"
# We ignore the byte 126: '~' because of an incompatibility in Python 3.6 and 3.7
@ -131,3 +143,7 @@ def test_unquote():
assert url.unquote("foo") == "foo"
assert url.unquote("foo%20bar") == "foo bar"
assert url.unquote(surrogates_quoted) == surrogates
def test_hostport():
assert url.hostport(b"https", b"foo.com", 8080) == b"foo.com:8080"