From 6f96da08c9d838e4ead43425440d6120d4a02d0f Mon Sep 17 00:00:00 2001 From: Shadab Zafar Date: Wed, 17 Feb 2016 08:48:59 +0530 Subject: [PATCH 1/4] Handle port numbers in host header from: https://github.com/mitmproxy/netlib/pull/121 --- netlib/netlib/http/request.py | 26 +++++++++++++++++++++++--- test/netlib/http/test_request.py | 4 ++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/netlib/netlib/http/request.py b/netlib/netlib/http/request.py index b9076c0f2..2be3f2371 100644 --- a/netlib/netlib/http/request.py +++ b/netlib/netlib/http/request.py @@ -1,5 +1,6 @@ from __future__ import absolute_import, print_function, division +import re import warnings import six @@ -12,6 +13,10 @@ from .. import encoding from .headers import Headers from .message import Message, _native, _always_bytes, MessageData +# This regex extracts & splits the host header into host and port. +# Handles the edge case of IPv6 addresses containing colons. +# https://bugzilla.mozilla.org/show_bug.cgi?id=45891 +host_header_re = re.compile(r"^(?P[^:]+|\[.+\])(?::(?P\d+))?$") class RequestData(MessageData): def __init__(self, first_line_format, method, scheme, host, port, path, http_version, headers=None, content=None, @@ -159,6 +164,18 @@ class Request(Message): def url(self, url): self.scheme, self.host, self.port, self.path = utils.parse_url(url) + def _parse_host_header(self): + """Extract the host and port from Host header""" + if "host" not in self.headers: + return None, None + host, port = self.headers["host"], None + m = host_header_re.match(host) + if m: + host = m.group("host").strip("[]") + if m.group("port"): + port = int(m.group("port")) + return host, port + @property def pretty_host(self): """ @@ -166,16 +183,19 @@ class Request(Message): This is useful in transparent mode where :py:attr:`host` is only an IP address, but may not reflect the actual destination as the Host header could be spoofed. """ - return self.headers.get("host", self.host) + return self._parse_host_header()[0] or self.host @property def pretty_url(self): """ Like :py:attr:`url`, but using :py:attr:`pretty_host` instead of :py:attr:`host`. """ + host, port = self._parse_host_header() + host = host or self.host + port = port or self.port if self.first_line_format == "authority": - return "%s:%d" % (self.pretty_host, self.port) - return utils.unparse_url(self.scheme, self.pretty_host, self.port, self.path) + return "%s:%d" % (host, port) + return utils.unparse_url(self.scheme, host, port, self.path) @property def query(self): diff --git a/test/netlib/http/test_request.py b/test/netlib/http/test_request.py index b4ecfd4ee..4f8a34c22 100644 --- a/test/netlib/http/test_request.py +++ b/test/netlib/http/test_request.py @@ -106,6 +106,8 @@ class TestRequestUtils(object): request = treq() assert request.pretty_host == "address" assert request.host == "address" + request.headers["host"] = "other:22" + assert request.pretty_host == "other" request.headers["host"] = "other" assert request.pretty_host == "other" assert request.host == "address" @@ -123,6 +125,8 @@ class TestRequestUtils(object): assert request.pretty_url == "http://address:22/path" request.headers["host"] = "other" assert request.pretty_url == "http://other:22/path" + request.headers["host"] = "other:33" + assert request.pretty_url == "http://other:33/path" def test_pretty_url_authority(self): request = treq(first_line_format="authority") From 175109e44e419dcc1792d8f7171782448c5c3ea4 Mon Sep 17 00:00:00 2001 From: Shadab Zafar Date: Thu, 18 Feb 2016 07:01:52 +0530 Subject: [PATCH 2/4] Use host header values only when the ports match --- netlib/netlib/http/request.py | 7 ++++++- test/netlib/http/test_request.py | 8 ++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/netlib/netlib/http/request.py b/netlib/netlib/http/request.py index 2be3f2371..f53678b7a 100644 --- a/netlib/netlib/http/request.py +++ b/netlib/netlib/http/request.py @@ -183,7 +183,12 @@ class Request(Message): This is useful in transparent mode where :py:attr:`host` is only an IP address, but may not reflect the actual destination as the Host header could be spoofed. """ - return self._parse_host_header()[0] or self.host + host, port = self._parse_host_header() + if not host: + return self.host + if not port: + port = 443 if self.scheme == 'https' else 80 + return host if port == self.port else self.host @property def pretty_url(self): diff --git a/test/netlib/http/test_request.py b/test/netlib/http/test_request.py index 4f8a34c22..350b54056 100644 --- a/test/netlib/http/test_request.py +++ b/test/netlib/http/test_request.py @@ -104,19 +104,23 @@ class TestRequestUtils(object): def test_pretty_host(self): request = treq() + # Without host header assert request.pretty_host == "address" assert request.host == "address" + # Same port as self.port (22) request.headers["host"] = "other:22" assert request.pretty_host == "other" + # Different Ports request.headers["host"] = "other" - assert request.pretty_host == "other" + assert request.pretty_host == "address" assert request.host == "address" + # Empty host request.host = None assert request.pretty_host is None assert request.host is None # Invalid IDNA - request.headers["host"] = ".disqus.com" + request.headers["host"] = ".disqus.com:22" assert request.pretty_host == ".disqus.com" def test_pretty_url(self): From 9dc12242822d0361cdf25ba2a2bb6e17dbd35eb6 Mon Sep 17 00:00:00 2001 From: Shadab Zafar Date: Thu, 18 Feb 2016 20:09:28 +0530 Subject: [PATCH 3/4] Incorporate comments made during review --- netlib/netlib/http/request.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/netlib/netlib/http/request.py b/netlib/netlib/http/request.py index f53678b7a..99662732e 100644 --- a/netlib/netlib/http/request.py +++ b/netlib/netlib/http/request.py @@ -188,6 +188,7 @@ class Request(Message): return self.host if not port: port = 443 if self.scheme == 'https' else 80 + # Prefer the original address if host header has an unexpected form return host if port == self.port else self.host @property @@ -195,12 +196,9 @@ class Request(Message): """ Like :py:attr:`url`, but using :py:attr:`pretty_host` instead of :py:attr:`host`. """ - host, port = self._parse_host_header() - host = host or self.host - port = port or self.port if self.first_line_format == "authority": - return "%s:%d" % (host, port) - return utils.unparse_url(self.scheme, host, port, self.path) + return "%s:%d" % (self.pretty_host, self.port) + return utils.unparse_url(self.scheme, self.pretty_host, self.port, self.path) @property def query(self): From d6ab9901d1f4d330a624b1a41d86d8d03c910b7a Mon Sep 17 00:00:00 2001 From: Shadab Zafar Date: Thu, 18 Feb 2016 21:58:32 +0530 Subject: [PATCH 4/4] Fixup more tests --- test/mitmproxy/test_flow.py | 2 +- test/netlib/http/test_request.py | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/test/mitmproxy/test_flow.py b/test/mitmproxy/test_flow.py index a593c8c3a..13ced2651 100644 --- a/test/mitmproxy/test_flow.py +++ b/test/mitmproxy/test_flow.py @@ -1035,7 +1035,7 @@ class TestRequest: assert r.url == "https://address:22/path" assert r.pretty_url == "https://address:22/path" - r.headers["Host"] = "foo.com" + r.headers["Host"] = "foo.com:22" assert r.url == "https://address:22/path" assert r.pretty_url == "https://foo.com:22/path" diff --git a/test/netlib/http/test_request.py b/test/netlib/http/test_request.py index 350b54056..5672259ed 100644 --- a/test/netlib/http/test_request.py +++ b/test/netlib/http/test_request.py @@ -110,7 +110,7 @@ class TestRequestUtils(object): # Same port as self.port (22) request.headers["host"] = "other:22" assert request.pretty_host == "other" - # Different Ports + # Different ports request.headers["host"] = "other" assert request.pretty_host == "address" assert request.host == "address" @@ -125,12 +125,15 @@ class TestRequestUtils(object): def test_pretty_url(self): request = treq() + # Without host header assert request.url == "http://address:22/path" assert request.pretty_url == "http://address:22/path" - request.headers["host"] = "other" + # Same port as self.port (22) + request.headers["host"] = "other:22" assert request.pretty_url == "http://other:22/path" - request.headers["host"] = "other:33" - assert request.pretty_url == "http://other:33/path" + # Different ports + request.headers["host"] = "other" + assert request.pretty_url == "http://address:22/path" def test_pretty_url_authority(self): request = treq(first_line_format="authority")