from __future__ import (absolute_import, print_function, division) import string, urlparse, binascii import sys from . import odict, utils class HttpError(Exception): def __init__(self, code, message): super(HttpError, self).__init__(message) self.code = code class HttpErrorConnClosed(HttpError): pass def _is_valid_port(port): if not 0 <= port <= 65535: return False return True def _is_valid_host(host): try: host.decode("idna") except ValueError: return False if "\0" in host: return None return True def parse_url(url): """ Returns a (scheme, host, port, path) tuple, or None on error. Checks that: port is an integer 0-65535 host is a valid IDNA-encoded hostname with no null-bytes path is valid ASCII """ try: scheme, netloc, path, params, query, fragment = urlparse.urlparse(url) except ValueError: return None if not scheme: return None if '@' in netloc: # FIXME: Consider what to do with the discarded credentials here Most # probably we should extend the signature to return these as a separate # value. _, netloc = string.rsplit(netloc, '@', maxsplit=1) if ':' in netloc: host, port = string.rsplit(netloc, ':', maxsplit=1) try: port = int(port) except ValueError: return None else: host = netloc if scheme == "https": port = 443 else: port = 80 path = urlparse.urlunparse(('', '', path, params, query, fragment)) if not path.startswith("/"): path = "/" + path if not _is_valid_host(host): return None if not utils.isascii(path): return None if not _is_valid_port(port): return None return scheme, host, port, path def read_headers(fp): """ Read a set of headers from a file pointer. Stop once a blank line is reached. Return a ODictCaseless object, or None if headers are invalid. """ ret = [] name = '' while 1: line = fp.readline() if not line or line == '\r\n' or line == '\n': break if line[0] in ' \t': if not ret: return None # continued header ret[-1][1] = ret[-1][1] + '\r\n ' + line.strip() else: i = line.find(':') # We're being liberal in what we accept, here. if i > 0: name = line[:i] value = line[i + 1:].strip() ret.append([name, value]) else: return None return odict.ODictCaseless(ret) def read_chunked(fp, limit, is_request): """ Read a chunked HTTP body. May raise HttpError. """ # FIXME: Should check if chunked is the final encoding in the headers # http://tools.ietf.org/html/draft-ietf-httpbis-p1-messaging-16#section-3.3 3.3 2. total = 0 code = 400 if is_request else 502 while True: line = fp.readline(128) if line == "": raise HttpErrorConnClosed(code, "Connection closed prematurely") if line != '\r\n' and line != '\n': try: length = int(line, 16) except ValueError: raise HttpError( code, "Invalid chunked encoding length: %s" % line ) total += length if limit is not None and total > limit: msg = "HTTP Body too large. Limit is %s," \ " chunked content longer than %s" % (limit, total) raise HttpError(code, msg) chunk = fp.read(length) suffix = fp.readline(5) if suffix != '\r\n': raise HttpError(code, "Malformed chunked body") yield line, chunk, '\r\n' if length == 0: return def get_header_tokens(headers, key): """ Retrieve all tokens for a header key. A number of different headers follow a pattern where each header line can containe comma-separated tokens, and headers can be set multiple times. """ toks = [] for i in headers[key]: for j in i.split(","): toks.append(j.strip()) return toks def has_chunked_encoding(headers): return "chunked" in [ i.lower() for i in get_header_tokens(headers, "transfer-encoding") ] def parse_http_protocol(s): """ Parse an HTTP protocol declaration. Returns a (major, minor) tuple, or None. """ if not s.startswith("HTTP/"): return None _, version = s.split('/', 1) if "." not in version: return None major, minor = version.split('.', 1) try: major = int(major) minor = int(minor) except ValueError: return None return major, minor def parse_http_basic_auth(s): words = s.split() if len(words) != 2: return None scheme = words[0] try: user = binascii.a2b_base64(words[1]) except binascii.Error: return None parts = user.split(':') if len(parts) != 2: return None return scheme, parts[0], parts[1] def assemble_http_basic_auth(scheme, username, password): v = binascii.b2a_base64(username + ":" + password) return scheme + " " + v def parse_init(line): try: method, url, protocol = string.split(line) except ValueError: return None httpversion = parse_http_protocol(protocol) if not httpversion: return None if not utils.isascii(method): return None return method, url, httpversion def parse_init_connect(line): """ Returns (host, port, httpversion) if line is a valid CONNECT line. http://tools.ietf.org/html/draft-luotonen-web-proxy-tunneling-01 section 3.1 """ v = parse_init(line) if not v: return None method, url, httpversion = v if method.upper() != 'CONNECT': return None try: host, port = url.split(":") except ValueError: return None try: port = int(port) except ValueError: return None if not _is_valid_port(port): return None if not _is_valid_host(host): return None return host, port, httpversion def parse_init_proxy(line): v = parse_init(line) if not v: return None method, url, httpversion = v parts = parse_url(url) if not parts: return None scheme, host, port, path = parts return method, scheme, host, port, path, httpversion def parse_init_http(line): """ Returns (method, url, httpversion) """ v = parse_init(line) if not v: return None method, url, httpversion = v if not utils.isascii(url): return None if not (url.startswith("/") or url == "*"): return None return method, url, httpversion def connection_close(httpversion, headers): """ Checks the message to see if the client connection should be closed according to RFC 2616 Section 8.1 Note that a connection should be closed as well if the response has been read until end of the stream. """ # At first, check if we have an explicit Connection header. if "connection" in headers: toks = get_header_tokens(headers, "connection") if "close" in toks: return True elif "keep-alive" in toks: return False # If we don't have a Connection header, HTTP 1.1 connections are assumed to # be persistent if httpversion == (1, 1): return False return True def parse_response_line(line): parts = line.strip().split(" ", 2) if len(parts) == 2: # handle missing message gracefully parts.append("") if len(parts) != 3: return None proto, code, msg = parts try: code = int(code) except ValueError: return None return (proto, code, msg) def read_response(rfile, request_method, body_size_limit, include_body=True): """ Return an (httpversion, code, msg, headers, content) tuple. By default, both response header and body are read. If include_body=False is specified, content may be one of the following: - None, if the response is technically allowed to have a response body - "", if the response must not have a response body (e.g. it's a response to a HEAD request) """ line = rfile.readline() if line == "\r\n" or line == "\n": # Possible leftover from previous message line = rfile.readline() if not line: raise HttpErrorConnClosed(502, "Server disconnect.") parts = parse_response_line(line) if not parts: raise HttpError(502, "Invalid server response: %s" % repr(line)) proto, code, msg = parts httpversion = parse_http_protocol(proto) if httpversion is None: raise HttpError(502, "Invalid HTTP version in line: %s" % repr(proto)) headers = read_headers(rfile) if headers is None: raise HttpError(502, "Invalid headers.") if include_body: content = read_http_body( rfile, headers, body_size_limit, request_method, code, False ) else: # if include_body==False then a None content means the body should be # read separately content = None return httpversion, code, msg, headers, content def read_http_body(*args, **kwargs): return "".join( content for _, content, _ in read_http_body_chunked(*args, **kwargs) ) def read_http_body_chunked(rfile, headers, limit, request_method, response_code, is_request, max_chunk_size=None): """ Read an HTTP message body: rfile: A file descriptor to read from headers: An ODictCaseless object limit: Size limit. is_request: True if the body to read belongs to a request, False otherwise """ if max_chunk_size is None: max_chunk_size = limit or sys.maxint expected_size = expected_http_body_size( headers, is_request, request_method, response_code ) if expected_size is None: if has_chunked_encoding(headers): # Python 3: yield from for x in read_chunked(rfile, limit, is_request): yield x else: # pragma: nocover raise HttpError( 400 if is_request else 502, "Content-Length unknown but no chunked encoding" ) elif expected_size >= 0: if limit is not None and expected_size > limit: raise HttpError( 400 if is_request else 509, "HTTP Body too large. Limit is %s, content-length was %s" % ( limit, expected_size ) ) bytes_left = expected_size while bytes_left: chunk_size = min(bytes_left, max_chunk_size) yield "", rfile.read(chunk_size), "" bytes_left -= chunk_size else: bytes_left = limit or -1 while bytes_left: chunk_size = min(bytes_left, max_chunk_size) content = rfile.read(chunk_size) if not content: return yield "", content, "" bytes_left -= chunk_size not_done = rfile.read(1) if not_done: raise HttpError( 400 if is_request else 509, "HTTP Body too large. Limit is %s," % limit ) def expected_http_body_size(headers, is_request, request_method, response_code): """ Returns the expected body length: - a positive integer, if the size is known in advance - None, if the size in unknown in advance (chunked encoding or invalid data) - -1, if all data should be read until end of stream. May raise HttpError. """ # Determine response size according to # http://tools.ietf.org/html/rfc7230#section-3.3 if request_method: request_method = request_method.upper() if (not is_request and ( request_method == "HEAD" or (request_method == "CONNECT" and response_code == 200) or response_code in [204, 304] or 100 <= response_code <= 199)): return 0 if has_chunked_encoding(headers): return None if "content-length" in headers: try: size = int(headers["content-length"][0]) if size < 0: raise ValueError() return size except ValueError: return None if is_request: return 0 return -1