Merge pull request #1464 from dufferzafar/har

HAR
This commit is contained in:
Thomas Kriechbaumer 2016-08-15 12:49:28 +02:00 committed by GitHub
commit 2419ab153d
6 changed files with 409 additions and 306 deletions

216
examples/har_dump.py Normal file
View File

@ -0,0 +1,216 @@
"""
This inline script can be used to dump flows as HAR files.
"""
import pprint
import json
import sys
import base64
import zlib
from datetime import datetime
import pytz
import mitmproxy
from netlib import version
from netlib import strutils
from netlib.http import cookies
HAR = {}
# A list of server seen till now is maintained so we can avoid
# using 'connect' time for entries that use an existing connection.
SERVERS_SEEN = set()
def start():
"""
Called once on script startup before any other events.
"""
if len(sys.argv) != 2:
raise ValueError(
'Usage: -s "har_dump.py filename" '
'(- will output to stdout, filenames ending with .zhar '
'will result in compressed har)'
)
HAR.update({
"log": {
"version": "1.2",
"creator": {
"name": "mitmproxy har_dump",
"version": "0.1",
"comment": "mitmproxy version %s" % version.MITMPROXY
},
"entries": []
}
})
def response(flow):
"""
Called when a server response has been received.
"""
# -1 indicates that these values do not apply to current request
ssl_time = -1
connect_time = -1
if flow.server_conn and flow.server_conn not in SERVERS_SEEN:
connect_time = (flow.server_conn.timestamp_tcp_setup -
flow.server_conn.timestamp_start)
if flow.server_conn.timestamp_ssl_setup is not None:
ssl_time = (flow.server_conn.timestamp_ssl_setup -
flow.server_conn.timestamp_tcp_setup)
SERVERS_SEEN.add(flow.server_conn)
# Calculate raw timings from timestamps. DNS timings can not be calculated
# for lack of a way to measure it. The same goes for HAR blocked.
# mitmproxy will open a server connection as soon as it receives the host
# and port from the client connection. So, the time spent waiting is actually
# spent waiting between request.timestamp_end and response.timestamp_start
# thus it correlates to HAR wait instead.
timings_raw = {
'send': flow.request.timestamp_end - flow.request.timestamp_start,
'receive': flow.response.timestamp_end - flow.response.timestamp_start,
'wait': flow.response.timestamp_start - flow.request.timestamp_end,
'connect': connect_time,
'ssl': ssl_time,
}
# HAR timings are integers in ms, so we re-encode the raw timings to that format.
timings = dict([(k, int(1000 * v)) for k, v in timings_raw.items()])
# full_time is the sum of all timings.
# Timings set to -1 will be ignored as per spec.
full_time = sum(v for v in timings.values() if v > -1)
started_date_time = format_datetime(datetime.utcfromtimestamp(flow.request.timestamp_start))
# Response body size and encoding
response_body_size = len(flow.response.raw_content)
response_body_decoded_size = len(flow.response.content)
response_body_compression = response_body_decoded_size - response_body_size
entry = {
"startedDateTime": started_date_time,
"time": full_time,
"request": {
"method": flow.request.method,
"url": flow.request.url,
"httpVersion": flow.request.http_version,
"cookies": format_request_cookies(flow.request.cookies.fields),
"headers": name_value(flow.request.headers),
"queryString": name_value(flow.request.query or {}),
"headersSize": len(str(flow.request.headers)),
"bodySize": len(flow.request.content),
},
"response": {
"status": flow.response.status_code,
"statusText": flow.response.reason,
"httpVersion": flow.response.http_version,
"cookies": format_response_cookies(flow.response.cookies.fields),
"headers": name_value(flow.response.headers),
"content": {
"size": response_body_size,
"compression": response_body_compression,
"mimeType": flow.response.headers.get('Content-Type', '')
},
"redirectURL": flow.response.headers.get('Location', ''),
"headersSize": len(str(flow.response.headers)),
"bodySize": response_body_size,
},
"cache": {},
"timings": timings,
}
# Store binay data as base64
if strutils.is_mostly_bin(flow.response.content):
b64 = base64.b64encode(flow.response.content)
entry["response"]["content"]["text"] = b64.decode('ascii')
entry["response"]["content"]["encoding"] = "base64"
else:
entry["response"]["content"]["text"] = flow.response.text
if flow.request.method in ["POST", "PUT", "PATCH"]:
entry["request"]["postData"] = {
"mimeType": flow.request.headers.get("Content-Type", "").split(";")[0],
"text": flow.request.content,
"params": name_value(flow.request.urlencoded_form)
}
if flow.server_conn:
entry["serverIPAddress"] = str(flow.server_conn.ip_address.address[0])
HAR["log"]["entries"].append(entry)
def done():
"""
Called once on script shutdown, after any other events.
"""
dump_file = sys.argv[1]
if dump_file == '-':
mitmproxy.ctx.log(pprint.pformat(HAR))
else:
json_dump = json.dumps(HAR, indent=2)
if dump_file.endswith('.zhar'):
json_dump = zlib.compress(json_dump, 9)
with open(dump_file, "w") as f:
f.write(json_dump)
mitmproxy.ctx.log("HAR dump finished (wrote %s bytes to file)" % len(json_dump))
def format_datetime(dt):
return dt.replace(tzinfo=pytz.timezone("UTC")).isoformat()
def format_cookies(cookie_list):
rv = []
for name, value, attrs in cookie_list:
cookie_har = {
"name": name,
"value": value,
}
# HAR only needs some attributes
for key in ["path", "domain", "comment"]:
if key in attrs:
cookie_har[key] = attrs[key]
# These keys need to be boolean!
for key in ["httpOnly", "secure"]:
cookie_har[key] = bool(key in attrs)
# Expiration time needs to be formatted
expire_ts = cookies.get_expiration_ts(attrs)
if expire_ts is not None:
cookie_har["expires"] = format_datetime(datetime.fromtimestamp(expire_ts))
rv.append(cookie_har)
return rv
def format_request_cookies(fields):
return format_cookies(cookies.group_cookies(fields))
def format_response_cookies(fields):
return format_cookies((c[0], c[1].value, c[1].attrs) for c in fields)
def name_value(obj):
"""
Convert (key, value) pairs to HAR format.
"""
return [{"name": k, "value": v} for k, v in obj.items()]

View File

@ -1,264 +0,0 @@
"""
This inline script utilizes harparser.HAR from
https://github.com/JustusW/harparser to generate a HAR log object.
"""
import mitmproxy.ctx
import six
import sys
import pytz
from harparser import HAR
from datetime import datetime
class _HARLog(HAR.log):
# The attributes need to be registered here for them to actually be
# available later via self. This is due to HAREncodable linking __getattr__
# to __getitem__. Anything that is set only in __init__ will just be added
# as key/value pair to self.__classes__.
__page_list__ = []
__page_count__ = 0
__page_ref__ = {}
def __init__(self, page_list=[]):
self.__page_list__ = page_list
self.__page_count__ = 0
self.__page_ref__ = {}
HAR.log.__init__(self, {"version": "1.2",
"creator": {"name": "MITMPROXY HARExtractor",
"version": "0.1",
"comment": ""},
"pages": [],
"entries": []})
def reset(self):
self.__init__(self.__page_list__)
def add(self, obj):
if isinstance(obj, HAR.pages):
self['pages'].append(obj)
if isinstance(obj, HAR.entries):
self['entries'].append(obj)
def create_page_id(self):
self.__page_count__ += 1
return "autopage_%s" % str(self.__page_count__)
def set_page_ref(self, page, ref):
self.__page_ref__[page] = ref
def get_page_ref(self, page):
return self.__page_ref__.get(page, None)
def get_page_list(self):
return self.__page_list__
class Context(object):
pass
context = Context()
def start():
"""
On start we create a HARLog instance. You will have to adapt this to
suit your actual needs of HAR generation. As it will probably be
necessary to cluster logs by IPs or reset them from time to time.
"""
if sys.version_info >= (3, 0):
raise RuntimeError(
"har_extractor.py does not work on Python 3. "
"Please check out https://github.com/mitmproxy/mitmproxy/issues/1320 "
"if you want to help making this work again."
)
context.dump_file = None
if len(sys.argv) > 1:
context.dump_file = sys.argv[1]
else:
raise ValueError(
'Usage: -s "har_extractor.py filename" '
'(- will output to stdout, filenames ending with .zhar '
'will result in compressed har)'
)
context.HARLog = _HARLog()
context.seen_server = set()
def response(flow):
"""
Called when a server response has been received. At the time of this
message both a request and a response are present and completely done.
"""
# Values are converted from float seconds to int milliseconds later.
ssl_time = -.001
connect_time = -.001
if flow.server_conn not in context.seen_server:
# Calculate the connect_time for this server_conn. Afterwards add it to
# seen list, in order to avoid the connect_time being present in entries
# that use an existing connection.
connect_time = (flow.server_conn.timestamp_tcp_setup -
flow.server_conn.timestamp_start)
context.seen_server.add(flow.server_conn)
if flow.server_conn.timestamp_ssl_setup is not None:
# Get the ssl_time for this server_conn as the difference between
# the start of the successful tcp setup and the successful ssl
# setup. If no ssl setup has been made it is left as -1 since it
# doesn't apply to this connection.
ssl_time = (flow.server_conn.timestamp_ssl_setup -
flow.server_conn.timestamp_tcp_setup)
# Calculate the raw timings from the different timestamps present in the
# request and response object. For lack of a way to measure it dns timings
# can not be calculated. The same goes for HAR blocked: MITMProxy will open
# a server connection as soon as it receives the host and port from the
# client connection. So the time spent waiting is actually spent waiting
# between request.timestamp_end and response.timestamp_start thus it
# correlates to HAR wait instead.
timings_raw = {
'send': flow.request.timestamp_end - flow.request.timestamp_start,
'wait': flow.response.timestamp_start - flow.request.timestamp_end,
'receive': flow.response.timestamp_end - flow.response.timestamp_start,
'connect': connect_time,
'ssl': ssl_time
}
# HAR timings are integers in ms, so we have to re-encode the raw timings to
# that format.
timings = dict([(k, int(1000 * v)) for k, v in six.iteritems(timings_raw)])
# The full_time is the sum of all timings.
# Timings set to -1 will be ignored as per spec.
full_time = sum(v for v in timings.values() if v > -1)
started_date_time = datetime.utcfromtimestamp(
flow.request.timestamp_start).replace(tzinfo=pytz.timezone("UTC")).isoformat()
request_query_string = [{"name": k, "value": v}
for k, v in flow.request.query or {}]
response_body_size = len(flow.response.content)
response_body_decoded_size = len(flow.response.content)
response_body_compression = response_body_decoded_size - response_body_size
entry = HAR.entries({
"startedDateTime": started_date_time,
"time": full_time,
"request": {
"method": flow.request.method,
"url": flow.request.url,
"httpVersion": flow.request.http_version,
"cookies": format_cookies(flow.request.cookies),
"headers": format_headers(flow.request.headers),
"queryString": request_query_string,
"headersSize": len(str(flow.request.headers)),
"bodySize": len(flow.request.content),
},
"response": {
"status": flow.response.status_code,
"statusText": flow.response.reason,
"httpVersion": flow.response.http_version,
"cookies": format_cookies(flow.response.cookies),
"headers": format_headers(flow.response.headers),
"content": {
"size": response_body_size,
"compression": response_body_compression,
"mimeType": flow.response.headers.get('Content-Type', '')
},
"redirectURL": flow.response.headers.get('Location', ''),
"headersSize": len(str(flow.response.headers)),
"bodySize": response_body_size,
},
"cache": {},
"timings": timings,
})
# If the current url is in the page list of context.HARLog or
# does not have a referrer, we add it as a new pages object.
is_new_page = (
flow.request.url in context.HARLog.get_page_list() or
flow.request.headers.get('Referer') is None
)
if is_new_page:
page_id = context.HARLog.create_page_id()
context.HARLog.add(
HAR.pages({
"startedDateTime": entry['startedDateTime'],
"id": page_id,
"title": flow.request.url,
"pageTimings": {}
})
)
context.HARLog.set_page_ref(flow.request.url, page_id)
entry['pageref'] = page_id
# Lookup the referer in the page_ref of context.HARLog to point this entries
# pageref attribute to the right pages object, then set it as a new
# reference to build a reference tree.
elif context.HARLog.get_page_ref(flow.request.headers.get('Referer')) is not None:
entry['pageref'] = context.HARLog.get_page_ref(
flow.request.headers['Referer']
)
context.HARLog.set_page_ref(
flow.request.headers['Referer'], entry['pageref']
)
context.HARLog.add(entry)
def done():
"""
Called once on script shutdown, after any other events.
"""
import pprint
import json
json_dump = context.HARLog.json()
compressed_json_dump = context.HARLog.compress()
if context.dump_file == '-':
mitmproxy.ctx.log(pprint.pformat(json.loads(json_dump)))
elif context.dump_file.endswith('.zhar'):
with open(context.dump_file, "wb") as f:
f.write(compressed_json_dump)
else:
with open(context.dump_file, "wb") as f:
f.write(json_dump)
mitmproxy.ctx.log(
"HAR log finished with %s bytes (%s bytes compressed)" % (
len(json_dump), len(compressed_json_dump)
)
)
mitmproxy.ctx.log(
"Compression rate is %s%%" % str(
100. * len(compressed_json_dump) / len(json_dump)
)
)
def format_cookies(obj):
if obj:
return [{"name": k.strip(), "value": v[0]} for k, v in obj.items()]
return ""
def format_headers(obj):
if obj:
return [{"name": k, "value": v} for k, v in obj.fields]
return ""
def print_attributes(obj, filter_string=None, hide_privates=False):
"""
Useful helper method to quickly get all attributes of an object and its
values.
"""
for attr in dir(obj):
if hide_privates and "__" in attr:
continue
if filter_string is not None and filter_string not in attr:
continue
value = getattr(obj, attr)
print("%s.%s" % ('obj', attr), value, type(value))

View File

@ -26,6 +26,12 @@ variants. Serialization follows RFC6265.
http://tools.ietf.org/html/rfc2965 http://tools.ietf.org/html/rfc2965
""" """
_cookie_params = set((
'expires', 'path', 'comment', 'max-age',
'secure', 'httponly', 'version',
))
# TODO: Disallow LHS-only Cookie values # TODO: Disallow LHS-only Cookie values
@ -263,6 +269,32 @@ def refresh_set_cookie_header(c, delta):
return ret return ret
def get_expiration_ts(cookie_attrs):
"""
Determines the time when the cookie will be expired.
Considering both 'expires' and 'max-age' parameters.
Returns: timestamp of when the cookie will expire.
None, if no expiration time is set.
"""
if 'expires' in cookie_attrs:
e = email.utils.parsedate_tz(cookie_attrs["expires"])
if e:
return email.utils.mktime_tz(e)
elif 'max-age' in cookie_attrs:
try:
max_age = int(cookie_attrs['Max-Age'])
except ValueError:
pass
else:
now_ts = time.time()
return now_ts + max_age
return None
def is_expired(cookie_attrs): def is_expired(cookie_attrs):
""" """
Determines whether a cookie has expired. Determines whether a cookie has expired.
@ -270,20 +302,36 @@ def is_expired(cookie_attrs):
Returns: boolean Returns: boolean
""" """
# See if 'expires' time is in the past exp_ts = get_expiration_ts(cookie_attrs)
expires = False now_ts = time.time()
if 'expires' in cookie_attrs:
e = email.utils.parsedate_tz(cookie_attrs["expires"])
if e:
exp_ts = email.utils.mktime_tz(e)
now_ts = time.time()
expires = exp_ts < now_ts
# or if Max-Age is 0 # If no expiration information was provided with the cookie
max_age = False if exp_ts is None:
try: return False
max_age = int(cookie_attrs.get('Max-Age', 1)) == 0 else:
except ValueError: return exp_ts <= now_ts
pass
return expires or max_age
def group_cookies(pairs):
"""
Converts a list of pairs to a (name, value, attrs) for each cookie.
"""
if not pairs:
return []
cookie_list = []
# First pair is always a new cookie
name, value = pairs[0]
attrs = []
for k, v in pairs[1:]:
if k.lower() in _cookie_params:
attrs.append((k, v))
else:
cookie_list.append((name, value, CookieAttrs(attrs)))
name, value, attrs = k, v, []
cookie_list.append((name, value, CookieAttrs(attrs)))
return cookie_list

View File

@ -119,7 +119,6 @@ setup(
], ],
'examples': [ 'examples': [
"beautifulsoup4>=4.4.1, <4.6", "beautifulsoup4>=4.4.1, <4.6",
"harparser>=0.2, <0.3",
"pytz>=2015.07.0, <=2016.6.1", "pytz>=2015.07.0, <=2016.6.1",
] ]
} }

View File

@ -1,16 +1,20 @@
import json import json
import os
import six import six
import sys
import os.path
from mitmproxy.flow import master
from mitmproxy.flow import state
from mitmproxy import options from mitmproxy import options
from mitmproxy import contentviews from mitmproxy import contentviews
from mitmproxy.builtins import script from mitmproxy.builtins import script
from mitmproxy.flow import master
from mitmproxy.flow import state
import netlib.utils import netlib.utils
from netlib import tutils as netutils from netlib import tutils as netutils
from netlib.http import Headers from netlib.http import Headers
from netlib.http import cookies
from . import tutils, mastertest from . import tutils, mastertest
example_dir = netlib.utils.Data(__name__).push("../../examples") example_dir = netlib.utils.Data(__name__).push("../../examples")
@ -98,30 +102,66 @@ class TestScripts(mastertest.MasterTest):
m.request(f) m.request(f)
assert f.request.host == "mitmproxy.org" assert f.request.host == "mitmproxy.org"
def test_har_extractor(self):
if sys.version_info >= (3, 0):
with tutils.raises("does not work on Python 3"):
tscript("har_extractor.py")
return
class TestHARDump():
def flow(self, resp_content=b'message'):
times = dict(
timestamp_start=746203272,
timestamp_end=746203272,
)
# Create a dummy flow for testing
return tutils.tflow(
req=netutils.treq(method=b'GET', **times),
resp=netutils.tresp(content=resp_content, **times)
)
def test_no_file_arg(self):
with tutils.raises(ScriptError): with tutils.raises(ScriptError):
tscript("har_extractor.py") tscript("har_dump.py")
def test_simple(self):
with tutils.tmpdir() as tdir: with tutils.tmpdir() as tdir:
times = dict( path = os.path.join(tdir, "somefile")
timestamp_start=746203272,
timestamp_end=746203272,
)
path = os.path.join(tdir, "file") m, sc = tscript("har_dump.py", six.moves.shlex_quote(path))
m, sc = tscript("har_extractor.py", six.moves.shlex_quote(path)) m.addons.invoke(m, "response", self.flow())
f = tutils.tflow(
req=netutils.treq(**times),
resp=netutils.tresp(**times)
)
m.response(f)
m.addons.remove(sc) m.addons.remove(sc)
with open(path, "rb") as f: with open(path, "r") as inp:
test_data = json.load(f) har = json.load(inp)
assert len(test_data["log"]["pages"]) == 1
assert len(har["log"]["entries"]) == 1
def test_base64(self):
with tutils.tmpdir() as tdir:
path = os.path.join(tdir, "somefile")
m, sc = tscript("har_dump.py", six.moves.shlex_quote(path))
m.addons.invoke(m, "response", self.flow(resp_content=b"foo" + b"\xFF" * 10))
m.addons.remove(sc)
with open(path, "r") as inp:
har = json.load(inp)
assert har["log"]["entries"][0]["response"]["content"]["encoding"] == "base64"
def test_format_cookies(self):
m, sc = tscript("har_dump.py", "-")
format_cookies = sc.ns.ns["format_cookies"]
CA = cookies.CookieAttrs
f = format_cookies([("n", "v", CA([("k", "v")]))])[0]
assert f['name'] == "n"
assert f['value'] == "v"
assert not f['httpOnly']
assert not f['secure']
f = format_cookies([("n", "v", CA([("httponly", None), ("secure", None)]))])[0]
assert f['httpOnly']
assert f['secure']
f = format_cookies([("n", "v", CA([("expires", "Mon, 24-Aug-2037 00:00:00 GMT")]))])[0]
assert f['expires']

View File

@ -1,6 +1,10 @@
import time
from netlib.http import cookies from netlib.http import cookies
from netlib.tutils import raises from netlib.tutils import raises
import mock
def test_read_token(): def test_read_token():
tokens = [ tokens = [
@ -247,6 +251,22 @@ def test_refresh_cookie():
assert cookies.refresh_set_cookie_header(c, 0) assert cookies.refresh_set_cookie_header(c, 0)
@mock.patch('time.time')
def test_get_expiration_ts(*args):
# Freeze time
now_ts = 17
time.time.return_value = now_ts
CA = cookies.CookieAttrs
F = cookies.get_expiration_ts
assert F(CA([("Expires", "Thu, 01-Jan-1970 00:00:00 GMT")])) == 0
assert F(CA([("Expires", "Mon, 24-Aug-2037 00:00:00 GMT")])) == 2134684800
assert F(CA([("Max-Age", "0")])) == now_ts
assert F(CA([("Max-Age", "31")])) == now_ts + 31
def test_is_expired(): def test_is_expired():
CA = cookies.CookieAttrs CA = cookies.CookieAttrs
@ -260,9 +280,53 @@ def test_is_expired():
# or both # or both
assert cookies.is_expired(CA([("Expires", "Thu, 01-Jan-1970 00:00:00 GMT"), ("Max-Age", "0")])) assert cookies.is_expired(CA([("Expires", "Thu, 01-Jan-1970 00:00:00 GMT"), ("Max-Age", "0")]))
assert not cookies.is_expired(CA([("Expires", "Thu, 24-Aug-2063 00:00:00 GMT")])) assert not cookies.is_expired(CA([("Expires", "Mon, 24-Aug-2037 00:00:00 GMT")]))
assert not cookies.is_expired(CA([("Max-Age", "1")])) assert not cookies.is_expired(CA([("Max-Age", "1")]))
assert not cookies.is_expired(CA([("Expires", "Thu, 15-Jul-2068 00:00:00 GMT"), ("Max-Age", "1")])) assert not cookies.is_expired(CA([("Expires", "Wed, 15-Jul-2037 00:00:00 GMT"), ("Max-Age", "1")]))
assert not cookies.is_expired(CA([("Max-Age", "nan")])) assert not cookies.is_expired(CA([("Max-Age", "nan")]))
assert not cookies.is_expired(CA([("Expires", "false")])) assert not cookies.is_expired(CA([("Expires", "false")]))
def test_group_cookies():
CA = cookies.CookieAttrs
groups = [
[
"one=uno; foo=bar; foo=baz",
[
('one', 'uno', CA([])),
('foo', 'bar', CA([])),
('foo', 'baz', CA([]))
]
],
[
"one=uno; Path=/; foo=bar; Max-Age=0; foo=baz; expires=24-08-1993",
[
('one', 'uno', CA([('Path', '/')])),
('foo', 'bar', CA([('Max-Age', '0')])),
('foo', 'baz', CA([('expires', '24-08-1993')]))
]
],
[
"one=uno;",
[
('one', 'uno', CA([]))
]
],
[
"one=uno; Path=/; Max-Age=0; Expires=24-08-1993",
[
('one', 'uno', CA([('Path', '/'), ('Max-Age', '0'), ('Expires', '24-08-1993')]))
]
],
[
"path=val; Path=/",
[
('path', 'val', CA([('Path', '/')]))
]
]
]
for c, expected in groups:
observed = cookies.group_cookies(cookies.parse_cookie_header(c))
assert observed == expected