From acce67e1fd468bf1ac4a536d007ac56d9ab652e3 Mon Sep 17 00:00:00 2001 From: Justus Wingert Date: Sat, 15 Nov 2014 03:34:39 +0100 Subject: [PATCH 01/11] Initial checkin with har_extractor script. --- examples/har_extractor.py | 207 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 207 insertions(+) create mode 100644 examples/har_extractor.py diff --git a/examples/har_extractor.py b/examples/har_extractor.py new file mode 100644 index 000000000..504e98df1 --- /dev/null +++ b/examples/har_extractor.py @@ -0,0 +1,207 @@ +""" + This inline script utilizes harparser.HAR from https://github.com/JustusW/harparser + to generate a HAR log object. +""" +from harparser import HAR +from datetime import datetime, timedelta, tzinfo + + +class UTC(tzinfo): + def utcoffset(self, dt): + return timedelta(0) + + def dst(self, dt): + return timedelta(0) + + def tzname(self, dt): + return "Z" + + +class _HARLog(HAR.log): + def __init__(self): + HAR.log.__init__(self, {"version": "1.2", + "creator": {"name": "MITMPROXY HARExtractor", + "version": "0.1", + "comment": ""}, + "pages": [], + "entries": []}) + + def reset(self): + self.__init__() + + def add(self, obj): + if isinstance(obj, HAR.pages): + self['pages'].append(obj) + if isinstance(obj, HAR.entries): + self['entries'].append(obj) + + +def start(context, argv): + HARLog.reset() + + +def clientconnect(context, conn_handler): + """ + Called when a client initiates a connection to the proxy. Note that a + connection can correspond to multiple HTTP requests + """ + import time + context.log("clientconnect" + str(time.time())) + + +def serverconnect(context, conn_handler): + """ + Called when the proxy initiates a connection to the target server. Note that a + connection can correspond to multiple HTTP requests + """ + CONNECT_TIMES.pop(conn_handler.server_conn.address.address, None) + SSL_TIMES.pop(conn_handler.server_conn.address.address, None) + import time + context.log("serverconnect " + str(time.time())) + + +def request(context, flow): + """ + Called when a client request has been received. + """ + # print_attributes(flow) + # print_attributes(context) + import time + context.log("request " + str(time.time()) + " " + str(flow.request.timestamp_start)) + + +def responseheaders(context, flow): + """ + Called when the response headers for a server response have been received, + but the response body has not been processed yet. Can be used to tell mitmproxy + to stream the response. + """ + context.log("responseheaders") + + +def response(context, flow): + """ + Called when a server response has been received. + """ + import time + context.log("response " + str(time.time()) + " " + str(flow.request.timestamp_start)) + context.log("response " + str(time.time()) + " " + str(flow.response.timestamp_end)) + connect_time = CONNECT_TIMES.get(flow.server_conn.address.address, + int((flow.server_conn.timestamp_tcp_setup + - flow.server_conn.timestamp_start) + * 1000)) + CONNECT_TIMES[flow.server_conn.address.address] = -1 + + ssl_time = -1 + if flow.server_conn.timestamp_ssl_setup is not None: + ssl_time = SSL_TIMES.get(flow.server_conn.address.address, + int((flow.server_conn.timestamp_ssl_setup + - flow.server_conn.timestamp_tcp_setup) + * 1000)) + SSL_TIMES[flow.server_conn.address.address] = -1 + + timings = {'send': int((flow.request.timestamp_end - flow.request.timestamp_start) * 1000), + 'wait': int((flow.response.timestamp_start - flow.request.timestamp_end) * 1000), + 'receive': int((flow.response.timestamp_end - flow.response.timestamp_start) * 1000), + 'connect': connect_time, + 'ssl': ssl_time} + + full_time = 0 + for item in timings.values(): + if item > -1: + full_time += item + + entry = HAR.entries({"startedDateTime": datetime.fromtimestamp(flow.request.timestamp_start, tz=UTC()).isoformat(), + "time": full_time, + "request": {"method": flow.request.method, + "url": flow.request.url, + "httpVersion": ".".join([str(v) for v in flow.request.httpversion]), + "cookies": [{"name": k.strip(), "value": v[0]} + for k, v in (flow.request.get_cookies() or {}).iteritems()], + "headers": [{"name": k, "value": v} + for k, v in flow.request.headers], + "queryString": [{"name": k, "value": v} + for k, v in flow.request.get_query()], + "headersSize": len(str(flow.request.headers).split("\r\n\r\n")[0]), + "bodySize": len(flow.request.content), }, + "response": {"status": flow.response.code, + "statusText": flow.response.msg, + "httpVersion": ".".join([str(v) for v in flow.response.httpversion]), + "cookies": [{"name": k.strip(), "value": v[0]} + for k, v in (flow.response.get_cookies() or {}).iteritems()], + "headers": [{"name": k, "value": v} + for k, v in flow.response.headers], + "content": {"size": len(flow.response.content), + "compression": len(flow.response.get_decoded_content()) - len( + flow.response.content), + "mimeType": flow.response.headers.get('Content-Type', ('', ))[0]}, + "redirectURL": flow.response.headers.get('Location', ''), + "headersSize": len(str(flow.response.headers).split("\r\n\r\n")[0]), + "bodySize": len(flow.response.content), }, + "cache": {}, + "timings": timings, }) + + if flow.request.url in HARPAGE_LIST or flow.request.headers.get('Referer', None) is None: + PAGE_COUNT[1] += 1 + page_id = "_".join([str(v) for v in PAGE_COUNT]) + HARLog.add(HAR.pages({"startedDateTime": entry['startedDateTime'], + "id": page_id, + "title": flow.request.url, })) + PAGE_REF[flow.request.url] = page_id + entry['pageref'] = page_id + + if flow.request.headers.get('Referer', (None, ))[0] in PAGE_REF.keys(): + entry['pageref'] = PAGE_REF[flow.request.headers['Referer'][0]] + PAGE_REF[flow.request.url] = entry['pageref'] + + HARLog.add(entry) + + +def error(context, flow): + """ + Called when a flow error has occured, e.g. invalid server responses, or + interrupted connections. This is distinct from a valid server HTTP error + response, which is simply a response with an HTTP error code. + """ + # context.log("error") + + +def clientdisconnect(context, conn_handler): + """ + Called when a client disconnects from the proxy. + """ + # print "clientdisconnect" + # print_attributes(context._master) + # print_attributes(conn_handler) + + +def done(context): + """ + Called once on script shutdown, after any other events. + """ + from pprint import pprint + import json + + pprint(json.loads(HARLog.json())) + print HARLog.json() + print HARLog.compress() + print "%s%%" % str(100. * len(HARLog.compress()) / len(HARLog.json())) + + +def print_attributes(obj, filter=None): + for attr in dir(obj): + # if "__" in attr: + # continue + if filter is not None and filter not in attr: + continue + value = getattr(obj, attr) + print "%s.%s" % ('obj', attr), value, type(value) + + +HARPAGE_LIST = ['https://github.com/'] +HARLog = _HARLog() + +CONNECT_TIMES = {} +SSL_TIMES = {} +PAGE_REF = {} +PAGE_COUNT = ['autopage', 0] From fd48a70128581c508420901910c285f247c930c7 Mon Sep 17 00:00:00 2001 From: Justus Wingert Date: Sat, 15 Nov 2014 18:38:59 +0100 Subject: [PATCH 02/11] Updated documentation and cleaned up the code. --- examples/har_extractor.py | 194 ++++++++++++++++++-------------------- 1 file changed, 90 insertions(+), 104 deletions(-) diff --git a/examples/har_extractor.py b/examples/har_extractor.py index 504e98df1..bc67d2993 100644 --- a/examples/har_extractor.py +++ b/examples/har_extractor.py @@ -37,110 +37,104 @@ class _HARLog(HAR.log): def start(context, argv): + """ + On start we reset the HAR, it's not really necessary since it will have been + instantiated earlier during initial parsing of this file. You will have to + adapt this to suit your actual needs of HAR generation. + """ HARLog.reset() - - -def clientconnect(context, conn_handler): - """ - Called when a client initiates a connection to the proxy. Note that a - connection can correspond to multiple HTTP requests - """ - import time - context.log("clientconnect" + str(time.time())) - - -def serverconnect(context, conn_handler): - """ - Called when the proxy initiates a connection to the target server. Note that a - connection can correspond to multiple HTTP requests - """ - CONNECT_TIMES.pop(conn_handler.server_conn.address.address, None) - SSL_TIMES.pop(conn_handler.server_conn.address.address, None) - import time - context.log("serverconnect " + str(time.time())) - - -def request(context, flow): - """ - Called when a client request has been received. - """ - # print_attributes(flow) - # print_attributes(context) - import time - context.log("request " + str(time.time()) + " " + str(flow.request.timestamp_start)) - - -def responseheaders(context, flow): - """ - Called when the response headers for a server response have been received, - but the response body has not been processed yet. Can be used to tell mitmproxy - to stream the response. - """ - context.log("responseheaders") + context.seen_server_connect = set() + context.seen_server_ssl = set() def response(context, flow): """ - Called when a server response has been received. + Called when a server response has been received. At the time of this message both + a request and a response are present and completely done. """ - import time - context.log("response " + str(time.time()) + " " + str(flow.request.timestamp_start)) - context.log("response " + str(time.time()) + " " + str(flow.response.timestamp_end)) - connect_time = CONNECT_TIMES.get(flow.server_conn.address.address, - int((flow.server_conn.timestamp_tcp_setup - - flow.server_conn.timestamp_start) - * 1000)) - CONNECT_TIMES[flow.server_conn.address.address] = -1 + connect_time = -1 + if flow.server_conn not in context.seen_server_connect: + # Calculate the connect_time for this server_conn. Afterwards add it to seen list, in + # order to avoid the connect_time being present in entries that use an existing connection. + connect_time = flow.server_conn.timestamp_tcp_setup - flow.server_conn.timestamp_start + context.seen_server_connect.add(flow.server_conn) ssl_time = -1 - if flow.server_conn.timestamp_ssl_setup is not None: - ssl_time = SSL_TIMES.get(flow.server_conn.address.address, - int((flow.server_conn.timestamp_ssl_setup - - flow.server_conn.timestamp_tcp_setup) - * 1000)) - SSL_TIMES[flow.server_conn.address.address] = -1 + if flow.server_conn not in context.seen_server_connect \ + and flow.server_conn.timestamp_ssl_setup is not None: + # Get the ssl_time for this server_conn as the difference between the start of the successful + # tcp setup and the successful ssl setup. Afterwards add it to seen list, in order to avoid + # the ssl_time being present in entries that use an existing connection. If no ssl setup has + # been made initiate it is also left as -1 since it doesn't apply to this connection. + ssl_time = flow.server_conn.timestamp_ssl_setup - flow.server_conn.timestamp_tcp_setup + context.seen_server_ssl.add(flow.server_conn) - timings = {'send': int((flow.request.timestamp_end - flow.request.timestamp_start) * 1000), - 'wait': int((flow.response.timestamp_start - flow.request.timestamp_end) * 1000), - 'receive': int((flow.response.timestamp_end - flow.response.timestamp_start) * 1000), - 'connect': connect_time, - 'ssl': ssl_time} + # Calculate the raw timings from the different timestamps present in the request and response object. + # For lack of a way to measure it dns timings can not be calculated. The same goes for HAR blocked: + # MITMProxy will open a server connection as soon as it receives the host and port from the client + # connection. So the time spent waiting is actually spent waiting between request.timestamp_end and + # response.timestamp_start thus it correlates to HAR wait instead. + timings_raw = {'send': flow.request.timestamp_end - flow.request.timestamp_start, + 'wait': flow.response.timestamp_start - flow.request.timestamp_end, + 'receive': flow.response.timestamp_end - flow.response.timestamp_start, + 'connect': connect_time, + 'ssl': ssl_time} + # HAR timings are integers in ms, so we have to re-encode the raw timings to that format. + timings = dict([(key, int(1000 * value)) for key, value in timings_raw.iteritems()]) + + # The full_time is the sum of all timings. Timings set to -1 will be ignored as per spec. full_time = 0 for item in timings.values(): if item > -1: full_time += item - entry = HAR.entries({"startedDateTime": datetime.fromtimestamp(flow.request.timestamp_start, tz=UTC()).isoformat(), + started_date_time = datetime.fromtimestamp(flow.request.timestamp_start, tz=UTC()).isoformat() + + request_query_string = [{"name": k, "value": v} for k, v in flow.request.get_query()] + request_http_version = ".".join([str(v) for v in flow.request.httpversion]) + # Cookies are shaped as tuples by MITMProxy. + request_cookies = [{"name": k.strip(), "value": v[0]} for k, v in (flow.request.get_cookies() or {}).iteritems()] + request_headers = [{"name": k, "value": v} for k, v in flow.request.headers] + request_headers_size = len(str(flow.request.headers)) + request_body_size = len(flow.request.content) + + response_http_version = ".".join([str(v) for v in flow.response.httpversion]) + # Cookies are shaped as tuples by MITMProxy. + response_cookies = [{"name": k.strip(), "value": v[0]} for k, v in (flow.response.get_cookies() or {}).iteritems()] + response_headers = [{"name": k, "value": v} for k, v in flow.response.headers] + response_headers_size = len(str(flow.response.headers)) + response_body_size = len(flow.response.content) + response_body_decoded_size = len(flow.response.content) + response_body_compression = response_body_decoded_size - response_body_size + response_mime_type = flow.response.headers.get('Content-Type', [''])[0] + response_redirect_url = flow.response.headers.get('Location', [''])[0] + + entry = HAR.entries({"startedDateTime": started_date_time, "time": full_time, "request": {"method": flow.request.method, "url": flow.request.url, - "httpVersion": ".".join([str(v) for v in flow.request.httpversion]), - "cookies": [{"name": k.strip(), "value": v[0]} - for k, v in (flow.request.get_cookies() or {}).iteritems()], - "headers": [{"name": k, "value": v} - for k, v in flow.request.headers], - "queryString": [{"name": k, "value": v} - for k, v in flow.request.get_query()], - "headersSize": len(str(flow.request.headers).split("\r\n\r\n")[0]), - "bodySize": len(flow.request.content), }, + "httpVersion": request_http_version, + "cookies": request_cookies, + "headers": request_headers, + "queryString": request_query_string, + "headersSize": request_headers_size, + "bodySize": request_body_size, }, "response": {"status": flow.response.code, "statusText": flow.response.msg, - "httpVersion": ".".join([str(v) for v in flow.response.httpversion]), - "cookies": [{"name": k.strip(), "value": v[0]} - for k, v in (flow.response.get_cookies() or {}).iteritems()], - "headers": [{"name": k, "value": v} - for k, v in flow.response.headers], - "content": {"size": len(flow.response.content), - "compression": len(flow.response.get_decoded_content()) - len( - flow.response.content), - "mimeType": flow.response.headers.get('Content-Type', ('', ))[0]}, - "redirectURL": flow.response.headers.get('Location', ''), - "headersSize": len(str(flow.response.headers).split("\r\n\r\n")[0]), - "bodySize": len(flow.response.content), }, + "httpVersion": response_http_version, + "cookies": response_cookies, + "headers": response_headers, + "content": {"size": response_body_size, + "compression": response_body_compression, + "mimeType": response_mime_type}, + "redirectURL": response_redirect_url, + "headersSize": response_headers_size, + "bodySize": response_body_size, }, "cache": {}, "timings": timings, }) + # If the current url is in HARPAGE_LIST or does not have a referer we add it as a new pages object. if flow.request.url in HARPAGE_LIST or flow.request.headers.get('Referer', None) is None: PAGE_COUNT[1] += 1 page_id = "_".join([str(v) for v in PAGE_COUNT]) @@ -150,31 +144,14 @@ def response(context, flow): PAGE_REF[flow.request.url] = page_id entry['pageref'] = page_id - if flow.request.headers.get('Referer', (None, ))[0] in PAGE_REF.keys(): + # Lookup the referer in our PAGE_REF dict to point this entries pageref attribute to the right pages object. + elif flow.request.headers.get('Referer', (None, ))[0] in PAGE_REF.keys(): entry['pageref'] = PAGE_REF[flow.request.headers['Referer'][0]] PAGE_REF[flow.request.url] = entry['pageref'] HARLog.add(entry) -def error(context, flow): - """ - Called when a flow error has occured, e.g. invalid server responses, or - interrupted connections. This is distinct from a valid server HTTP error - response, which is simply a response with an HTTP error code. - """ - # context.log("error") - - -def clientdisconnect(context, conn_handler): - """ - Called when a client disconnects from the proxy. - """ - # print "clientdisconnect" - # print_attributes(context._master) - # print_attributes(conn_handler) - - def done(context): """ Called once on script shutdown, after any other events. @@ -182,13 +159,21 @@ def done(context): from pprint import pprint import json - pprint(json.loads(HARLog.json())) - print HARLog.json() - print HARLog.compress() - print "%s%%" % str(100. * len(HARLog.compress()) / len(HARLog.json())) + json_dump = HARLog.json() + compressed_json_dump = HARLog.compress() + + print "=" * 100 + pprint(json.loads(json_dump)) + print "=" * 100 + print "HAR log finished with %s bytes (%s bytes compressed)" % (len(json_dump), len(compressed_json_dump)) + print "Compression rate is %s%%" % str(100. * len(compressed_json_dump) / len(json_dump)) + print "=" * 100 def print_attributes(obj, filter=None): + """ + Useful helper method to quickly get all attributes of an object and its values. + """ for attr in dir(obj): # if "__" in attr: # continue @@ -198,6 +183,7 @@ def print_attributes(obj, filter=None): print "%s.%s" % ('obj', attr), value, type(value) +# Some initializations. Add any page you want to have its own pages object to HARPAGE_LIST HARPAGE_LIST = ['https://github.com/'] HARLog = _HARLog() From f3a78d4795a97f99194a46c764cfeb1fe6fd01f2 Mon Sep 17 00:00:00 2001 From: Justus Wingert Date: Sat, 15 Nov 2014 18:41:51 +0100 Subject: [PATCH 03/11] Improved helper method, marginally. --- examples/har_extractor.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/har_extractor.py b/examples/har_extractor.py index bc67d2993..666dc03f0 100644 --- a/examples/har_extractor.py +++ b/examples/har_extractor.py @@ -170,14 +170,14 @@ def done(context): print "=" * 100 -def print_attributes(obj, filter=None): +def print_attributes(obj, filter_string=None, hide_privates=False): """ Useful helper method to quickly get all attributes of an object and its values. """ for attr in dir(obj): - # if "__" in attr: - # continue - if filter is not None and filter not in attr: + if hide_privates and "__" in attr: + continue + if filter_string is not None and filter_string not in attr: continue value = getattr(obj, attr) print "%s.%s" % ('obj', attr), value, type(value) From 18b803d03a31c12d0d73890bed98bc775ff31d33 Mon Sep 17 00:00:00 2001 From: Justus Wingert Date: Sat, 15 Nov 2014 18:45:28 +0100 Subject: [PATCH 04/11] Typo... --- examples/har_extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/har_extractor.py b/examples/har_extractor.py index 666dc03f0..cc2cf5d7c 100644 --- a/examples/har_extractor.py +++ b/examples/har_extractor.py @@ -65,7 +65,7 @@ def response(context, flow): # Get the ssl_time for this server_conn as the difference between the start of the successful # tcp setup and the successful ssl setup. Afterwards add it to seen list, in order to avoid # the ssl_time being present in entries that use an existing connection. If no ssl setup has - # been made initiate it is also left as -1 since it doesn't apply to this connection. + # been made it is also left as -1 since it doesn't apply to this connection. ssl_time = flow.server_conn.timestamp_ssl_setup - flow.server_conn.timestamp_tcp_setup context.seen_server_ssl.add(flow.server_conn) From 57d980712286ce3184a2aad7bef1b63b2b26e95e Mon Sep 17 00:00:00 2001 From: Justus Wingert Date: Sat, 15 Nov 2014 19:05:36 +0100 Subject: [PATCH 05/11] Added script dependencie to harparser. --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 08ccbbfd4..80bb78951 100644 --- a/setup.py +++ b/setup.py @@ -28,6 +28,7 @@ script_deps = { "urwid>=1.1", "lxml>=3.3.6", "Pillow>=2.3.0", + "-e git+https://github.com/JustusW/harparser.git#egg=harparser", }, "mitmdump": set() } From 31249b9e2471e05cf3e9eed7fce1ae72cf17451b Mon Sep 17 00:00:00 2001 From: Justus Wingert Date: Sat, 15 Nov 2014 19:28:10 +0100 Subject: [PATCH 06/11] Hopefully fixed dependency fuckup. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 80bb78951..41598b116 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ script_deps = { "urwid>=1.1", "lxml>=3.3.6", "Pillow>=2.3.0", - "-e git+https://github.com/JustusW/harparser.git#egg=harparser", + "harparser", }, "mitmdump": set() } From 4342d79dc073277b51effda92179ad7050bebf68 Mon Sep 17 00:00:00 2001 From: Justus Wingert Date: Sat, 15 Nov 2014 20:11:25 +0100 Subject: [PATCH 07/11] Removed the globals and replaced them with internal attributes of _HARLog. Minor bugfixes to make ssl timings work. --- examples/har_extractor.py | 80 ++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 34 deletions(-) diff --git a/examples/har_extractor.py b/examples/har_extractor.py index cc2cf5d7c..68fb1d0da 100644 --- a/examples/har_extractor.py +++ b/examples/har_extractor.py @@ -18,7 +18,15 @@ class UTC(tzinfo): class _HARLog(HAR.log): - def __init__(self): + __page_list__ = [] + __page_count__ = 0 + __page_ref__ = {} + + def __init__(self, page_list): + self.__page_list__ = page_list + self.__page_count__ = 0 + self.__page_ref__ = {} + HAR.log.__init__(self, {"version": "1.2", "creator": {"name": "MITMPROXY HARExtractor", "version": "0.1", @@ -35,14 +43,27 @@ class _HARLog(HAR.log): if isinstance(obj, HAR.entries): self['entries'].append(obj) + def create_page_id(self): + self.__page_count__ += 1 + return "autopage_%s" % str(self.__page_count__) + + def set_page_ref(self, page, ref): + self.__page_ref__[page] = ref + + def get_page_ref(self, page): + return self.__page_ref__.get(page, None) + + def get_page_list(self): + return self.__page_list__ + def start(context, argv): """ - On start we reset the HAR, it's not really necessary since it will have been - instantiated earlier during initial parsing of this file. You will have to - adapt this to suit your actual needs of HAR generation. + On start we create a HARLog instance. You will have to adapt this to suit your actual needs + of HAR generation. As it will probably be necessary to cluster logs by IPs or reset them + from time to time. """ - HARLog.reset() + context.HARLog = _HARLog(['https://github.com']) context.seen_server_connect = set() context.seen_server_ssl = set() @@ -52,15 +73,15 @@ def response(context, flow): Called when a server response has been received. At the time of this message both a request and a response are present and completely done. """ - connect_time = -1 + connect_time = -.001 if flow.server_conn not in context.seen_server_connect: # Calculate the connect_time for this server_conn. Afterwards add it to seen list, in # order to avoid the connect_time being present in entries that use an existing connection. connect_time = flow.server_conn.timestamp_tcp_setup - flow.server_conn.timestamp_start context.seen_server_connect.add(flow.server_conn) - ssl_time = -1 - if flow.server_conn not in context.seen_server_connect \ + ssl_time = -.001 + if flow.server_conn not in context.seen_server_ssl \ and flow.server_conn.timestamp_ssl_setup is not None: # Get the ssl_time for this server_conn as the difference between the start of the successful # tcp setup and the successful ssl setup. Afterwards add it to seen list, in order to avoid @@ -134,22 +155,23 @@ def response(context, flow): "cache": {}, "timings": timings, }) - # If the current url is in HARPAGE_LIST or does not have a referer we add it as a new pages object. - if flow.request.url in HARPAGE_LIST or flow.request.headers.get('Referer', None) is None: - PAGE_COUNT[1] += 1 - page_id = "_".join([str(v) for v in PAGE_COUNT]) - HARLog.add(HAR.pages({"startedDateTime": entry['startedDateTime'], - "id": page_id, - "title": flow.request.url, })) - PAGE_REF[flow.request.url] = page_id + # If the current url is in the page list of context.HARLog or does not have a referrer we add it as a new + # pages object. + if flow.request.url in context.HARLog.get_page_list() or flow.request.headers.get('Referer', None) is None: + page_id = context.HARLog.create_page_id() + context.HARLog.add(HAR.pages({"startedDateTime": entry['startedDateTime'], + "id": page_id, + "title": flow.request.url, })) + context.HARLog.set_page_ref(flow.request.url, page_id) entry['pageref'] = page_id - # Lookup the referer in our PAGE_REF dict to point this entries pageref attribute to the right pages object. - elif flow.request.headers.get('Referer', (None, ))[0] in PAGE_REF.keys(): - entry['pageref'] = PAGE_REF[flow.request.headers['Referer'][0]] - PAGE_REF[flow.request.url] = entry['pageref'] + # Lookup the referer in the page_ref of context.HARLog to point this entries pageref attribute to the right + # pages object, then set it as a new reference to build a reference tree. + elif context.HARLog.get_page_ref(flow.request.headers.get('Referer', (None, ))[0]) is not None: + entry['pageref'] = context.HARLog.get_page_ref(flow.request.headers['Referer'][0]) + context.HARLog.set_page_ref(flow.request.headers['Referer'][0], entry['pageref']) - HARLog.add(entry) + context.HARLog.add(entry) def done(context): @@ -159,8 +181,8 @@ def done(context): from pprint import pprint import json - json_dump = HARLog.json() - compressed_json_dump = HARLog.compress() + json_dump = context.HARLog.json() + compressed_json_dump = context.HARLog.compress() print "=" * 100 pprint(json.loads(json_dump)) @@ -180,14 +202,4 @@ def print_attributes(obj, filter_string=None, hide_privates=False): if filter_string is not None and filter_string not in attr: continue value = getattr(obj, attr) - print "%s.%s" % ('obj', attr), value, type(value) - - -# Some initializations. Add any page you want to have its own pages object to HARPAGE_LIST -HARPAGE_LIST = ['https://github.com/'] -HARLog = _HARLog() - -CONNECT_TIMES = {} -SSL_TIMES = {} -PAGE_REF = {} -PAGE_COUNT = ['autopage', 0] + print "%s.%s" % ('obj', attr), value, type(value) \ No newline at end of file From 4227feef37b9a9c0e835ebf179b5fb7a4509569e Mon Sep 17 00:00:00 2001 From: Justus Wingert Date: Sat, 15 Nov 2014 21:14:50 +0100 Subject: [PATCH 08/11] It seems get_decoded_content can actually be shorter than content due to encoding issues. Since I'm not crazy after all it seems safe to push. --- examples/har_extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/har_extractor.py b/examples/har_extractor.py index 68fb1d0da..4de8ce6a3 100644 --- a/examples/har_extractor.py +++ b/examples/har_extractor.py @@ -126,7 +126,7 @@ def response(context, flow): response_headers = [{"name": k, "value": v} for k, v in flow.response.headers] response_headers_size = len(str(flow.response.headers)) response_body_size = len(flow.response.content) - response_body_decoded_size = len(flow.response.content) + response_body_decoded_size = len(flow.response.get_decoded_content()) response_body_compression = response_body_decoded_size - response_body_size response_mime_type = flow.response.headers.get('Content-Type', [''])[0] response_redirect_url = flow.response.headers.get('Location', [''])[0] From a7ab06d80eccbe3e58753da0917fca8d55a21c8e Mon Sep 17 00:00:00 2001 From: Justus Wingert Date: Sat, 15 Nov 2014 22:04:52 +0100 Subject: [PATCH 09/11] Switched to pytz. Added comment for clarification on behaviour of HAREncodable. Added missing parameter in reset(). Fixed accessing headers. --- examples/har_extractor.py | 23 ++++++++--------------- setup.py | 1 + 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/examples/har_extractor.py b/examples/har_extractor.py index 4de8ce6a3..8e97ee2d3 100644 --- a/examples/har_extractor.py +++ b/examples/har_extractor.py @@ -2,22 +2,15 @@ This inline script utilizes harparser.HAR from https://github.com/JustusW/harparser to generate a HAR log object. """ +from pytz import utc from harparser import HAR from datetime import datetime, timedelta, tzinfo -class UTC(tzinfo): - def utcoffset(self, dt): - return timedelta(0) - - def dst(self, dt): - return timedelta(0) - - def tzname(self, dt): - return "Z" - - class _HARLog(HAR.log): + # The attributes need to be registered here for them to actually be available later via self. This is + # due to HAREncodable linking __getattr__ to __getitem__. Anything that is set only in __init__ will + # just be added as key/value pair to self.__classes__. __page_list__ = [] __page_count__ = 0 __page_ref__ = {} @@ -35,7 +28,7 @@ class _HARLog(HAR.log): "entries": []}) def reset(self): - self.__init__() + self.__init__(self.__page_list__) def add(self, obj): if isinstance(obj, HAR.pages): @@ -110,7 +103,7 @@ def response(context, flow): if item > -1: full_time += item - started_date_time = datetime.fromtimestamp(flow.request.timestamp_start, tz=UTC()).isoformat() + started_date_time = datetime.fromtimestamp(flow.request.timestamp_start, tz=utc).isoformat() request_query_string = [{"name": k, "value": v} for k, v in flow.request.get_query()] request_http_version = ".".join([str(v) for v in flow.request.httpversion]) @@ -128,8 +121,8 @@ def response(context, flow): response_body_size = len(flow.response.content) response_body_decoded_size = len(flow.response.get_decoded_content()) response_body_compression = response_body_decoded_size - response_body_size - response_mime_type = flow.response.headers.get('Content-Type', [''])[0] - response_redirect_url = flow.response.headers.get('Location', [''])[0] + response_mime_type = flow.response.headers.get_first('Content-Type', '') + response_redirect_url = flow.response.headers.get_first('Location', '') entry = HAR.entries({"startedDateTime": started_date_time, "time": full_time, diff --git a/setup.py b/setup.py index 41598b116..7ea6d0cb5 100644 --- a/setup.py +++ b/setup.py @@ -28,6 +28,7 @@ script_deps = { "urwid>=1.1", "lxml>=3.3.6", "Pillow>=2.3.0", + "pytz", "harparser", }, "mitmdump": set() From c84ad384f660ba2c04aad3dfd3e7d3b961b013aa Mon Sep 17 00:00:00 2001 From: Justus Wingert Date: Sat, 15 Nov 2014 22:37:32 +0100 Subject: [PATCH 10/11] Updated setup.py and moved requirements to examples section. Included examples section in requirements.txt. Updated har_extractor to use command line arguments. --- examples/har_extractor.py | 36 ++++++++++++++++++++++-------------- requirements.txt | 2 +- setup.py | 6 ++++-- 3 files changed, 27 insertions(+), 17 deletions(-) diff --git a/examples/har_extractor.py b/examples/har_extractor.py index 8e97ee2d3..c994f3718 100644 --- a/examples/har_extractor.py +++ b/examples/har_extractor.py @@ -56,9 +56,14 @@ def start(context, argv): of HAR generation. As it will probably be necessary to cluster logs by IPs or reset them from time to time. """ + context.dump_file = None + if len(argv) > 1: + context.dump_file = argv[1] + else: + raise ValueError('Usage: -s "har_extractor.py filename" ' + '(- will output to stdout, filenames ending with .zhar will result in compressed har)') context.HARLog = _HARLog(['https://github.com']) - context.seen_server_connect = set() - context.seen_server_ssl = set() + context.seen_server = set() def response(context, flow): @@ -66,22 +71,20 @@ def response(context, flow): Called when a server response has been received. At the time of this message both a request and a response are present and completely done. """ + # Values are converted from float seconds to int milliseconds later. + ssl_time = -.001 connect_time = -.001 - if flow.server_conn not in context.seen_server_connect: + if flow.server_conn not in context.seen_server: # Calculate the connect_time for this server_conn. Afterwards add it to seen list, in # order to avoid the connect_time being present in entries that use an existing connection. connect_time = flow.server_conn.timestamp_tcp_setup - flow.server_conn.timestamp_start - context.seen_server_connect.add(flow.server_conn) + context.seen_server.add(flow.server_conn) - ssl_time = -.001 - if flow.server_conn not in context.seen_server_ssl \ - and flow.server_conn.timestamp_ssl_setup is not None: - # Get the ssl_time for this server_conn as the difference between the start of the successful - # tcp setup and the successful ssl setup. Afterwards add it to seen list, in order to avoid - # the ssl_time being present in entries that use an existing connection. If no ssl setup has - # been made it is also left as -1 since it doesn't apply to this connection. - ssl_time = flow.server_conn.timestamp_ssl_setup - flow.server_conn.timestamp_tcp_setup - context.seen_server_ssl.add(flow.server_conn) + if flow.server_conn.timestamp_ssl_setup is not None: + # Get the ssl_time for this server_conn as the difference between the start of the successful + # tcp setup and the successful ssl setup. If no ssl setup has been made it is left as -1 since + # it doesn't apply to this connection. + ssl_time = flow.server_conn.timestamp_ssl_setup - flow.server_conn.timestamp_tcp_setup # Calculate the raw timings from the different timestamps present in the request and response object. # For lack of a way to measure it dns timings can not be calculated. The same goes for HAR blocked: @@ -178,7 +181,12 @@ def done(context): compressed_json_dump = context.HARLog.compress() print "=" * 100 - pprint(json.loads(json_dump)) + if context.dump_file == '-': + pprint(json.loads(json_dump)) + elif context.dump_file.endswith('.zhar'): + file(context.dump_file, "w").write(compressed_json_dump) + else: + file(context.dump_file, "w").write(json_dump) print "=" * 100 print "HAR log finished with %s bytes (%s bytes compressed)" % (len(json_dump), len(compressed_json_dump)) print "Compression rate is %s%%" % str(100. * len(compressed_json_dump) / len(json_dump)) diff --git a/requirements.txt b/requirements.txt index d84347b73..946e5ffe8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -e git+https://github.com/mitmproxy/netlib.git#egg=netlib -e git+https://github.com/mitmproxy/pathod.git#egg=pathod --e .[dev] \ No newline at end of file +-e .[dev,examples] \ No newline at end of file diff --git a/setup.py b/setup.py index 7ea6d0cb5..ace5e8174 100644 --- a/setup.py +++ b/setup.py @@ -28,8 +28,6 @@ script_deps = { "urwid>=1.1", "lxml>=3.3.6", "Pillow>=2.3.0", - "pytz", - "harparser", }, "mitmdump": set() } @@ -80,6 +78,10 @@ setup( "pyamf>=0.6.1", "protobuf>=2.5.0", "cssutils>=1.0" + ], + 'examples': [ + "pytz", + "harparser", ] } ) From ddce662fe64a693f64f9fda4b5e406be8f1278d1 Mon Sep 17 00:00:00 2001 From: Justus Wingert Date: Sat, 15 Nov 2014 22:39:15 +0100 Subject: [PATCH 11/11] Added try/except block for import errors with harparser and pytz. --- examples/har_extractor.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/examples/har_extractor.py b/examples/har_extractor.py index c994f3718..531f32aaa 100644 --- a/examples/har_extractor.py +++ b/examples/har_extractor.py @@ -2,8 +2,14 @@ This inline script utilizes harparser.HAR from https://github.com/JustusW/harparser to generate a HAR log object. """ -from pytz import utc -from harparser import HAR +try: + from harparser import HAR + from pytz import UTC +except ImportError as e: + import sys + print >> sys.stderr, "\r\nMissing dependencies: please run `pip install mitmproxy[examples]`.\r\n" + raise + from datetime import datetime, timedelta, tzinfo