Clean up har_extractor example

- Fix spacing, line length, unused imports, unusual import idioms
- Prevent it from barfing into our test output
This commit is contained in:
Aldo Cortesi 2015-01-02 13:41:40 +13:00
parent 1b5f5021dc
commit bb5fb2dbe0

View File

@ -1,22 +1,18 @@
""" """
This inline script utilizes harparser.HAR from https://github.com/JustusW/harparser
to generate a HAR log object.
"""
try:
from harparser import HAR
from pytz import UTC
except ImportError as e:
import sys
print >> sys.stderr, "\r\nMissing dependencies: please run `pip install mitmproxy[examples]`.\r\n"
raise
from datetime import datetime, timedelta, tzinfo This inline script utilizes harparser.HAR from
https://github.com/JustusW/harparser to generate a HAR log object.
"""
from harparser import HAR
from datetime import datetime
class _HARLog(HAR.log): class _HARLog(HAR.log):
# The attributes need to be registered here for them to actually be available later via self. This is # The attributes need to be registered here for them to actually be
# due to HAREncodable linking __getattr__ to __getitem__. Anything that is set only in __init__ will # available later via self. This is due to HAREncodable linking __getattr__
# just be added as key/value pair to self.__classes__. # to __getitem__. Anything that is set only in __init__ will just be added
# as key/value pair to self.__classes__.
__page_list__ = [] __page_list__ = []
__page_count__ = 0 __page_count__ = 0
__page_ref__ = {} __page_ref__ = {}
@ -58,55 +54,66 @@ class _HARLog(HAR.log):
def start(context, argv): def start(context, argv):
""" """
On start we create a HARLog instance. You will have to adapt this to suit your actual needs On start we create a HARLog instance. You will have to adapt this to
of HAR generation. As it will probably be necessary to cluster logs by IPs or reset them suit your actual needs of HAR generation. As it will probably be
from time to time. necessary to cluster logs by IPs or reset them from time to time.
""" """
context.dump_file = None context.dump_file = None
if len(argv) > 1: if len(argv) > 1:
context.dump_file = argv[1] context.dump_file = argv[1]
else: else:
raise ValueError('Usage: -s "har_extractor.py filename" ' raise ValueError(
'(- will output to stdout, filenames ending with .zhar will result in compressed har)') 'Usage: -s "har_extractor.py filename" '
'(- will output to stdout, filenames ending with .zhar '
'will result in compressed har)'
)
context.HARLog = _HARLog(['https://github.com']) context.HARLog = _HARLog(['https://github.com'])
context.seen_server = set() context.seen_server = set()
def response(context, flow): def response(context, flow):
""" """
Called when a server response has been received. At the time of this message both Called when a server response has been received. At the time of this
a request and a response are present and completely done. message both a request and a response are present and completely done.
""" """
# Values are converted from float seconds to int milliseconds later. # Values are converted from float seconds to int milliseconds later.
ssl_time = -.001 ssl_time = -.001
connect_time = -.001 connect_time = -.001
if flow.server_conn not in context.seen_server: if flow.server_conn not in context.seen_server:
# Calculate the connect_time for this server_conn. Afterwards add it to seen list, in # Calculate the connect_time for this server_conn. Afterwards add it to
# order to avoid the connect_time being present in entries that use an existing connection. # seen list, in order to avoid the connect_time being present in entries
# that use an existing connection.
connect_time = flow.server_conn.timestamp_tcp_setup - flow.server_conn.timestamp_start connect_time = flow.server_conn.timestamp_tcp_setup - flow.server_conn.timestamp_start
context.seen_server.add(flow.server_conn) context.seen_server.add(flow.server_conn)
if flow.server_conn.timestamp_ssl_setup is not None: if flow.server_conn.timestamp_ssl_setup is not None:
# Get the ssl_time for this server_conn as the difference between the start of the successful # Get the ssl_time for this server_conn as the difference between
# tcp setup and the successful ssl setup. If no ssl setup has been made it is left as -1 since # the start of the successful tcp setup and the successful ssl
# it doesn't apply to this connection. # setup. If no ssl setup has been made it is left as -1 since it
# doesn't apply to this connection.
ssl_time = flow.server_conn.timestamp_ssl_setup - flow.server_conn.timestamp_tcp_setup ssl_time = flow.server_conn.timestamp_ssl_setup - flow.server_conn.timestamp_tcp_setup
# Calculate the raw timings from the different timestamps present in the request and response object. # Calculate the raw timings from the different timestamps present in the
# For lack of a way to measure it dns timings can not be calculated. The same goes for HAR blocked: # request and response object. For lack of a way to measure it dns timings
# MITMProxy will open a server connection as soon as it receives the host and port from the client # can not be calculated. The same goes for HAR blocked: MITMProxy will open
# connection. So the time spent waiting is actually spent waiting between request.timestamp_end and # a server connection as soon as it receives the host and port from the
# response.timestamp_start thus it correlates to HAR wait instead. # client connection. So the time spent waiting is actually spent waiting
timings_raw = {'send': flow.request.timestamp_end - flow.request.timestamp_start, # between request.timestamp_end and response.timestamp_start thus it
'wait': flow.response.timestamp_start - flow.request.timestamp_end, # correlates to HAR wait instead.
'receive': flow.response.timestamp_end - flow.response.timestamp_start, timings_raw = {
'connect': connect_time, 'send': flow.request.timestamp_end - flow.request.timestamp_start,
'ssl': ssl_time} 'wait': flow.response.timestamp_start - flow.request.timestamp_end,
'receive': flow.response.timestamp_end - flow.response.timestamp_start,
'connect': connect_time,
'ssl': ssl_time
}
# HAR timings are integers in ms, so we have to re-encode the raw timings to that format. # HAR timings are integers in ms, so we have to re-encode the raw timings to
# that format.
timings = dict([(key, int(1000 * value)) for key, value in timings_raw.iteritems()]) timings = dict([(key, int(1000 * value)) for key, value in timings_raw.iteritems()])
# The full_time is the sum of all timings. Timings set to -1 will be ignored as per spec. # The full_time is the sum of all timings. Timings set to -1 will be ignored
# as per spec.
full_time = 0 full_time = 0
for item in timings.values(): for item in timings.values():
if item > -1: if item > -1:
@ -157,21 +164,30 @@ def response(context, flow):
"cache": {}, "cache": {},
"timings": timings, }) "timings": timings, })
# If the current url is in the page list of context.HARLog or does not have a referrer we add it as a new # If the current url is in the page list of context.HARLog or does not have
# pages object. # a referrer we add it as a new pages object.
if flow.request.url in context.HARLog.get_page_list() or flow.request.headers.get('Referer', None) is None: if flow.request.url in context.HARLog.get_page_list() or flow.request.headers.get('Referer', None) is None:
page_id = context.HARLog.create_page_id() page_id = context.HARLog.create_page_id()
context.HARLog.add(HAR.pages({"startedDateTime": entry['startedDateTime'], context.HARLog.add(
"id": page_id, HAR.pages({
"title": flow.request.url, })) "startedDateTime": entry['startedDateTime'],
"id": page_id,
"title": flow.request.url,
})
)
context.HARLog.set_page_ref(flow.request.url, page_id) context.HARLog.set_page_ref(flow.request.url, page_id)
entry['pageref'] = page_id entry['pageref'] = page_id
# Lookup the referer in the page_ref of context.HARLog to point this entries pageref attribute to the right # Lookup the referer in the page_ref of context.HARLog to point this entries
# pages object, then set it as a new reference to build a reference tree. # pageref attribute to the right pages object, then set it as a new
# reference to build a reference tree.
elif context.HARLog.get_page_ref(flow.request.headers.get('Referer', (None, ))[0]) is not None: elif context.HARLog.get_page_ref(flow.request.headers.get('Referer', (None, ))[0]) is not None:
entry['pageref'] = context.HARLog.get_page_ref(flow.request.headers['Referer'][0]) entry['pageref'] = context.HARLog.get_page_ref(
context.HARLog.set_page_ref(flow.request.headers['Referer'][0], entry['pageref']) flow.request.headers['Referer'][0]
)
context.HARLog.set_page_ref(
flow.request.headers['Referer'][0], entry['pageref']
)
context.HARLog.add(entry) context.HARLog.add(entry)
@ -186,22 +202,28 @@ def done(context):
json_dump = context.HARLog.json() json_dump = context.HARLog.json()
compressed_json_dump = context.HARLog.compress() compressed_json_dump = context.HARLog.compress()
print "=" * 100
if context.dump_file == '-': if context.dump_file == '-':
pprint(json.loads(json_dump)) context.log(pprint.pformat(json.loads(json_dump)))
elif context.dump_file.endswith('.zhar'): elif context.dump_file.endswith('.zhar'):
file(context.dump_file, "w").write(compressed_json_dump) file(context.dump_file, "w").write(compressed_json_dump)
else: else:
file(context.dump_file, "w").write(json_dump) file(context.dump_file, "w").write(json_dump)
print "=" * 100 context.log(
print "HAR log finished with %s bytes (%s bytes compressed)" % (len(json_dump), len(compressed_json_dump)) "HAR log finished with %s bytes (%s bytes compressed)" % (
print "Compression rate is %s%%" % str(100. * len(compressed_json_dump) / len(json_dump)) len(json_dump), len(compressed_json_dump)
print "=" * 100 )
)
context.log(
"Compression rate is %s%%" % str(
100. * len(compressed_json_dump) / len(json_dump)
)
)
def print_attributes(obj, filter_string=None, hide_privates=False): def print_attributes(obj, filter_string=None, hide_privates=False):
""" """
Useful helper method to quickly get all attributes of an object and its values. Useful helper method to quickly get all attributes of an object and its
values.
""" """
for attr in dir(obj): for attr in dir(obj):
if hide_privates and "__" in attr: if hide_privates and "__" in attr: