Merge pull request #989 from dufferzafar/har-extractor

Improve HAR Extractor Script
This commit is contained in:
Thomas Kriechbaumer 2016-03-04 19:55:52 +01:00
commit 428da2c4b1
4 changed files with 176 additions and 72 deletions

0
examples/__init__.py Normal file
View File

View File

@ -1,5 +1,4 @@
""" """
This inline script utilizes harparser.HAR from This inline script utilizes harparser.HAR from
https://github.com/JustusW/harparser to generate a HAR log object. https://github.com/JustusW/harparser to generate a HAR log object.
""" """
@ -17,7 +16,7 @@ class _HARLog(HAR.log):
__page_count__ = 0 __page_count__ = 0
__page_ref__ = {} __page_ref__ = {}
def __init__(self, page_list): def __init__(self, page_list=[]):
self.__page_list__ = page_list self.__page_list__ = page_list
self.__page_count__ = 0 self.__page_count__ = 0
self.__page_ref__ = {} self.__page_ref__ = {}
@ -67,7 +66,7 @@ def start(context, argv):
'(- will output to stdout, filenames ending with .zhar ' '(- will output to stdout, filenames ending with .zhar '
'will result in compressed har)' 'will result in compressed har)'
) )
context.HARLog = _HARLog(['https://github.com']) context.HARLog = _HARLog()
context.seen_server = set() context.seen_server = set()
@ -83,17 +82,17 @@ def response(context, flow):
# Calculate the connect_time for this server_conn. Afterwards add it to # Calculate the connect_time for this server_conn. Afterwards add it to
# seen list, in order to avoid the connect_time being present in entries # seen list, in order to avoid the connect_time being present in entries
# that use an existing connection. # that use an existing connection.
connect_time = flow.server_conn.timestamp_tcp_setup - \ connect_time = (flow.server_conn.timestamp_tcp_setup -
flow.server_conn.timestamp_start flow.server_conn.timestamp_start)
context.seen_server.add(flow.server_conn) context.seen_server.add(flow.server_conn)
if flow.server_conn.timestamp_ssl_setup is not None: if flow.server_conn.timestamp_ssl_setup is not None:
# Get the ssl_time for this server_conn as the difference between # Get the ssl_time for this server_conn as the difference between
# the start of the successful tcp setup and the successful ssl # the start of the successful tcp setup and the successful ssl
# setup. If no ssl setup has been made it is left as -1 since it # setup. If no ssl setup has been made it is left as -1 since it
# doesn't apply to this connection. # doesn't apply to this connection.
ssl_time = flow.server_conn.timestamp_ssl_setup - \ ssl_time = (flow.server_conn.timestamp_ssl_setup -
flow.server_conn.timestamp_tcp_setup flow.server_conn.timestamp_tcp_setup)
# Calculate the raw timings from the different timestamps present in the # Calculate the raw timings from the different timestamps present in the
# request and response object. For lack of a way to measure it dns timings # request and response object. For lack of a way to measure it dns timings
@ -112,80 +111,58 @@ def response(context, flow):
# HAR timings are integers in ms, so we have to re-encode the raw timings to # HAR timings are integers in ms, so we have to re-encode the raw timings to
# that format. # that format.
timings = dict([(key, int(1000 * value)) timings = dict([(k, int(1000 * v)) for k, v in timings_raw.iteritems()])
for key, value in timings_raw.iteritems()])
# The full_time is the sum of all timings. Timings set to -1 will be ignored # The full_time is the sum of all timings.
# as per spec. # Timings set to -1 will be ignored as per spec.
full_time = 0 full_time = sum(v for v in timings.values() if v > -1)
for item in timings.values():
if item > -1:
full_time += item
started_date_time = datetime.fromtimestamp( started_date_time = datetime.utcfromtimestamp(
flow.request.timestamp_start, flow.request.timestamp_start).isoformat()
tz=utc).isoformat()
request_query_string = [{"name": k, "value": v} request_query_string = [{"name": k, "value": v}
for k, v in flow.request.query] for k, v in flow.request.query or {}]
request_http_version = flow.request.http_version
# Cookies are shaped as tuples by MITMProxy.
request_cookies = [{"name": k.strip(), "value": v[0]}
for k, v in flow.request.cookies.items()]
request_headers = [{"name": k, "value": v} for k, v in flow.request.headers]
request_headers_size = len(str(flow.request.headers))
request_body_size = len(flow.request.content)
response_http_version = flow.response.http_version
# Cookies are shaped as tuples by MITMProxy.
response_cookies = [{"name": k.strip(), "value": v[0]}
for k, v in flow.response.cookies.items()]
response_headers = [{"name": k, "value": v}
for k, v in flow.response.headers]
response_headers_size = len(str(flow.response.headers))
response_body_size = len(flow.response.content) response_body_size = len(flow.response.content)
response_body_decoded_size = len(flow.response.get_decoded_content()) response_body_decoded_size = len(flow.response.get_decoded_content())
response_body_compression = response_body_decoded_size - response_body_size response_body_compression = response_body_decoded_size - response_body_size
response_mime_type = flow.response.headers.get('Content-Type', '')
response_redirect_url = flow.response.headers.get('Location', '')
entry = HAR.entries( entry = HAR.entries({
{ "startedDateTime": started_date_time,
"startedDateTime": started_date_time, "time": full_time,
"time": full_time, "request": {
"request": { "method": flow.request.method,
"method": flow.request.method, "url": flow.request.url,
"url": flow.request.url, "httpVersion": flow.request.http_version,
"httpVersion": request_http_version, "cookies": format_cookies(flow.request.cookies),
"cookies": request_cookies, "headers": format_headers(flow.request.headers),
"headers": request_headers, "queryString": request_query_string,
"queryString": request_query_string, "headersSize": len(str(flow.request.headers)),
"headersSize": request_headers_size, "bodySize": len(flow.request.content),
"bodySize": request_body_size, },
"response": {
"status": flow.response.status_code,
"statusText": flow.response.msg,
"httpVersion": flow.response.http_version,
"cookies": format_cookies(flow.response.cookies),
"headers": format_headers(flow.response.headers),
"content": {
"size": response_body_size,
"compression": response_body_compression,
"mimeType": flow.response.headers.get('Content-Type', '')
}, },
"response": { "redirectURL": flow.response.headers.get('Location', ''),
"status": flow.response.status_code, "headersSize": len(str(flow.response.headers)),
"statusText": flow.response.msg, "bodySize": response_body_size,
"httpVersion": response_http_version, },
"cookies": response_cookies, "cache": {},
"headers": response_headers, "timings": timings,
"content": { })
"size": response_body_size,
"compression": response_body_compression,
"mimeType": response_mime_type},
"redirectURL": response_redirect_url,
"headersSize": response_headers_size,
"bodySize": response_body_size,
},
"cache": {},
"timings": timings,
})
# If the current url is in the page list of context.HARLog or does not have # If the current url is in the page list of context.HARLog or
# a referrer we add it as a new pages object. # does not have a referrer, we add it as a new pages object.
if flow.request.url in context.HARLog.get_page_list() or flow.request.headers.get( if (flow.request.url in context.HARLog.get_page_list() or
'Referer', flow.request.headers.get('Referer') is None):
None) is None:
page_id = context.HARLog.create_page_id() page_id = context.HARLog.create_page_id()
context.HARLog.add( context.HARLog.add(
HAR.pages({ HAR.pages({
@ -215,7 +192,7 @@ def done(context):
""" """
Called once on script shutdown, after any other events. Called once on script shutdown, after any other events.
""" """
from pprint import pprint import pprint
import json import json
json_dump = context.HARLog.json() json_dump = context.HARLog.json()
@ -239,6 +216,18 @@ def done(context):
) )
def format_cookies(obj):
if obj:
return [{"name": k.strip(), "value": v[0]} for k, v in obj.items()]
return ""
def format_headers(obj):
if obj:
return [{"name": k, "value": v} for k, v in obj.fields]
return ""
def print_attributes(obj, filter_string=None, hide_privates=False): def print_attributes(obj, filter_string=None, hide_privates=False):
""" """
Useful helper method to quickly get all attributes of an object and its Useful helper method to quickly get all attributes of an object and its

View File

@ -0,0 +1,78 @@
{
"test_response": {
"log": {
"__page_count__": 1,
"version": "1.2",
"creator": {
"comment": "",
"version": "0.1",
"name": "MITMPROXY HARExtractor"
},
"pages": [
{
"startedDateTime": "1993-08-24T14:41:12",
"id": "autopage_1",
"title": "http://address:22/path"
}
],
"entries": [
{
"pageref": "autopage_1",
"startedDateTime": "1993-08-24T14:41:12",
"cache": {},
"request": {
"cookies": [],
"url": "http://address:22/path",
"queryString": [],
"headers": [
{
"name": "header",
"value": "qvalue"
},
{
"name": "content-length",
"value": "7"
}
],
"headersSize": 35,
"httpVersion": "HTTP/1.1",
"method": "GET",
"bodySize": 7
},
"timings": {
"receive": 0,
"ssl": 1000,
"connect": 1000,
"send": 0,
"wait": 0
},
"time": 2000,
"response": {
"status": 200,
"cookies": [],
"statusText": "OK",
"content": {
"mimeType": "",
"compression": 0,
"size": 7
},
"headers": [
{
"name": "content-length",
"value": "7"
},
{
"name": "header-response",
"value": "svalue"
}
],
"headersSize": 44,
"redirectURL": "",
"httpVersion": "HTTP/1.1",
"bodySize": 7
}
}
]
}
}
}

View File

@ -0,0 +1,37 @@
import json
import netlib.tutils
from . import tutils
from examples import har_extractor
class Context(object):
pass
trequest = netlib.tutils.treq(
timestamp_start=746203272,
timestamp_end=746203272,
)
tresponse = netlib.tutils.tresp(
timestamp_start=746203272,
timestamp_end=746203272,
)
def test_start():
tutils.raises(ValueError, har_extractor.start, Context(), [])
def test_response():
ctx = Context()
ctx.HARLog = har_extractor._HARLog([])
ctx.seen_server = set()
fl = tutils.tflow(req=trequest, resp=tresponse)
har_extractor.response(ctx, fl)
with open(tutils.test_data.path("data/har_extractor.har")) as fp:
test_data = json.load(fp)
assert json.loads(ctx.HARLog.json()) == test_data["test_response"]