Merge pull request #989 from dufferzafar/har-extractor

Improve HAR Extractor Script
2024-11-23 08:11:00 +00:00 · 2016-03-04 19:55:52 +01:00 · 2016-03-04 19:55:52 +01:00 · 428da2c4b1
commit 428da2c4b1
parent d7e9dda85c d17cd5c6ec
4 changed files with 176 additions and 72 deletions
--- a/examples/init.py
+++ b/examples/init.py
--- a/examples/har_extractor.py
+++ b/examples/har_extractor.py
@ -1,5 +1,4 @@
 """
-
    This inline script utilizes harparser.HAR from
    https://github.com/JustusW/harparser to generate a HAR log object.
 """
@ -17,7 +16,7 @@ class _HARLog(HAR.log):
    __page_count__ = 0
    __page_ref__ = {}

-    def __init__(self, page_list):
+    def __init__(self, page_list=[]):
        self.__page_list__ = page_list
        self.__page_count__ = 0
        self.__page_ref__ = {}
@ -67,7 +66,7 @@ def start(context, argv):
            '(- will output to stdout, filenames ending with .zhar '
            'will result in compressed har)'
        )
-    context.HARLog = _HARLog(['https://github.com'])
+    context.HARLog = _HARLog()
    context.seen_server = set()


@ -83,8 +82,8 @@ def response(context, flow):
        # Calculate the connect_time for this server_conn. Afterwards add it to
        # seen list, in order to avoid the connect_time being present in entries
        # that use an existing connection.
-        connect_time = flow.server_conn.timestamp_tcp_setup - \
-            flow.server_conn.timestamp_start
+        connect_time = (flow.server_conn.timestamp_tcp_setup -
+                        flow.server_conn.timestamp_start)
        context.seen_server.add(flow.server_conn)

        if flow.server_conn.timestamp_ssl_setup is not None:
@ -92,8 +91,8 @@ def response(context, flow):
            # the start of the successful tcp setup and the successful ssl
            # setup. If no ssl setup has been made it is left as -1 since it
            # doesn't apply to this connection.
-            ssl_time = flow.server_conn.timestamp_ssl_setup - \
-                flow.server_conn.timestamp_tcp_setup
+            ssl_time = (flow.server_conn.timestamp_ssl_setup -
+                        flow.server_conn.timestamp_tcp_setup)

    # Calculate the raw timings from the different timestamps present in the
    # request and response object. For lack of a way to measure it dns timings
@ -112,80 +111,58 @@ def response(context, flow):

    # HAR timings are integers in ms, so we have to re-encode the raw timings to
    # that format.
-    timings = dict([(key, int(1000 * value))
-                    for key, value in timings_raw.iteritems()])
+    timings = dict([(k, int(1000 * v)) for k, v in timings_raw.iteritems()])

-    # The full_time is the sum of all timings. Timings set to -1 will be ignored
-    # as per spec.
-    full_time = 0
-    for item in timings.values():
-        if item > -1:
-            full_time += item
+    # The full_time is the sum of all timings.
+    # Timings set to -1 will be ignored as per spec.
+    full_time = sum(v for v in timings.values() if v > -1)

-    started_date_time = datetime.fromtimestamp(
-        flow.request.timestamp_start,
-        tz=utc).isoformat()
+    started_date_time = datetime.utcfromtimestamp(
+        flow.request.timestamp_start).isoformat()

    request_query_string = [{"name": k, "value": v}
-                            for k, v in flow.request.query]
-    request_http_version = flow.request.http_version
-    # Cookies are shaped as tuples by MITMProxy.
-    request_cookies = [{"name": k.strip(), "value": v[0]}
-                       for k, v in flow.request.cookies.items()]
-    request_headers = [{"name": k, "value": v} for k, v in flow.request.headers]
-    request_headers_size = len(str(flow.request.headers))
-    request_body_size = len(flow.request.content)
+                            for k, v in flow.request.query or {}]

-    response_http_version = flow.response.http_version
-    # Cookies are shaped as tuples by MITMProxy.
-    response_cookies = [{"name": k.strip(), "value": v[0]}
-                        for k, v in flow.response.cookies.items()]
-    response_headers = [{"name": k, "value": v}
-                        for k, v in flow.response.headers]
-    response_headers_size = len(str(flow.response.headers))
    response_body_size = len(flow.response.content)
    response_body_decoded_size = len(flow.response.get_decoded_content())
    response_body_compression = response_body_decoded_size - response_body_size
-    response_mime_type = flow.response.headers.get('Content-Type', '')
-    response_redirect_url = flow.response.headers.get('Location', '')

-    entry = HAR.entries(
-        {
+    entry = HAR.entries({
        "startedDateTime": started_date_time,
        "time": full_time,
        "request": {
            "method": flow.request.method,
            "url": flow.request.url,
-                "httpVersion": request_http_version,
-                "cookies": request_cookies,
-                "headers": request_headers,
+            "httpVersion": flow.request.http_version,
+            "cookies": format_cookies(flow.request.cookies),
+            "headers": format_headers(flow.request.headers),
            "queryString": request_query_string,
-                "headersSize": request_headers_size,
-                "bodySize": request_body_size,
+            "headersSize": len(str(flow.request.headers)),
+            "bodySize": len(flow.request.content),
        },
        "response": {
            "status": flow.response.status_code,
            "statusText": flow.response.msg,
-                "httpVersion": response_http_version,
-                "cookies": response_cookies,
-                "headers": response_headers,
+            "httpVersion": flow.response.http_version,
+            "cookies": format_cookies(flow.response.cookies),
+            "headers": format_headers(flow.response.headers),
            "content": {
                "size": response_body_size,
                "compression": response_body_compression,
-                    "mimeType": response_mime_type},
-                "redirectURL": response_redirect_url,
-                "headersSize": response_headers_size,
+                "mimeType": flow.response.headers.get('Content-Type', '')
+            },
+            "redirectURL": flow.response.headers.get('Location', ''),
+            "headersSize": len(str(flow.response.headers)),
            "bodySize": response_body_size,
        },
        "cache": {},
        "timings": timings,
    })

-    # If the current url is in the page list of context.HARLog or does not have
-    # a referrer we add it as a new pages object.
-    if flow.request.url in context.HARLog.get_page_list() or flow.request.headers.get(
-            'Referer',
-            None) is None:
+    # If the current url is in the page list of context.HARLog or
+    # does not have a referrer, we add it as a new pages object.
+    if (flow.request.url in context.HARLog.get_page_list() or
+            flow.request.headers.get('Referer') is None):
        page_id = context.HARLog.create_page_id()
        context.HARLog.add(
            HAR.pages({
@ -215,7 +192,7 @@ def done(context):
    """
        Called once on script shutdown, after any other events.
    """
-    from pprint import pprint
+    import pprint
    import json

    json_dump = context.HARLog.json()
@ -239,6 +216,18 @@ def done(context):
    )


+def format_cookies(obj):
+    if obj:
+        return [{"name": k.strip(), "value": v[0]} for k, v in obj.items()]
+    return ""
+
+
+def format_headers(obj):
+    if obj:
+        return [{"name": k, "value": v} for k, v in obj.fields]
+    return ""
+
+
 def print_attributes(obj, filter_string=None, hide_privates=False):
    """
        Useful helper method to quickly get all attributes of an object and its
--- a/test/mitmproxy/data/har_extractor.har
+++ b/test/mitmproxy/data/har_extractor.har
@ -0,0 +1,78 @@
+{
+    "test_response": {
+        "log": {
+            "__page_count__": 1,
+            "version": "1.2",
+            "creator": {
+                "comment": "",
+                "version": "0.1",
+                "name": "MITMPROXY HARExtractor"
+            },
+            "pages": [
+                {
+                    "startedDateTime": "1993-08-24T14:41:12",
+                    "id": "autopage_1",
+                    "title": "http://address:22/path"
+                }
+            ],
+            "entries": [
+                {
+                    "pageref": "autopage_1",
+                    "startedDateTime": "1993-08-24T14:41:12",
+                    "cache": {},
+                    "request": {
+                        "cookies": [],
+                        "url": "http://address:22/path",
+                        "queryString": [],
+                        "headers": [
+                            {
+                                "name": "header",
+                                "value": "qvalue"
+                            },
+                            {
+                                "name": "content-length",
+                                "value": "7"
+                            }
+                        ],
+                        "headersSize": 35,
+                        "httpVersion": "HTTP/1.1",
+                        "method": "GET",
+                        "bodySize": 7
+                    },
+                    "timings": {
+                        "receive": 0,
+                        "ssl": 1000,
+                        "connect": 1000,
+                        "send": 0,
+                        "wait": 0
+                    },
+                    "time": 2000,
+                    "response": {
+                        "status": 200,
+                        "cookies": [],
+                        "statusText": "OK",
+                        "content": {
+                            "mimeType": "",
+                            "compression": 0,
+                            "size": 7
+                        },
+                        "headers": [
+                            {
+                                "name": "content-length",
+                                "value": "7"
+                            },
+                            {
+                                "name": "header-response",
+                                "value": "svalue"
+                            }
+                        ],
+                        "headersSize": 44,
+                        "redirectURL": "",
+                        "httpVersion": "HTTP/1.1",
+                        "bodySize": 7
+                    }
+                }
+            ]
+        }
+    }
+}
--- a/test/mitmproxy/test_har_extractor.py
+++ b/test/mitmproxy/test_har_extractor.py
@ -0,0 +1,37 @@
+import json
+import netlib.tutils
+from . import tutils
+
+from examples import har_extractor
+
+
+class Context(object):
+    pass
+
+
+trequest = netlib.tutils.treq(
+    timestamp_start=746203272,
+    timestamp_end=746203272,
+)
+
+tresponse = netlib.tutils.tresp(
+    timestamp_start=746203272,
+    timestamp_end=746203272,
+)
+
+
+def test_start():
+    tutils.raises(ValueError, har_extractor.start, Context(), [])
+
+
+def test_response():
+    ctx = Context()
+    ctx.HARLog = har_extractor._HARLog([])
+    ctx.seen_server = set()
+
+    fl = tutils.tflow(req=trequest, resp=tresponse)
+    har_extractor.response(ctx, fl)
+
+    with open(tutils.test_data.path("data/har_extractor.har")) as fp:
+        test_data = json.load(fp)
+        assert json.loads(ctx.HARLog.json()) == test_data["test_response"]