Clean up har_extractor example

- Fix spacing, line length, unused imports, unusual import idioms - Prevent it from barfing into our test output
2025-01-30 23:09:44 +00:00 · 2015-01-02 13:41:40 +13:00 · 2015-01-02 13:41:40 +13:00 · bb5fb2dbe0
commit bb5fb2dbe0
parent 1b5f5021dc
1 changed files with 76 additions and 54 deletions
--- a/examples/har_extractor.py
+++ b/examples/har_extractor.py
@ -1,22 +1,18 @@
 """
-    This inline script utilizes harparser.HAR from https://github.com/JustusW/harparser
-    to generate a HAR log object.
-"""
-try:
-    from harparser import HAR
-    from pytz import UTC
-except ImportError as e:
-    import sys
-    print >> sys.stderr, "\r\nMissing dependencies: please run `pip install mitmproxy[examples]`.\r\n"
-    raise

-from datetime import datetime, timedelta, tzinfo
+    This inline script utilizes harparser.HAR from
+    https://github.com/JustusW/harparser to generate a HAR log object.
+"""
+from harparser import HAR
+
+from datetime import datetime


 class _HARLog(HAR.log):
-    # The attributes need to be registered here for them to actually be available later via self. This is
-    # due to HAREncodable linking __getattr__ to __getitem__. Anything that is set only in __init__ will
-    # just be added as key/value pair to self.__classes__.
+    # The attributes need to be registered here for them to actually be
+    # available later via self. This is due to HAREncodable linking __getattr__
+    # to __getitem__. Anything that is set only in __init__ will just be added
+    # as key/value pair to self.__classes__.
    __page_list__ = []
    __page_count__ = 0
    __page_ref__ = {}
@ -58,55 +54,66 @@ class _HARLog(HAR.log):

 def start(context, argv):
    """
-        On start we create a HARLog instance. You will have to adapt this to suit your actual needs
-        of HAR generation. As it will probably be necessary to cluster logs by IPs or reset them
-        from time to time.
+        On start we create a HARLog instance. You will have to adapt this to
+        suit your actual needs of HAR generation. As it will probably be
+        necessary to cluster logs by IPs or reset them from time to time.
    """
    context.dump_file = None
    if len(argv) > 1:
        context.dump_file = argv[1]
    else:
-        raise ValueError('Usage: -s "har_extractor.py filename" '
-                         '(- will output to stdout, filenames ending with .zhar will result in compressed har)')
+        raise ValueError(
+            'Usage: -s "har_extractor.py filename" '
+            '(- will output to stdout, filenames ending with .zhar '
+            'will result in compressed har)'
+        )
    context.HARLog = _HARLog(['https://github.com'])
    context.seen_server = set()


 def response(context, flow):
    """
-       Called when a server response has been received. At the time of this message both
-       a request and a response are present and completely done.
+       Called when a server response has been received. At the time of this
+       message both a request and a response are present and completely done.
    """
    # Values are converted from float seconds to int milliseconds later.
    ssl_time = -.001
    connect_time = -.001
    if flow.server_conn not in context.seen_server:
-        # Calculate the connect_time for this server_conn. Afterwards add it to seen list, in
-        # order to avoid the connect_time being present in entries that use an existing connection.
+        # Calculate the connect_time for this server_conn. Afterwards add it to
+        # seen list, in order to avoid the connect_time being present in entries
+        # that use an existing connection.
        connect_time = flow.server_conn.timestamp_tcp_setup - flow.server_conn.timestamp_start
        context.seen_server.add(flow.server_conn)

        if flow.server_conn.timestamp_ssl_setup is not None:
-            # Get the ssl_time for this server_conn as the difference between the start of the successful
-            # tcp setup and the successful ssl setup. If  no ssl setup has been made it is left as -1 since
-            # it doesn't apply to this connection.
+            # Get the ssl_time for this server_conn as the difference between
+            # the start of the successful tcp setup and the successful ssl
+            # setup. If  no ssl setup has been made it is left as -1 since it
+            # doesn't apply to this connection.
            ssl_time = flow.server_conn.timestamp_ssl_setup - flow.server_conn.timestamp_tcp_setup

-    # Calculate the raw timings from the different timestamps present in the request and response object.
-    # For lack of a way to measure it dns timings can not be calculated. The same goes for HAR blocked:
-    # MITMProxy will open a server connection as soon as it receives the host and port from the client
-    # connection. So the time spent waiting is actually spent waiting between request.timestamp_end and
-    # response.timestamp_start thus it correlates to HAR wait instead.
-    timings_raw = {'send': flow.request.timestamp_end - flow.request.timestamp_start,
-                   'wait': flow.response.timestamp_start - flow.request.timestamp_end,
-                   'receive': flow.response.timestamp_end - flow.response.timestamp_start,
-                   'connect': connect_time,
-                   'ssl': ssl_time}
+    # Calculate the raw timings from the different timestamps present in the
+    # request and response object. For lack of a way to measure it dns timings
+    # can not be calculated. The same goes for HAR blocked: MITMProxy will open
+    # a server connection as soon as it receives the host and port from the
+    # client connection. So the time spent waiting is actually spent waiting
+    # between request.timestamp_end and response.timestamp_start thus it
+    # correlates to HAR wait instead.
+    timings_raw = {
+        'send': flow.request.timestamp_end - flow.request.timestamp_start,
+        'wait': flow.response.timestamp_start - flow.request.timestamp_end,
+        'receive': flow.response.timestamp_end - flow.response.timestamp_start,
+        'connect': connect_time,
+        'ssl': ssl_time
+    }

-    # HAR timings are integers in ms, so we have to re-encode the raw timings to that format.
+    # HAR timings are integers in ms, so we have to re-encode the raw timings to
+    # that format.
    timings = dict([(key, int(1000 * value)) for key, value in timings_raw.iteritems()])

-    # The full_time is the sum of all timings. Timings set to -1 will be ignored as per spec.
+    # The full_time is the sum of all timings. Timings set to -1 will be ignored
+    # as per spec.
    full_time = 0
    for item in timings.values():
        if item > -1:
@ -157,21 +164,30 @@ def response(context, flow):
                         "cache": {},
                         "timings": timings, })

-    # If the current url is in the page list of context.HARLog or does not have a referrer we add it as a new
-    # pages object.
+    # If the current url is in the page list of context.HARLog or does not have
+    # a referrer we add it as a new pages object.
    if flow.request.url in context.HARLog.get_page_list() or flow.request.headers.get('Referer', None) is None:
        page_id = context.HARLog.create_page_id()
-        context.HARLog.add(HAR.pages({"startedDateTime": entry['startedDateTime'],
-                                      "id": page_id,
-                                      "title": flow.request.url, }))
+        context.HARLog.add(
+            HAR.pages({
+                "startedDateTime": entry['startedDateTime'],
+                "id": page_id,
+                "title": flow.request.url,
+            })
+        )
        context.HARLog.set_page_ref(flow.request.url, page_id)
        entry['pageref'] = page_id

-    # Lookup the referer in the page_ref of context.HARLog to point this entries pageref attribute to the right
-    # pages object, then set it as a new reference to build a reference tree.
+    # Lookup the referer in the page_ref of context.HARLog to point this entries
+    # pageref attribute to the right pages object, then set it as a new
+    # reference to build a reference tree.
    elif context.HARLog.get_page_ref(flow.request.headers.get('Referer', (None, ))[0]) is not None:
-        entry['pageref'] = context.HARLog.get_page_ref(flow.request.headers['Referer'][0])
-        context.HARLog.set_page_ref(flow.request.headers['Referer'][0], entry['pageref'])
+        entry['pageref'] = context.HARLog.get_page_ref(
+            flow.request.headers['Referer'][0]
+        )
+        context.HARLog.set_page_ref(
+            flow.request.headers['Referer'][0], entry['pageref']
+        )

    context.HARLog.add(entry)

@ -186,22 +202,28 @@ def done(context):
    json_dump = context.HARLog.json()
    compressed_json_dump = context.HARLog.compress()

-    print "=" * 100
    if context.dump_file == '-':
-        pprint(json.loads(json_dump))
+        context.log(pprint.pformat(json.loads(json_dump)))
    elif context.dump_file.endswith('.zhar'):
        file(context.dump_file, "w").write(compressed_json_dump)
    else:
        file(context.dump_file, "w").write(json_dump)
-    print "=" * 100
-    print "HAR log finished with %s bytes (%s bytes compressed)" % (len(json_dump), len(compressed_json_dump))
-    print "Compression rate is %s%%" % str(100. * len(compressed_json_dump) / len(json_dump))
-    print "=" * 100
+    context.log(
+        "HAR log finished with %s bytes (%s bytes compressed)" % (
+            len(json_dump), len(compressed_json_dump)
+        )
+    )
+    context.log(
+        "Compression rate is %s%%" % str(
+            100. * len(compressed_json_dump) / len(json_dump)
+        )
+    )


 def print_attributes(obj, filter_string=None, hide_privates=False):
    """
-        Useful helper method to quickly get all attributes of an object and its values.
+        Useful helper method to quickly get all attributes of an object and its
+        values.
    """
    for attr in dir(obj):
        if hide_privates and "__" in attr: