mitmproxy/examples/har_dump.py
Slobodan Mišković 39d7ba852c Include boudary=... in mutipart postData
While the HAR spec is not very explicit and their example shows just this one example: ```json
"postData": {
    "mimeType": "multipart/form-data"
}
```
Would it not make sense to include all the information necessary to parse out the post data `text`. Eg.
```json
"postData": {
           "text": "--xYzZY\r\nContent-Disposition: form-data; name=\"sort1\"\r\n\r\noldest date first\r\n--xYzZY--\r\n",
           "mimeType": "multipart/form-data; boundary=xYzZY"
         },
```
Currently, full mimeType is included only in `content-type` request header.

Elsewhere in HAR spec they include the 'extras', eg ```json
"content": {
    "mimeType": "text/html; charset=utf-8"
}
``` 
So one could argue that `mimeType` should include all information necessary to interpret the data. In case of `multipart/form-data`, as per RFC2046 http://www.ietf.org/rfc/rfc2046.txt
```
 The Content-Type field for multipart entities requires one parameter, "boundary".
```
I believe that earlier incarnations, eg `har_exporter.py` included it in the mimeType.
2016-10-24 14:34:04 -07:00

220 lines
6.9 KiB
Python

"""
This inline script can be used to dump flows as HAR files.
"""
import json
import sys
import base64
import zlib
from datetime import datetime
import pytz
import mitmproxy
from mitmproxy import version
from mitmproxy.utils import strutils
from mitmproxy.net.http import cookies
HAR = {}
# A list of server seen till now is maintained so we can avoid
# using 'connect' time for entries that use an existing connection.
SERVERS_SEEN = set()
def start():
"""
Called once on script startup before any other events.
"""
if len(sys.argv) != 2:
raise ValueError(
'Usage: -s "har_dump.py filename" '
'(- will output to stdout, filenames ending with .zhar '
'will result in compressed har)'
)
HAR.update({
"log": {
"version": "1.2",
"creator": {
"name": "mitmproxy har_dump",
"version": "0.1",
"comment": "mitmproxy version %s" % version.MITMPROXY
},
"entries": []
}
})
def response(flow):
"""
Called when a server response has been received.
"""
# -1 indicates that these values do not apply to current request
ssl_time = -1
connect_time = -1
if flow.server_conn and flow.server_conn not in SERVERS_SEEN:
connect_time = (flow.server_conn.timestamp_tcp_setup -
flow.server_conn.timestamp_start)
if flow.server_conn.timestamp_ssl_setup is not None:
ssl_time = (flow.server_conn.timestamp_ssl_setup -
flow.server_conn.timestamp_tcp_setup)
SERVERS_SEEN.add(flow.server_conn)
# Calculate raw timings from timestamps. DNS timings can not be calculated
# for lack of a way to measure it. The same goes for HAR blocked.
# mitmproxy will open a server connection as soon as it receives the host
# and port from the client connection. So, the time spent waiting is actually
# spent waiting between request.timestamp_end and response.timestamp_start
# thus it correlates to HAR wait instead.
timings_raw = {
'send': flow.request.timestamp_end - flow.request.timestamp_start,
'receive': flow.response.timestamp_end - flow.response.timestamp_start,
'wait': flow.response.timestamp_start - flow.request.timestamp_end,
'connect': connect_time,
'ssl': ssl_time,
}
# HAR timings are integers in ms, so we re-encode the raw timings to that format.
timings = dict([(k, int(1000 * v)) for k, v in timings_raw.items()])
# full_time is the sum of all timings.
# Timings set to -1 will be ignored as per spec.
full_time = sum(v for v in timings.values() if v > -1)
started_date_time = format_datetime(datetime.utcfromtimestamp(flow.request.timestamp_start))
# Response body size and encoding
response_body_size = len(flow.response.raw_content)
response_body_decoded_size = len(flow.response.content)
response_body_compression = response_body_decoded_size - response_body_size
entry = {
"startedDateTime": started_date_time,
"time": full_time,
"request": {
"method": flow.request.method,
"url": flow.request.url,
"httpVersion": flow.request.http_version,
"cookies": format_request_cookies(flow.request.cookies.fields),
"headers": name_value(flow.request.headers),
"queryString": name_value(flow.request.query or {}),
"headersSize": len(str(flow.request.headers)),
"bodySize": len(flow.request.content),
},
"response": {
"status": flow.response.status_code,
"statusText": flow.response.reason,
"httpVersion": flow.response.http_version,
"cookies": format_response_cookies(flow.response.cookies.fields),
"headers": name_value(flow.response.headers),
"content": {
"size": response_body_size,
"compression": response_body_compression,
"mimeType": flow.response.headers.get('Content-Type', '')
},
"redirectURL": flow.response.headers.get('Location', ''),
"headersSize": len(str(flow.response.headers)),
"bodySize": response_body_size,
},
"cache": {},
"timings": timings,
}
# Store binary data as base64
if strutils.is_mostly_bin(flow.response.content):
entry["response"]["content"]["text"] = base64.b64encode(flow.response.content).decode()
entry["response"]["content"]["encoding"] = "base64"
else:
entry["response"]["content"]["text"] = flow.response.get_text(strict=False)
if flow.request.method in ["POST", "PUT", "PATCH"]:
params = [
{"name": a.decode("utf8", "surrogateescape"), "value": b.decode("utf8", "surrogateescape")}
for a, b in flow.request.urlencoded_form.items(multi=True)
]
entry["request"]["postData"] = {
"mimeType": flow.request.headers.get("Content-Type", ""),
"text": flow.request.get_text(strict=False),
"params": params
}
if flow.server_conn:
entry["serverIPAddress"] = str(flow.server_conn.ip_address.address[0])
HAR["log"]["entries"].append(entry)
def done():
"""
Called once on script shutdown, after any other events.
"""
dump_file = sys.argv[1]
json_dump = json.dumps(HAR, indent=2) # type: str
if dump_file == '-':
mitmproxy.ctx.log(json_dump)
else:
raw = json_dump.encode() # type: bytes
if dump_file.endswith('.zhar'):
raw = zlib.compress(raw, 9)
with open(dump_file, "wb") as f:
f.write(raw)
mitmproxy.ctx.log("HAR dump finished (wrote %s bytes to file)" % len(json_dump))
def format_datetime(dt):
return dt.replace(tzinfo=pytz.timezone("UTC")).isoformat()
def format_cookies(cookie_list):
rv = []
for name, value, attrs in cookie_list:
cookie_har = {
"name": name,
"value": value,
}
# HAR only needs some attributes
for key in ["path", "domain", "comment"]:
if key in attrs:
cookie_har[key] = attrs[key]
# These keys need to be boolean!
for key in ["httpOnly", "secure"]:
cookie_har[key] = bool(key in attrs)
# Expiration time needs to be formatted
expire_ts = cookies.get_expiration_ts(attrs)
if expire_ts is not None:
cookie_har["expires"] = format_datetime(datetime.fromtimestamp(expire_ts))
rv.append(cookie_har)
return rv
def format_request_cookies(fields):
return format_cookies(cookies.group_cookies(fields))
def format_response_cookies(fields):
return format_cookies((c[0], c[1].value, c[1].attrs) for c in fields)
def name_value(obj):
"""
Convert (key, value) pairs to HAR format.
"""
return [{"name": k, "value": v} for k, v in obj.items()]