mitmproxy/libmproxy/utils.py

from __future__ import absolute_import
import os, datetime, urllib, re
import time, functools, cgi
import json

def timestamp():
    """
        Returns a serializable UTC timestamp.
    """
    return time.time()


def format_timestamp(s):
    s = time.localtime(s)
    d = datetime.datetime.fromtimestamp(time.mktime(s))
    return d.strftime("%Y-%m-%d %H:%M:%S")


def format_timestamp_with_milli(s):
    d = datetime.datetime.fromtimestamp(s)
    return d.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]


def isBin(s):
    """
        Does this string have any non-ASCII characters?
    """
    for i in s:
        i = ord(i)
        if i < 9:
            return True
        elif i > 13 and i < 32:
            return True
        elif i > 126:
            return True
    return False


def isXML(s):
    for i in s:
        if i in "\n \t":
            continue
        elif i == "<":
            return True
        else:
            return False


def pretty_json(s):
    try:
        p = json.loads(s)
    except ValueError:
        return None
    return json.dumps(p, sort_keys=True, indent=4).split("\n")


def urldecode(s):
    """
        Takes a urlencoded string and returns a list of (key, value) tuples.
    """
    return cgi.parse_qsl(s, keep_blank_values=True)


def urlencode(s):
    """
        Takes a list of (key, value) tuples and returns a urlencoded string.
    """
    s = [tuple(i) for i in s]
    return urllib.urlencode(s, False)


def multipartdecode(hdrs, content):
    """
        Takes a multipart boundary encoded string and returns list of (key, value) tuples.
    """
    v = hdrs.get_first("content-type")
    if v:
        v = parse_content_type(v)
        if not v:
            return []
        boundary = v[2].get("boundary")
        if not boundary:
            return []

        rx = re.compile(r'\bname="([^"]+)"')
        r = []

        for i in content.split("--" + boundary):
            parts = i.splitlines()
            if len(parts) > 1 and parts[0][0:2] != "--":
                match = rx.search(parts[1])
                if match:
                    key = match.group(1)
                    value = "".join(parts[3+parts[2:].index(""):])
                    r.append((key, value))
        return r
    return []


def pretty_duration(secs):
    formatters = [
        (100, "{:.0f}s"),
        (10, "{:2.1f}s"),
        (1, "{:1.2f}s"),
    ]

    for limit, formatter in formatters:
        if secs >= limit:
            return formatter.format(secs)
    #less than 1 sec
    return "{:.0f}ms".format(secs*1000)


class Data:
    def __init__(self, name):
        m = __import__(name)
        dirname, _ = os.path.split(m.__file__)
        self.dirname = os.path.abspath(dirname)

    def path(self, path):
        """
            Returns a path to the package data housed at 'path' under this
            module.Path can be a path to a file, or to a directory.

            This function will raise ValueError if the path does not exist.
        """
        fullpath = os.path.join(self.dirname, path)
        if not os.path.exists(fullpath):
            raise ValueError, "dataPath: %s does not exist."%fullpath
        return fullpath
pkg_data = Data(__name__)


class LRUCache:
    """
        A simple LRU cache for generated values.
    """
    def __init__(self, size=100):
        self.size = size
        self.cache = {}
        self.cacheList  = []

    def get(self, gen, *args):
        """
            gen: A (presumably expensive) generator function. The identity of
            gen is NOT taken into account by the cache.
            *args: A list of immutable arguments, used to establish identiy by
            *the cache, and passed to gen to generate values.
        """
        if self.cache.has_key(args):
            self.cacheList.remove(args)
            self.cacheList.insert(0, args)
            return self.cache[args]
        else:
            ret = gen(*args)
            self.cacheList.insert(0, args)
            self.cache[args] = ret
            if len(self.cacheList) > self.size:
                d = self.cacheList.pop()
                self.cache.pop(d)
            return ret


def parse_content_type(c):
    """
        A simple parser for content-type values. Returns a (type, subtype,
        parameters) tuple, where type and subtype are strings, and parameters
        is a dict. If the string could not be parsed, return None.

        E.g. the following string:

            text/html; charset=UTF-8

        Returns:

            ("text", "html", {"charset": "UTF-8"})
    """
    parts = c.split(";", 1)
    ts = parts[0].split("/", 1)
    if len(ts) != 2:
        return None
    d = {}
    if len(parts) == 2:
        for i in parts[1].split(";"):
            clause = i.split("=", 1)
            if len(clause) == 2:
                d[clause[0].strip()] = clause[1].strip()
    return ts[0].lower(), ts[1].lower(), d


def hostport(scheme, host, port):
    """
        Returns the host component, with a port specifcation if needed.
    """
    if (port, scheme) in [(80, "http"), (443, "https")]:
        return host
    else:
        return "%s:%s"%(host, port)


def unparse_url(scheme, host, port, path=""):
    """
        Returns a URL string, constructed from the specified compnents.
    """
    return "%s://%s%s"%(scheme, hostport(scheme, host, port), path)


def clean_hanging_newline(t):
    """
        Many editors will silently add a newline to the final line of a
        document (I'm looking at you, Vim). This function fixes this common
        problem at the risk of removing a hanging newline in the rare cases
        where the user actually intends it.
    """
    if t and t[-1] == "\n":
        return t[:-1]
    return t


def parse_size(s):
    """
        Parses a size specification. Valid specifications are:

            123: bytes
            123k: kilobytes
            123m: megabytes
            123g: gigabytes
    """
    if not s:
        return None
    mult = None
    if s[-1].lower() == "k":
        mult = 1024**1
    elif s[-1].lower() == "m":
        mult = 1024**2
    elif s[-1].lower() == "g":
        mult = 1024**3

    if mult:
        s = s[:-1]
    else:
        mult = 1
    try:
        return int(s) * mult
    except ValueError:
        raise ValueError("Invalid size specification: %s"%s)


def safe_subn(pattern, repl, target, *args, **kwargs):
    """
        There are Unicode conversion problems with re.subn. We try to smooth
        that over by casting the pattern and replacement to strings. We really
        need a better solution that is aware of the actual content ecoding.
    """
    return re.subn(str(pattern), str(repl), target, *args, **kwargs)
always use relative imports, fix status bar bug 2014-03-10 21:36:47 +00:00			`from __future__ import absolute_import`
Silence pyflakes And pick up one bug as a consequence. 2014-01-19 05:16:24 +00:00			`import os, datetime, urllib, re`
Minor cruft removal. 2012-04-08 23:42:56 +00:00			`import time, functools, cgi`
JSON pretty-printing. Also rename the display modes ("pretty" instead of "indent"), and expand the built-in documentation. 2011-06-30 01:27:27 +00:00			`import json`
Store timestamps on flow components as a UTC time tuple. Format is: (tm_year,tm_mon,tm_mday,tm_hour,tm_min, tm_sec,tm_wday,tm_yday,tm_isdst) 2011-03-07 00:46:02 +00:00
			`def timestamp():`
Think harder about timestamps. Just save seconds since the epoch as a float. 2011-03-08 23:18:08 +00:00			`"""`
			`Returns a serializable UTC timestamp.`
			`"""`
			`return time.time()`
Store timestamps on flow components as a UTC time tuple. Format is: (tm_year,tm_mon,tm_mday,tm_hour,tm_min, tm_sec,tm_wday,tm_yday,tm_isdst) 2011-03-07 00:46:02 +00:00
Initial checkin. 2010-02-16 04:09:07 +00:00
removed unused parameter 2015-03-03 20:49:47 +00:00			`def format_timestamp(s):`
fixed formatting and added a 'test' (sort of) 2015-03-03 02:22:44 +00:00			`s = time.localtime(s)`
			`d = datetime.datetime.fromtimestamp(time.mktime(s))`
			`return d.strftime("%Y-%m-%d %H:%M:%S")`
Add timestamps to flows. For now, these are only displayed on the connection view screen, with second granularity. 2011-02-03 00:30:47 +00:00
Minor refactor to PR #496 2015-03-04 17:02:01 +00:00
fixed formatting and added a 'test' (sort of) 2015-03-03 02:22:44 +00:00			`def format_timestamp_with_milli(s):`
			`d = datetime.datetime.fromtimestamp(s)`
Minor refactor to PR #496 2015-03-04 17:02:01 +00:00			`return d.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]`


Initial checkin. 2010-02-16 04:09:07 +00:00			`def isBin(s):`
			`"""`
			`Does this string have any non-ASCII characters?`
			`"""`
			`for i in s:`
			`i = ord(i)`
			`if i < 9:`
			`return True`
			`elif i > 13 and i < 32:`
			`return True`
			`elif i > 126:`
			`return True`
			`return False`


Try not to hang when user views large request & response bodies Two different strategies here: - Use a simple heuristic to detect if we're looking at XML data when indent mode is used. On non-XML data we can hang even on small documents. - Only view partial data for large bodies. At the moment the cutoff is 100k. I might finetune this later. 2011-06-27 03:59:17 +00:00			`def isXML(s):`
			`for i in s:`
			`if i in "\n \t":`
			`continue`
			`elif i == "<":`
			`return True`
			`else:`
			`return False`


JSON pretty-printing. Also rename the display modes ("pretty" instead of "indent"), and expand the built-in documentation. 2011-06-30 01:27:27 +00:00			`def pretty_json(s):`
			`try:`
			`p = json.loads(s)`
			`except ValueError:`
			`return None`
			`return json.dumps(p, sort_keys=True, indent=4).split("\n")`


Add a pretty-printing mode for urlencoded form data. 2011-07-15 04:16:43 +00:00			`def urldecode(s):`
Add get_query and set_query methods to Request. 2012-02-09 03:40:31 +00:00			`"""`
			`Takes a urlencoded string and returns a list of (key, value) tuples.`
			`"""`
Keep blank URL parameters TODO: This should probably be configurable 2013-02-11 02:22:25 +00:00			`return cgi.parse_qsl(s, keep_blank_values=True)`
Add a pretty-printing mode for urlencoded form data. 2011-07-15 04:16:43 +00:00

Add get_query and set_query methods to Request. 2012-02-09 03:40:31 +00:00			`def urlencode(s):`
			`"""`
			`Takes a list of (key, value) tuples and returns a urlencoded string.`
			`"""`
Specialize GridEditor into a number of subclasses. 2012-03-18 21:12:06 +00:00			`s = [tuple(i) for i in s]`
Add get_query and set_query methods to Request. 2012-02-09 03:40:31 +00:00			`return urllib.urlencode(s, False)`


[#514] Add support for ignoring payload params in multipart/form-data 2015-03-10 09:44:06 +00:00			`def multipartdecode(hdrs, content):`
			`"""`
			`Takes a multipart boundary encoded string and returns list of (key, value) tuples.`
			`"""`
			`v = hdrs.get_first("content-type")`
			`if v:`
			`v = parse_content_type(v)`
			`if not v:`
			`return []`
			`boundary = v[2].get("boundary")`
			`if not boundary:`
			`return []`

			`rx = re.compile(r'\bname="([^"]+)"')`
			`r = []`

			`for i in content.split("--" + boundary):`
			`parts = i.splitlines()`
			`if len(parts) > 1 and parts[0][0:2] != "--":`
			`match = rx.search(parts[1])`
			`if match:`
			`key = match.group(1)`
			`value = "".join(parts[3+parts[2:].index(""):])`
			`r.append((key, value))`
			`return r`
			`return []`

Initial checkin. 2010-02-16 04:09:07 +00:00
added to flowlist / flowdetail time elapsed between request sent and response received 2015-02-26 21:14:20 +00:00			`def pretty_duration(secs):`
			`formatters = [`
Minor refactor to PR #496 2015-03-04 17:02:01 +00:00			`(100, "{:.0f}s"),`
			`(10, "{:2.1f}s"),`
			`(1, "{:1.2f}s"),`
added to flowlist / flowdetail time elapsed between request sent and response received 2015-02-26 21:14:20 +00:00			`]`

			`for limit, formatter in formatters:`
			`if secs >= limit:`
			`return formatter.format(secs)`
Minor refactor to PR #496 2015-03-04 17:02:01 +00:00			`#less than 1 sec`
added to flowlist / flowdetail time elapsed between request sent and response received 2015-02-26 21:14:20 +00:00			`return "{:.0f}ms".format(secs*1000)`
Initial checkin. 2010-02-16 04:09:07 +00:00
pretty_size now lives in netlib.utils 2015-04-30 00:18:01 +00:00
Initial checkin. 2010-02-16 04:09:07 +00:00			`class Data:`
			`def __init__(self, name):`
			`m = __import__(name)`
			`dirname, _ = os.path.split(m.__file__)`
			`self.dirname = os.path.abspath(dirname)`

			`def path(self, path):`
			`"""`
			`Returns a path to the package data housed at 'path' under this`
			`module.Path can be a path to a file, or to a directory.`

			`This function will raise ValueError if the path does not exist.`
			`"""`
			`fullpath = os.path.join(self.dirname, path)`
			`if not os.path.exists(fullpath):`
			`raise ValueError, "dataPath: %s does not exist."%fullpath`
			`return fullpath`
General cleanup. Cut out unused variables and code, generally shut up pychecker as much as is reasonable. 2011-08-02 04:14:33 +00:00			`pkg_data = Data(__name__)`
Initial checkin. 2010-02-16 04:09:07 +00:00

Improve responsiveness of request and response viewing. - Computing the view of a large body is expensive, so we introduce an LRU cache to hold the latest 20 results. - Use ListView more correctly, passing it individual urwid.Text snippets, rather than a single large one. This hugely improves render time. 2011-03-15 00:05:33 +00:00			`class LRUCache:`
			`"""`
Replace far-too-clever decorator LRU cache with something simpler 2015-03-22 08:00:41 +00:00			`A simple LRU cache for generated values.`
Improve responsiveness of request and response viewing. - Computing the view of a large body is expensive, so we introduce an LRU cache to hold the latest 20 results. - Use ListView more correctly, passing it individual urwid.Text snippets, rather than a single large one. This hugely improves render time. 2011-03-15 00:05:33 +00:00			`"""`
			`def __init__(self, size=100):`
			`self.size = size`
Replace far-too-clever decorator LRU cache with something simpler 2015-03-22 08:00:41 +00:00			`self.cache = {}`
			`self.cacheList = []`

			`def get(self, gen, *args):`
			`"""`
			`gen: A (presumably expensive) generator function. The identity of`
			`gen is NOT taken into account by the cache.`
			`*args: A list of immutable arguments, used to establish identiy by`
			`*the cache, and passed to gen to generate values.`
			`"""`
			`if self.cache.has_key(args):`
			`self.cacheList.remove(args)`
			`self.cacheList.insert(0, args)`
			`return self.cache[args]`
			`else:`
			`ret = gen(*args)`
			`self.cacheList.insert(0, args)`
			`self.cache[args] = ret`
			`if len(self.cacheList) > self.size:`
			`d = self.cacheList.pop()`
			`self.cache.pop(d)`
			`return ret`
Improve responsiveness of request and response viewing. - Computing the view of a large body is expensive, so we introduce an LRU cache to hold the latest 20 results. - Use ListView more correctly, passing it individual urwid.Text snippets, rather than a single large one. This hugely improves render time. 2011-03-15 00:05:33 +00:00
Move all HTTP objects to flow.py That's Request, Response, ClientConnect, ClientDisconnect, Error, and Headers. 2011-08-03 10:38:23 +00:00
Add a simple parser for content type specifications. 2012-03-19 21:31:07 +00:00			`def parse_content_type(c):`
			`"""`
			`A simple parser for content-type values. Returns a (type, subtype,`
			`parameters) tuple, where type and subtype are strings, and parameters`
			`is a dict. If the string could not be parsed, return None.`

			`E.g. the following string:`

			`text/html; charset=UTF-8`

			`Returns:`

			`("text", "html", {"charset": "UTF-8"})`
			`"""`
			`parts = c.split(";", 1)`
			`ts = parts[0].split("/", 1)`
			`if len(ts) != 2:`
			`return None`
			`d = {}`
			`if len(parts) == 2:`
			`for i in parts[1].split(";"):`
			`clause = i.split("=", 1)`
			`if len(clause) == 2:`
			`d[clause[0].strip()] = clause[1].strip()`
Refactor pretty view forcing somewhat. - Use a lookup table of content types -> view modes. - Add a urlencoded forcing. Remove "html" - at the moment it's the same as "xmlish". - Display type when forced. 2012-03-19 21:58:43 +00:00			`return ts[0].lower(), ts[1].lower(), d`
Add a simple parser for content type specifications. 2012-03-19 21:31:07 +00:00

Firm up reverse proxy specification. - Extract proxy spec parsing and unparsing functions. - Add a status indicator in mitmproxy. - Add the "R" keybinding for changing the reverse proxy from within mitmproxy. 2012-02-18 03:27:09 +00:00			`def hostport(scheme, host, port):`
			`"""`
			`Returns the host component, with a port specifcation if needed.`
			`"""`
			`if (port, scheme) in [(80, "http"), (443, "https")]:`
			`return host`
			`else:`
			`return "%s:%s"%(host, port)`


			`def unparse_url(scheme, host, port, path=""):`
			`"""`
			`Returns a URL string, constructed from the specified compnents.`
			`"""`
			`return "%s://%s%s"%(scheme, hostport(scheme, host, port), path)`


KVEditor: "e" shortcut spawns an external editor on a field. 2012-02-08 05:25:00 +00:00			`def clean_hanging_newline(t):`
			`"""`
			`Many editors will silently add a newline to the final line of a`
			`document (I'm looking at you, Vim). This function fixes this common`
			`problem at the risk of removing a hanging newline in the rare cases`
			`where the user actually intends it.`
			`"""`
Add an "r" shortcut in grid editors to read value from file. 2012-08-25 00:21:45 +00:00			`if t and t[-1] == "\n":`
KVEditor: "e" shortcut spawns an external editor on a field. 2012-02-08 05:25:00 +00:00			`return t[:-1]`
			`return t`


Add HTTP body size limit specification to command-line tools. 2011-09-09 03:27:31 +00:00			`def parse_size(s):`
			`"""`
			`Parses a size specification. Valid specifications are:`
Fix an issue caused by some editors when editing a request/response body. Many editors make it hard save a file without a terminating newline on the last line. When editing message bodies, this can cause problems. For now, I just strip the newlines off the end of the body when we return from an editor. 2012-01-20 23:43:00 +00:00
Add HTTP body size limit specification to command-line tools. 2011-09-09 03:27:31 +00:00			`123: bytes`
			`123k: kilobytes`
			`123m: megabytes`
			`123g: gigabytes`
			`"""`
			`if not s:`
			`return None`
			`mult = None`
			`if s[-1].lower() == "k":`
			`mult = 1024**1`
			`elif s[-1].lower() == "m":`
			`mult = 1024**2`
			`elif s[-1].lower() == "g":`
			`mult = 1024**3`

			`if mult:`
			`s = s[:-1]`
			`else:`
			`mult = 1`
			`try:`
			`return int(s) * mult`
			`except ValueError:`
			`raise ValueError("Invalid size specification: %s"%s)`
Fix a crashing bug when replacing text in a flow with unicode bodies. 2012-05-26 01:10:31 +00:00

			`def safe_subn(pattern, repl, target, args, *kwargs):`
			`"""`
			`There are Unicode conversion problems with re.subn. We try to smooth`
			`that over by casting the pattern and replacement to strings. We really`
			`need a better solution that is aware of the actual content ecoding.`
			`"""`
			`return re.subn(str(pattern), str(repl), target, args, *kwargs)`