mitmproxy/libmproxy/utils.py

# Copyright (C) 2010  Aldo Cortesi
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
import os, datetime, urlparse, string, urllib, re
import time, functools, cgi
import json
from netlib import http

def timestamp():
    """
        Returns a serializable UTC timestamp.
    """
    return time.time()


def format_timestamp(s):
    s = time.localtime(s)
    d = datetime.datetime.fromtimestamp(time.mktime(s))
    return d.strftime("%Y-%m-%d %H:%M:%S")


def isBin(s):
    """
        Does this string have any non-ASCII characters?
    """
    for i in s:
        i = ord(i)
        if i < 9:
            return True
        elif i > 13 and i < 32:
            return True
        elif i > 126:
            return True
    return False


def isXML(s):
    for i in s:
        if i in "\n \t":
            continue
        elif i == "<":
            return True
        else:
            return False


def pretty_json(s):
    try:
        p = json.loads(s)
    except ValueError:
        return None
    return json.dumps(p, sort_keys=True, indent=4).split("\n")


def urldecode(s):
    """
        Takes a urlencoded string and returns a list of (key, value) tuples.
    """
    return cgi.parse_qsl(s, keep_blank_values=True)


def urlencode(s):
    """
        Takes a list of (key, value) tuples and returns a urlencoded string.
    """
    s = [tuple(i) for i in s]
    return urllib.urlencode(s, False)


def del_all(dict, keys):
    for key in keys:
        if key in dict:
            del dict[key]


def pretty_size(size):
    suffixes = [
        ("B",   2**10),
        ("kB",   2**20),
        ("M",   2**30),
    ]
    for suf, lim in suffixes:
        if size >= lim:
            continue
        else:
            x = round(size/float(lim/2**10), 2)
            if x == int(x):
                x = int(x)
            return str(x) + suf


class Data:
    def __init__(self, name):
        m = __import__(name)
        dirname, _ = os.path.split(m.__file__)
        self.dirname = os.path.abspath(dirname)

    def path(self, path):
        """
            Returns a path to the package data housed at 'path' under this
            module.Path can be a path to a file, or to a directory.

            This function will raise ValueError if the path does not exist.
        """
        fullpath = os.path.join(self.dirname, path)
        if not os.path.exists(fullpath):
            raise ValueError, "dataPath: %s does not exist."%fullpath
        return fullpath
pkg_data = Data(__name__)


class LRUCache:
    """
        A decorator that implements a self-expiring LRU cache for class
        methods (not functions!).

        Cache data is tracked as attributes on the object itself. There is
        therefore a separate cache for each object instance.
    """
    def __init__(self, size=100):
        self.size = size

    def __call__(self, f):
        cacheName = "_cached_%s"%f.__name__
        cacheListName = "_cachelist_%s"%f.__name__
        size = self.size

        @functools.wraps(f)
        def wrap(self, *args):
            if not hasattr(self, cacheName):
                setattr(self, cacheName, {})
                setattr(self, cacheListName, [])
            cache = getattr(self, cacheName)
            cacheList = getattr(self, cacheListName)
            if cache.has_key(args):
                cacheList.remove(args)
                cacheList.insert(0, args)
                return cache[args]
            else:
                ret = f(self, *args)
                cacheList.insert(0, args)
                cache[args] = ret
                if len(cacheList) > size:
                    d = cacheList.pop()
                    cache.pop(d)
                return ret
        return wrap


def parse_proxy_spec(url):
    p = http.parse_url(url)
    if not p or not p[1]:
        return None
    return p[:3]


def parse_content_type(c):
    """
        A simple parser for content-type values. Returns a (type, subtype,
        parameters) tuple, where type and subtype are strings, and parameters
        is a dict. If the string could not be parsed, return None.

        E.g. the following string:

            text/html; charset=UTF-8

        Returns:

            ("text", "html", {"charset": "UTF-8"})
    """
    parts = c.split(";", 1)
    ts = parts[0].split("/", 1)
    if len(ts) != 2:
        return None
    d = {}
    if len(parts) == 2:
        for i in parts[1].split(";"):
            clause = i.split("=", 1)
            if len(clause) == 2:
                d[clause[0].strip()] = clause[1].strip()
    return ts[0].lower(), ts[1].lower(), d


def hostport(scheme, host, port):
    """
        Returns the host component, with a port specifcation if needed.
    """
    if (port, scheme) in [(80, "http"), (443, "https")]:
        return host
    else:
        return "%s:%s"%(host, port)


def unparse_url(scheme, host, port, path=""):
    """
        Returns a URL string, constructed from the specified compnents.
    """
    return "%s://%s%s"%(scheme, hostport(scheme, host, port), path)


def clean_hanging_newline(t):
    """
        Many editors will silently add a newline to the final line of a
        document (I'm looking at you, Vim). This function fixes this common
        problem at the risk of removing a hanging newline in the rare cases
        where the user actually intends it.
    """
    if t and t[-1] == "\n":
        return t[:-1]
    return t


def parse_size(s):
    """
        Parses a size specification. Valid specifications are:

            123: bytes
            123k: kilobytes
            123m: megabytes
            123g: gigabytes
    """
    if not s:
        return None
    mult = None
    if s[-1].lower() == "k":
        mult = 1024**1
    elif s[-1].lower() == "m":
        mult = 1024**2
    elif s[-1].lower() == "g":
        mult = 1024**3

    if mult:
        s = s[:-1]
    else:
        mult = 1
    try:
        return int(s) * mult
    except ValueError:
        raise ValueError("Invalid size specification: %s"%s)


def safe_subn(pattern, repl, target, *args, **kwargs):
    """
        There are Unicode conversion problems with re.subn. We try to smooth
        that over by casting the pattern and replacement to strings. We really
        need a better solution that is aware of the actual content ecoding.
    """
    return re.subn(str(pattern), str(repl), target, *args, **kwargs)
Initial checkin. 2010-02-16 04:09:07 +00:00			`# Copyright (C) 2010 Aldo Cortesi`
Changes replace logic to function in both Python 2.6.x and 2.7.x Tests now only assume Python 2.6.x rather than requiring 2.7.x. This does not preclude the use of flags as a kwarg in replace 2011-07-27 05:47:08 +00:00			`#`
Initial checkin. 2010-02-16 04:09:07 +00:00			`# This program is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
Changes replace logic to function in both Python 2.6.x and 2.7.x Tests now only assume Python 2.6.x rather than requiring 2.7.x. This does not preclude the use of flags as a kwarg in replace 2011-07-27 05:47:08 +00:00			`#`
Initial checkin. 2010-02-16 04:09:07 +00:00			`# This program is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
Changes replace logic to function in both Python 2.6.x and 2.7.x Tests now only assume Python 2.6.x rather than requiring 2.7.x. This does not preclude the use of flags as a kwarg in replace 2011-07-27 05:47:08 +00:00			`#`
Initial checkin. 2010-02-16 04:09:07 +00:00			`# You should have received a copy of the GNU General Public License`
			`# along with this program. If not, see <http://www.gnu.org/licenses/>.`
Fix a crashing bug when replacing text in a flow with unicode bodies. 2012-05-26 01:10:31 +00:00			`import os, datetime, urlparse, string, urllib, re`
Minor cruft removal. 2012-04-08 23:42:56 +00:00			`import time, functools, cgi`
JSON pretty-printing. Also rename the display modes ("pretty" instead of "indent"), and expand the built-in documentation. 2011-06-30 01:27:27 +00:00			`import json`
Adapt for API changes in netlib. 2012-06-23 02:06:34 +00:00			`from netlib import http`
Store timestamps on flow components as a UTC time tuple. Format is: (tm_year,tm_mon,tm_mday,tm_hour,tm_min, tm_sec,tm_wday,tm_yday,tm_isdst) 2011-03-07 00:46:02 +00:00
			`def timestamp():`
Think harder about timestamps. Just save seconds since the epoch as a float. 2011-03-08 23:18:08 +00:00			`"""`
			`Returns a serializable UTC timestamp.`
			`"""`
			`return time.time()`
Store timestamps on flow components as a UTC time tuple. Format is: (tm_year,tm_mon,tm_mday,tm_hour,tm_min, tm_sec,tm_wday,tm_yday,tm_isdst) 2011-03-07 00:46:02 +00:00
Initial checkin. 2010-02-16 04:09:07 +00:00
Add timestamps to flows. For now, these are only displayed on the connection view screen, with second granularity. 2011-02-03 00:30:47 +00:00			`def format_timestamp(s):`
Think harder about timestamps. Just save seconds since the epoch as a float. 2011-03-08 23:18:08 +00:00			`s = time.localtime(s)`
Store timestamps on flow components as a UTC time tuple. Format is: (tm_year,tm_mon,tm_mday,tm_hour,tm_min, tm_sec,tm_wday,tm_yday,tm_isdst) 2011-03-07 00:46:02 +00:00			`d = datetime.datetime.fromtimestamp(time.mktime(s))`
Add timestamps to flows. For now, these are only displayed on the connection view screen, with second granularity. 2011-02-03 00:30:47 +00:00			`return d.strftime("%Y-%m-%d %H:%M:%S")`


Initial checkin. 2010-02-16 04:09:07 +00:00			`def isBin(s):`
			`"""`
			`Does this string have any non-ASCII characters?`
			`"""`
			`for i in s:`
			`i = ord(i)`
			`if i < 9:`
			`return True`
			`elif i > 13 and i < 32:`
			`return True`
			`elif i > 126:`
			`return True`
			`return False`


Try not to hang when user views large request & response bodies Two different strategies here: - Use a simple heuristic to detect if we're looking at XML data when indent mode is used. On non-XML data we can hang even on small documents. - Only view partial data for large bodies. At the moment the cutoff is 100k. I might finetune this later. 2011-06-27 03:59:17 +00:00			`def isXML(s):`
			`for i in s:`
			`if i in "\n \t":`
			`continue`
			`elif i == "<":`
			`return True`
			`else:`
			`return False`


JSON pretty-printing. Also rename the display modes ("pretty" instead of "indent"), and expand the built-in documentation. 2011-06-30 01:27:27 +00:00			`def pretty_json(s):`
			`try:`
			`p = json.loads(s)`
			`except ValueError:`
			`return None`
			`return json.dumps(p, sort_keys=True, indent=4).split("\n")`


Add a pretty-printing mode for urlencoded form data. 2011-07-15 04:16:43 +00:00			`def urldecode(s):`
Add get_query and set_query methods to Request. 2012-02-09 03:40:31 +00:00			`"""`
			`Takes a urlencoded string and returns a list of (key, value) tuples.`
			`"""`
Keep blank URL parameters TODO: This should probably be configurable 2013-02-11 02:22:25 +00:00			`return cgi.parse_qsl(s, keep_blank_values=True)`
Add a pretty-printing mode for urlencoded form data. 2011-07-15 04:16:43 +00:00

Add get_query and set_query methods to Request. 2012-02-09 03:40:31 +00:00			`def urlencode(s):`
			`"""`
			`Takes a list of (key, value) tuples and returns a urlencoded string.`
			`"""`
Specialize GridEditor into a number of subclasses. 2012-03-18 21:12:06 +00:00			`s = [tuple(i) for i in s]`
Add get_query and set_query methods to Request. 2012-02-09 03:40:31 +00:00			`return urllib.urlencode(s, False)`


Unit test++ 2011-08-02 04:52:47 +00:00			`def del_all(dict, keys):`
			`for key in keys:`
			`if key in dict:`
			`del dict[key]`
Move try_del to utils 2010-11-17 11:03:42 +00:00

Initial checkin. 2010-02-16 04:09:07 +00:00			`def pretty_size(size):`
			`suffixes = [`
			`("B", 2**10),`
			`("kB", 2**20),`
			`("M", 2**30),`
			`]`
			`for suf, lim in suffixes:`
			`if size >= lim:`
			`continue`
			`else:`
			`x = round(size/float(lim/2**10), 2)`
			`if x == int(x):`
			`x = int(x)`
			`return str(x) + suf`


			`class Data:`
			`def __init__(self, name):`
			`m = __import__(name)`
			`dirname, _ = os.path.split(m.__file__)`
			`self.dirname = os.path.abspath(dirname)`

			`def path(self, path):`
			`"""`
			`Returns a path to the package data housed at 'path' under this`
			`module.Path can be a path to a file, or to a directory.`

			`This function will raise ValueError if the path does not exist.`
			`"""`
			`fullpath = os.path.join(self.dirname, path)`
			`if not os.path.exists(fullpath):`
			`raise ValueError, "dataPath: %s does not exist."%fullpath`
			`return fullpath`
General cleanup. Cut out unused variables and code, generally shut up pychecker as much as is reasonable. 2011-08-02 04:14:33 +00:00			`pkg_data = Data(__name__)`
Initial checkin. 2010-02-16 04:09:07 +00:00

Improve responsiveness of request and response viewing. - Computing the view of a large body is expensive, so we introduce an LRU cache to hold the latest 20 results. - Use ListView more correctly, passing it individual urwid.Text snippets, rather than a single large one. This hugely improves render time. 2011-03-15 00:05:33 +00:00			`class LRUCache:`
			`"""`
			`A decorator that implements a self-expiring LRU cache for class`
			`methods (not functions!).`

			`Cache data is tracked as attributes on the object itself. There is`
			`therefore a separate cache for each object instance.`
			`"""`
			`def __init__(self, size=100):`
			`self.size = size`

			`def __call__(self, f):`
			`cacheName = "_cached_%s"%f.__name__`
			`cacheListName = "_cachelist_%s"%f.__name__`
			`size = self.size`

			`@functools.wraps(f)`
			`def wrap(self, *args):`
			`if not hasattr(self, cacheName):`
			`setattr(self, cacheName, {})`
			`setattr(self, cacheListName, [])`
			`cache = getattr(self, cacheName)`
			`cacheList = getattr(self, cacheListName)`
			`if cache.has_key(args):`
			`cacheList.remove(args)`
			`cacheList.insert(0, args)`
			`return cache[args]`
			`else:`
			`ret = f(self, *args)`
			`cacheList.insert(0, args)`
			`cache[args] = ret`
			`if len(cacheList) > size:`
			`d = cacheList.pop()`
			`cache.pop(d)`
			`return ret`
			`return wrap`
Move all HTTP objects to flow.py That's Request, Response, ClientConnect, ClientDisconnect, Error, and Headers. 2011-08-03 10:38:23 +00:00

Refactor reverse proxying - Retain the specification from the Host header as a Request's description. - Expand upstream proxy specifications to include the scheme. We now say https://hostname:port - Move the "R" revert keybinding to "v" to make room for a reverse proxy binding that matches the command-line flag. 2012-02-18 01:45:22 +00:00			`def parse_proxy_spec(url):`
Adapt for API changes in netlib. 2012-06-23 02:06:34 +00:00			`p = http.parse_url(url)`
Firm up what we consider to be a valid proxy spec. 2012-02-18 03:29:02 +00:00			`if not p or not p[1]:`
Refactor reverse proxying - Retain the specification from the Host header as a Request's description. - Expand upstream proxy specifications to include the scheme. We now say https://hostname:port - Move the "R" revert keybinding to "v" to make room for a reverse proxy binding that matches the command-line flag. 2012-02-18 01:45:22 +00:00			`return None`
			`return p[:3]`


Add a simple parser for content type specifications. 2012-03-19 21:31:07 +00:00			`def parse_content_type(c):`
			`"""`
			`A simple parser for content-type values. Returns a (type, subtype,`
			`parameters) tuple, where type and subtype are strings, and parameters`
			`is a dict. If the string could not be parsed, return None.`

			`E.g. the following string:`

			`text/html; charset=UTF-8`

			`Returns:`

			`("text", "html", {"charset": "UTF-8"})`
			`"""`
			`parts = c.split(";", 1)`
			`ts = parts[0].split("/", 1)`
			`if len(ts) != 2:`
			`return None`
			`d = {}`
			`if len(parts) == 2:`
			`for i in parts[1].split(";"):`
			`clause = i.split("=", 1)`
			`if len(clause) == 2:`
			`d[clause[0].strip()] = clause[1].strip()`
Refactor pretty view forcing somewhat. - Use a lookup table of content types -> view modes. - Add a urlencoded forcing. Remove "html" - at the moment it's the same as "xmlish". - Display type when forced. 2012-03-19 21:58:43 +00:00			`return ts[0].lower(), ts[1].lower(), d`
Add a simple parser for content type specifications. 2012-03-19 21:31:07 +00:00

Firm up reverse proxy specification. - Extract proxy spec parsing and unparsing functions. - Add a status indicator in mitmproxy. - Add the "R" keybinding for changing the reverse proxy from within mitmproxy. 2012-02-18 03:27:09 +00:00			`def hostport(scheme, host, port):`
			`"""`
			`Returns the host component, with a port specifcation if needed.`
			`"""`
			`if (port, scheme) in [(80, "http"), (443, "https")]:`
			`return host`
			`else:`
			`return "%s:%s"%(host, port)`


			`def unparse_url(scheme, host, port, path=""):`
			`"""`
			`Returns a URL string, constructed from the specified compnents.`
			`"""`
			`return "%s://%s%s"%(scheme, hostport(scheme, host, port), path)`


KVEditor: "e" shortcut spawns an external editor on a field. 2012-02-08 05:25:00 +00:00			`def clean_hanging_newline(t):`
			`"""`
			`Many editors will silently add a newline to the final line of a`
			`document (I'm looking at you, Vim). This function fixes this common`
			`problem at the risk of removing a hanging newline in the rare cases`
			`where the user actually intends it.`
			`"""`
Add an "r" shortcut in grid editors to read value from file. 2012-08-25 00:21:45 +00:00			`if t and t[-1] == "\n":`
KVEditor: "e" shortcut spawns an external editor on a field. 2012-02-08 05:25:00 +00:00			`return t[:-1]`
			`return t`


Add HTTP body size limit specification to command-line tools. 2011-09-09 03:27:31 +00:00			`def parse_size(s):`
			`"""`
			`Parses a size specification. Valid specifications are:`
Fix an issue caused by some editors when editing a request/response body. Many editors make it hard save a file without a terminating newline on the last line. When editing message bodies, this can cause problems. For now, I just strip the newlines off the end of the body when we return from an editor. 2012-01-20 23:43:00 +00:00
Add HTTP body size limit specification to command-line tools. 2011-09-09 03:27:31 +00:00			`123: bytes`
			`123k: kilobytes`
			`123m: megabytes`
			`123g: gigabytes`
			`"""`
			`if not s:`
			`return None`
			`mult = None`
			`if s[-1].lower() == "k":`
			`mult = 1024**1`
			`elif s[-1].lower() == "m":`
			`mult = 1024**2`
			`elif s[-1].lower() == "g":`
			`mult = 1024**3`

			`if mult:`
			`s = s[:-1]`
			`else:`
			`mult = 1`
			`try:`
			`return int(s) * mult`
			`except ValueError:`
			`raise ValueError("Invalid size specification: %s"%s)`
Fix a crashing bug when replacing text in a flow with unicode bodies. 2012-05-26 01:10:31 +00:00

			`def safe_subn(pattern, repl, target, args, *kwargs):`
			`"""`
			`There are Unicode conversion problems with re.subn. We try to smooth`
			`that over by casting the pattern and replacement to strings. We really`
			`need a better solution that is aware of the actual content ecoding.`
			`"""`
			`return re.subn(str(pattern), str(repl), target, args, *kwargs)`