mitmproxy/netlib/utils.py

from __future__ import absolute_import, print_function, division
import os.path
import re
import codecs
import unicodedata
import importlib
import inspect

import six

def always_bytes(unicode_or_bytes, *encode_args):
    if isinstance(unicode_or_bytes, six.text_type):
        return unicode_or_bytes.encode(*encode_args)
    return unicode_or_bytes


def native(s, *encoding_opts):
    """
    Convert :py:class:`bytes` or :py:class:`unicode` to the native
    :py:class:`str` type, using latin1 encoding if conversion is necessary.

    https://www.python.org/dev/peps/pep-3333/#a-note-on-string-types
    """
    if not isinstance(s, (six.binary_type, six.text_type)):
        raise TypeError("%r is neither bytes nor unicode" % s)
    if six.PY3:
        if isinstance(s, six.binary_type):
            return s.decode(*encoding_opts)
    else:
        if isinstance(s, six.text_type):
            return s.encode(*encoding_opts)
    return s


def clean_bin(s, keep_spacing=True):
    """
        Cleans binary data to make it safe to display.

        Args:
            keep_spacing: If False, tabs and newlines will also be replaced.
    """
    if isinstance(s, six.text_type):
        if keep_spacing:
            keep = u" \n\r\t"
        else:
            keep = u" "
        return u"".join(
            ch if (unicodedata.category(ch)[0] not in "CZ" or ch in keep) else u"."
            for ch in s
        )
    else:
        if keep_spacing:
            keep = (9, 10, 13)  # \t, \n, \r,
        else:
            keep = ()
        return b"".join(
            six.int2byte(ch) if (31 < ch < 127 or ch in keep) else b"."
            for ch in six.iterbytes(s)
        )


def hexdump(s):
    """
        Returns:
            A generator of (offset, hex, str) tuples
    """
    for i in range(0, len(s), 16):
        offset = "{:0=10x}".format(i).encode()
        part = s[i:i + 16]
        x = b" ".join("{:0=2x}".format(i).encode() for i in six.iterbytes(part))
        x = x.ljust(47)  # 16*2 + 15
        yield (offset, x, clean_bin(part, False))


def setbit(byte, offset, value):
    """
        Set a bit in a byte to 1 if value is truthy, 0 if not.
    """
    if value:
        return byte | (1 << offset)
    else:
        return byte & ~(1 << offset)


def getbit(byte, offset):
    mask = 1 << offset
    return bool(byte & mask)


class BiDi(object):

    """
        A wee utility class for keeping bi-directional mappings, like field
        constants in protocols. Names are attributes on the object, dict-like
        access maps values to names:

        CONST = BiDi(a=1, b=2)
        assert CONST.a == 1
        assert CONST.get_name(1) == "a"
    """

    def __init__(self, **kwargs):
        self.names = kwargs
        self.values = {}
        for k, v in kwargs.items():
            self.values[v] = k
        if len(self.names) != len(self.values):
            raise ValueError("Duplicate values not allowed.")

    def __getattr__(self, k):
        if k in self.names:
            return self.names[k]
        raise AttributeError("No such attribute: %s", k)

    def get_name(self, n, default=None):
        return self.values.get(n, default)


class Data(object):

    def __init__(self, name):
        m = importlib.import_module(name)
        dirname = os.path.dirname(inspect.getsourcefile(m))
        self.dirname = os.path.abspath(dirname)

    def path(self, path):
        """
            Returns a path to the package data housed at 'path' under this
            module.Path can be a path to a file, or to a directory.

            This function will raise ValueError if the path does not exist.
        """
        fullpath = os.path.join(self.dirname, path)
        if not os.path.exists(fullpath):
            raise ValueError("dataPath: %s does not exist." % fullpath)
        return fullpath


_label_valid = re.compile(b"(?!-)[A-Z\d-]{1,63}(?<!-)$", re.IGNORECASE)


def is_valid_host(host):
    """
    Checks if a hostname is valid.

    Args:
      host (bytes): The hostname
    """
    try:
        host.decode("idna")
    except ValueError:
        return False
    if len(host) > 255:
        return False
    if host[-1] == b".":
        host = host[:-1]
    return all(_label_valid.match(x) for x in host.split(b"."))


def is_valid_port(port):
    return 0 <= port <= 65535


def hostport(scheme, host, port):
    """
        Returns the host component, with a port specifcation if needed.
    """
    if (port, scheme) in [(80, "http"), (443, "https"), (80, b"http"), (443, b"https")]:
        return host
    else:
        if isinstance(host, six.binary_type):
            return b"%s:%d" % (host, port)
        else:
            return "%s:%d" % (host, port)


def safe_subn(pattern, repl, target, *args, **kwargs):
    """
        There are Unicode conversion problems with re.subn. We try to smooth
        that over by casting the pattern and replacement to strings. We really
        need a better solution that is aware of the actual content ecoding.
    """
    return re.subn(str(pattern), str(repl), target, *args, **kwargs)


def bytes_to_escaped_str(data):
    """
    Take bytes and return a safe string that can be displayed to the user.

    Single quotes are always escaped, double quotes are never escaped:
        "'" + bytes_to_escaped_str(...) + "'"
    gives a valid Python string.
    """
    # TODO: We may want to support multi-byte characters without escaping them.
    # One way to do would be calling .decode("utf8", "backslashreplace") first
    # and then escaping UTF8 control chars (see clean_bin).

    if not isinstance(data, bytes):
        raise ValueError("data must be bytes, but is {}".format(data.__class__.__name__))
    # We always insert a double-quote here so that we get a single-quoted string back
    # https://stackoverflow.com/questions/29019340/why-does-python-use-different-quotes-for-representing-strings-depending-on-their
    return repr(b'"' + data).lstrip("b")[2:-1]


def escaped_str_to_bytes(data):
    """
    Take an escaped string and return the unescaped bytes equivalent.
    """
    if not isinstance(data, six.string_types):
        if six.PY2:
            raise ValueError("data must be str or unicode, but is {}".format(data.__class__.__name__))
        raise ValueError("data must be str, but is {}".format(data.__class__.__name__))

    if six.PY2:
        if isinstance(data, unicode):
            data = data.encode("utf8")
        return data.decode("string-escape")

    # This one is difficult - we use an undocumented Python API here
    # as per http://stackoverflow.com/a/23151714/934719
    return codecs.escape_decode(data)[0]
Revert "Porting netlib to python3.4" This reverts commit 823718348598efb324298ca29ad4cb7d5097c084. 2015-11-12 01:41:42 +00:00			`from __future__ import absolute_import, print_function, division`
Remove dependence on pathod in test suite. 2015-06-23 10:16:03 +00:00			`import os.path`
add move tests and code from mitmproxy 2015-08-05 19:32:53 +00:00			`import re`
migrate to hyperframe 2016-01-31 11:15:44 +00:00			`import codecs`
make clean_bin unicode-aware 2015-09-12 15:03:09 +00:00			`import unicodedata`
fix tests 2016-02-15 15:34:38 +00:00			`import importlib`
			`import inspect`
add Serializable ABC 2016-02-08 03:16:58 +00:00
wip 2015-09-15 17:12:15 +00:00			`import six`

python 3++ 2015-09-20 22:44:17 +00:00			`def always_bytes(unicode_or_bytes, *encode_args):`
			`if isinstance(unicode_or_bytes, six.text_type):`
			`return unicode_or_bytes.encode(*encode_args)`
			`return unicode_or_bytes`


minor encoding fixes 2015-09-21 16:34:43 +00:00			`def native(s, *encoding_opts):`
python 3++ 2015-09-20 22:44:17 +00:00			`"""`
			Convert :py:class:`bytes` or :py:class:`unicode` to the native
			:py:class:`str` type, using latin1 encoding if conversion is necessary.

			`https://www.python.org/dev/peps/pep-3333/#a-note-on-string-types`
			`"""`
			`if not isinstance(s, (six.binary_type, six.text_type)):`
			`raise TypeError("%r is neither bytes nor unicode" % s)`
			`if six.PY3:`
			`if isinstance(s, six.binary_type):`
minor encoding fixes 2015-09-21 16:34:43 +00:00			`return s.decode(*encoding_opts)`
python 3++ 2015-09-20 22:44:17 +00:00			`else:`
			`if isinstance(s, six.text_type):`
minor encoding fixes 2015-09-21 16:34:43 +00:00			`return s.encode(*encoding_opts)`
python 3++ 2015-09-20 22:44:17 +00:00			`return s`


make clean_bin unicode-aware 2015-09-12 15:03:09 +00:00			`def clean_bin(s, keep_spacing=True):`
Create netlib.utils, move cleanBin and hexdump from libmproxy.utils. 2012-09-23 23:21:48 +00:00			`"""`
make clean_bin unicode-aware 2015-09-12 15:03:09 +00:00			`Cleans binary data to make it safe to display.`

			`Args:`
			`keep_spacing: If False, tabs and newlines will also be replaced.`
Create netlib.utils, move cleanBin and hexdump from libmproxy.utils. 2012-09-23 23:21:48 +00:00			`"""`
make clean_bin unicode-aware 2015-09-12 15:03:09 +00:00			`if isinstance(s, six.text_type):`
			`if keep_spacing:`
Revert "Porting netlib to python3.4" This reverts commit 823718348598efb324298ca29ad4cb7d5097c084. 2015-11-12 01:41:42 +00:00			`keep = u" \n\r\t"`
make clean_bin unicode-aware 2015-09-12 15:03:09 +00:00			`else:`
Revert "Porting netlib to python3.4" This reverts commit 823718348598efb324298ca29ad4cb7d5097c084. 2015-11-12 01:41:42 +00:00			`keep = u" "`
			`return u"".join(`
			`ch if (unicodedata.category(ch)[0] not in "CZ" or ch in keep) else u"."`
make clean_bin unicode-aware 2015-09-12 15:03:09 +00:00			`for ch in s`
			`)`
			`else:`
			`if keep_spacing:`
finish netlib.http.http1 refactor 2015-09-15 22:04:23 +00:00			`keep = (9, 10, 13) # \t, \n, \r,`
Create netlib.utils, move cleanBin and hexdump from libmproxy.utils. 2012-09-23 23:21:48 +00:00			`else:`
finish netlib.http.http1 refactor 2015-09-15 22:04:23 +00:00			`keep = ()`
make clean_bin unicode-aware 2015-09-12 15:03:09 +00:00			`return b"".join(`
wip 2015-09-15 17:12:15 +00:00			`six.int2byte(ch) if (31 < ch < 127 or ch in keep) else b"."`
			`for ch in six.iterbytes(s)`
make clean_bin unicode-aware 2015-09-12 15:03:09 +00:00			`)`
Create netlib.utils, move cleanBin and hexdump from libmproxy.utils. 2012-09-23 23:21:48 +00:00

			`def hexdump(s):`
			`"""`
clean up http message models 2015-09-17 13:16:12 +00:00			`Returns:`
			`A generator of (offset, hex, str) tuples`
Create netlib.utils, move cleanBin and hexdump from libmproxy.utils. 2012-09-23 23:21:48 +00:00			`"""`
			`for i in range(0, len(s), 16):`
Porting to Python 3.4 Fixed byte string formatting for hexdump. = test session starts = platform darwin -- Python 3.4.1, pytest-2.8.2, py-1.4.30, pluggy-0.3.1 rootdir: /Users/samc/src/python/netlib, inifile: collected 11 items netlib/test/test_utils.py ........... = 11 passed in 0.23 seconds = 2015-11-12 01:53:51 +00:00			`offset = "{:0=10x}".format(i).encode()`
add inet_ntop/inet_pton functions 2014-06-25 18:31:10 +00:00			`part = s[i:i + 16]`
Porting to Python 3.4 Fixed byte string formatting for hexdump. = test session starts = platform darwin -- Python 3.4.1, pytest-2.8.2, py-1.4.30, pluggy-0.3.1 rootdir: /Users/samc/src/python/netlib, inifile: collected 11 items netlib/test/test_utils.py ........... = 11 passed in 0.23 seconds = 2015-11-12 01:53:51 +00:00			`x = b" ".join("{:0=2x}".format(i).encode() for i in six.iterbytes(part))`
clean up http message models 2015-09-17 13:16:12 +00:00			`x = x.ljust(47) # 16*2 + 15`
			`yield (offset, x, clean_bin(part, False))`
websocket: interface refactoring - Separate out FrameHeader. We need to deal with this separately in many circumstances. - Simpler equality scheme. - Bits are now specified by truthiness - we don't care about the integer value. This means lots of validation is not needed any more. 2015-04-24 03:09:21 +00:00

			`def setbit(byte, offset, value):`
			`"""`
			`Set a bit in a byte to 1 if value is truthy, 0 if not.`
			`"""`
			`if value:`
			`return byte \| (1 << offset)`
			`else:`
			`return byte & ~(1 << offset)`


			`def getbit(byte, offset):`
			`mask = 1 << offset`
clean up http message models 2015-09-17 13:16:12 +00:00			`return bool(byte & mask)`
Add a tiny utility class for keeping bi-directional mappings. Use it in websocket and socks. 2015-04-29 21:04:22 +00:00

fix warnings and code smells use prospector to find them 2015-06-17 11:10:27 +00:00			`class BiDi(object):`
cleanup code with autopep8 run the following command: $ autopep8 -i -r -a -a . 2015-05-27 09:18:54 +00:00
Add a tiny utility class for keeping bi-directional mappings. Use it in websocket and socks. 2015-04-29 21:04:22 +00:00			`"""`
			`A wee utility class for keeping bi-directional mappings, like field`
websockets: more compact and legible human_readable 2015-04-30 00:10:08 +00:00			`constants in protocols. Names are attributes on the object, dict-like`
			`access maps values to names:`
Add a tiny utility class for keeping bi-directional mappings. Use it in websocket and socks. 2015-04-29 21:04:22 +00:00
			`CONST = BiDi(a=1, b=2)`
			`assert CONST.a == 1`
websockets: more compact and legible human_readable 2015-04-30 00:10:08 +00:00			`assert CONST.get_name(1) == "a"`
Add a tiny utility class for keeping bi-directional mappings. Use it in websocket and socks. 2015-04-29 21:04:22 +00:00			`"""`
cleanup code with autopep8 run the following command: $ autopep8 -i -r -a -a . 2015-05-27 09:18:54 +00:00
Add a tiny utility class for keeping bi-directional mappings. Use it in websocket and socks. 2015-04-29 21:04:22 +00:00			`def __init__(self, **kwargs):`
			`self.names = kwargs`
			`self.values = {}`
Revert "Porting netlib to python3.4" This reverts commit 823718348598efb324298ca29ad4cb7d5097c084. 2015-11-12 01:41:42 +00:00			`for k, v in kwargs.items():`
Add a tiny utility class for keeping bi-directional mappings. Use it in websocket and socks. 2015-04-29 21:04:22 +00:00			`self.values[v] = k`
			`if len(self.names) != len(self.values):`
			`raise ValueError("Duplicate values not allowed.")`

			`def __getattr__(self, k):`
			`if k in self.names:`
			`return self.names[k]`
			`raise AttributeError("No such attribute: %s", k)`

websockets: more compact and legible human_readable 2015-04-30 00:10:08 +00:00			`def get_name(self, n, default=None):`
			`return self.values.get(n, default)`


Remove dependence on pathod in test suite. 2015-06-23 10:16:03 +00:00			`class Data(object):`
cleanup whitespace 2015-08-10 18:44:36 +00:00
Remove dependence on pathod in test suite. 2015-06-23 10:16:03 +00:00			`def __init__(self, name):`
fix tests 2016-02-15 15:34:38 +00:00			`m = importlib.import_module(name)`
			`dirname = os.path.dirname(inspect.getsourcefile(m))`
Remove dependence on pathod in test suite. 2015-06-23 10:16:03 +00:00			`self.dirname = os.path.abspath(dirname)`

			`def path(self, path):`
			`"""`
			`Returns a path to the package data housed at 'path' under this`
			`module.Path can be a path to a file, or to a directory.`

			`This function will raise ValueError if the path does not exist.`
			`"""`
fix tests 2016-02-15 15:34:38 +00:00			`fullpath = os.path.join(self.dirname, path)`
Remove dependence on pathod in test suite. 2015-06-23 10:16:03 +00:00			`if not os.path.exists(fullpath):`
			`raise ValueError("dataPath: %s does not exist." % fullpath)`
			`return fullpath`
add on-the-wire representation methods 2015-07-29 09:27:43 +00:00

wip 2015-09-15 17:12:15 +00:00			`_label_valid = re.compile(b"(?!-)[A-Z\d-]{1,63}(?<!-)$", re.IGNORECASE)`
move code from mitmproxy to netlib 2015-08-01 08:39:14 +00:00

			`def is_valid_host(host):`
python3++ 2015-09-20 17:40:09 +00:00			`"""`
			`Checks if a hostname is valid.`

			`Args:`
			`host (bytes): The hostname`
			`"""`
move code from mitmproxy to netlib 2015-08-01 08:39:14 +00:00			`try:`
			`host.decode("idna")`
			`except ValueError:`
			`return False`
wip 2015-09-15 17:12:15 +00:00			`if len(host) > 255:`
			`return False`
clean up http message models 2015-09-17 13:16:12 +00:00			`if host[-1] == b".":`
wip 2015-09-15 17:12:15 +00:00			`host = host[:-1]`
			`return all(_label_valid.match(x) for x in host.split(b"."))`


			`def is_valid_port(port):`
			`return 0 <= port <= 65535`


add on-the-wire representation methods 2015-07-29 09:27:43 +00:00			`def hostport(scheme, host, port):`
			`"""`
			`Returns the host component, with a port specifcation if needed.`
			`"""`
refactor request model 2015-09-25 22:39:04 +00:00			`if (port, scheme) in [(80, "http"), (443, "https"), (80, b"http"), (443, b"https")]:`
add on-the-wire representation methods 2015-07-29 09:27:43 +00:00			`return host`
			`else:`
refactor request model 2015-09-25 22:39:04 +00:00			`if isinstance(host, six.binary_type):`
			`return b"%s:%d" % (host, port)`
			`else:`
			`return "%s:%d" % (host, port)`
move code from mitmproxy to netlib 2015-08-01 08:39:14 +00:00
cleanup whitespace 2015-08-10 18:44:36 +00:00
improve .replace() and move it into netlib 2016-04-02 12:38:33 +00:00			`def safe_subn(pattern, repl, target, args, *kwargs):`
			`"""`
			`There are Unicode conversion problems with re.subn. We try to smooth`
			`that over by casting the pattern and replacement to strings. We really`
			`need a better solution that is aware of the actual content ecoding.`
			`"""`
			`return re.subn(str(pattern), str(repl), target, args, *kwargs)`
Sanitize Print (#1135) * sanitize strings with shell control characters * netlib: add utilities to safe-print bytes * escaped str: add TODO for multi-byte chars 2016-05-12 17:03:57 +00:00

			`def bytes_to_escaped_str(data):`
			`"""`
			`Take bytes and return a safe string that can be displayed to the user.`
bytes_to_escaped_str: always escape single quotes 2016-05-26 03:11:34 +00:00
			`Single quotes are always escaped, double quotes are never escaped:`
			`"'" + bytes_to_escaped_str(...) + "'"`
			`gives a valid Python string.`
Sanitize Print (#1135) * sanitize strings with shell control characters * netlib: add utilities to safe-print bytes * escaped str: add TODO for multi-byte chars 2016-05-12 17:03:57 +00:00			`"""`
			`# TODO: We may want to support multi-byte characters without escaping them.`
			`# One way to do would be calling .decode("utf8", "backslashreplace") first`
			`# and then escaping UTF8 control chars (see clean_bin).`

			`if not isinstance(data, bytes):`
raise a more verbose error 2016-05-29 17:28:59 +00:00			`raise ValueError("data must be bytes, but is {}".format(data.__class__.__name__))`
bytes_to_escaped_str: always escape single quotes 2016-05-26 03:11:34 +00:00			`# We always insert a double-quote here so that we get a single-quoted string back`
			`# https://stackoverflow.com/questions/29019340/why-does-python-use-different-quotes-for-representing-strings-depending-on-their`
fix py3 tests 2016-05-26 03:31:32 +00:00			`return repr(b'"' + data).lstrip("b")[2:-1]`
Sanitize Print (#1135) * sanitize strings with shell control characters * netlib: add utilities to safe-print bytes * escaped str: add TODO for multi-byte chars 2016-05-12 17:03:57 +00:00

			`def escaped_str_to_bytes(data):`
			`"""`
			`Take an escaped string and return the unescaped bytes equivalent.`
			`"""`
escaped_str_to_bytes: support unicode on python 2 2016-05-26 02:16:02 +00:00			`if not isinstance(data, six.string_types):`
			`if six.PY2:`
raise a more verbose error 2016-05-29 17:28:59 +00:00			`raise ValueError("data must be str or unicode, but is {}".format(data.__class__.__name__))`
			`raise ValueError("data must be str, but is {}".format(data.__class__.__name__))`
Sanitize Print (#1135) * sanitize strings with shell control characters * netlib: add utilities to safe-print bytes * escaped str: add TODO for multi-byte chars 2016-05-12 17:03:57 +00:00
			`if six.PY2:`
escaped_str_to_bytes: support unicode on python 2 2016-05-26 02:16:02 +00:00			`if isinstance(data, unicode):`
			`data = data.encode("utf8")`
Sanitize Print (#1135) * sanitize strings with shell control characters * netlib: add utilities to safe-print bytes * escaped str: add TODO for multi-byte chars 2016-05-12 17:03:57 +00:00			`return data.decode("string-escape")`

			`# This one is difficult - we use an undocumented Python API here`
			`# as per http://stackoverflow.com/a/23151714/934719`
			`return codecs.escape_decode(data)[0]`