diff --git a/mitmproxy/contentviews/protobuf.py b/mitmproxy/contentviews/protobuf.py index 4bbb15809..abd3985a6 100644 --- a/mitmproxy/contentviews/protobuf.py +++ b/mitmproxy/contentviews/protobuf.py @@ -1,6 +1,63 @@ -import subprocess +import io +from kaitaistruct import KaitaiStream from . import base +from mitmproxy.contrib.kaitaistruct import google_protobuf + + +def write_buf(out, field_tag, body, indent_level): + if body is not None: + out.write("{: <{level}}{}: {}\n".format('', field_tag, body if isinstance(body, int) else str(body, 'utf-8'), + level=indent_level)) + elif field_tag is not None: + out.write(' ' * indent_level + str(field_tag) + " {\n") + else: + out.write(' ' * indent_level + "}\n") + + +def format_pbuf(raw): + out = io.StringIO() + stack = [] + + try: + buf = google_protobuf.GoogleProtobuf(KaitaiStream(io.BytesIO(raw))) + except: + return False + stack.extend([(pair, 0) for pair in buf.pairs[::-1]]) + + while len(stack): + pair, indent_level = stack.pop() + + if pair.wire_type == pair.WireTypes.group_start: + body = None + elif pair.wire_type == pair.WireTypes.group_end: + body = None + pair._m_field_tag = None + elif pair.wire_type == pair.WireTypes.len_delimited: + body = pair.value.body + elif pair.wire_type == pair.WireTypes.varint: + body = pair.value.value + else: + body = pair.value + + try: + next_buf = google_protobuf.GoogleProtobuf(KaitaiStream(io.BytesIO(body))) + stack.extend([(pair, indent_level + 2) for pair in next_buf.pairs[::-1]]) + write_buf(out, pair.field_tag, None, indent_level) + except: + write_buf(out, pair.field_tag, body, indent_level) + + if stack: + prev_level = stack[-1][1] + else: + prev_level = 0 + + if prev_level < indent_level: + levels = int((indent_level - prev_level) / 2) + for i in range(1, levels + 1): + write_buf(out, None, None, indent_level - i * 2) + + return out.getvalue() class ViewProtobuf(base.View): @@ -15,28 +72,9 @@ class ViewProtobuf(base.View): "application/x-protobuffer", ] - def is_available(self): - try: - p = subprocess.Popen( - ["protoc", "--version"], - stdout=subprocess.PIPE - ) - out, _ = p.communicate() - return out.startswith(b"libprotoc") - except: - return False - def __call__(self, data, **metadata): - if not self.is_available(): - raise NotImplementedError("protoc not found. Please make sure 'protoc' is available in $PATH.") - - # if Popen raises OSError, it will be caught in - # get_content_view and fall back to Raw - p = subprocess.Popen(['protoc', '--decode_raw'], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - decoded, _ = p.communicate(input=data) + decoded = format_pbuf(data) if not decoded: raise ValueError("Failed to parse input.") + return "Protobuf", base.format_text(decoded) diff --git a/mitmproxy/contrib/kaitaistruct/google_protobuf.py b/mitmproxy/contrib/kaitaistruct/google_protobuf.py new file mode 100644 index 000000000..fe2336cc9 --- /dev/null +++ b/mitmproxy/contrib/kaitaistruct/google_protobuf.py @@ -0,0 +1,124 @@ +# This is a generated file! Please edit source .ksy file and use kaitai-struct-compiler to rebuild + +from pkg_resources import parse_version +from kaitaistruct import __version__ as ks_version, KaitaiStruct, KaitaiStream, BytesIO +from enum import Enum + + +if parse_version(ks_version) < parse_version('0.7'): + raise Exception("Incompatible Kaitai Struct Python API: 0.7 or later is required, but you have %s" % (ks_version)) + +from .vlq_base128_le import VlqBase128Le +class GoogleProtobuf(KaitaiStruct): + """Google Protocol Buffers (AKA protobuf) is a popular data + serialization scheme used for communication protocols, data storage, + etc. There are implementations are available for almost every + popular language. The focus points of this scheme are brevity (data + is encoded in a very size-efficient manner) and extensibility (one + can add keys to the structure, while keeping it readable in previous + version of software). + + Protobuf uses semi-self-describing encoding scheme for its + messages. It means that it is possible to parse overall structure of + the message (skipping over fields one can't understand), but to + fully understand the message, one needs a protocol definition file + (`.proto`). To be specific: + + * "Keys" in key-value pairs provided in the message are identified + only with an integer "field tag". `.proto` file provides info on + which symbolic field names these field tags map to. + * "Keys" also provide something called "wire type". It's not a data + type in its common sense (i.e. you can't, for example, distinguish + `sint32` vs `uint32` vs some enum, or `string` from `bytes`), but + it's enough information to determine how many bytes to + parse. Interpretation of the value should be done according to the + type specified in `.proto` file. + * There's no direct information on which fields are optional / + required, which fields may be repeated or constitute a map, what + restrictions are placed on fields usage in a single message, what + are the fields' default values, etc, etc. + + .. seealso:: + Source - https://developers.google.com/protocol-buffers/docs/encoding + """ + def __init__(self, _io, _parent=None, _root=None): + self._io = _io + self._parent = _parent + self._root = _root if _root else self + self._read() + + def _read(self): + self.pairs = [] + while not self._io.is_eof(): + self.pairs.append(self._root.Pair(self._io, self, self._root)) + + + class Pair(KaitaiStruct): + """Key-value pair.""" + + class WireTypes(Enum): + varint = 0 + bit_64 = 1 + len_delimited = 2 + group_start = 3 + group_end = 4 + bit_32 = 5 + def __init__(self, _io, _parent=None, _root=None): + self._io = _io + self._parent = _parent + self._root = _root if _root else self + self._read() + + def _read(self): + self.key = VlqBase128Le(self._io) + _on = self.wire_type + if _on == self._root.Pair.WireTypes.varint: + self.value = VlqBase128Le(self._io) + elif _on == self._root.Pair.WireTypes.len_delimited: + self.value = self._root.DelimitedBytes(self._io, self, self._root) + elif _on == self._root.Pair.WireTypes.bit_64: + self.value = self._io.read_u8le() + elif _on == self._root.Pair.WireTypes.bit_32: + self.value = self._io.read_u4le() + + @property + def wire_type(self): + """"Wire type" is a part of the "key" that carries enough + information to parse value from the wire, i.e. read correct + amount of bytes, but there's not enough informaton to + interprete in unambiguously. For example, one can't clearly + distinguish 64-bit fixed-sized integers from 64-bit floats, + signed zigzag-encoded varints from regular unsigned varints, + arbitrary bytes from UTF-8 encoded strings, etc. + """ + if hasattr(self, '_m_wire_type'): + return self._m_wire_type if hasattr(self, '_m_wire_type') else None + + self._m_wire_type = self._root.Pair.WireTypes((self.key.value & 7)) + return self._m_wire_type if hasattr(self, '_m_wire_type') else None + + @property + def field_tag(self): + """Identifies a field of protocol. One can look up symbolic + field name in a `.proto` file by this field tag. + """ + if hasattr(self, '_m_field_tag'): + return self._m_field_tag if hasattr(self, '_m_field_tag') else None + + self._m_field_tag = (self.key.value >> 3) + return self._m_field_tag if hasattr(self, '_m_field_tag') else None + + + class DelimitedBytes(KaitaiStruct): + def __init__(self, _io, _parent=None, _root=None): + self._io = _io + self._parent = _parent + self._root = _root if _root else self + self._read() + + def _read(self): + self.len = VlqBase128Le(self._io) + self.body = self._io.read_bytes(self.len.value) + + + diff --git a/mitmproxy/contrib/kaitaistruct/make.sh b/mitmproxy/contrib/kaitaistruct/make.sh index 789829cf6..0a30358aa 100755 --- a/mitmproxy/contrib/kaitaistruct/make.sh +++ b/mitmproxy/contrib/kaitaistruct/make.sh @@ -7,5 +7,7 @@ wget -N https://raw.githubusercontent.com/kaitai-io/kaitai_struct_formats/master wget -N https://raw.githubusercontent.com/kaitai-io/kaitai_struct_formats/master/image/jpeg.ksy wget -N https://raw.githubusercontent.com/kaitai-io/kaitai_struct_formats/master/image/png.ksy wget -N https://raw.githubusercontent.com/kaitai-io/kaitai_struct_formats/master/image/ico.ksy +wget -N https://raw.githubusercontent.com/kaitai-io/kaitai_struct_formats/master/common/vlq_base128_le.ksy +wget -N https://raw.githubusercontent.com/kaitai-io/kaitai_struct_formats/master/serialization/google_protobuf.ksy kaitai-struct-compiler --target python --opaque-types=true *.ksy diff --git a/mitmproxy/contrib/kaitaistruct/vlq_base128_le.py b/mitmproxy/contrib/kaitaistruct/vlq_base128_le.py new file mode 100644 index 000000000..235759b7a --- /dev/null +++ b/mitmproxy/contrib/kaitaistruct/vlq_base128_le.py @@ -0,0 +1,94 @@ +# This is a generated file! Please edit source .ksy file and use kaitai-struct-compiler to rebuild + +from pkg_resources import parse_version +from kaitaistruct import __version__ as ks_version, KaitaiStruct, KaitaiStream, BytesIO + + +if parse_version(ks_version) < parse_version('0.7'): + raise Exception("Incompatible Kaitai Struct Python API: 0.7 or later is required, but you have %s" % (ks_version)) + +class VlqBase128Le(KaitaiStruct): + """A variable-length unsigned integer using base128 encoding. 1-byte groups + consists of 1-bit flag of continuation and 7-bit value, and are ordered + "least significant group first", i.e. in "little-endian" manner. + + This particular encoding is specified and used in: + + * DWARF debug file format, where it's dubbed "unsigned LEB128" or "ULEB128". + http://dwarfstd.org/doc/dwarf-2.0.0.pdf - page 139 + * Google Protocol Buffers, where it's called "Base 128 Varints". + https://developers.google.com/protocol-buffers/docs/encoding?csw=1#varints + * Apache Lucene, where it's called "VInt" + http://lucene.apache.org/core/3_5_0/fileformats.html#VInt + * Apache Avro uses this as a basis for integer encoding, adding ZigZag on + top of it for signed ints + http://avro.apache.org/docs/current/spec.html#binary_encode_primitive + + More information on this encoding is available at https://en.wikipedia.org/wiki/LEB128 + + This particular implementation supports serialized values to up 8 bytes long. + """ + def __init__(self, _io, _parent=None, _root=None): + self._io = _io + self._parent = _parent + self._root = _root if _root else self + self._read() + + def _read(self): + self.groups = [] + while True: + _ = self._root.Group(self._io, self, self._root) + self.groups.append(_) + if not (_.has_next): + break + + class Group(KaitaiStruct): + """One byte group, clearly divided into 7-bit "value" and 1-bit "has continuation + in the next byte" flag. + """ + def __init__(self, _io, _parent=None, _root=None): + self._io = _io + self._parent = _parent + self._root = _root if _root else self + self._read() + + def _read(self): + self.b = self._io.read_u1() + + @property + def has_next(self): + """If true, then we have more bytes to read.""" + if hasattr(self, '_m_has_next'): + return self._m_has_next if hasattr(self, '_m_has_next') else None + + self._m_has_next = (self.b & 128) != 0 + return self._m_has_next if hasattr(self, '_m_has_next') else None + + @property + def value(self): + """The 7-bit (base128) numeric value of this group.""" + if hasattr(self, '_m_value'): + return self._m_value if hasattr(self, '_m_value') else None + + self._m_value = (self.b & 127) + return self._m_value if hasattr(self, '_m_value') else None + + + @property + def len(self): + if hasattr(self, '_m_len'): + return self._m_len if hasattr(self, '_m_len') else None + + self._m_len = len(self.groups) + return self._m_len if hasattr(self, '_m_len') else None + + @property + def value(self): + """Resulting value as normal integer.""" + if hasattr(self, '_m_value'): + return self._m_value if hasattr(self, '_m_value') else None + + self._m_value = (((((((self.groups[0].value + ((self.groups[1].value << 7) if self.len >= 2 else 0)) + ((self.groups[2].value << 14) if self.len >= 3 else 0)) + ((self.groups[3].value << 21) if self.len >= 4 else 0)) + ((self.groups[4].value << 28) if self.len >= 5 else 0)) + ((self.groups[5].value << 35) if self.len >= 6 else 0)) + ((self.groups[6].value << 42) if self.len >= 7 else 0)) + ((self.groups[7].value << 49) if self.len >= 8 else 0)) + return self._m_value if hasattr(self, '_m_value') else None + + diff --git a/test/mitmproxy/contentviews/test_protobuf.py b/test/mitmproxy/contentviews/test_protobuf.py index 71e515769..6c6e37f2b 100644 --- a/test/mitmproxy/contentviews/test_protobuf.py +++ b/test/mitmproxy/contentviews/test_protobuf.py @@ -1,52 +1,31 @@ -from unittest import mock import pytest from mitmproxy.contentviews import protobuf from mitmproxy.test import tutils from . import full_eval +data = tutils.test_data.push("mitmproxy/contentviews/test_protobuf_data/") + def test_view_protobuf_request(): v = full_eval(protobuf.ViewProtobuf()) - p = tutils.test_data.path("mitmproxy/data/protobuf01") + p = data.path("protobuf01") - with mock.patch('mitmproxy.contentviews.protobuf.ViewProtobuf.is_available'): - with mock.patch('subprocess.Popen') as n: - m = mock.Mock() - attrs = {'communicate.return_value': (b'1: "3bbc333c-e61c-433b-819a-0b9a8cc103b8"', True)} - m.configure_mock(**attrs) - n.return_value = m - - with open(p, "rb") as f: - data = f.read() - content_type, output = v(data) - assert content_type == "Protobuf" - assert output[0] == [('text', b'1: "3bbc333c-e61c-433b-819a-0b9a8cc103b8"')] - - m.communicate = mock.MagicMock() - m.communicate.return_value = (None, None) - with pytest.raises(ValueError, matches="Failed to parse input."): - v(b'foobar') + with open(p, "rb") as f: + raw = f.read() + content_type, output = v(raw) + assert content_type == "Protobuf" + assert output == [[('text', '1: 3bbc333c-e61c-433b-819a-0b9a8cc103b8')]] + with pytest.raises(ValueError, matches="Failed to parse input."): + v(b'foobar') -def test_view_protobuf_availability(): - with mock.patch('subprocess.Popen') as n: - m = mock.Mock() - attrs = {'communicate.return_value': (b'libprotoc fake version', True)} - m.configure_mock(**attrs) - n.return_value = m - assert protobuf.ViewProtobuf().is_available() +@pytest.mark.parametrize("filename", ["protobuf02", "protobuf03"]) +def test_format_pbuf(filename): + path = data.path(filename) + with open(path, "rb") as f: + input = f.read() + with open(path + "-decoded") as f: + expected = f.read() - m = mock.Mock() - attrs = {'communicate.return_value': (b'command not found', True)} - m.configure_mock(**attrs) - n.return_value = m - assert not protobuf.ViewProtobuf().is_available() - - -def test_view_protobuf_fallback(): - with mock.patch('subprocess.Popen.communicate') as m: - m.side_effect = OSError() - v = full_eval(protobuf.ViewProtobuf()) - with pytest.raises(NotImplementedError, matches='protoc not found'): - v(b'foobar') + assert protobuf.format_pbuf(input) == expected diff --git a/test/mitmproxy/data/protobuf01 b/test/mitmproxy/contentviews/test_protobuf_data/protobuf01 similarity index 100% rename from test/mitmproxy/data/protobuf01 rename to test/mitmproxy/contentviews/test_protobuf_data/protobuf01 diff --git a/test/mitmproxy/contentviews/test_protobuf_data/protobuf02 b/test/mitmproxy/contentviews/test_protobuf_data/protobuf02 new file mode 100644 index 000000000..a47c45d51 Binary files /dev/null and b/test/mitmproxy/contentviews/test_protobuf_data/protobuf02 differ diff --git a/test/mitmproxy/contentviews/test_protobuf_data/protobuf02-decoded b/test/mitmproxy/contentviews/test_protobuf_data/protobuf02-decoded new file mode 100644 index 000000000..9be61e28e --- /dev/null +++ b/test/mitmproxy/contentviews/test_protobuf_data/protobuf02-decoded @@ -0,0 +1,65 @@ +1 { + 1: tpbuf + 4 { + 1: Person + 2 { + 1: name + 3: 1 + 4: 2 + 5: 9 + } + 2 { + 1: id + 3: 2 + 4: 2 + 5: 5 + } + 2 { + 1 { + 12: 1818845549 + } + 3: 3 + 4: 1 + 5: 9 + } + 2 { + 1: phone + 3: 4 + 4: 3 + 5: 11 + 6: .Person.PhoneNumber + } + 3 { + 1: PhoneNumber + 2 { + 1: number + 3: 1 + 4: 2 + 5: 9 + } + 2 { + 1: type + 3: 2 + 4: 1 + 5: 14 + 6: .Person.PhoneType + 7: HOME + } + } + 4 { + 1: PhoneType + 2 { + 1: MOBILE + 2: 0 + } + 2 { + 1: HOME + 2: 1 + } + 2 { + 1: WORK + 2: 2 + } + } + } +} diff --git a/test/mitmproxy/contentviews/test_protobuf_data/protobuf03 b/test/mitmproxy/contentviews/test_protobuf_data/protobuf03 new file mode 100644 index 000000000..9fb230b3a --- /dev/null +++ b/test/mitmproxy/contentviews/test_protobuf_data/protobuf03 @@ -0,0 +1 @@ +€ ð \ No newline at end of file diff --git a/test/mitmproxy/contentviews/test_protobuf_data/protobuf03-decoded b/test/mitmproxy/contentviews/test_protobuf_data/protobuf03-decoded new file mode 100644 index 000000000..3d3392e16 --- /dev/null +++ b/test/mitmproxy/contentviews/test_protobuf_data/protobuf03-decoded @@ -0,0 +1,4 @@ +2 { +3: 3840 +4: 2160 +}