Merge pull request #2427 from ujjwal96/protobuf

Kaitai parser for protobuf
2025-01-30 14:58:38 +00:00 · 2017-07-10 22:58:38 +02:00 · 2017-07-10 22:58:38 +02:00 · 3814f171dd
commit 3814f171dd
parent b115c25dcc 3f269d2b68
10 changed files with 368 additions and 61 deletions
--- a/mitmproxy/contentviews/protobuf.py
+++ b/mitmproxy/contentviews/protobuf.py
@ -1,6 +1,63 @@
-import subprocess
+import io

+from kaitaistruct import KaitaiStream
 from . import base
+from mitmproxy.contrib.kaitaistruct import google_protobuf
+
+
+def write_buf(out, field_tag, body, indent_level):
+    if body is not None:
+        out.write("{: <{level}}{}: {}\n".format('', field_tag, body if isinstance(body, int) else str(body, 'utf-8'),
+                                                level=indent_level))
+    elif field_tag is not None:
+        out.write(' ' * indent_level + str(field_tag) + " {\n")
+    else:
+        out.write(' ' * indent_level + "}\n")
+
+
+def format_pbuf(raw):
+    out = io.StringIO()
+    stack = []
+
+    try:
+        buf = google_protobuf.GoogleProtobuf(KaitaiStream(io.BytesIO(raw)))
+    except:
+        return False
+    stack.extend([(pair, 0) for pair in buf.pairs[::-1]])
+
+    while len(stack):
+        pair, indent_level = stack.pop()
+
+        if pair.wire_type == pair.WireTypes.group_start:
+            body = None
+        elif pair.wire_type == pair.WireTypes.group_end:
+            body = None
+            pair._m_field_tag = None
+        elif pair.wire_type == pair.WireTypes.len_delimited:
+            body = pair.value.body
+        elif pair.wire_type == pair.WireTypes.varint:
+            body = pair.value.value
+        else:
+            body = pair.value
+
+        try:
+            next_buf = google_protobuf.GoogleProtobuf(KaitaiStream(io.BytesIO(body)))
+            stack.extend([(pair, indent_level + 2) for pair in next_buf.pairs[::-1]])
+            write_buf(out, pair.field_tag, None, indent_level)
+        except:
+            write_buf(out, pair.field_tag, body, indent_level)
+
+        if stack:
+            prev_level = stack[-1][1]
+        else:
+            prev_level = 0
+
+        if prev_level < indent_level:
+            levels = int((indent_level - prev_level) / 2)
+            for i in range(1, levels + 1):
+                write_buf(out, None, None, indent_level - i * 2)
+
+    return out.getvalue()


 class ViewProtobuf(base.View):
@ -15,28 +72,9 @@ class ViewProtobuf(base.View):
        "application/x-protobuffer",
    ]

-    def is_available(self):
-        try:
-            p = subprocess.Popen(
-                ["protoc", "--version"],
-                stdout=subprocess.PIPE
-            )
-            out, _ = p.communicate()
-            return out.startswith(b"libprotoc")
-        except:
-            return False
-
    def __call__(self, data, **metadata):
-        if not self.is_available():
-            raise NotImplementedError("protoc not found. Please make sure 'protoc' is available in $PATH.")
-
-        # if Popen raises OSError, it will be caught in
-        # get_content_view and fall back to Raw
-        p = subprocess.Popen(['protoc', '--decode_raw'],
-                             stdin=subprocess.PIPE,
-                             stdout=subprocess.PIPE,
-                             stderr=subprocess.PIPE)
-        decoded, _ = p.communicate(input=data)
+        decoded = format_pbuf(data)
        if not decoded:
            raise ValueError("Failed to parse input.")
+
        return "Protobuf", base.format_text(decoded)
--- a/mitmproxy/contrib/kaitaistruct/google_protobuf.py
+++ b/mitmproxy/contrib/kaitaistruct/google_protobuf.py
@ -0,0 +1,124 @@
+# This is a generated file! Please edit source .ksy file and use kaitai-struct-compiler to rebuild
+
+from pkg_resources import parse_version
+from kaitaistruct import __version__ as ks_version, KaitaiStruct, KaitaiStream, BytesIO
+from enum import Enum
+
+
+if parse_version(ks_version) < parse_version('0.7'):
+    raise Exception("Incompatible Kaitai Struct Python API: 0.7 or later is required, but you have %s" % (ks_version))
+
+from .vlq_base128_le import VlqBase128Le
+class GoogleProtobuf(KaitaiStruct):
+    """Google Protocol Buffers (AKA protobuf) is a popular data
+    serialization scheme used for communication protocols, data storage,
+    etc. There are implementations are available for almost every
+    popular language. The focus points of this scheme are brevity (data
+    is encoded in a very size-efficient manner) and extensibility (one
+    can add keys to the structure, while keeping it readable in previous
+    version of software).
+    
+    Protobuf uses semi-self-describing encoding scheme for its
+    messages. It means that it is possible to parse overall structure of
+    the message (skipping over fields one can't understand), but to
+    fully understand the message, one needs a protocol definition file
+    (`.proto`). To be specific:
+    
+    * "Keys" in key-value pairs provided in the message are identified
+      only with an integer "field tag". `.proto` file provides info on
+      which symbolic field names these field tags map to.
+    * "Keys" also provide something called "wire type". It's not a data
+      type in its common sense (i.e. you can't, for example, distinguish
+      `sint32` vs `uint32` vs some enum, or `string` from `bytes`), but
+      it's enough information to determine how many bytes to
+      parse. Interpretation of the value should be done according to the
+      type specified in `.proto` file.
+    * There's no direct information on which fields are optional /
+      required, which fields may be repeated or constitute a map, what
+      restrictions are placed on fields usage in a single message, what
+      are the fields' default values, etc, etc.
+    
+    .. seealso::
+       Source - https://developers.google.com/protocol-buffers/docs/encoding
+    """
+    def __init__(self, _io, _parent=None, _root=None):
+        self._io = _io
+        self._parent = _parent
+        self._root = _root if _root else self
+        self._read()
+
+    def _read(self):
+        self.pairs = []
+        while not self._io.is_eof():
+            self.pairs.append(self._root.Pair(self._io, self, self._root))
+
+
+    class Pair(KaitaiStruct):
+        """Key-value pair."""
+
+        class WireTypes(Enum):
+            varint = 0
+            bit_64 = 1
+            len_delimited = 2
+            group_start = 3
+            group_end = 4
+            bit_32 = 5
+        def __init__(self, _io, _parent=None, _root=None):
+            self._io = _io
+            self._parent = _parent
+            self._root = _root if _root else self
+            self._read()
+
+        def _read(self):
+            self.key = VlqBase128Le(self._io)
+            _on = self.wire_type
+            if _on == self._root.Pair.WireTypes.varint:
+                self.value = VlqBase128Le(self._io)
+            elif _on == self._root.Pair.WireTypes.len_delimited:
+                self.value = self._root.DelimitedBytes(self._io, self, self._root)
+            elif _on == self._root.Pair.WireTypes.bit_64:
+                self.value = self._io.read_u8le()
+            elif _on == self._root.Pair.WireTypes.bit_32:
+                self.value = self._io.read_u4le()
+
+        @property
+        def wire_type(self):
+            """"Wire type" is a part of the "key" that carries enough
+            information to parse value from the wire, i.e. read correct
+            amount of bytes, but there's not enough informaton to
+            interprete in unambiguously. For example, one can't clearly
+            distinguish 64-bit fixed-sized integers from 64-bit floats,
+            signed zigzag-encoded varints from regular unsigned varints,
+            arbitrary bytes from UTF-8 encoded strings, etc.
+            """
+            if hasattr(self, '_m_wire_type'):
+                return self._m_wire_type if hasattr(self, '_m_wire_type') else None
+
+            self._m_wire_type = self._root.Pair.WireTypes((self.key.value & 7))
+            return self._m_wire_type if hasattr(self, '_m_wire_type') else None
+
+        @property
+        def field_tag(self):
+            """Identifies a field of protocol. One can look up symbolic
+            field name in a `.proto` file by this field tag.
+            """
+            if hasattr(self, '_m_field_tag'):
+                return self._m_field_tag if hasattr(self, '_m_field_tag') else None
+
+            self._m_field_tag = (self.key.value >> 3)
+            return self._m_field_tag if hasattr(self, '_m_field_tag') else None
+
+
+    class DelimitedBytes(KaitaiStruct):
+        def __init__(self, _io, _parent=None, _root=None):
+            self._io = _io
+            self._parent = _parent
+            self._root = _root if _root else self
+            self._read()
+
+        def _read(self):
+            self.len = VlqBase128Le(self._io)
+            self.body = self._io.read_bytes(self.len.value)
+
+
+
--- a/mitmproxy/contrib/kaitaistruct/make.sh
+++ b/mitmproxy/contrib/kaitaistruct/make.sh
@ -7,5 +7,7 @@ wget -N https://raw.githubusercontent.com/kaitai-io/kaitai_struct_formats/master
 wget -N https://raw.githubusercontent.com/kaitai-io/kaitai_struct_formats/master/image/jpeg.ksy
 wget -N https://raw.githubusercontent.com/kaitai-io/kaitai_struct_formats/master/image/png.ksy
 wget -N https://raw.githubusercontent.com/kaitai-io/kaitai_struct_formats/master/image/ico.ksy
+wget -N https://raw.githubusercontent.com/kaitai-io/kaitai_struct_formats/master/common/vlq_base128_le.ksy
+wget -N https://raw.githubusercontent.com/kaitai-io/kaitai_struct_formats/master/serialization/google_protobuf.ksy

 kaitai-struct-compiler --target python --opaque-types=true *.ksy
--- a/mitmproxy/contrib/kaitaistruct/vlq_base128_le.py
+++ b/mitmproxy/contrib/kaitaistruct/vlq_base128_le.py
@ -0,0 +1,94 @@
+# This is a generated file! Please edit source .ksy file and use kaitai-struct-compiler to rebuild
+
+from pkg_resources import parse_version
+from kaitaistruct import __version__ as ks_version, KaitaiStruct, KaitaiStream, BytesIO
+
+
+if parse_version(ks_version) < parse_version('0.7'):
+    raise Exception("Incompatible Kaitai Struct Python API: 0.7 or later is required, but you have %s" % (ks_version))
+
+class VlqBase128Le(KaitaiStruct):
+    """A variable-length unsigned integer using base128 encoding. 1-byte groups
+    consists of 1-bit flag of continuation and 7-bit value, and are ordered
+    "least significant group first", i.e. in "little-endian" manner.
+    
+    This particular encoding is specified and used in:
+    
+    * DWARF debug file format, where it's dubbed "unsigned LEB128" or "ULEB128".
+      http://dwarfstd.org/doc/dwarf-2.0.0.pdf - page 139
+    * Google Protocol Buffers, where it's called "Base 128 Varints".
+      https://developers.google.com/protocol-buffers/docs/encoding?csw=1#varints
+    * Apache Lucene, where it's called "VInt"
+      http://lucene.apache.org/core/3_5_0/fileformats.html#VInt
+    * Apache Avro uses this as a basis for integer encoding, adding ZigZag on
+      top of it for signed ints
+      http://avro.apache.org/docs/current/spec.html#binary_encode_primitive
+    
+    More information on this encoding is available at https://en.wikipedia.org/wiki/LEB128
+    
+    This particular implementation supports serialized values to up 8 bytes long.
+    """
+    def __init__(self, _io, _parent=None, _root=None):
+        self._io = _io
+        self._parent = _parent
+        self._root = _root if _root else self
+        self._read()
+
+    def _read(self):
+        self.groups = []
+        while True:
+            _ = self._root.Group(self._io, self, self._root)
+            self.groups.append(_)
+            if not (_.has_next):
+                break
+
+    class Group(KaitaiStruct):
+        """One byte group, clearly divided into 7-bit "value" and 1-bit "has continuation
+        in the next byte" flag.
+        """
+        def __init__(self, _io, _parent=None, _root=None):
+            self._io = _io
+            self._parent = _parent
+            self._root = _root if _root else self
+            self._read()
+
+        def _read(self):
+            self.b = self._io.read_u1()
+
+        @property
+        def has_next(self):
+            """If true, then we have more bytes to read."""
+            if hasattr(self, '_m_has_next'):
+                return self._m_has_next if hasattr(self, '_m_has_next') else None
+
+            self._m_has_next = (self.b & 128) != 0
+            return self._m_has_next if hasattr(self, '_m_has_next') else None
+
+        @property
+        def value(self):
+            """The 7-bit (base128) numeric value of this group."""
+            if hasattr(self, '_m_value'):
+                return self._m_value if hasattr(self, '_m_value') else None
+
+            self._m_value = (self.b & 127)
+            return self._m_value if hasattr(self, '_m_value') else None
+
+
+    @property
+    def len(self):
+        if hasattr(self, '_m_len'):
+            return self._m_len if hasattr(self, '_m_len') else None
+
+        self._m_len = len(self.groups)
+        return self._m_len if hasattr(self, '_m_len') else None
+
+    @property
+    def value(self):
+        """Resulting value as normal integer."""
+        if hasattr(self, '_m_value'):
+            return self._m_value if hasattr(self, '_m_value') else None
+
+        self._m_value = (((((((self.groups[0].value + ((self.groups[1].value << 7) if self.len >= 2 else 0)) + ((self.groups[2].value << 14) if self.len >= 3 else 0)) + ((self.groups[3].value << 21) if self.len >= 4 else 0)) + ((self.groups[4].value << 28) if self.len >= 5 else 0)) + ((self.groups[5].value << 35) if self.len >= 6 else 0)) + ((self.groups[6].value << 42) if self.len >= 7 else 0)) + ((self.groups[7].value << 49) if self.len >= 8 else 0))
+        return self._m_value if hasattr(self, '_m_value') else None
+
+
--- a/test/mitmproxy/contentviews/test_protobuf.py
+++ b/test/mitmproxy/contentviews/test_protobuf.py
@ -1,52 +1,31 @@
-from unittest import mock
 import pytest

 from mitmproxy.contentviews import protobuf
 from mitmproxy.test import tutils
 from . import full_eval

+data = tutils.test_data.push("mitmproxy/contentviews/test_protobuf_data/")
+

 def test_view_protobuf_request():
    v = full_eval(protobuf.ViewProtobuf())
-    p = tutils.test_data.path("mitmproxy/data/protobuf01")
+    p = data.path("protobuf01")

-    with mock.patch('mitmproxy.contentviews.protobuf.ViewProtobuf.is_available'):
-        with mock.patch('subprocess.Popen') as n:
-            m = mock.Mock()
-            attrs = {'communicate.return_value': (b'1: "3bbc333c-e61c-433b-819a-0b9a8cc103b8"', True)}
-            m.configure_mock(**attrs)
-            n.return_value = m
-
-            with open(p, "rb") as f:
-                data = f.read()
-            content_type, output = v(data)
-            assert content_type == "Protobuf"
-            assert output[0] == [('text', b'1: "3bbc333c-e61c-433b-819a-0b9a8cc103b8"')]
-
-            m.communicate = mock.MagicMock()
-            m.communicate.return_value = (None, None)
-            with pytest.raises(ValueError, matches="Failed to parse input."):
-                v(b'foobar')
+    with open(p, "rb") as f:
+        raw = f.read()
+    content_type, output = v(raw)
+    assert content_type == "Protobuf"
+    assert output == [[('text', '1: 3bbc333c-e61c-433b-819a-0b9a8cc103b8')]]
+    with pytest.raises(ValueError, matches="Failed to parse input."):
+        v(b'foobar')


-def test_view_protobuf_availability():
-    with mock.patch('subprocess.Popen') as n:
-        m = mock.Mock()
-        attrs = {'communicate.return_value': (b'libprotoc fake version', True)}
-        m.configure_mock(**attrs)
-        n.return_value = m
-        assert protobuf.ViewProtobuf().is_available()
+@pytest.mark.parametrize("filename", ["protobuf02", "protobuf03"])
+def test_format_pbuf(filename):
+    path = data.path(filename)
+    with open(path, "rb") as f:
+        input = f.read()
+    with open(path + "-decoded") as f:
+        expected = f.read()

-        m = mock.Mock()
-        attrs = {'communicate.return_value': (b'command not found', True)}
-        m.configure_mock(**attrs)
-        n.return_value = m
-        assert not protobuf.ViewProtobuf().is_available()
-
-
-def test_view_protobuf_fallback():
-    with mock.patch('subprocess.Popen.communicate') as m:
-        m.side_effect = OSError()
-        v = full_eval(protobuf.ViewProtobuf())
-        with pytest.raises(NotImplementedError, matches='protoc not found'):
-            v(b'foobar')
+    assert protobuf.format_pbuf(input) == expected
--- a/test/mitmproxy/contentviews/test_protobuf_data/protobuf01
+++ b/test/mitmproxy/contentviews/test_protobuf_data/protobuf01
--- a/test/mitmproxy/contentviews/test_protobuf_data/protobuf02
+++ b/test/mitmproxy/contentviews/test_protobuf_data/protobuf02
--- a/test/mitmproxy/contentviews/test_protobuf_data/protobuf02-decoded
+++ b/test/mitmproxy/contentviews/test_protobuf_data/protobuf02-decoded
@ -0,0 +1,65 @@
+1 {
+  1: tpbuf
+  4 {
+    1: Person
+    2 {
+      1: name
+      3: 1
+      4: 2
+      5: 9
+    }
+    2 {
+      1: id
+      3: 2
+      4: 2
+      5: 5
+    }
+    2 {
+      1 {
+        12: 1818845549
+      }
+      3: 3
+      4: 1
+      5: 9
+    }
+    2 {
+      1: phone
+      3: 4
+      4: 3
+      5: 11
+      6: .Person.PhoneNumber
+    }
+    3 {
+      1: PhoneNumber
+      2 {
+        1: number
+        3: 1
+        4: 2
+        5: 9
+      }
+      2 {
+        1: type
+        3: 2
+        4: 1
+        5: 14
+        6: .Person.PhoneType
+        7: HOME
+      }
+    }
+    4 {
+      1: PhoneType
+      2 {
+        1: MOBILE
+        2: 0
+      }
+      2 {
+        1: HOME
+        2: 1
+      }
+      2 {
+        1: WORK
+        2: 2
+      }
+    }
+  }
+}
--- a/test/mitmproxy/contentviews/test_protobuf_data/protobuf03
+++ b/test/mitmproxy/contentviews/test_protobuf_data/protobuf03
@ -0,0 +1 @@
+<18> <20>
--- a/test/mitmproxy/contentviews/test_protobuf_data/protobuf03-decoded
+++ b/test/mitmproxy/contentviews/test_protobuf_data/protobuf03-decoded
@ -0,0 +1,4 @@
+2 {
+3: 3840
+4: 2160
+}