Merge pull request #2427 from ujjwal96/protobuf

Kaitai parser for protobuf
This commit is contained in:
Thomas Kriechbaumer 2017-07-10 22:58:38 +02:00 committed by GitHub
commit 3814f171dd
10 changed files with 368 additions and 61 deletions

View File

@ -1,6 +1,63 @@
import subprocess
import io
from kaitaistruct import KaitaiStream
from . import base
from mitmproxy.contrib.kaitaistruct import google_protobuf
def write_buf(out, field_tag, body, indent_level):
if body is not None:
out.write("{: <{level}}{}: {}\n".format('', field_tag, body if isinstance(body, int) else str(body, 'utf-8'),
level=indent_level))
elif field_tag is not None:
out.write(' ' * indent_level + str(field_tag) + " {\n")
else:
out.write(' ' * indent_level + "}\n")
def format_pbuf(raw):
out = io.StringIO()
stack = []
try:
buf = google_protobuf.GoogleProtobuf(KaitaiStream(io.BytesIO(raw)))
except:
return False
stack.extend([(pair, 0) for pair in buf.pairs[::-1]])
while len(stack):
pair, indent_level = stack.pop()
if pair.wire_type == pair.WireTypes.group_start:
body = None
elif pair.wire_type == pair.WireTypes.group_end:
body = None
pair._m_field_tag = None
elif pair.wire_type == pair.WireTypes.len_delimited:
body = pair.value.body
elif pair.wire_type == pair.WireTypes.varint:
body = pair.value.value
else:
body = pair.value
try:
next_buf = google_protobuf.GoogleProtobuf(KaitaiStream(io.BytesIO(body)))
stack.extend([(pair, indent_level + 2) for pair in next_buf.pairs[::-1]])
write_buf(out, pair.field_tag, None, indent_level)
except:
write_buf(out, pair.field_tag, body, indent_level)
if stack:
prev_level = stack[-1][1]
else:
prev_level = 0
if prev_level < indent_level:
levels = int((indent_level - prev_level) / 2)
for i in range(1, levels + 1):
write_buf(out, None, None, indent_level - i * 2)
return out.getvalue()
class ViewProtobuf(base.View):
@ -15,28 +72,9 @@ class ViewProtobuf(base.View):
"application/x-protobuffer",
]
def is_available(self):
try:
p = subprocess.Popen(
["protoc", "--version"],
stdout=subprocess.PIPE
)
out, _ = p.communicate()
return out.startswith(b"libprotoc")
except:
return False
def __call__(self, data, **metadata):
if not self.is_available():
raise NotImplementedError("protoc not found. Please make sure 'protoc' is available in $PATH.")
# if Popen raises OSError, it will be caught in
# get_content_view and fall back to Raw
p = subprocess.Popen(['protoc', '--decode_raw'],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
decoded, _ = p.communicate(input=data)
decoded = format_pbuf(data)
if not decoded:
raise ValueError("Failed to parse input.")
return "Protobuf", base.format_text(decoded)

View File

@ -0,0 +1,124 @@
# This is a generated file! Please edit source .ksy file and use kaitai-struct-compiler to rebuild
from pkg_resources import parse_version
from kaitaistruct import __version__ as ks_version, KaitaiStruct, KaitaiStream, BytesIO
from enum import Enum
if parse_version(ks_version) < parse_version('0.7'):
raise Exception("Incompatible Kaitai Struct Python API: 0.7 or later is required, but you have %s" % (ks_version))
from .vlq_base128_le import VlqBase128Le
class GoogleProtobuf(KaitaiStruct):
"""Google Protocol Buffers (AKA protobuf) is a popular data
serialization scheme used for communication protocols, data storage,
etc. There are implementations are available for almost every
popular language. The focus points of this scheme are brevity (data
is encoded in a very size-efficient manner) and extensibility (one
can add keys to the structure, while keeping it readable in previous
version of software).
Protobuf uses semi-self-describing encoding scheme for its
messages. It means that it is possible to parse overall structure of
the message (skipping over fields one can't understand), but to
fully understand the message, one needs a protocol definition file
(`.proto`). To be specific:
* "Keys" in key-value pairs provided in the message are identified
only with an integer "field tag". `.proto` file provides info on
which symbolic field names these field tags map to.
* "Keys" also provide something called "wire type". It's not a data
type in its common sense (i.e. you can't, for example, distinguish
`sint32` vs `uint32` vs some enum, or `string` from `bytes`), but
it's enough information to determine how many bytes to
parse. Interpretation of the value should be done according to the
type specified in `.proto` file.
* There's no direct information on which fields are optional /
required, which fields may be repeated or constitute a map, what
restrictions are placed on fields usage in a single message, what
are the fields' default values, etc, etc.
.. seealso::
Source - https://developers.google.com/protocol-buffers/docs/encoding
"""
def __init__(self, _io, _parent=None, _root=None):
self._io = _io
self._parent = _parent
self._root = _root if _root else self
self._read()
def _read(self):
self.pairs = []
while not self._io.is_eof():
self.pairs.append(self._root.Pair(self._io, self, self._root))
class Pair(KaitaiStruct):
"""Key-value pair."""
class WireTypes(Enum):
varint = 0
bit_64 = 1
len_delimited = 2
group_start = 3
group_end = 4
bit_32 = 5
def __init__(self, _io, _parent=None, _root=None):
self._io = _io
self._parent = _parent
self._root = _root if _root else self
self._read()
def _read(self):
self.key = VlqBase128Le(self._io)
_on = self.wire_type
if _on == self._root.Pair.WireTypes.varint:
self.value = VlqBase128Le(self._io)
elif _on == self._root.Pair.WireTypes.len_delimited:
self.value = self._root.DelimitedBytes(self._io, self, self._root)
elif _on == self._root.Pair.WireTypes.bit_64:
self.value = self._io.read_u8le()
elif _on == self._root.Pair.WireTypes.bit_32:
self.value = self._io.read_u4le()
@property
def wire_type(self):
""""Wire type" is a part of the "key" that carries enough
information to parse value from the wire, i.e. read correct
amount of bytes, but there's not enough informaton to
interprete in unambiguously. For example, one can't clearly
distinguish 64-bit fixed-sized integers from 64-bit floats,
signed zigzag-encoded varints from regular unsigned varints,
arbitrary bytes from UTF-8 encoded strings, etc.
"""
if hasattr(self, '_m_wire_type'):
return self._m_wire_type if hasattr(self, '_m_wire_type') else None
self._m_wire_type = self._root.Pair.WireTypes((self.key.value & 7))
return self._m_wire_type if hasattr(self, '_m_wire_type') else None
@property
def field_tag(self):
"""Identifies a field of protocol. One can look up symbolic
field name in a `.proto` file by this field tag.
"""
if hasattr(self, '_m_field_tag'):
return self._m_field_tag if hasattr(self, '_m_field_tag') else None
self._m_field_tag = (self.key.value >> 3)
return self._m_field_tag if hasattr(self, '_m_field_tag') else None
class DelimitedBytes(KaitaiStruct):
def __init__(self, _io, _parent=None, _root=None):
self._io = _io
self._parent = _parent
self._root = _root if _root else self
self._read()
def _read(self):
self.len = VlqBase128Le(self._io)
self.body = self._io.read_bytes(self.len.value)

View File

@ -7,5 +7,7 @@ wget -N https://raw.githubusercontent.com/kaitai-io/kaitai_struct_formats/master
wget -N https://raw.githubusercontent.com/kaitai-io/kaitai_struct_formats/master/image/jpeg.ksy
wget -N https://raw.githubusercontent.com/kaitai-io/kaitai_struct_formats/master/image/png.ksy
wget -N https://raw.githubusercontent.com/kaitai-io/kaitai_struct_formats/master/image/ico.ksy
wget -N https://raw.githubusercontent.com/kaitai-io/kaitai_struct_formats/master/common/vlq_base128_le.ksy
wget -N https://raw.githubusercontent.com/kaitai-io/kaitai_struct_formats/master/serialization/google_protobuf.ksy
kaitai-struct-compiler --target python --opaque-types=true *.ksy

View File

@ -0,0 +1,94 @@
# This is a generated file! Please edit source .ksy file and use kaitai-struct-compiler to rebuild
from pkg_resources import parse_version
from kaitaistruct import __version__ as ks_version, KaitaiStruct, KaitaiStream, BytesIO
if parse_version(ks_version) < parse_version('0.7'):
raise Exception("Incompatible Kaitai Struct Python API: 0.7 or later is required, but you have %s" % (ks_version))
class VlqBase128Le(KaitaiStruct):
"""A variable-length unsigned integer using base128 encoding. 1-byte groups
consists of 1-bit flag of continuation and 7-bit value, and are ordered
"least significant group first", i.e. in "little-endian" manner.
This particular encoding is specified and used in:
* DWARF debug file format, where it's dubbed "unsigned LEB128" or "ULEB128".
http://dwarfstd.org/doc/dwarf-2.0.0.pdf - page 139
* Google Protocol Buffers, where it's called "Base 128 Varints".
https://developers.google.com/protocol-buffers/docs/encoding?csw=1#varints
* Apache Lucene, where it's called "VInt"
http://lucene.apache.org/core/3_5_0/fileformats.html#VInt
* Apache Avro uses this as a basis for integer encoding, adding ZigZag on
top of it for signed ints
http://avro.apache.org/docs/current/spec.html#binary_encode_primitive
More information on this encoding is available at https://en.wikipedia.org/wiki/LEB128
This particular implementation supports serialized values to up 8 bytes long.
"""
def __init__(self, _io, _parent=None, _root=None):
self._io = _io
self._parent = _parent
self._root = _root if _root else self
self._read()
def _read(self):
self.groups = []
while True:
_ = self._root.Group(self._io, self, self._root)
self.groups.append(_)
if not (_.has_next):
break
class Group(KaitaiStruct):
"""One byte group, clearly divided into 7-bit "value" and 1-bit "has continuation
in the next byte" flag.
"""
def __init__(self, _io, _parent=None, _root=None):
self._io = _io
self._parent = _parent
self._root = _root if _root else self
self._read()
def _read(self):
self.b = self._io.read_u1()
@property
def has_next(self):
"""If true, then we have more bytes to read."""
if hasattr(self, '_m_has_next'):
return self._m_has_next if hasattr(self, '_m_has_next') else None
self._m_has_next = (self.b & 128) != 0
return self._m_has_next if hasattr(self, '_m_has_next') else None
@property
def value(self):
"""The 7-bit (base128) numeric value of this group."""
if hasattr(self, '_m_value'):
return self._m_value if hasattr(self, '_m_value') else None
self._m_value = (self.b & 127)
return self._m_value if hasattr(self, '_m_value') else None
@property
def len(self):
if hasattr(self, '_m_len'):
return self._m_len if hasattr(self, '_m_len') else None
self._m_len = len(self.groups)
return self._m_len if hasattr(self, '_m_len') else None
@property
def value(self):
"""Resulting value as normal integer."""
if hasattr(self, '_m_value'):
return self._m_value if hasattr(self, '_m_value') else None
self._m_value = (((((((self.groups[0].value + ((self.groups[1].value << 7) if self.len >= 2 else 0)) + ((self.groups[2].value << 14) if self.len >= 3 else 0)) + ((self.groups[3].value << 21) if self.len >= 4 else 0)) + ((self.groups[4].value << 28) if self.len >= 5 else 0)) + ((self.groups[5].value << 35) if self.len >= 6 else 0)) + ((self.groups[6].value << 42) if self.len >= 7 else 0)) + ((self.groups[7].value << 49) if self.len >= 8 else 0))
return self._m_value if hasattr(self, '_m_value') else None

View File

@ -1,52 +1,31 @@
from unittest import mock
import pytest
from mitmproxy.contentviews import protobuf
from mitmproxy.test import tutils
from . import full_eval
data = tutils.test_data.push("mitmproxy/contentviews/test_protobuf_data/")
def test_view_protobuf_request():
v = full_eval(protobuf.ViewProtobuf())
p = tutils.test_data.path("mitmproxy/data/protobuf01")
p = data.path("protobuf01")
with mock.patch('mitmproxy.contentviews.protobuf.ViewProtobuf.is_available'):
with mock.patch('subprocess.Popen') as n:
m = mock.Mock()
attrs = {'communicate.return_value': (b'1: "3bbc333c-e61c-433b-819a-0b9a8cc103b8"', True)}
m.configure_mock(**attrs)
n.return_value = m
with open(p, "rb") as f:
data = f.read()
content_type, output = v(data)
assert content_type == "Protobuf"
assert output[0] == [('text', b'1: "3bbc333c-e61c-433b-819a-0b9a8cc103b8"')]
m.communicate = mock.MagicMock()
m.communicate.return_value = (None, None)
with pytest.raises(ValueError, matches="Failed to parse input."):
v(b'foobar')
with open(p, "rb") as f:
raw = f.read()
content_type, output = v(raw)
assert content_type == "Protobuf"
assert output == [[('text', '1: 3bbc333c-e61c-433b-819a-0b9a8cc103b8')]]
with pytest.raises(ValueError, matches="Failed to parse input."):
v(b'foobar')
def test_view_protobuf_availability():
with mock.patch('subprocess.Popen') as n:
m = mock.Mock()
attrs = {'communicate.return_value': (b'libprotoc fake version', True)}
m.configure_mock(**attrs)
n.return_value = m
assert protobuf.ViewProtobuf().is_available()
@pytest.mark.parametrize("filename", ["protobuf02", "protobuf03"])
def test_format_pbuf(filename):
path = data.path(filename)
with open(path, "rb") as f:
input = f.read()
with open(path + "-decoded") as f:
expected = f.read()
m = mock.Mock()
attrs = {'communicate.return_value': (b'command not found', True)}
m.configure_mock(**attrs)
n.return_value = m
assert not protobuf.ViewProtobuf().is_available()
def test_view_protobuf_fallback():
with mock.patch('subprocess.Popen.communicate') as m:
m.side_effect = OSError()
v = full_eval(protobuf.ViewProtobuf())
with pytest.raises(NotImplementedError, matches='protoc not found'):
v(b'foobar')
assert protobuf.format_pbuf(input) == expected

View File

@ -0,0 +1,65 @@
1 {
1: tpbuf
4 {
1: Person
2 {
1: name
3: 1
4: 2
5: 9
}
2 {
1: id
3: 2
4: 2
5: 5
}
2 {
1 {
12: 1818845549
}
3: 3
4: 1
5: 9
}
2 {
1: phone
3: 4
4: 3
5: 11
6: .Person.PhoneNumber
}
3 {
1: PhoneNumber
2 {
1: number
3: 1
4: 2
5: 9
}
2 {
1: type
3: 2
4: 1
5: 14
6: .Person.PhoneType
7: HOME
}
}
4 {
1: PhoneType
2 {
1: MOBILE
2: 0
}
2 {
1: HOME
2: 1
}
2 {
1: WORK
2: 2
}
}
}
}

View File

@ -0,0 +1 @@
<18> <20>

View File

@ -0,0 +1,4 @@
2 {
3: 3840
4: 2160
}