mirror of
https://github.com/Grasscutters/mitmproxy.git
synced 2024-12-02 03:53:42 +00:00
Integrate lxml for pretty-printing HTML and XML.
Tackling the pretty-printing performance problem head-on, at the cost of a major dependency.
This commit is contained in:
parent
549512e93e
commit
f1dc3f2ab2
@ -2,11 +2,12 @@ import re, cStringIO
|
|||||||
import urwid
|
import urwid
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from PIL.ExifTags import TAGS
|
from PIL.ExifTags import TAGS
|
||||||
|
import lxml.html, lxml.etree
|
||||||
import common
|
import common
|
||||||
from .. import utils, encoding, flow
|
from .. import utils, encoding, flow
|
||||||
from ..contrib import jsbeautifier
|
from ..contrib import jsbeautifier
|
||||||
|
|
||||||
VIEW_CUTOFF = 1024*20
|
VIEW_CUTOFF = 1024*200
|
||||||
|
|
||||||
VIEW_AUTO = 0
|
VIEW_AUTO = 0
|
||||||
VIEW_JSON = 1
|
VIEW_JSON = 1
|
||||||
@ -17,6 +18,7 @@ VIEW_JAVASCRIPT = 5
|
|||||||
VIEW_IMAGE = 6
|
VIEW_IMAGE = 6
|
||||||
VIEW_RAW = 7
|
VIEW_RAW = 7
|
||||||
VIEW_HEX = 8
|
VIEW_HEX = 8
|
||||||
|
VIEW_HTML = 9
|
||||||
|
|
||||||
VIEW_NAMES = {
|
VIEW_NAMES = {
|
||||||
VIEW_AUTO: "Auto",
|
VIEW_AUTO: "Auto",
|
||||||
@ -28,35 +30,38 @@ VIEW_NAMES = {
|
|||||||
VIEW_IMAGE: "Image",
|
VIEW_IMAGE: "Image",
|
||||||
VIEW_RAW: "Raw",
|
VIEW_RAW: "Raw",
|
||||||
VIEW_HEX: "Hex",
|
VIEW_HEX: "Hex",
|
||||||
|
VIEW_HTML: "HTML",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
VIEW_PROMPT = (
|
VIEW_PROMPT = (
|
||||||
("auto detect", "a"),
|
("auto detect", "a"),
|
||||||
("hex", "h"),
|
("hex", "e"),
|
||||||
|
("html", "h"),
|
||||||
("image", "i"),
|
("image", "i"),
|
||||||
("javascript", "j"),
|
("javascript", "j"),
|
||||||
("json", "s"),
|
("json", "s"),
|
||||||
("raw", "r"),
|
("raw", "r"),
|
||||||
("multipart", "m"),
|
("multipart", "m"),
|
||||||
("urlencoded", "u"),
|
("urlencoded", "u"),
|
||||||
("xmlish", "x"),
|
("xml", "x"),
|
||||||
)
|
)
|
||||||
|
|
||||||
VIEW_SHORTCUTS = {
|
VIEW_SHORTCUTS = {
|
||||||
"a": VIEW_AUTO,
|
"a": VIEW_AUTO,
|
||||||
|
"x": VIEW_XML,
|
||||||
|
"h": VIEW_HTML,
|
||||||
"i": VIEW_IMAGE,
|
"i": VIEW_IMAGE,
|
||||||
"j": VIEW_JAVASCRIPT,
|
"j": VIEW_JAVASCRIPT,
|
||||||
"s": VIEW_JSON,
|
"s": VIEW_JSON,
|
||||||
"u": VIEW_URLENCODED,
|
"u": VIEW_URLENCODED,
|
||||||
"m": VIEW_MULTIPART,
|
"m": VIEW_MULTIPART,
|
||||||
"x": VIEW_XML,
|
|
||||||
"r": VIEW_RAW,
|
"r": VIEW_RAW,
|
||||||
"h": VIEW_HEX,
|
"e": VIEW_HEX,
|
||||||
}
|
}
|
||||||
|
|
||||||
CONTENT_TYPES_MAP = {
|
CONTENT_TYPES_MAP = {
|
||||||
"text/html": VIEW_XML,
|
"text/html": VIEW_HTML,
|
||||||
"application/json": VIEW_JSON,
|
"application/json": VIEW_JSON,
|
||||||
"text/xml": VIEW_XML,
|
"text/xml": VIEW_XML,
|
||||||
"multipart/form-data": VIEW_MULTIPART,
|
"multipart/form-data": VIEW_MULTIPART,
|
||||||
@ -116,9 +121,34 @@ def view_hex(hdrs, content):
|
|||||||
return "Hex", txt
|
return "Hex", txt
|
||||||
|
|
||||||
|
|
||||||
def view_xmlish(hdrs, content):
|
def view_xml(hdrs, content):
|
||||||
|
parser = lxml.etree.XMLParser(remove_blank_text=True, resolve_entities=False, strip_cdata=False, recover=False)
|
||||||
|
try:
|
||||||
|
document = lxml.etree.fromstring(content, parser)
|
||||||
|
except lxml.etree.XMLSyntaxError, v:
|
||||||
|
print v
|
||||||
|
return None
|
||||||
|
docinfo = document.getroottree().docinfo
|
||||||
|
|
||||||
|
prev = []
|
||||||
|
p = document.getroottree().getroot().getprevious()
|
||||||
|
while p is not None:
|
||||||
|
prev.insert(
|
||||||
|
0,
|
||||||
|
lxml.etree.tostring(p)
|
||||||
|
)
|
||||||
|
p = p.getprevious()
|
||||||
|
|
||||||
|
s = lxml.etree.tostring(
|
||||||
|
document,
|
||||||
|
pretty_print=True,
|
||||||
|
xml_declaration=True,
|
||||||
|
doctype=docinfo.doctype + "\n".join(prev),
|
||||||
|
encoding = docinfo.encoding
|
||||||
|
)
|
||||||
|
|
||||||
txt = []
|
txt = []
|
||||||
for i in utils.pretty_xmlish(content[:VIEW_CUTOFF]):
|
for i in s[:VIEW_CUTOFF].strip().split("\n"):
|
||||||
txt.append(
|
txt.append(
|
||||||
urwid.Text(("text", i)),
|
urwid.Text(("text", i)),
|
||||||
)
|
)
|
||||||
@ -126,6 +156,22 @@ def view_xmlish(hdrs, content):
|
|||||||
return "XML-like data", txt
|
return "XML-like data", txt
|
||||||
|
|
||||||
|
|
||||||
|
def view_html(hdrs, content):
|
||||||
|
if utils.isXML(content):
|
||||||
|
parser = lxml.etree.HTMLParser(strip_cdata=True, remove_blank_text=True)
|
||||||
|
d = lxml.html.fromstring(content, parser=parser)
|
||||||
|
docinfo = d.getroottree().docinfo
|
||||||
|
s = lxml.etree.tostring(d, pretty_print=True, doctype=docinfo.doctype)
|
||||||
|
|
||||||
|
txt = []
|
||||||
|
for i in s[:VIEW_CUTOFF].strip().split("\n"):
|
||||||
|
txt.append(
|
||||||
|
urwid.Text(("text", i)),
|
||||||
|
)
|
||||||
|
trailer(len(content), txt)
|
||||||
|
return "HTML", txt
|
||||||
|
|
||||||
|
|
||||||
def view_json(hdrs, content):
|
def view_json(hdrs, content):
|
||||||
lines = utils.pretty_json(content)
|
lines = utils.pretty_json(content)
|
||||||
if lines:
|
if lines:
|
||||||
@ -229,7 +275,8 @@ def view_image(hdrs, content):
|
|||||||
|
|
||||||
|
|
||||||
PRETTY_FUNCTION_MAP = {
|
PRETTY_FUNCTION_MAP = {
|
||||||
VIEW_XML: view_xmlish,
|
VIEW_XML: view_xml,
|
||||||
|
VIEW_HTML: view_html,
|
||||||
VIEW_JSON: view_json,
|
VIEW_JSON: view_json,
|
||||||
VIEW_URLENCODED: view_urlencoded,
|
VIEW_URLENCODED: view_urlencoded,
|
||||||
VIEW_MULTIPART: view_multipart,
|
VIEW_MULTIPART: view_multipart,
|
||||||
@ -274,7 +321,7 @@ def get_content_view(viewmode, hdrItems, content):
|
|||||||
if not ret:
|
if not ret:
|
||||||
viewmode = VIEW_RAW
|
viewmode = VIEW_RAW
|
||||||
ret = view_raw(hdrs, content)
|
ret = view_raw(hdrs, content)
|
||||||
msg.append("Fallback to Raw")
|
msg.append("Couldn't parse: falling back to Raw")
|
||||||
else:
|
else:
|
||||||
msg.append(ret[0])
|
msg.append(ret[0])
|
||||||
return " ".join(msg), ret[1]
|
return " ".join(msg), ret[1]
|
||||||
|
@ -72,51 +72,6 @@ def cleanBin(s, fixspacing=False):
|
|||||||
return "".join(parts)
|
return "".join(parts)
|
||||||
|
|
||||||
|
|
||||||
TAG = r"""
|
|
||||||
<\s*
|
|
||||||
(?!\s*[!"])
|
|
||||||
(?P<close>\s*\/)?
|
|
||||||
(?P<name>\w+)
|
|
||||||
(
|
|
||||||
[^'"\t >]+ |
|
|
||||||
"[^\"]*"['\"]* |
|
|
||||||
'[^']*'['\"]* |
|
|
||||||
\s+
|
|
||||||
)*
|
|
||||||
(?P<selfcont>\s*\/\s*)?
|
|
||||||
\s*>
|
|
||||||
"""
|
|
||||||
UNI = set(["br", "hr", "img", "input", "area", "link"])
|
|
||||||
INDENT = " "*4
|
|
||||||
def pretty_xmlish(s):
|
|
||||||
"""
|
|
||||||
A robust pretty-printer for XML-ish data.
|
|
||||||
Returns a list of lines.
|
|
||||||
"""
|
|
||||||
s = cleanBin(s)
|
|
||||||
data, offset, indent, prev = [], 0, 0, None
|
|
||||||
for i in re.finditer(TAG, s, re.VERBOSE|re.MULTILINE):
|
|
||||||
start, end = i.span()
|
|
||||||
name = i.group("name")
|
|
||||||
if start > offset:
|
|
||||||
txt = []
|
|
||||||
for x in textwrap.dedent(s[offset:start]).split("\n"):
|
|
||||||
if x.strip():
|
|
||||||
txt.append(indent*INDENT + x)
|
|
||||||
data.extend(txt)
|
|
||||||
if i.group("close") and not (name in UNI and name==prev):
|
|
||||||
indent = max(indent - 1, 0)
|
|
||||||
data.append(indent*INDENT + i.group().strip())
|
|
||||||
offset = end
|
|
||||||
if not any([i.group("close"), i.group("selfcont"), name in UNI]):
|
|
||||||
indent += 1
|
|
||||||
prev = name
|
|
||||||
trail = s[offset:]
|
|
||||||
if trail.strip():
|
|
||||||
data.append(s[offset:])
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
def pretty_json(s):
|
def pretty_json(s):
|
||||||
try:
|
try:
|
||||||
p = json.loads(s)
|
p = json.loads(s)
|
||||||
|
@ -57,15 +57,32 @@ class uContentView(libpry.AutoTree):
|
|||||||
assert cv.view_urlencoded([], d)
|
assert cv.view_urlencoded([], d)
|
||||||
assert not cv.view_urlencoded([], "foo")
|
assert not cv.view_urlencoded([], "foo")
|
||||||
|
|
||||||
|
def test_view_html(self):
|
||||||
|
s = "<html><br><br></br><p>one</p></html>"
|
||||||
|
assert cv.view_html([], s)
|
||||||
|
|
||||||
|
s = "gobbledygook"
|
||||||
|
assert not cv.view_html([], s)
|
||||||
|
|
||||||
def test_view_json(self):
|
def test_view_json(self):
|
||||||
cv.VIEW_CUTOFF = 100
|
cv.VIEW_CUTOFF = 100
|
||||||
assert cv.view_json([], "{}")
|
assert cv.view_json([], "{}")
|
||||||
assert not cv.view_urlencoded([], "{")
|
assert not cv.view_urlencoded([], "{")
|
||||||
assert cv.view_json([], "[" + ",".join(["0"]*cv.VIEW_CUTOFF) + "]")
|
assert cv.view_json([], "[" + ",".join(["0"]*cv.VIEW_CUTOFF) + "]")
|
||||||
|
|
||||||
def test_view_xmlish(self):
|
def test_view_xml(self):
|
||||||
assert cv.view_xmlish([], "<foo></foo>")
|
#assert cv.view_xml([], "<foo></foo>")
|
||||||
assert cv.view_xmlish([], "<foo>")
|
#assert not cv.view_xml([], "<foo>")
|
||||||
|
|
||||||
|
s = """<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<?xml-stylesheet title="XSL_formatting"?>
|
||||||
|
<rss
|
||||||
|
xmlns:media="http://search.yahoo.com/mrss/"
|
||||||
|
xmlns:atom="http://www.w3.org/2005/Atom"
|
||||||
|
version="2.0">
|
||||||
|
</rss>
|
||||||
|
"""
|
||||||
|
print cv.view_xml([], s)
|
||||||
|
|
||||||
def test_view_raw(self):
|
def test_view_raw(self):
|
||||||
assert cv.view_raw([], "foo")
|
assert cv.view_raw([], "foo")
|
||||||
|
@ -58,63 +58,6 @@ class uData(libpry.AutoTree):
|
|||||||
libpry.raises("does not exist", utils.pkg_data.path, "nonexistent")
|
libpry.raises("does not exist", utils.pkg_data.path, "nonexistent")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class upretty_xmlish(libpry.AutoTree):
|
|
||||||
def test_tagre(self):
|
|
||||||
def f(s):
|
|
||||||
return re.search(utils.TAG, s, re.VERBOSE|re.MULTILINE)
|
|
||||||
assert f(r"<body>")
|
|
||||||
assert f(r"<body/>")
|
|
||||||
assert f(r"< body/>")
|
|
||||||
assert f(r"< body/ >")
|
|
||||||
assert f(r"< body / >")
|
|
||||||
assert f(r"<foo a=b>")
|
|
||||||
assert f(r"<foo a='b'>")
|
|
||||||
assert f(r"<foo a='b\"'>")
|
|
||||||
assert f(r'<a b=(a.b) href="foo">')
|
|
||||||
assert f('<td width=25%>')
|
|
||||||
assert f('<form name="search" action="/search.php" method="get" accept-charset="utf-8" class="search">')
|
|
||||||
assert f('<img src="gif" width="125" height="16" alt="" />')
|
|
||||||
|
|
||||||
|
|
||||||
def test_all(self):
|
|
||||||
def isbalanced(ret):
|
|
||||||
# The last tag should have no indent
|
|
||||||
assert ret[-1].strip() == ret[-1]
|
|
||||||
|
|
||||||
s = "<html><br><br></br><p>one</p></html>"
|
|
||||||
ret = utils.pretty_xmlish(s)
|
|
||||||
isbalanced(ret)
|
|
||||||
|
|
||||||
s = r"""
|
|
||||||
<body bgcolor=#ffffff text=#000000 link=#0000cc vlink=#551a8b alink=#ff0000 onload="document.f.q.focus();if(document.images)new Image().src='/images/srpr/nav_logo27.png'" ><textarea id=csi style=display:none></textarea></body>
|
|
||||||
"""
|
|
||||||
isbalanced(utils.pretty_xmlish(textwrap.dedent(s)))
|
|
||||||
|
|
||||||
s = r"""
|
|
||||||
<a href="http://foo.com" target="">
|
|
||||||
<img src="http://foo.gif" alt="bar" height="25" width="132">
|
|
||||||
</a>
|
|
||||||
"""
|
|
||||||
isbalanced(utils.pretty_xmlish(textwrap.dedent(s)))
|
|
||||||
|
|
||||||
s = r"""
|
|
||||||
<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"
|
|
||||||
\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">
|
|
||||||
<html></html>
|
|
||||||
"""
|
|
||||||
ret = utils.pretty_xmlish(textwrap.dedent(s))
|
|
||||||
isbalanced(ret)
|
|
||||||
|
|
||||||
s = "<html><br/><p>one</p></html>"
|
|
||||||
ret = utils.pretty_xmlish(s)
|
|
||||||
assert len(ret) == 6
|
|
||||||
isbalanced(ret)
|
|
||||||
|
|
||||||
s = "gobbledygook"
|
|
||||||
assert utils.pretty_xmlish(s) == ["gobbledygook"]
|
|
||||||
|
|
||||||
|
|
||||||
class upretty_json(libpry.AutoTree):
|
class upretty_json(libpry.AutoTree):
|
||||||
def test_one(self):
|
def test_one(self):
|
||||||
s = json.dumps({"foo": 1})
|
s = json.dumps({"foo": 1})
|
||||||
@ -242,7 +185,6 @@ tests = [
|
|||||||
uhexdump(),
|
uhexdump(),
|
||||||
upretty_size(),
|
upretty_size(),
|
||||||
uData(),
|
uData(),
|
||||||
upretty_xmlish(),
|
|
||||||
upretty_json(),
|
upretty_json(),
|
||||||
u_urldecode(),
|
u_urldecode(),
|
||||||
udel_all(),
|
udel_all(),
|
||||||
|
1
todo
1
todo
@ -4,7 +4,6 @@ of these and need some pointers.
|
|||||||
|
|
||||||
Targeted for 0.9:
|
Targeted for 0.9:
|
||||||
- Upstream proxy support.
|
- Upstream proxy support.
|
||||||
- Improve worst-case performance problem with XML-ish indenter
|
|
||||||
- Follow mode to keep most recent flow in view
|
- Follow mode to keep most recent flow in view
|
||||||
- Rewrite the core to be asynchronous. I've done some research, and
|
- Rewrite the core to be asynchronous. I've done some research, and
|
||||||
although it's a bit of a bloated monster, it looks like Twisted is the way
|
although it's a bit of a bloated monster, it looks like Twisted is the way
|
||||||
|
Loading…
Reference in New Issue
Block a user