From f1dc3f2ab2d78ce2bc1a0418239fa6fa1f6c4429 Mon Sep 17 00:00:00 2001 From: Aldo Cortesi Date: Sat, 7 Apr 2012 13:47:03 +1200 Subject: [PATCH] Integrate lxml for pretty-printing HTML and XML. Tackling the pretty-printing performance problem head-on, at the cost of a major dependency. --- libmproxy/console/contentview.py | 67 +++++++++++++++++++++++++++----- libmproxy/utils.py | 45 --------------------- test/test_console_contentview.py | 23 +++++++++-- test/test_utils.py | 58 --------------------------- todo | 1 - 5 files changed, 77 insertions(+), 117 deletions(-) diff --git a/libmproxy/console/contentview.py b/libmproxy/console/contentview.py index 0d725c9da..02394c6f5 100644 --- a/libmproxy/console/contentview.py +++ b/libmproxy/console/contentview.py @@ -2,11 +2,12 @@ import re, cStringIO import urwid from PIL import Image from PIL.ExifTags import TAGS +import lxml.html, lxml.etree import common from .. import utils, encoding, flow from ..contrib import jsbeautifier -VIEW_CUTOFF = 1024*20 +VIEW_CUTOFF = 1024*200 VIEW_AUTO = 0 VIEW_JSON = 1 @@ -17,6 +18,7 @@ VIEW_JAVASCRIPT = 5 VIEW_IMAGE = 6 VIEW_RAW = 7 VIEW_HEX = 8 +VIEW_HTML = 9 VIEW_NAMES = { VIEW_AUTO: "Auto", @@ -28,35 +30,38 @@ VIEW_NAMES = { VIEW_IMAGE: "Image", VIEW_RAW: "Raw", VIEW_HEX: "Hex", + VIEW_HTML: "HTML", } VIEW_PROMPT = ( ("auto detect", "a"), - ("hex", "h"), + ("hex", "e"), + ("html", "h"), ("image", "i"), ("javascript", "j"), ("json", "s"), ("raw", "r"), ("multipart", "m"), ("urlencoded", "u"), - ("xmlish", "x"), + ("xml", "x"), ) VIEW_SHORTCUTS = { "a": VIEW_AUTO, + "x": VIEW_XML, + "h": VIEW_HTML, "i": VIEW_IMAGE, "j": VIEW_JAVASCRIPT, "s": VIEW_JSON, "u": VIEW_URLENCODED, "m": VIEW_MULTIPART, - "x": VIEW_XML, "r": VIEW_RAW, - "h": VIEW_HEX, + "e": VIEW_HEX, } CONTENT_TYPES_MAP = { - "text/html": VIEW_XML, + "text/html": VIEW_HTML, "application/json": VIEW_JSON, "text/xml": VIEW_XML, "multipart/form-data": VIEW_MULTIPART, @@ -116,9 +121,34 @@ def view_hex(hdrs, content): return "Hex", txt -def view_xmlish(hdrs, content): +def view_xml(hdrs, content): + parser = lxml.etree.XMLParser(remove_blank_text=True, resolve_entities=False, strip_cdata=False, recover=False) + try: + document = lxml.etree.fromstring(content, parser) + except lxml.etree.XMLSyntaxError, v: + print v + return None + docinfo = document.getroottree().docinfo + + prev = [] + p = document.getroottree().getroot().getprevious() + while p is not None: + prev.insert( + 0, + lxml.etree.tostring(p) + ) + p = p.getprevious() + + s = lxml.etree.tostring( + document, + pretty_print=True, + xml_declaration=True, + doctype=docinfo.doctype + "\n".join(prev), + encoding = docinfo.encoding + ) + txt = [] - for i in utils.pretty_xmlish(content[:VIEW_CUTOFF]): + for i in s[:VIEW_CUTOFF].strip().split("\n"): txt.append( urwid.Text(("text", i)), ) @@ -126,6 +156,22 @@ def view_xmlish(hdrs, content): return "XML-like data", txt +def view_html(hdrs, content): + if utils.isXML(content): + parser = lxml.etree.HTMLParser(strip_cdata=True, remove_blank_text=True) + d = lxml.html.fromstring(content, parser=parser) + docinfo = d.getroottree().docinfo + s = lxml.etree.tostring(d, pretty_print=True, doctype=docinfo.doctype) + + txt = [] + for i in s[:VIEW_CUTOFF].strip().split("\n"): + txt.append( + urwid.Text(("text", i)), + ) + trailer(len(content), txt) + return "HTML", txt + + def view_json(hdrs, content): lines = utils.pretty_json(content) if lines: @@ -229,7 +275,8 @@ def view_image(hdrs, content): PRETTY_FUNCTION_MAP = { - VIEW_XML: view_xmlish, + VIEW_XML: view_xml, + VIEW_HTML: view_html, VIEW_JSON: view_json, VIEW_URLENCODED: view_urlencoded, VIEW_MULTIPART: view_multipart, @@ -274,7 +321,7 @@ def get_content_view(viewmode, hdrItems, content): if not ret: viewmode = VIEW_RAW ret = view_raw(hdrs, content) - msg.append("Fallback to Raw") + msg.append("Couldn't parse: falling back to Raw") else: msg.append(ret[0]) return " ".join(msg), ret[1] diff --git a/libmproxy/utils.py b/libmproxy/utils.py index b4e317c5f..d8345399c 100644 --- a/libmproxy/utils.py +++ b/libmproxy/utils.py @@ -72,51 +72,6 @@ def cleanBin(s, fixspacing=False): return "".join(parts) -TAG = r""" - <\s* - (?!\s*[!"]) - (?P\s*\/)? - (?P\w+) - ( - [^'"\t >]+ | - "[^\"]*"['\"]* | - '[^']*'['\"]* | - \s+ - )* - (?P\s*\/\s*)? - \s*> - """ -UNI = set(["br", "hr", "img", "input", "area", "link"]) -INDENT = " "*4 -def pretty_xmlish(s): - """ - A robust pretty-printer for XML-ish data. - Returns a list of lines. - """ - s = cleanBin(s) - data, offset, indent, prev = [], 0, 0, None - for i in re.finditer(TAG, s, re.VERBOSE|re.MULTILINE): - start, end = i.span() - name = i.group("name") - if start > offset: - txt = [] - for x in textwrap.dedent(s[offset:start]).split("\n"): - if x.strip(): - txt.append(indent*INDENT + x) - data.extend(txt) - if i.group("close") and not (name in UNI and name==prev): - indent = max(indent - 1, 0) - data.append(indent*INDENT + i.group().strip()) - offset = end - if not any([i.group("close"), i.group("selfcont"), name in UNI]): - indent += 1 - prev = name - trail = s[offset:] - if trail.strip(): - data.append(s[offset:]) - return data - - def pretty_json(s): try: p = json.loads(s) diff --git a/test/test_console_contentview.py b/test/test_console_contentview.py index babe59ea9..cf2ab1e57 100644 --- a/test/test_console_contentview.py +++ b/test/test_console_contentview.py @@ -57,15 +57,32 @@ class uContentView(libpry.AutoTree): assert cv.view_urlencoded([], d) assert not cv.view_urlencoded([], "foo") + def test_view_html(self): + s = "


one

" + assert cv.view_html([], s) + + s = "gobbledygook" + assert not cv.view_html([], s) + def test_view_json(self): cv.VIEW_CUTOFF = 100 assert cv.view_json([], "{}") assert not cv.view_urlencoded([], "{") assert cv.view_json([], "[" + ",".join(["0"]*cv.VIEW_CUTOFF) + "]") - def test_view_xmlish(self): - assert cv.view_xmlish([], "") - assert cv.view_xmlish([], "") + def test_view_xml(self): + #assert cv.view_xml([], "") + #assert not cv.view_xml([], "") + + s = """ + + + + """ + print cv.view_xml([], s) def test_view_raw(self): assert cv.view_raw([], "foo") diff --git a/test/test_utils.py b/test/test_utils.py index e445614aa..f279ce651 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -58,63 +58,6 @@ class uData(libpry.AutoTree): libpry.raises("does not exist", utils.pkg_data.path, "nonexistent") - -class upretty_xmlish(libpry.AutoTree): - def test_tagre(self): - def f(s): - return re.search(utils.TAG, s, re.VERBOSE|re.MULTILINE) - assert f(r"") - assert f(r"") - assert f(r"< body/>") - assert f(r"< body/ >") - assert f(r"< body / >") - assert f(r"") - assert f(r"") - assert f(r"") - assert f(r'') - assert f('') - assert f('