Integrate lxml for pretty-printing HTML and XML.

Tackling the pretty-printing performance problem head-on, at the cost of a major dependency.
2024-12-02 03:53:42 +00:00 · 2012-04-07 13:47:03 +12:00 · 2012-04-07 13:47:03 +12:00 · f1dc3f2ab2
commit f1dc3f2ab2
parent 549512e93e
5 changed files with 77 additions and 117 deletions
--- a/libmproxy/console/contentview.py
+++ b/libmproxy/console/contentview.py
@ -2,11 +2,12 @@ import re, cStringIO
 import urwid
 from PIL import Image
 from PIL.ExifTags import TAGS
 import lxml.html, lxml.etree
 import common
 from .. import utils, encoding, flow
 from ..contrib import jsbeautifier
-VIEW_CUTOFF = 1024*20
+VIEW_CUTOFF = 1024*200
 VIEW_AUTO = 0
 VIEW_JSON = 1
@ -17,6 +18,7 @@ VIEW_JAVASCRIPT = 5
 VIEW_IMAGE = 6
 VIEW_RAW = 7
 VIEW_HEX = 8
 VIEW_HTML = 9
 VIEW_NAMES = {
    VIEW_AUTO: "Auto",
@ -28,35 +30,38 @@ VIEW_NAMES = {
    VIEW_IMAGE: "Image",
    VIEW_RAW: "Raw",
    VIEW_HEX: "Hex",
    VIEW_HTML: "HTML",
 }
 VIEW_PROMPT = (
    ("auto detect", "a"),
-    ("hex", "h"),
+    ("hex", "e"),
    ("html", "h"),
    ("image", "i"),
    ("javascript", "j"),
    ("json", "s"),
    ("raw", "r"),
    ("multipart", "m"),
    ("urlencoded", "u"),
-    ("xmlish", "x"),
+    ("xml", "x"),
 )
 VIEW_SHORTCUTS = {
    "a": VIEW_AUTO,
    "x": VIEW_XML,
    "h": VIEW_HTML,
    "i": VIEW_IMAGE,
    "j": VIEW_JAVASCRIPT,
    "s": VIEW_JSON,
    "u": VIEW_URLENCODED,
    "m": VIEW_MULTIPART,
    "x": VIEW_XML,
    "r": VIEW_RAW,
-    "h": VIEW_HEX,
+    "e": VIEW_HEX,
 }
 CONTENT_TYPES_MAP = {
-    "text/html": VIEW_XML,
+    "text/html": VIEW_HTML,
    "application/json": VIEW_JSON,
    "text/xml": VIEW_XML,
    "multipart/form-data": VIEW_MULTIPART,
@ -116,9 +121,34 @@ def view_hex(hdrs, content):
    return "Hex", txt
-def view_xmlish(hdrs, content):
+def view_xml(hdrs, content):
    parser = lxml.etree.XMLParser(remove_blank_text=True, resolve_entities=False, strip_cdata=False, recover=False)
    try:
        document = lxml.etree.fromstring(content, parser)
    except lxml.etree.XMLSyntaxError, v:
        print v
        return None
    docinfo = document.getroottree().docinfo
    prev = []
    p = document.getroottree().getroot().getprevious()
    while p is not None:
        prev.insert(
            0,
            lxml.etree.tostring(p)
        )
        p = p.getprevious()
    s = lxml.etree.tostring(
            document,
            pretty_print=True,
            xml_declaration=True,
            doctype=docinfo.doctype + "\n".join(prev),
            encoding = docinfo.encoding
        )
    txt = []
-    for i in utils.pretty_xmlish(content[:VIEW_CUTOFF]):
+    for i in s[:VIEW_CUTOFF].strip().split("\n"):
        txt.append(
            urwid.Text(("text", i)),
        )
@ -126,6 +156,22 @@ def view_xmlish(hdrs, content):
    return "XML-like data", txt
 def view_html(hdrs, content):
    if utils.isXML(content):
        parser = lxml.etree.HTMLParser(strip_cdata=True, remove_blank_text=True)
        d = lxml.html.fromstring(content, parser=parser)
        docinfo = d.getroottree().docinfo
        s = lxml.etree.tostring(d, pretty_print=True, doctype=docinfo.doctype)
        txt = []
        for i in s[:VIEW_CUTOFF].strip().split("\n"):
            txt.append(
                urwid.Text(("text", i)),
            )
        trailer(len(content), txt)
        return "HTML", txt
 def view_json(hdrs, content):
    lines = utils.pretty_json(content)
    if lines:
@ -229,7 +275,8 @@ def view_image(hdrs, content):
 PRETTY_FUNCTION_MAP = {
-    VIEW_XML: view_xmlish,
+    VIEW_XML: view_xml,
    VIEW_HTML: view_html,
    VIEW_JSON: view_json,
    VIEW_URLENCODED: view_urlencoded,
    VIEW_MULTIPART: view_multipart,
@ -274,7 +321,7 @@ def get_content_view(viewmode, hdrItems, content):
    if not ret:
        viewmode = VIEW_RAW
        ret = view_raw(hdrs, content)
-        msg.append("Fallback to Raw")
+        msg.append("Couldn't parse: falling back to Raw")
    else:
        msg.append(ret[0])
    return " ".join(msg), ret[1]
--- a/libmproxy/utils.py
+++ b/libmproxy/utils.py
@ -72,51 +72,6 @@ def cleanBin(s, fixspacing=False):
    return "".join(parts)
 TAG = r"""
        <\s*
        (?!\s*[!"])
        (?P<close>\s*\/)?
        (?P<name>\w+)
        (
            [^'"\t >]+ |
            "[^\"]*"['\"]* |
            '[^']*'['\"]* |
            \s+
        )*
        (?P<selfcont>\s*\/\s*)?
        \s*>
      """
 UNI = set(["br", "hr", "img", "input", "area", "link"])
 INDENT = " "*4
 def pretty_xmlish(s):
    """
        A robust pretty-printer for XML-ish data.
        Returns a list of lines.
    """
    s = cleanBin(s)
    data, offset, indent, prev = [], 0, 0, None
    for i in re.finditer(TAG, s, re.VERBOSE|re.MULTILINE):
        start, end = i.span()
        name = i.group("name")
        if start > offset:
            txt = []
            for x in textwrap.dedent(s[offset:start]).split("\n"):
                if x.strip():
                    txt.append(indent*INDENT + x)
            data.extend(txt)
        if i.group("close") and not (name in UNI and name==prev):
            indent = max(indent - 1, 0)
        data.append(indent*INDENT + i.group().strip())
        offset = end
        if not any([i.group("close"), i.group("selfcont"), name in UNI]):
            indent += 1
        prev = name
    trail = s[offset:]
    if trail.strip():
        data.append(s[offset:])
    return data
 def pretty_json(s):
    try:
        p = json.loads(s)
--- a/test/test_console_contentview.py
+++ b/test/test_console_contentview.py
@ -57,15 +57,32 @@ class uContentView(libpry.AutoTree):
        assert cv.view_urlencoded([], d)
        assert not cv.view_urlencoded([], "foo")
    def test_view_html(self):
        s = "<html><br><br></br><p>one</p></html>"
        assert cv.view_html([], s)
        s = "gobbledygook"
        assert not cv.view_html([], s)
    def test_view_json(self):
        cv.VIEW_CUTOFF = 100
        assert cv.view_json([], "{}")
        assert not cv.view_urlencoded([], "{")
        assert cv.view_json([], "[" + ",".join(["0"]*cv.VIEW_CUTOFF) + "]")
-    def test_view_xmlish(self):
+    def test_view_xml(self):
-        assert cv.view_xmlish([], "<foo></foo>")
+        #assert cv.view_xml([], "<foo></foo>")
-        assert cv.view_xmlish([], "<foo>")
+        #assert not cv.view_xml([], "<foo>")
        s = """<?xml version="1.0" encoding="UTF-8"?>
            <?xml-stylesheet title="XSL_formatting"?>
            <rss 
                xmlns:media="http://search.yahoo.com/mrss/"
                xmlns:atom="http://www.w3.org/2005/Atom"
                version="2.0">
            </rss>
        """
        print cv.view_xml([], s)
    def test_view_raw(self):
        assert cv.view_raw([], "foo")
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -58,63 +58,6 @@ class uData(libpry.AutoTree):
        libpry.raises("does not exist", utils.pkg_data.path, "nonexistent")
 class upretty_xmlish(libpry.AutoTree):
    def test_tagre(self):
        def f(s):
            return re.search(utils.TAG, s, re.VERBOSE|re.MULTILINE)
        assert f(r"<body>")
        assert f(r"<body/>")
        assert f(r"< body/>")
        assert f(r"< body/ >")
        assert f(r"< body / >")
        assert f(r"<foo a=b>")
        assert f(r"<foo a='b'>")
        assert f(r"<foo a='b\"'>")
        assert f(r'<a b=(a.b) href="foo">')
        assert f('<td width=25%>')
        assert f('<form name="search" action="/search.php" method="get" accept-charset="utf-8" class="search">')
        assert f('<img src="gif" width="125" height="16" alt=&quot;&quot; />')
    def test_all(self):
        def isbalanced(ret):
            # The last tag should have no indent
            assert ret[-1].strip() == ret[-1]
        s = "<html><br><br></br><p>one</p></html>"
        ret = utils.pretty_xmlish(s)
        isbalanced(ret)
        s = r"""
 <body bgcolor=#ffffff text=#000000 link=#0000cc vlink=#551a8b alink=#ff0000 onload="document.f.q.focus();if(document.images)new Image().src='/images/srpr/nav_logo27.png'" ><textarea id=csi style=display:none></textarea></body>
        """
        isbalanced(utils.pretty_xmlish(textwrap.dedent(s)))
        s = r"""
                <a href="http://foo.com" target="">
                   <img src="http://foo.gif" alt="bar" height="25" width="132">
                </a>
            """
        isbalanced(utils.pretty_xmlish(textwrap.dedent(s)))
        s = r"""
            <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"
            \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">
            <html></html>
        """
        ret = utils.pretty_xmlish(textwrap.dedent(s))
        isbalanced(ret)
        s = "<html><br/><p>one</p></html>"
        ret = utils.pretty_xmlish(s)
        assert len(ret) == 6
        isbalanced(ret)
        s = "gobbledygook"
        assert utils.pretty_xmlish(s) == ["gobbledygook"]
 class upretty_json(libpry.AutoTree):
    def test_one(self):
        s = json.dumps({"foo": 1})
@ -242,7 +185,6 @@ tests = [
    uhexdump(),
    upretty_size(),
    uData(),
    upretty_xmlish(),
    upretty_json(),
    u_urldecode(),
    udel_all(),
--- a/1
+++ b/1
@ -4,7 +4,6 @@ of these and need some pointers.
 Targeted for 0.9:
    - Upstream proxy support.
    - Improve worst-case performance problem with XML-ish indenter
    - Follow mode to keep most recent flow in view
    - Rewrite the core to be asynchronous. I've done some research, and
    although it's a bit of a bloated monster, it looks like Twisted is the way