Rip out BeautifulSoup, and use a custom XML-ish prettyprinter.

2024-11-22 15:37:45 +00:00 · 2011-02-06 14:17:30 +13:00 · 2011-02-06 14:17:30 +13:00 · 7156d1a73a
commit 7156d1a73a
parent 44dc3a052e
5 changed files with 98 additions and 2036 deletions
--- a/libmproxy/console.py
+++ b/libmproxy/console.py
@ -307,7 +307,7 @@ class ConnectionView(WWrap):
            ])

    def _view_pretty(self, conn, txt):
-        for i in utils.prettybody(conn.content):
+        for i in utils.pretty_xmlish(conn.content):
            txt.append(
                ("text", i),
            )
--- a/libmproxy/contrib/BeautifulSoup.py
+++ b/libmproxy/contrib/BeautifulSoup.py
--- a/libmproxy/utils.py
+++ b/libmproxy/utils.py
@ -12,9 +12,7 @@
 # 
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-import re, os, subprocess, datetime
-from contrib import BeautifulSoup
+import re, os, subprocess, datetime, textwrap


 def format_timestamp(s):
@ -48,14 +46,48 @@ def cleanBin(s):
    return "".join(parts)
    

-def prettybody(s):
+TAG = r"""
+        <\s*
+        (?!\s*[!"])
+        (?P<close>\s*\/)?
+        (?P<name>\w+)
+        (
+                [a-zA-Z0-9_#:=().%\/]+
+            |
+                "[^\"]*"['\"]*
+            |
+                '[^']*'['\"]*
+            | 
+                \s+
+        )*
+        (?P<selfcont>\s*\/\s*)?
+        \s*>
+      """
+UNI = set(["br", "hr", "img", "input", "area", "link"])
+INDENT = " "*4
+def pretty_xmlish(s):
    """
-        Return a list of pretty-printed lines.
+        This is a robust, general pretty-printer for XML-ish data. 
+        Returns a list of lines.
    """
-    s = BeautifulSoup.BeautifulStoneSoup(s)
-    s = s.prettify().strip()
-    parts = s.split("\n")
-    return [repr(i)[1:-1] for i in parts]
+    data, offset, indent, prev = [], 0, 0, None
+    for i in re.finditer(TAG, s, re.VERBOSE|re.MULTILINE):
+        start, end = i.span()
+        name = i.group("name")
+        if start > offset:
+            txt = []
+            for x in textwrap.dedent(s[offset:start]).split("\n"):
+                if x.strip():
+                    txt.append(indent*INDENT + x)
+            data.extend(txt)
+        if i.group("close") and not (name in UNI and name==prev):
+            indent = max(indent - 1, 0)
+        data.append(indent*INDENT + i.group().strip())
+        offset = end
+        if not any([i.group("close"), i.group("selfcont"), name in UNI]):
+            indent += 1
+        prev = name
+    return data


 def hexdump(s):
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -1,4 +1,4 @@
-import textwrap, cStringIO, os, time
+import textwrap, cStringIO, os, time, re
 import libpry
 from libmproxy import utils

@ -228,13 +228,59 @@ class umake_bogus_cert(libpry.AutoTree):
        assert "CERTIFICATE" in d


-class uprettybody(libpry.AutoTree):
-    def test_all(self):
-        s = "<html><p></p></html>"
-        assert utils.prettybody(s)
+class upretty_xmlish(libpry.AutoTree):
+    def test_tagre(self):
+        def f(s):
+            return re.search(utils.TAG, s, re.VERBOSE|re.MULTILINE)
+        assert f(r"<body>")
+        assert f(r"<body/>")
+        assert f(r"< body/>")
+        assert f(r"< body/ >")
+        assert f(r"< body / >")
+        assert f(r"<foo a=b>")
+        assert f(r"<foo a='b'>")
+        assert f(r"<foo a='b\"'>")
+        assert f(r'<a b=(a.b) href="foo">')
+        assert f('<td width=25%>')
+
+    def test_all(self):
+        def isbalanced(ret):
+            # The last tag should have no indent
+            assert ret[-1].strip() == ret[-1]
+
+        s = "<html><br><br></br><p>one</p></html>"
+        ret = utils.pretty_xmlish(s)
+        isbalanced(ret)
+
+        s = r"""
+<body bgcolor=#ffffff text=#000000 link=#0000cc vlink=#551a8b alink=#ff0000 onload="document.f.q.focus();if(document.images)new Image().src='/images/srpr/nav_logo27.png'" ><textarea id=csi style=display:none></textarea></body>
+        """
+        isbalanced(utils.pretty_xmlish(textwrap.dedent(s)))
+
+        s = r"""
+                <a href="http://foo.com" target="">
+                   <img src="http://foo.gif" alt="bar" height="25" width="132">
+                </a>
+            """
+        isbalanced(utils.pretty_xmlish(textwrap.dedent(s)))
+
+        s = r"""
+            <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"
+            \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">
+            <html></html>
+        """
+        ret = utils.pretty_xmlish(textwrap.dedent(s))
+        isbalanced(ret)
+
+        s = "<html><br/><p>one</p></html>"
+        ret = utils.pretty_xmlish(s)
+        assert len(ret) == 6
+        isbalanced(ret)
+
+        s = "gobbledygook"
+        print utils.pretty_xmlish(s)
+

-        s = "".join([chr(i) for i in range(256)])
-        assert utils.prettybody(s)


    
@ -249,5 +295,5 @@ tests = [
    uMultiDict(),
    uHeaders(),
    uData(),
-    uprettybody(),
+    upretty_xmlish(),
 ]
--- a/6
+++ b/6
@ -1,15 +1,11 @@

 Futures:

-    - Timestamps
-    
    - Strings view for binary responses.
    - Post and URL field parsing and editing.
    - On-the-fly generation of keys, signed with a CA
    - Pass-through fast-track for things that don't match filter?
-    - Reading contents from file
    - Shortcut for viewing in pager
-    - Serializing and de-serializing requests and responses.
    - Upstream proxies.
    - mitmdump
        - Filters
@ -17,7 +13,9 @@ Futures:
        - Pipe to script
        - Command-line replay or serialized flows

+
 Bugs:
    
    - In some circumstances, long URLs in list view are line-broken oddly.
    - Termination sometimes hangs.
+    - When a bug in mitmproxy causes a stack trace, we hang on exit.