Rip out BeautifulSoup, and use a custom XML-ish prettyprinter.

This commit is contained in:
Aldo Cortesi 2011-02-06 14:17:30 +13:00
parent 44dc3a052e
commit 7156d1a73a
5 changed files with 98 additions and 2036 deletions

View File

@ -307,7 +307,7 @@ class ConnectionView(WWrap):
])
def _view_pretty(self, conn, txt):
for i in utils.prettybody(conn.content):
for i in utils.pretty_xmlish(conn.content):
txt.append(
("text", i),
)

File diff suppressed because it is too large Load Diff

View File

@ -12,9 +12,7 @@
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import re, os, subprocess, datetime
from contrib import BeautifulSoup
import re, os, subprocess, datetime, textwrap
def format_timestamp(s):
@ -48,14 +46,48 @@ def cleanBin(s):
return "".join(parts)
def prettybody(s):
TAG = r"""
<\s*
(?!\s*[!"])
(?P<close>\s*\/)?
(?P<name>\w+)
(
[a-zA-Z0-9_#:=().%\/]+
|
"[^\"]*"['\"]*
|
'[^']*'['\"]*
|
\s+
)*
(?P<selfcont>\s*\/\s*)?
\s*>
"""
UNI = set(["br", "hr", "img", "input", "area", "link"])
INDENT = " "*4
def pretty_xmlish(s):
"""
Return a list of pretty-printed lines.
This is a robust, general pretty-printer for XML-ish data.
Returns a list of lines.
"""
s = BeautifulSoup.BeautifulStoneSoup(s)
s = s.prettify().strip()
parts = s.split("\n")
return [repr(i)[1:-1] for i in parts]
data, offset, indent, prev = [], 0, 0, None
for i in re.finditer(TAG, s, re.VERBOSE|re.MULTILINE):
start, end = i.span()
name = i.group("name")
if start > offset:
txt = []
for x in textwrap.dedent(s[offset:start]).split("\n"):
if x.strip():
txt.append(indent*INDENT + x)
data.extend(txt)
if i.group("close") and not (name in UNI and name==prev):
indent = max(indent - 1, 0)
data.append(indent*INDENT + i.group().strip())
offset = end
if not any([i.group("close"), i.group("selfcont"), name in UNI]):
indent += 1
prev = name
return data
def hexdump(s):

View File

@ -1,4 +1,4 @@
import textwrap, cStringIO, os, time
import textwrap, cStringIO, os, time, re
import libpry
from libmproxy import utils
@ -228,13 +228,59 @@ class umake_bogus_cert(libpry.AutoTree):
assert "CERTIFICATE" in d
class uprettybody(libpry.AutoTree):
def test_all(self):
s = "<html><p></p></html>"
assert utils.prettybody(s)
class upretty_xmlish(libpry.AutoTree):
def test_tagre(self):
def f(s):
return re.search(utils.TAG, s, re.VERBOSE|re.MULTILINE)
assert f(r"<body>")
assert f(r"<body/>")
assert f(r"< body/>")
assert f(r"< body/ >")
assert f(r"< body / >")
assert f(r"<foo a=b>")
assert f(r"<foo a='b'>")
assert f(r"<foo a='b\"'>")
assert f(r'<a b=(a.b) href="foo">')
assert f('<td width=25%>')
def test_all(self):
def isbalanced(ret):
# The last tag should have no indent
assert ret[-1].strip() == ret[-1]
s = "<html><br><br></br><p>one</p></html>"
ret = utils.pretty_xmlish(s)
isbalanced(ret)
s = r"""
<body bgcolor=#ffffff text=#000000 link=#0000cc vlink=#551a8b alink=#ff0000 onload="document.f.q.focus();if(document.images)new Image().src='/images/srpr/nav_logo27.png'" ><textarea id=csi style=display:none></textarea></body>
"""
isbalanced(utils.pretty_xmlish(textwrap.dedent(s)))
s = r"""
<a href="http://foo.com" target="">
<img src="http://foo.gif" alt="bar" height="25" width="132">
</a>
"""
isbalanced(utils.pretty_xmlish(textwrap.dedent(s)))
s = r"""
<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"
\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">
<html></html>
"""
ret = utils.pretty_xmlish(textwrap.dedent(s))
isbalanced(ret)
s = "<html><br/><p>one</p></html>"
ret = utils.pretty_xmlish(s)
assert len(ret) == 6
isbalanced(ret)
s = "gobbledygook"
print utils.pretty_xmlish(s)
s = "".join([chr(i) for i in range(256)])
assert utils.prettybody(s)
@ -249,5 +295,5 @@ tests = [
uMultiDict(),
uHeaders(),
uData(),
uprettybody(),
upretty_xmlish(),
]

6
todo
View File

@ -1,15 +1,11 @@
Futures:
- Timestamps
- Strings view for binary responses.
- Post and URL field parsing and editing.
- On-the-fly generation of keys, signed with a CA
- Pass-through fast-track for things that don't match filter?
- Reading contents from file
- Shortcut for viewing in pager
- Serializing and de-serializing requests and responses.
- Upstream proxies.
- mitmdump
- Filters
@ -17,7 +13,9 @@ Futures:
- Pipe to script
- Command-line replay or serialized flows
Bugs:
- In some circumstances, long URLs in list view are line-broken oddly.
- Termination sometimes hangs.
- When a bug in mitmproxy causes a stack trace, we hang on exit.