Integrate lxml for pretty-printing HTML and XML.

Tackling the pretty-printing performance problem head-on, at the cost of a
major dependency.
This commit is contained in:
Aldo Cortesi 2012-04-07 13:47:03 +12:00
parent 549512e93e
commit f1dc3f2ab2
5 changed files with 77 additions and 117 deletions

View File

@ -2,11 +2,12 @@ import re, cStringIO
import urwid import urwid
from PIL import Image from PIL import Image
from PIL.ExifTags import TAGS from PIL.ExifTags import TAGS
import lxml.html, lxml.etree
import common import common
from .. import utils, encoding, flow from .. import utils, encoding, flow
from ..contrib import jsbeautifier from ..contrib import jsbeautifier
VIEW_CUTOFF = 1024*20 VIEW_CUTOFF = 1024*200
VIEW_AUTO = 0 VIEW_AUTO = 0
VIEW_JSON = 1 VIEW_JSON = 1
@ -17,6 +18,7 @@ VIEW_JAVASCRIPT = 5
VIEW_IMAGE = 6 VIEW_IMAGE = 6
VIEW_RAW = 7 VIEW_RAW = 7
VIEW_HEX = 8 VIEW_HEX = 8
VIEW_HTML = 9
VIEW_NAMES = { VIEW_NAMES = {
VIEW_AUTO: "Auto", VIEW_AUTO: "Auto",
@ -28,35 +30,38 @@ VIEW_NAMES = {
VIEW_IMAGE: "Image", VIEW_IMAGE: "Image",
VIEW_RAW: "Raw", VIEW_RAW: "Raw",
VIEW_HEX: "Hex", VIEW_HEX: "Hex",
VIEW_HTML: "HTML",
} }
VIEW_PROMPT = ( VIEW_PROMPT = (
("auto detect", "a"), ("auto detect", "a"),
("hex", "h"), ("hex", "e"),
("html", "h"),
("image", "i"), ("image", "i"),
("javascript", "j"), ("javascript", "j"),
("json", "s"), ("json", "s"),
("raw", "r"), ("raw", "r"),
("multipart", "m"), ("multipart", "m"),
("urlencoded", "u"), ("urlencoded", "u"),
("xmlish", "x"), ("xml", "x"),
) )
VIEW_SHORTCUTS = { VIEW_SHORTCUTS = {
"a": VIEW_AUTO, "a": VIEW_AUTO,
"x": VIEW_XML,
"h": VIEW_HTML,
"i": VIEW_IMAGE, "i": VIEW_IMAGE,
"j": VIEW_JAVASCRIPT, "j": VIEW_JAVASCRIPT,
"s": VIEW_JSON, "s": VIEW_JSON,
"u": VIEW_URLENCODED, "u": VIEW_URLENCODED,
"m": VIEW_MULTIPART, "m": VIEW_MULTIPART,
"x": VIEW_XML,
"r": VIEW_RAW, "r": VIEW_RAW,
"h": VIEW_HEX, "e": VIEW_HEX,
} }
CONTENT_TYPES_MAP = { CONTENT_TYPES_MAP = {
"text/html": VIEW_XML, "text/html": VIEW_HTML,
"application/json": VIEW_JSON, "application/json": VIEW_JSON,
"text/xml": VIEW_XML, "text/xml": VIEW_XML,
"multipart/form-data": VIEW_MULTIPART, "multipart/form-data": VIEW_MULTIPART,
@ -116,9 +121,34 @@ def view_hex(hdrs, content):
return "Hex", txt return "Hex", txt
def view_xmlish(hdrs, content): def view_xml(hdrs, content):
parser = lxml.etree.XMLParser(remove_blank_text=True, resolve_entities=False, strip_cdata=False, recover=False)
try:
document = lxml.etree.fromstring(content, parser)
except lxml.etree.XMLSyntaxError, v:
print v
return None
docinfo = document.getroottree().docinfo
prev = []
p = document.getroottree().getroot().getprevious()
while p is not None:
prev.insert(
0,
lxml.etree.tostring(p)
)
p = p.getprevious()
s = lxml.etree.tostring(
document,
pretty_print=True,
xml_declaration=True,
doctype=docinfo.doctype + "\n".join(prev),
encoding = docinfo.encoding
)
txt = [] txt = []
for i in utils.pretty_xmlish(content[:VIEW_CUTOFF]): for i in s[:VIEW_CUTOFF].strip().split("\n"):
txt.append( txt.append(
urwid.Text(("text", i)), urwid.Text(("text", i)),
) )
@ -126,6 +156,22 @@ def view_xmlish(hdrs, content):
return "XML-like data", txt return "XML-like data", txt
def view_html(hdrs, content):
if utils.isXML(content):
parser = lxml.etree.HTMLParser(strip_cdata=True, remove_blank_text=True)
d = lxml.html.fromstring(content, parser=parser)
docinfo = d.getroottree().docinfo
s = lxml.etree.tostring(d, pretty_print=True, doctype=docinfo.doctype)
txt = []
for i in s[:VIEW_CUTOFF].strip().split("\n"):
txt.append(
urwid.Text(("text", i)),
)
trailer(len(content), txt)
return "HTML", txt
def view_json(hdrs, content): def view_json(hdrs, content):
lines = utils.pretty_json(content) lines = utils.pretty_json(content)
if lines: if lines:
@ -229,7 +275,8 @@ def view_image(hdrs, content):
PRETTY_FUNCTION_MAP = { PRETTY_FUNCTION_MAP = {
VIEW_XML: view_xmlish, VIEW_XML: view_xml,
VIEW_HTML: view_html,
VIEW_JSON: view_json, VIEW_JSON: view_json,
VIEW_URLENCODED: view_urlencoded, VIEW_URLENCODED: view_urlencoded,
VIEW_MULTIPART: view_multipart, VIEW_MULTIPART: view_multipart,
@ -274,7 +321,7 @@ def get_content_view(viewmode, hdrItems, content):
if not ret: if not ret:
viewmode = VIEW_RAW viewmode = VIEW_RAW
ret = view_raw(hdrs, content) ret = view_raw(hdrs, content)
msg.append("Fallback to Raw") msg.append("Couldn't parse: falling back to Raw")
else: else:
msg.append(ret[0]) msg.append(ret[0])
return " ".join(msg), ret[1] return " ".join(msg), ret[1]

View File

@ -72,51 +72,6 @@ def cleanBin(s, fixspacing=False):
return "".join(parts) return "".join(parts)
TAG = r"""
<\s*
(?!\s*[!"])
(?P<close>\s*\/)?
(?P<name>\w+)
(
[^'"\t >]+ |
"[^\"]*"['\"]* |
'[^']*'['\"]* |
\s+
)*
(?P<selfcont>\s*\/\s*)?
\s*>
"""
UNI = set(["br", "hr", "img", "input", "area", "link"])
INDENT = " "*4
def pretty_xmlish(s):
"""
A robust pretty-printer for XML-ish data.
Returns a list of lines.
"""
s = cleanBin(s)
data, offset, indent, prev = [], 0, 0, None
for i in re.finditer(TAG, s, re.VERBOSE|re.MULTILINE):
start, end = i.span()
name = i.group("name")
if start > offset:
txt = []
for x in textwrap.dedent(s[offset:start]).split("\n"):
if x.strip():
txt.append(indent*INDENT + x)
data.extend(txt)
if i.group("close") and not (name in UNI and name==prev):
indent = max(indent - 1, 0)
data.append(indent*INDENT + i.group().strip())
offset = end
if not any([i.group("close"), i.group("selfcont"), name in UNI]):
indent += 1
prev = name
trail = s[offset:]
if trail.strip():
data.append(s[offset:])
return data
def pretty_json(s): def pretty_json(s):
try: try:
p = json.loads(s) p = json.loads(s)

View File

@ -57,15 +57,32 @@ class uContentView(libpry.AutoTree):
assert cv.view_urlencoded([], d) assert cv.view_urlencoded([], d)
assert not cv.view_urlencoded([], "foo") assert not cv.view_urlencoded([], "foo")
def test_view_html(self):
s = "<html><br><br></br><p>one</p></html>"
assert cv.view_html([], s)
s = "gobbledygook"
assert not cv.view_html([], s)
def test_view_json(self): def test_view_json(self):
cv.VIEW_CUTOFF = 100 cv.VIEW_CUTOFF = 100
assert cv.view_json([], "{}") assert cv.view_json([], "{}")
assert not cv.view_urlencoded([], "{") assert not cv.view_urlencoded([], "{")
assert cv.view_json([], "[" + ",".join(["0"]*cv.VIEW_CUTOFF) + "]") assert cv.view_json([], "[" + ",".join(["0"]*cv.VIEW_CUTOFF) + "]")
def test_view_xmlish(self): def test_view_xml(self):
assert cv.view_xmlish([], "<foo></foo>") #assert cv.view_xml([], "<foo></foo>")
assert cv.view_xmlish([], "<foo>") #assert not cv.view_xml([], "<foo>")
s = """<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet title="XSL_formatting"?>
<rss
xmlns:media="http://search.yahoo.com/mrss/"
xmlns:atom="http://www.w3.org/2005/Atom"
version="2.0">
</rss>
"""
print cv.view_xml([], s)
def test_view_raw(self): def test_view_raw(self):
assert cv.view_raw([], "foo") assert cv.view_raw([], "foo")

View File

@ -58,63 +58,6 @@ class uData(libpry.AutoTree):
libpry.raises("does not exist", utils.pkg_data.path, "nonexistent") libpry.raises("does not exist", utils.pkg_data.path, "nonexistent")
class upretty_xmlish(libpry.AutoTree):
def test_tagre(self):
def f(s):
return re.search(utils.TAG, s, re.VERBOSE|re.MULTILINE)
assert f(r"<body>")
assert f(r"<body/>")
assert f(r"< body/>")
assert f(r"< body/ >")
assert f(r"< body / >")
assert f(r"<foo a=b>")
assert f(r"<foo a='b'>")
assert f(r"<foo a='b\"'>")
assert f(r'<a b=(a.b) href="foo">')
assert f('<td width=25%>')
assert f('<form name="search" action="/search.php" method="get" accept-charset="utf-8" class="search">')
assert f('<img src="gif" width="125" height="16" alt=&quot;&quot; />')
def test_all(self):
def isbalanced(ret):
# The last tag should have no indent
assert ret[-1].strip() == ret[-1]
s = "<html><br><br></br><p>one</p></html>"
ret = utils.pretty_xmlish(s)
isbalanced(ret)
s = r"""
<body bgcolor=#ffffff text=#000000 link=#0000cc vlink=#551a8b alink=#ff0000 onload="document.f.q.focus();if(document.images)new Image().src='/images/srpr/nav_logo27.png'" ><textarea id=csi style=display:none></textarea></body>
"""
isbalanced(utils.pretty_xmlish(textwrap.dedent(s)))
s = r"""
<a href="http://foo.com" target="">
<img src="http://foo.gif" alt="bar" height="25" width="132">
</a>
"""
isbalanced(utils.pretty_xmlish(textwrap.dedent(s)))
s = r"""
<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"
\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">
<html></html>
"""
ret = utils.pretty_xmlish(textwrap.dedent(s))
isbalanced(ret)
s = "<html><br/><p>one</p></html>"
ret = utils.pretty_xmlish(s)
assert len(ret) == 6
isbalanced(ret)
s = "gobbledygook"
assert utils.pretty_xmlish(s) == ["gobbledygook"]
class upretty_json(libpry.AutoTree): class upretty_json(libpry.AutoTree):
def test_one(self): def test_one(self):
s = json.dumps({"foo": 1}) s = json.dumps({"foo": 1})
@ -242,7 +185,6 @@ tests = [
uhexdump(), uhexdump(),
upretty_size(), upretty_size(),
uData(), uData(),
upretty_xmlish(),
upretty_json(), upretty_json(),
u_urldecode(), u_urldecode(),
udel_all(), udel_all(),

1
todo
View File

@ -4,7 +4,6 @@ of these and need some pointers.
Targeted for 0.9: Targeted for 0.9:
- Upstream proxy support. - Upstream proxy support.
- Improve worst-case performance problem with XML-ish indenter
- Follow mode to keep most recent flow in view - Follow mode to keep most recent flow in view
- Rewrite the core to be asynchronous. I've done some research, and - Rewrite the core to be asynchronous. I've done some research, and
although it's a bit of a bloated monster, it looks like Twisted is the way although it's a bit of a bloated monster, it looks like Twisted is the way