diff --git a/libmproxy/console/contentview.py b/libmproxy/console/contentview.py index a6e6ee91b..2443c188c 100644 --- a/libmproxy/console/contentview.py +++ b/libmproxy/console/contentview.py @@ -1,11 +1,11 @@ -import re, cStringIO +import re, cStringIO, traceback import urwid from PIL import Image from PIL.ExifTags import TAGS import lxml.html, lxml.etree import common from .. import utils, encoding, flow -from ..contrib import jsbeautifier +from ..contrib import jsbeautifier, html2text VIEW_CUTOFF = 1024*50 @@ -19,6 +19,7 @@ VIEW_IMAGE = 6 VIEW_RAW = 7 VIEW_HEX = 8 VIEW_HTML = 9 +VIEW_OUTLINE = 10 VIEW_NAMES = { VIEW_AUTO: "Auto", @@ -31,6 +32,7 @@ VIEW_NAMES = { VIEW_RAW: "Raw", VIEW_HEX: "Hex", VIEW_HTML: "HTML", + VIEW_OUTLINE: "HTML Outline", } @@ -40,6 +42,7 @@ VIEW_PROMPT = ( ("html", "h"), ("image", "i"), ("javascript", "j"), + ("html outline", "o"), ("json", "s"), ("raw", "r"), ("multipart", "m"), @@ -56,6 +59,7 @@ VIEW_SHORTCUTS = { "s": VIEW_JSON, "u": VIEW_URLENCODED, "m": VIEW_MULTIPART, + "o": VIEW_OUTLINE, "r": VIEW_RAW, "e": VIEW_HEX, } @@ -170,6 +174,16 @@ def view_html(hdrs, content, limit): return "HTML", _view_text(s[:limit], len(s), limit) +def view_outline(hdrs, content, limit): + content = content.decode("utf-8") + h = html2text.HTML2Text(baseurl="") + h.ignore_images = True + h.body_width = 0 + content = h.handle(content) + txt = _view_text(content[:limit], len(content), limit) + return "HTML Outline", txt + + def view_json(hdrs, content, limit): lines = utils.pretty_json(content) if lines: @@ -282,6 +296,7 @@ PRETTY_FUNCTION_MAP = { VIEW_IMAGE: view_image, VIEW_HEX: view_hex, VIEW_RAW: view_raw, + VIEW_OUTLINE: view_outline, } def get_view_func(viewmode, hdrs, content): @@ -318,7 +333,9 @@ def get_content_view(viewmode, hdrItems, content, limit): try: ret = func(hdrs, content, limit) # Third-party viewers can fail in unexpected ways... - except: + except Exception, e: + s = traceback.format_exc() + return "", _view_text(s, len(s), len(s)) ret = None if not ret: viewmode = VIEW_RAW diff --git a/libmproxy/contrib/README b/libmproxy/contrib/README index f2e9907ec..61ae29e2e 100644 --- a/libmproxy/contrib/README +++ b/libmproxy/contrib/README @@ -1,6 +1,7 @@ Contribs: + pyparsing 1.5.2, MIT license jsbeautifier, git checkout 25/03/12, MIT license @@ -8,3 +9,6 @@ jsbeautifier, git checkout 25/03/12, MIT license - Disabled packers through a single-line modification (see "# CORTESI" comment) +html2text, git checkout 18/08/12, GPLv3 + + diff --git a/libmproxy/contrib/html2text.py b/libmproxy/contrib/html2text.py new file mode 100644 index 000000000..76ea91a34 --- /dev/null +++ b/libmproxy/contrib/html2text.py @@ -0,0 +1,834 @@ +#!/usr/bin/env python +"""html2text: Turn HTML into equivalent Markdown-structured text.""" +__version__ = "3.200.3" +__author__ = "Aaron Swartz (me@aaronsw.com)" +__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." +__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"] + +# TODO: +# Support decoded entities with unifiable. + +try: + True +except NameError: + setattr(__builtins__, 'True', 1) + setattr(__builtins__, 'False', 0) + +def has_key(x, y): + if hasattr(x, 'has_key'): return x.has_key(y) + else: return y in x + +try: + import htmlentitydefs + import urlparse + import HTMLParser +except ImportError: #Python3 + import html.entities as htmlentitydefs + import urllib.parse as urlparse + import html.parser as HTMLParser +try: #Python3 + import urllib.request as urllib +except: + import urllib +import optparse, re, sys, codecs, types + +try: from textwrap import wrap +except: pass + +# Use Unicode characters instead of their ascii psuedo-replacements +UNICODE_SNOB = 0 + +# Put the links after each paragraph instead of at the end. +LINKS_EACH_PARAGRAPH = 0 + +# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.) +BODY_WIDTH = 78 + +# Don't show internal links (href="#local-anchor") -- corresponding link targets +# won't be visible in the plain text file anyway. +SKIP_INTERNAL_LINKS = True + +# Use inline, rather than reference, formatting for images and links +INLINE_LINKS = True + +# Number of pixels Google indents nested lists +GOOGLE_LIST_INDENT = 36 + +IGNORE_ANCHORS = False +IGNORE_IMAGES = False +IGNORE_EMPHASIS = False + +### Entity Nonsense ### + +def name2cp(k): + if k == 'apos': return ord("'") + if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 + return htmlentitydefs.name2codepoint[k] + else: + k = htmlentitydefs.entitydefs[k] + if k.startswith("") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 + return ord(codecs.latin_1_decode(k)[0]) + +unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', +'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*', +'ndash':'-', 'oelig':'oe', 'aelig':'ae', +'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', +'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', +'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i', +'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', +'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u', +'lrm':'', 'rlm':''} + +unifiable_n = {} + +for k in unifiable.keys(): + unifiable_n[name2cp(k)] = unifiable[k] + +### End Entity Nonsense ### + +def onlywhite(line): + """Return true if the line does only consist of whitespace characters.""" + for c in line: + if c is not ' ' and c is not ' ': + return c is ' ' + return line + +def hn(tag): + if tag[0] == 'h' and len(tag) == 2: + try: + n = int(tag[1]) + if n in range(1, 10): return n + except ValueError: return 0 + +def dumb_property_dict(style): + """returns a hash of css attributes""" + return dict([(x.strip(), y.strip()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z]]); + +def dumb_css_parser(data): + """returns a hash of css selectors, each of which contains a hash of css attributes""" + # remove @import sentences + importIndex = data.find('@import') + while importIndex != -1: + data = data[0:importIndex] + data[data.find(';', importIndex) + 1:] + importIndex = data.find('@import') + + # parse the css. reverted from dictionary compehension in order to support older pythons + elements = [x.split('{') for x in data.split('}') if '{' in x.strip()] + try: + elements = dict([(a.strip(), dumb_property_dict(b)) for a, b in elements]) + except ValueError: + elements = {} # not that important + + return elements + +def element_style(attrs, style_def, parent_style): + """returns a hash of the 'final' style attributes of the element""" + style = parent_style.copy() + if 'class' in attrs: + for css_class in attrs['class'].split(): + css_style = style_def['.' + css_class] + style.update(css_style) + if 'style' in attrs: + immediate_style = dumb_property_dict(attrs['style']) + style.update(immediate_style) + return style + +def google_list_style(style): + """finds out whether this is an ordered or unordered list""" + if 'list-style-type' in style: + list_style = style['list-style-type'] + if list_style in ['disc', 'circle', 'square', 'none']: + return 'ul' + return 'ol' + +def google_has_height(style): + """check if the style of the element has the 'height' attribute explicitly defined""" + if 'height' in style: + return True + return False + +def google_text_emphasis(style): + """return a list of all emphasis modifiers of the element""" + emphasis = [] + if 'text-decoration' in style: + emphasis.append(style['text-decoration']) + if 'font-style' in style: + emphasis.append(style['font-style']) + if 'font-weight' in style: + emphasis.append(style['font-weight']) + return emphasis + +def google_fixed_width_font(style): + """check if the css of the current element defines a fixed width font""" + font_family = '' + if 'font-family' in style: + font_family = style['font-family'] + if 'Courier New' == font_family or 'Consolas' == font_family: + return True + return False + +def list_numbering_start(attrs): + """extract numbering from list element attributes""" + if 'start' in attrs: + return int(attrs['start']) - 1 + else: + return 0 + +class HTML2Text(HTMLParser.HTMLParser): + def __init__(self, out=None, baseurl=''): + HTMLParser.HTMLParser.__init__(self) + + # Config options + self.unicode_snob = UNICODE_SNOB + self.links_each_paragraph = LINKS_EACH_PARAGRAPH + self.body_width = BODY_WIDTH + self.skip_internal_links = SKIP_INTERNAL_LINKS + self.inline_links = INLINE_LINKS + self.google_list_indent = GOOGLE_LIST_INDENT + self.ignore_links = IGNORE_ANCHORS + self.ignore_images = IGNORE_IMAGES + self.ignore_emphasis = IGNORE_EMPHASIS + self.google_doc = False + self.ul_item_mark = '*' + + if out is None: + self.out = self.outtextf + else: + self.out = out + + self.outtextlist = [] # empty list to store output characters before they are "joined" + + try: + self.outtext = unicode() + except NameError: # Python3 + self.outtext = str() + + self.quiet = 0 + self.p_p = 0 # number of newline character to print before next output + self.outcount = 0 + self.start = 1 + self.space = 0 + self.a = [] + self.astack = [] + self.acount = 0 + self.list = [] + self.blockquote = 0 + self.pre = 0 + self.startpre = 0 + self.code = False + self.br_toggle = '' + self.lastWasNL = 0 + self.lastWasList = False + self.style = 0 + self.style_def = {} + self.tag_stack = [] + self.emphasis = 0 + self.drop_white_space = 0 + self.inheader = False + self.abbr_title = None # current abbreviation definition + self.abbr_data = None # last inner HTML (for abbr being defined) + self.abbr_list = {} # stack of abbreviations to write later + self.baseurl = baseurl + + try: del unifiable_n[name2cp('nbsp')] + except KeyError: pass + unifiable['nbsp'] = ' _place_holder;' + + + def feed(self, data): + data = data.replace("' + 'script>", "") + HTMLParser.HTMLParser.feed(self, data) + + def handle(self, data): + self.feed(data) + self.feed("") + return self.optwrap(self.close()) + + def outtextf(self, s): + self.outtextlist.append(s) + if s: self.lastWasNL = s[-1] == '\n' + + def close(self): + HTMLParser.HTMLParser.close(self) + + self.pbr() + self.o('', 0, 'end') + + self.outtext = self.outtext.join(self.outtextlist) + if self.unicode_snob: + nbsp = unichr(name2cp('nbsp')) + else: + nbsp = u' ' + self.outtext = self.outtext.replace(u' _place_holder;', nbsp) + + return self.outtext + + def handle_charref(self, c): + self.o(self.charref(c), 1) + + def handle_entityref(self, c): + self.o(self.entityref(c), 1) + + def handle_starttag(self, tag, attrs): + self.handle_tag(tag, attrs, 1) + + def handle_endtag(self, tag): + self.handle_tag(tag, None, 0) + + def previousIndex(self, attrs): + """ returns the index of certain set of attributes (of a link) in the + self.a list + + If the set of attributes is not found, returns None + """ + if not has_key(attrs, 'href'): return None + + i = -1 + for a in self.a: + i += 1 + match = 0 + + if has_key(a, 'href') and a['href'] == attrs['href']: + if has_key(a, 'title') or has_key(attrs, 'title'): + if (has_key(a, 'title') and has_key(attrs, 'title') and + a['title'] == attrs['title']): + match = True + else: + match = True + + if match: return i + + def drop_last(self, nLetters): + if not self.quiet: + self.outtext = self.outtext[:-nLetters] + + def handle_emphasis(self, start, tag_style, parent_style): + """handles various text emphases""" + tag_emphasis = google_text_emphasis(tag_style) + parent_emphasis = google_text_emphasis(parent_style) + + # handle Google's text emphasis + strikethrough = 'line-through' in tag_emphasis and self.hide_strikethrough + bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis + italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis + fixed = google_fixed_width_font(tag_style) and not \ + google_fixed_width_font(parent_style) and not self.pre + + if start: + # crossed-out text must be handled before other attributes + # in order not to output qualifiers unnecessarily + if bold or italic or fixed: + self.emphasis += 1 + if strikethrough: + self.quiet += 1 + if italic: + self.o("_") + self.drop_white_space += 1 + if bold: + self.o("**") + self.drop_white_space += 1 + if fixed: + self.o('`') + self.drop_white_space += 1 + self.code = True + else: + if bold or italic or fixed: + # there must not be whitespace before closing emphasis mark + self.emphasis -= 1 + self.space = 0 + self.outtext = self.outtext.rstrip() + if fixed: + if self.drop_white_space: + # empty emphasis, drop it + self.drop_last(1) + self.drop_white_space -= 1 + else: + self.o('`') + self.code = False + if bold: + if self.drop_white_space: + # empty emphasis, drop it + self.drop_last(2) + self.drop_white_space -= 1 + else: + self.o("**") + if italic: + if self.drop_white_space: + # empty emphasis, drop it + self.drop_last(1) + self.drop_white_space -= 1 + else: + self.o("_") + # space is only allowed after *all* emphasis marks + if (bold or italic) and not self.emphasis: + self.o(" ") + if strikethrough: + self.quiet -= 1 + + def handle_tag(self, tag, attrs, start): + #attrs = fixattrs(attrs) + if attrs is None: + attrs = {} + else: + attrs = dict(attrs) + + if self.google_doc: + # the attrs parameter is empty for a closing tag. in addition, we + # need the attributes of the parent nodes in order to get a + # complete style description for the current element. we assume + # that google docs export well formed html. + parent_style = {} + if start: + if self.tag_stack: + parent_style = self.tag_stack[-1][2] + tag_style = element_style(attrs, self.style_def, parent_style) + self.tag_stack.append((tag, attrs, tag_style)) + else: + dummy, attrs, tag_style = self.tag_stack.pop() + if self.tag_stack: + parent_style = self.tag_stack[-1][2] + + if hn(tag): + self.p() + if start: + self.inheader = True + self.o(hn(tag)*"#" + ' ') + else: + self.inheader = False + return # prevent redundant emphasis marks on headers + + if tag in ['p', 'div']: + if self.google_doc: + if start and google_has_height(tag_style): + self.p() + else: + self.soft_br() + else: + self.p() + + if tag == "br" and start: self.o(" \n") + + if tag == "hr" and start: + self.p() + self.o("* * *") + self.p() + + if tag in ["head", "style", 'script']: + if start: self.quiet += 1 + else: self.quiet -= 1 + + if tag == "style": + if start: self.style += 1 + else: self.style -= 1 + + if tag in ["body"]: + self.quiet = 0 # sites like 9rules.com never close
+ + if tag == "blockquote": + if start: + self.p(); self.o('> ', 0, 1); self.start = 1 + self.blockquote += 1 + else: + self.blockquote -= 1 + self.p() + + if tag in ['em', 'i', 'u'] and not self.ignore_emphasis: self.o("_") + if tag in ['strong', 'b'] and not self.ignore_emphasis: self.o("**") + if tag in ['del', 'strike', 's']: + if start: + self.o("<"+tag+">") + else: + self.o(""+tag+">") + + if self.google_doc: + if not self.inheader: + # handle some font attributes, but leave headers clean + self.handle_emphasis(start, tag_style, parent_style) + + if tag in ["code", "tt"] and not self.pre: self.o('`') #TODO: `` `this` `` + if tag == "abbr": + if start: + self.abbr_title = None + self.abbr_data = '' + if has_key(attrs, 'title'): + self.abbr_title = attrs['title'] + else: + if self.abbr_title != None: + self.abbr_list[self.abbr_data] = self.abbr_title + self.abbr_title = None + self.abbr_data = '' + + if tag == "a" and not self.ignore_links: + if start: + if has_key(attrs, 'href') and not (self.skip_internal_links and attrs['href'].startswith('#')): + self.astack.append(attrs) + self.o("[") + else: + self.astack.append(None) + else: + if self.astack: + a = self.astack.pop() + if a: + if self.inline_links: + self.o("](" + escape_md(a['href']) + ")") + else: + i = self.previousIndex(a) + if i is not None: + a = self.a[i] + else: + self.acount += 1 + a['count'] = self.acount + a['outcount'] = self.outcount + self.a.append(a) + self.o("][" + str(a['count']) + "]") + + if tag == "img" and start and not self.ignore_images: + if has_key(attrs, 'src'): + attrs['href'] = attrs['src'] + alt = attrs.get('alt', '') + self.o("![" + escape_md(alt) + "]") + + if self.inline_links: + self.o("(" + escape_md(attrs['href']) + ")") + else: + i = self.previousIndex(attrs) + if i is not None: + attrs = self.a[i] + else: + self.acount += 1 + attrs['count'] = self.acount + attrs['outcount'] = self.outcount + self.a.append(attrs) + self.o("[" + str(attrs['count']) + "]") + + if tag == 'dl' and start: self.p() + if tag == 'dt' and not start: self.pbr() + if tag == 'dd' and start: self.o(' ') + if tag == 'dd' and not start: self.pbr() + + if tag in ["ol", "ul"]: + # Google Docs create sub lists as top level lists + if (not self.list) and (not self.lastWasList): + self.p() + if start: + if self.google_doc: + list_style = google_list_style(tag_style) + else: + list_style = tag + numbering_start = list_numbering_start(attrs) + self.list.append({'name':list_style, 'num':numbering_start}) + else: + if self.list: self.list.pop() + self.lastWasList = True + else: + self.lastWasList = False + + if tag == 'li': + self.pbr() + if start: + if self.list: li = self.list[-1] + else: li = {'name':'ul', 'num':0} + if self.google_doc: + nest_count = self.google_nest_count(tag_style) + else: + nest_count = len(self.list) + self.o(" " * nest_count) #TODO: line up