diff --git a/libmproxy/console/contentview.py b/libmproxy/console/contentview.py index fd28a83ab..e4ffcd472 100644 --- a/libmproxy/console/contentview.py +++ b/libmproxy/console/contentview.py @@ -9,13 +9,14 @@ from PIL.ExifTags import TAGS import subprocess import traceback import urwid +import html2text import netlib.utils from netlib import odict from . import common, signals from .. import utils, encoding -from ..contrib import jsbeautifier, html2text +from ..contrib import jsbeautifier from ..contrib.wbxml.ASCommandResponse import ASCommandResponse try: diff --git a/libmproxy/contrib/README b/libmproxy/contrib/README index 656df3dd8..3b0f75124 100644 --- a/libmproxy/contrib/README +++ b/libmproxy/contrib/README @@ -1,14 +1,10 @@ Contribs: - -pyparsing 1.5.2, MIT license - jsbeautifier, git checkout 25/03/12, MIT license - Removed test directories - Disabled packers through a single-line modification (see "# CORTESI" comment) -html2text, git checkout 18/08/12, GPLv3 - -WinDivert 1.1.4, LGPL license, http://reqrypt.org/windivert.html \ No newline at end of file +wbxml + - https://github.com/davidpshaw/PyWBXMLDecoder diff --git a/libmproxy/contrib/html2text.py b/libmproxy/contrib/html2text.py deleted file mode 100644 index 035a596bd..000000000 --- a/libmproxy/contrib/html2text.py +++ /dev/null @@ -1,834 +0,0 @@ -#!/usr/bin/env python -"""html2text: Turn HTML into equivalent Markdown-structured text.""" -__version__ = "3.200.3" -__author__ = "Aaron Swartz (me@aaronsw.com)" -__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." -__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"] - -# TODO: -# Support decoded entities with unifiable. - -try: - True -except NameError: - setattr(__builtins__, 'True', 1) - setattr(__builtins__, 'False', 0) - -def has_key(x, y): - if hasattr(x, 'has_key'): return x.has_key(y) - else: return y in x - -try: - import htmlentitydefs - import urlparse - import HTMLParser -except ImportError: #Python3 - import html.entities as htmlentitydefs - import urllib.parse as urlparse - import html.parser as HTMLParser -try: #Python3 - import urllib.request as urllib -except: - import urllib -import optparse, re, sys, codecs, types - -try: from textwrap import wrap -except: pass - -# Use Unicode characters instead of their ascii psuedo-replacements -UNICODE_SNOB = 0 - -# Put the links after each paragraph instead of at the end. -LINKS_EACH_PARAGRAPH = 0 - -# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.) -BODY_WIDTH = 78 - -# Don't show internal links (href="#local-anchor") -- corresponding link targets -# won't be visible in the plain text file anyway. -SKIP_INTERNAL_LINKS = True - -# Use inline, rather than reference, formatting for images and links -INLINE_LINKS = True - -# Number of pixels Google indents nested lists -GOOGLE_LIST_INDENT = 36 - -IGNORE_ANCHORS = False -IGNORE_IMAGES = False -IGNORE_EMPHASIS = False - -### Entity Nonsense ### - -def name2cp(k): - if k == 'apos': return ord("'") - if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 - return htmlentitydefs.name2codepoint[k] - else: - k = htmlentitydefs.entitydefs[k] - if k.startswith("") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 - return ord(codecs.latin_1_decode(k)[0]) - -unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', -'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*', -'ndash':'-', 'oelig':'oe', 'aelig':'ae', -'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', -'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', -'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i', -'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', -'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u', -'lrm':'', 'rlm':''} - -unifiable_n = {} - -for k in unifiable.keys(): - unifiable_n[name2cp(k)] = unifiable[k] - -### End Entity Nonsense ### - -def onlywhite(line): - """Return true if the line does only consist of whitespace characters.""" - for c in line: - if c is not ' ' and c is not ' ': - return c is ' ' - return line - -def hn(tag): - if tag[0] == 'h' and len(tag) == 2: - try: - n = int(tag[1]) - if n in range(1, 10): return n - except ValueError: return 0 - -def dumb_property_dict(style): - """returns a hash of css attributes""" - return dict([(x.strip(), y.strip()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z]]); - -def dumb_css_parser(data): - """returns a hash of css selectors, each of which contains a hash of css attributes""" - # remove @import sentences - importIndex = data.find('@import') - while importIndex != -1: - data = data[0:importIndex] + data[data.find(';', importIndex) + 1:] - importIndex = data.find('@import') - - # parse the css. reverted from dictionary compehension in order to support older pythons - elements = [x.split('{') for x in data.split('}') if '{' in x.strip()] - try: - elements = dict([(a.strip(), dumb_property_dict(b)) for a, b in elements]) - except ValueError: - elements = {} # not that important - - return elements - -def element_style(attrs, style_def, parent_style): - """returns a hash of the 'final' style attributes of the element""" - style = parent_style.copy() - if 'class' in attrs: - for css_class in attrs['class'].split(): - css_style = style_def['.' + css_class] - style.update(css_style) - if 'style' in attrs: - immediate_style = dumb_property_dict(attrs['style']) - style.update(immediate_style) - return style - -def google_list_style(style): - """finds out whether this is an ordered or unordered list""" - if 'list-style-type' in style: - list_style = style['list-style-type'] - if list_style in ['disc', 'circle', 'square', 'none']: - return 'ul' - return 'ol' - -def google_has_height(style): - """check if the style of the element has the 'height' attribute explicitly defined""" - if 'height' in style: - return True - return False - -def google_text_emphasis(style): - """return a list of all emphasis modifiers of the element""" - emphasis = [] - if 'text-decoration' in style: - emphasis.append(style['text-decoration']) - if 'font-style' in style: - emphasis.append(style['font-style']) - if 'font-weight' in style: - emphasis.append(style['font-weight']) - return emphasis - -def google_fixed_width_font(style): - """check if the css of the current element defines a fixed width font""" - font_family = '' - if 'font-family' in style: - font_family = style['font-family'] - if 'Courier New' == font_family or 'Consolas' == font_family: - return True - return False - -def list_numbering_start(attrs): - """extract numbering from list element attributes""" - if 'start' in attrs: - return int(attrs['start']) - 1 - else: - return 0 - -class HTML2Text(HTMLParser.HTMLParser): - def __init__(self, out=None, baseurl=''): - HTMLParser.HTMLParser.__init__(self) - - # Config options - self.unicode_snob = UNICODE_SNOB - self.links_each_paragraph = LINKS_EACH_PARAGRAPH - self.body_width = BODY_WIDTH - self.skip_internal_links = SKIP_INTERNAL_LINKS - self.inline_links = INLINE_LINKS - self.google_list_indent = GOOGLE_LIST_INDENT - self.ignore_links = IGNORE_ANCHORS - self.ignore_images = IGNORE_IMAGES - self.ignore_emphasis = IGNORE_EMPHASIS - self.google_doc = False - self.ul_item_mark = '*' - - if out is None: - self.out = self.outtextf - else: - self.out = out - - self.outtextlist = [] # empty list to store output characters before they are "joined" - - try: - self.outtext = unicode() - except NameError: # Python3 - self.outtext = str() - - self.quiet = 0 - self.p_p = 0 # number of newline character to print before next output - self.outcount = 0 - self.start = 1 - self.space = 0 - self.a = [] - self.astack = [] - self.acount = 0 - self.list = [] - self.blockquote = 0 - self.pre = 0 - self.startpre = 0 - self.code = False - self.br_toggle = '' - self.lastWasNL = 0 - self.lastWasList = False - self.style = 0 - self.style_def = {} - self.tag_stack = [] - self.emphasis = 0 - self.drop_white_space = 0 - self.inheader = False - self.abbr_title = None # current abbreviation definition - self.abbr_data = None # last inner HTML (for abbr being defined) - self.abbr_list = {} # stack of abbreviations to write later - self.baseurl = baseurl - - try: del unifiable_n[name2cp('nbsp')] - except KeyError: pass - unifiable['nbsp'] = ' _place_holder;' - - - def feed(self, data): - data = data.replace("' + 'script>", "") - HTMLParser.HTMLParser.feed(self, data) - - def handle(self, data): - self.feed(data) - self.feed("") - return self.optwrap(self.close()) - - def outtextf(self, s): - self.outtextlist.append(s) - if s: self.lastWasNL = s[-1] == '\n' - - def close(self): - HTMLParser.HTMLParser.close(self) - - self.pbr() - self.o('', 0, 'end') - - self.outtext = self.outtext.join(self.outtextlist) - if self.unicode_snob: - nbsp = unichr(name2cp('nbsp')) - else: - nbsp = u' ' - self.outtext = self.outtext.replace(u' _place_holder;', nbsp) - - return self.outtext - - def handle_charref(self, c): - self.o(self.charref(c), 1) - - def handle_entityref(self, c): - self.o(self.entityref(c), 1) - - def handle_starttag(self, tag, attrs): - self.handle_tag(tag, attrs, 1) - - def handle_endtag(self, tag): - self.handle_tag(tag, None, 0) - - def previousIndex(self, attrs): - """ returns the index of certain set of attributes (of a link) in the - self.a list - - If the set of attributes is not found, returns None - """ - if not has_key(attrs, 'href'): return None - - i = -1 - for a in self.a: - i += 1 - match = 0 - - if has_key(a, 'href') and a['href'] == attrs['href']: - if has_key(a, 'title') or has_key(attrs, 'title'): - if (has_key(a, 'title') and has_key(attrs, 'title') and - a['title'] == attrs['title']): - match = True - else: - match = True - - if match: return i - - def drop_last(self, nLetters): - if not self.quiet: - self.outtext = self.outtext[:-nLetters] - - def handle_emphasis(self, start, tag_style, parent_style): - """handles various text emphases""" - tag_emphasis = google_text_emphasis(tag_style) - parent_emphasis = google_text_emphasis(parent_style) - - # handle Google's text emphasis - strikethrough = 'line-through' in tag_emphasis and self.hide_strikethrough - bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis - italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis - fixed = google_fixed_width_font(tag_style) and not \ - google_fixed_width_font(parent_style) and not self.pre - - if start: - # crossed-out text must be handled before other attributes - # in order not to output qualifiers unnecessarily - if bold or italic or fixed: - self.emphasis += 1 - if strikethrough: - self.quiet += 1 - if italic: - self.o("_") - self.drop_white_space += 1 - if bold: - self.o("**") - self.drop_white_space += 1 - if fixed: - self.o('`') - self.drop_white_space += 1 - self.code = True - else: - if bold or italic or fixed: - # there must not be whitespace before closing emphasis mark - self.emphasis -= 1 - self.space = 0 - self.outtext = self.outtext.rstrip() - if fixed: - if self.drop_white_space: - # empty emphasis, drop it - self.drop_last(1) - self.drop_white_space -= 1 - else: - self.o('`') - self.code = False - if bold: - if self.drop_white_space: - # empty emphasis, drop it - self.drop_last(2) - self.drop_white_space -= 1 - else: - self.o("**") - if italic: - if self.drop_white_space: - # empty emphasis, drop it - self.drop_last(1) - self.drop_white_space -= 1 - else: - self.o("_") - # space is only allowed after *all* emphasis marks - if (bold or italic) and not self.emphasis: - self.o(" ") - if strikethrough: - self.quiet -= 1 - - def handle_tag(self, tag, attrs, start): - #attrs = fixattrs(attrs) - if attrs is None: - attrs = {} - else: - attrs = dict(attrs) - - if self.google_doc: - # the attrs parameter is empty for a closing tag. in addition, we - # need the attributes of the parent nodes in order to get a - # complete style description for the current element. we assume - # that google docs export well formed html. - parent_style = {} - if start: - if self.tag_stack: - parent_style = self.tag_stack[-1][2] - tag_style = element_style(attrs, self.style_def, parent_style) - self.tag_stack.append((tag, attrs, tag_style)) - else: - dummy, attrs, tag_style = self.tag_stack.pop() - if self.tag_stack: - parent_style = self.tag_stack[-1][2] - - if hn(tag): - self.p() - if start: - self.inheader = True - self.o(hn(tag)*"#" + ' ') - else: - self.inheader = False - return # prevent redundant emphasis marks on headers - - if tag in ['p', 'div']: - if self.google_doc: - if start and google_has_height(tag_style): - self.p() - else: - self.soft_br() - else: - self.p() - - if tag == "br" and start: self.o(" \n") - - if tag == "hr" and start: - self.p() - self.o("* * *") - self.p() - - if tag in ["head", "style", 'script']: - if start: self.quiet += 1 - else: self.quiet -= 1 - - if tag == "style": - if start: self.style += 1 - else: self.style -= 1 - - if tag in ["body"]: - self.quiet = 0 # sites like 9rules.com never close
- - if tag == "blockquote": - if start: - self.p(); self.o('> ', 0, 1); self.start = 1 - self.blockquote += 1 - else: - self.blockquote -= 1 - self.p() - - if tag in ['em', 'i', 'u'] and not self.ignore_emphasis: self.o("_") - if tag in ['strong', 'b'] and not self.ignore_emphasis: self.o("**") - if tag in ['del', 'strike', 's']: - if start: - self.o("<"+tag+">") - else: - self.o(""+tag+">") - - if self.google_doc: - if not self.inheader: - # handle some font attributes, but leave headers clean - self.handle_emphasis(start, tag_style, parent_style) - - if tag in ["code", "tt"] and not self.pre: self.o('`') #TODO: `` `this` `` - if tag == "abbr": - if start: - self.abbr_title = None - self.abbr_data = '' - if has_key(attrs, 'title'): - self.abbr_title = attrs['title'] - else: - if self.abbr_title != None: - self.abbr_list[self.abbr_data] = self.abbr_title - self.abbr_title = None - self.abbr_data = '' - - if tag == "a" and not self.ignore_links: - if start: - if has_key(attrs, 'href') and not (self.skip_internal_links and attrs['href'].startswith('#')): - self.astack.append(attrs) - self.o("[") - else: - self.astack.append(None) - else: - if self.astack: - a = self.astack.pop() - if a: - if self.inline_links: - self.o("](" + escape_md(a['href']) + ")") - else: - i = self.previousIndex(a) - if i is not None: - a = self.a[i] - else: - self.acount += 1 - a['count'] = self.acount - a['outcount'] = self.outcount - self.a.append(a) - self.o("][" + str(a['count']) + "]") - - if tag == "img" and start and not self.ignore_images: - if has_key(attrs, 'src'): - attrs['href'] = attrs['src'] - alt = attrs.get('alt', '') - self.o("![" + escape_md(alt) + "]") - - if self.inline_links: - self.o("(" + escape_md(attrs['href']) + ")") - else: - i = self.previousIndex(attrs) - if i is not None: - attrs = self.a[i] - else: - self.acount += 1 - attrs['count'] = self.acount - attrs['outcount'] = self.outcount - self.a.append(attrs) - self.o("[" + str(attrs['count']) + "]") - - if tag == 'dl' and start: self.p() - if tag == 'dt' and not start: self.pbr() - if tag == 'dd' and start: self.o(' ') - if tag == 'dd' and not start: self.pbr() - - if tag in ["ol", "ul"]: - # Google Docs create sub lists as top level lists - if (not self.list) and (not self.lastWasList): - self.p() - if start: - if self.google_doc: - list_style = google_list_style(tag_style) - else: - list_style = tag - numbering_start = list_numbering_start(attrs) - self.list.append({'name':list_style, 'num':numbering_start}) - else: - if self.list: self.list.pop() - self.lastWasList = True - else: - self.lastWasList = False - - if tag == 'li': - self.pbr() - if start: - if self.list: li = self.list[-1] - else: li = {'name':'ul', 'num':0} - if self.google_doc: - nest_count = self.google_nest_count(tag_style) - else: - nest_count = len(self.list) - self.o(" " * nest_count) #TODO: line up