mirror of
https://github.com/Grasscutters/mitmproxy.git
synced 2024-11-23 00:01:36 +00:00
charset in meta tags (#3411)
original contribution from @0xHJK in https://github.com/mitmproxy/mitmproxy/pull/3150
This commit is contained in:
parent
db658b12ed
commit
e2bcca47b1
@ -68,7 +68,7 @@ class Message(serializable.Serializable):
|
||||
@property
|
||||
def raw_content(self) -> bytes:
|
||||
"""
|
||||
The raw (encoded) HTTP message body
|
||||
The raw (potentially compressed) HTTP message body as bytes.
|
||||
|
||||
See also: :py:attr:`content`, :py:class:`text`
|
||||
"""
|
||||
@ -80,10 +80,10 @@ class Message(serializable.Serializable):
|
||||
|
||||
def get_content(self, strict: bool=True) -> bytes:
|
||||
"""
|
||||
The HTTP message body decoded with the content-encoding header (e.g. gzip)
|
||||
The uncompressed HTTP message body as bytes.
|
||||
|
||||
Raises:
|
||||
ValueError, when the content-encoding is invalid and strict is True.
|
||||
ValueError, when the HTTP content-encoding is invalid and strict is True.
|
||||
|
||||
See also: :py:class:`raw_content`, :py:attr:`text`
|
||||
"""
|
||||
@ -165,22 +165,26 @@ class Message(serializable.Serializable):
|
||||
return ct[2].get("charset")
|
||||
return None
|
||||
|
||||
def _guess_encoding(self) -> str:
|
||||
def _guess_encoding(self, content=b"") -> str:
|
||||
enc = self._get_content_type_charset()
|
||||
if enc:
|
||||
return enc
|
||||
|
||||
if not enc:
|
||||
if "json" in self.headers.get("content-type", ""):
|
||||
return "utf8"
|
||||
else:
|
||||
# We may also want to check for HTML meta tags here at some point.
|
||||
# REGEX_ENCODING = re.compile(rb"""<meta[^>]+charset=['"]?([^'"]+)""")
|
||||
return "latin-1"
|
||||
enc = "utf8"
|
||||
if not enc:
|
||||
meta_charset = re.search(rb"""<meta[^>]+charset=['"]?([^'">]+)""", content)
|
||||
if meta_charset:
|
||||
enc = meta_charset.group(1).decode("ascii", "ignore")
|
||||
if not enc:
|
||||
enc = "latin-1"
|
||||
# Use GB 18030 as the superset of GB2312 and GBK to fix common encoding problems on Chinese websites.
|
||||
if enc.lower() in ("gb2312", "gbk"):
|
||||
enc = "gb18030"
|
||||
|
||||
return enc
|
||||
|
||||
def get_text(self, strict: bool=True) -> Optional[str]:
|
||||
"""
|
||||
The HTTP message body decoded with both content-encoding header (e.g. gzip)
|
||||
and content-type header charset.
|
||||
The uncompressed and decoded HTTP message body as text.
|
||||
|
||||
Raises:
|
||||
ValueError, when either content-encoding or charset is invalid and strict is True.
|
||||
@ -189,9 +193,9 @@ class Message(serializable.Serializable):
|
||||
"""
|
||||
if self.raw_content is None:
|
||||
return None
|
||||
enc = self._guess_encoding()
|
||||
|
||||
content = self.get_content(strict)
|
||||
enc = self._guess_encoding(content)
|
||||
try:
|
||||
return encoding.decode(content, enc)
|
||||
except ValueError:
|
||||
|
@ -229,6 +229,16 @@ class TestMessageText:
|
||||
r.headers["content-type"] = "application/json"
|
||||
assert r.text == u'"ü"'
|
||||
|
||||
def test_guess_meta_charset(self):
|
||||
r = tutils.tresp(content=b'<meta http-equiv="content-type" '
|
||||
b'content="text/html;charset=gb2312">\xe6\x98\x8e\xe4\xbc\xaf')
|
||||
# "鏄庝集" is decoded form of \xe6\x98\x8e\xe4\xbc\xaf in gb18030
|
||||
assert u"鏄庝集" in r.text
|
||||
|
||||
def test_guess_latin_1(self):
|
||||
r = tutils.tresp(content=b"\xF0\xE2")
|
||||
assert r.text == u"ðâ"
|
||||
|
||||
def test_none(self):
|
||||
r = tutils.tresp(content=None)
|
||||
assert r.text is None
|
||||
|
Loading…
Reference in New Issue
Block a user