diff --git a/mitmproxy/net/http/message.py b/mitmproxy/net/http/message.py index 06d00377b..86782e8ac 100644 --- a/mitmproxy/net/http/message.py +++ b/mitmproxy/net/http/message.py @@ -68,7 +68,7 @@ class Message(serializable.Serializable): @property def raw_content(self) -> bytes: """ - The raw (encoded) HTTP message body + The raw (potentially compressed) HTTP message body as bytes. See also: :py:attr:`content`, :py:class:`text` """ @@ -80,10 +80,10 @@ class Message(serializable.Serializable): def get_content(self, strict: bool=True) -> bytes: """ - The HTTP message body decoded with the content-encoding header (e.g. gzip) + The uncompressed HTTP message body as bytes. Raises: - ValueError, when the content-encoding is invalid and strict is True. + ValueError, when the HTTP content-encoding is invalid and strict is True. See also: :py:class:`raw_content`, :py:attr:`text` """ @@ -165,22 +165,26 @@ class Message(serializable.Serializable): return ct[2].get("charset") return None - def _guess_encoding(self) -> str: + def _guess_encoding(self, content=b"") -> str: enc = self._get_content_type_charset() - if enc: - return enc + if not enc: + if "json" in self.headers.get("content-type", ""): + enc = "utf8" + if not enc: + meta_charset = re.search(rb"""]+charset=['"]?([^'">]+)""", content) + if meta_charset: + enc = meta_charset.group(1).decode("ascii", "ignore") + if not enc: + enc = "latin-1" + # Use GB 18030 as the superset of GB2312 and GBK to fix common encoding problems on Chinese websites. + if enc.lower() in ("gb2312", "gbk"): + enc = "gb18030" - if "json" in self.headers.get("content-type", ""): - return "utf8" - else: - # We may also want to check for HTML meta tags here at some point. - # REGEX_ENCODING = re.compile(rb"""]+charset=['"]?([^'"]+)""") - return "latin-1" + return enc def get_text(self, strict: bool=True) -> Optional[str]: """ - The HTTP message body decoded with both content-encoding header (e.g. gzip) - and content-type header charset. + The uncompressed and decoded HTTP message body as text. Raises: ValueError, when either content-encoding or charset is invalid and strict is True. @@ -189,9 +193,9 @@ class Message(serializable.Serializable): """ if self.raw_content is None: return None - enc = self._guess_encoding() content = self.get_content(strict) + enc = self._guess_encoding(content) try: return encoding.decode(content, enc) except ValueError: diff --git a/test/mitmproxy/net/http/test_message.py b/test/mitmproxy/net/http/test_message.py index 512f31991..7ad7890c8 100644 --- a/test/mitmproxy/net/http/test_message.py +++ b/test/mitmproxy/net/http/test_message.py @@ -229,6 +229,16 @@ class TestMessageText: r.headers["content-type"] = "application/json" assert r.text == u'"ü"' + def test_guess_meta_charset(self): + r = tutils.tresp(content=b'\xe6\x98\x8e\xe4\xbc\xaf') + # "鏄庝集" is decoded form of \xe6\x98\x8e\xe4\xbc\xaf in gb18030 + assert u"鏄庝集" in r.text + + def test_guess_latin_1(self): + r = tutils.tresp(content=b"\xF0\xE2") + assert r.text == u"ðâ" + def test_none(self): r = tutils.tresp(content=None) assert r.text is None