charset in meta tags (#3411)

original contribution from @0xHJK in https://github.com/mitmproxy/mitmproxy/pull/3150
2024-11-23 00:01:36 +00:00 · 2018-12-13 20:04:12 +05:30 · 2018-12-13 20:04:12 +05:30 · e2bcca47b1
commit e2bcca47b1
parent db658b12ed
2 changed files with 29 additions and 15 deletions
--- a/mitmproxy/net/http/message.py
+++ b/mitmproxy/net/http/message.py
@ -68,7 +68,7 @@ class Message(serializable.Serializable):
    @property
    def raw_content(self) -> bytes:
        """
-        The raw (encoded) HTTP message body
+        The raw (potentially compressed) HTTP message body as bytes.

        See also: :py:attr:`content`, :py:class:`text`
        """
@ -80,10 +80,10 @@ class Message(serializable.Serializable):

    def get_content(self, strict: bool=True) -> bytes:
        """
-        The HTTP message body decoded with the content-encoding header (e.g. gzip)
+        The uncompressed HTTP message body as bytes.

        Raises:
-            ValueError, when the content-encoding is invalid and strict is True.
+            ValueError, when the HTTP content-encoding is invalid and strict is True.

        See also: :py:class:`raw_content`, :py:attr:`text`
        """
@ -165,22 +165,26 @@ class Message(serializable.Serializable):
            return ct[2].get("charset")
        return None

-    def _guess_encoding(self) -> str:
+    def _guess_encoding(self, content=b"") -> str:
        enc = self._get_content_type_charset()
-        if enc:
-            return enc
-
+        if not enc:
            if "json" in self.headers.get("content-type", ""):
-            return "utf8"
-        else:
-            # We may also want to check for HTML meta tags here at some point.
-            # REGEX_ENCODING = re.compile(rb"""<meta[^>]+charset=['"]?([^'"]+)""")
-            return "latin-1"
+                enc = "utf8"
+        if not enc:
+            meta_charset = re.search(rb"""<meta[^>]+charset=['"]?([^'">]+)""", content)
+            if meta_charset:
+                enc = meta_charset.group(1).decode("ascii", "ignore")
+        if not enc:
+            enc = "latin-1"
+        # Use GB 18030 as the superset of GB2312 and GBK to fix common encoding problems on Chinese websites.
+        if enc.lower() in ("gb2312", "gbk"):
+            enc = "gb18030"
+
+        return enc

    def get_text(self, strict: bool=True) -> Optional[str]:
        """
-        The HTTP message body decoded with both content-encoding header (e.g. gzip)
-        and content-type header charset.
+        The uncompressed and decoded HTTP message body as text.

        Raises:
            ValueError, when either content-encoding or charset is invalid and strict is True.
@ -189,9 +193,9 @@ class Message(serializable.Serializable):
        """
        if self.raw_content is None:
            return None
-        enc = self._guess_encoding()

        content = self.get_content(strict)
+        enc = self._guess_encoding(content)
        try:
            return encoding.decode(content, enc)
        except ValueError:
--- a/test/mitmproxy/net/http/test_message.py
+++ b/test/mitmproxy/net/http/test_message.py
@ -229,6 +229,16 @@ class TestMessageText:
        r.headers["content-type"] = "application/json"
        assert r.text == u'"ü"'

+    def test_guess_meta_charset(self):
+        r = tutils.tresp(content=b'<meta http-equiv="content-type" '
+                                 b'content="text/html;charset=gb2312">\xe6\x98\x8e\xe4\xbc\xaf')
+        # "鏄庝集" is decoded form of \xe6\x98\x8e\xe4\xbc\xaf in gb18030
+        assert u"鏄庝集" in r.text
+
+    def test_guess_latin_1(self):
+        r = tutils.tresp(content=b"\xF0\xE2")
+        assert r.text == u"ðâ"
+
    def test_none(self):
        r = tutils.tresp(content=None)
        assert r.text is None