Fixed encoding guessing: only search for meta tags in HTML bodies (#4566)

2024-11-25 09:37:37 +00:00 · 2022-03-17 15:25:26 +01:00 · 2022-03-17 15:25:26 +01:00 · e8ae38c8b6
commit e8ae38c8b6
parent a9283befad
3 changed files with 6 additions and 3 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -37,6 +37,7 @@
 * Fix processing of `--set` options (#5067, @marwinxxii) 
 * Lowercase user-added header names and emit a log message to notify the user when using HTTP/2 (#4746, @mhils)
 * Exit early if there are errors on startup (#4544, @mhils)
 * Fixed encoding guessing: only search for meta tags in HTML bodies (##4566, @Prinzhorn)
 ## 28 September 2021: mitmproxy 7.0.4
--- a/mitmproxy/http.py
+++ b/mitmproxy/http.py
@ -414,9 +414,10 @@ class Message(serializable.Serializable):
            if "json" in self.headers.get("content-type", ""):
                enc = "utf8"
        if not enc:
-            meta_charset = re.search(rb"""<meta[^>]+charset=['"]?([^'">]+)""", content, re.IGNORECASE)
+            if "html" in self.headers.get("content-type", ""):
-            if meta_charset:
+                meta_charset = re.search(rb"""<meta[^>]+charset=['"]?([^'">]+)""", content, re.IGNORECASE)
-                enc = meta_charset.group(1).decode("ascii", "ignore")
+                if meta_charset:
                    enc = meta_charset.group(1).decode("ascii", "ignore")
        if not enc:
            if "text/css" in self.headers.get("content-type", ""):
                # @charset rule must be the very first thing.
--- a/test/mitmproxy/test_http.py
+++ b/test/mitmproxy/test_http.py
@ -1098,6 +1098,7 @@ class TestMessageText:
    def test_guess_meta_charset(self):
        r = tresp(content=b'<meta http-equiv="content-type" '
                          b'content="text/html;charset=gb2312">\xe6\x98\x8e\xe4\xbc\xaf')
        r.headers["content-type"] = "text/html"
        # "鏄庝集" is decoded form of \xe6\x98\x8e\xe4\xbc\xaf in gb18030
        assert "鏄庝集" in r.text