From e8ae38c8b6bcaf2ce438c6b515b94951f4b6e724 Mon Sep 17 00:00:00 2001 From: Alexander Prinzhorn Date: Thu, 17 Mar 2022 15:25:26 +0100 Subject: [PATCH] Fixed encoding guessing: only search for meta tags in HTML bodies (#4566) --- CHANGELOG.md | 1 + mitmproxy/http.py | 7 ++++--- test/mitmproxy/test_http.py | 1 + 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9fd45dd33..dce13fadb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,6 +37,7 @@ * Fix processing of `--set` options (#5067, @marwinxxii) * Lowercase user-added header names and emit a log message to notify the user when using HTTP/2 (#4746, @mhils) * Exit early if there are errors on startup (#4544, @mhils) +* Fixed encoding guessing: only search for meta tags in HTML bodies (##4566, @Prinzhorn) ## 28 September 2021: mitmproxy 7.0.4 diff --git a/mitmproxy/http.py b/mitmproxy/http.py index 11af7b3e3..743c46c99 100644 --- a/mitmproxy/http.py +++ b/mitmproxy/http.py @@ -414,9 +414,10 @@ class Message(serializable.Serializable): if "json" in self.headers.get("content-type", ""): enc = "utf8" if not enc: - meta_charset = re.search(rb"""]+charset=['"]?([^'">]+)""", content, re.IGNORECASE) - if meta_charset: - enc = meta_charset.group(1).decode("ascii", "ignore") + if "html" in self.headers.get("content-type", ""): + meta_charset = re.search(rb"""]+charset=['"]?([^'">]+)""", content, re.IGNORECASE) + if meta_charset: + enc = meta_charset.group(1).decode("ascii", "ignore") if not enc: if "text/css" in self.headers.get("content-type", ""): # @charset rule must be the very first thing. diff --git a/test/mitmproxy/test_http.py b/test/mitmproxy/test_http.py index ee8c9600d..eb64821f3 100644 --- a/test/mitmproxy/test_http.py +++ b/test/mitmproxy/test_http.py @@ -1098,6 +1098,7 @@ class TestMessageText: def test_guess_meta_charset(self): r = tresp(content=b'\xe6\x98\x8e\xe4\xbc\xaf') + r.headers["content-type"] = "text/html" # "鏄庝集" is decoded form of \xe6\x98\x8e\xe4\xbc\xaf in gb18030 assert "鏄庝集" in r.text