Fixed encoding guessing: only search for meta tags in HTML bodies (#4566)

This commit is contained in:
Alexander Prinzhorn 2022-03-17 15:25:26 +01:00 committed by GitHub
parent a9283befad
commit e8ae38c8b6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 6 additions and 3 deletions

View File

@ -37,6 +37,7 @@
* Fix processing of `--set` options (#5067, @marwinxxii) * Fix processing of `--set` options (#5067, @marwinxxii)
* Lowercase user-added header names and emit a log message to notify the user when using HTTP/2 (#4746, @mhils) * Lowercase user-added header names and emit a log message to notify the user when using HTTP/2 (#4746, @mhils)
* Exit early if there are errors on startup (#4544, @mhils) * Exit early if there are errors on startup (#4544, @mhils)
* Fixed encoding guessing: only search for meta tags in HTML bodies (##4566, @Prinzhorn)
## 28 September 2021: mitmproxy 7.0.4 ## 28 September 2021: mitmproxy 7.0.4

View File

@ -414,9 +414,10 @@ class Message(serializable.Serializable):
if "json" in self.headers.get("content-type", ""): if "json" in self.headers.get("content-type", ""):
enc = "utf8" enc = "utf8"
if not enc: if not enc:
meta_charset = re.search(rb"""<meta[^>]+charset=['"]?([^'">]+)""", content, re.IGNORECASE) if "html" in self.headers.get("content-type", ""):
if meta_charset: meta_charset = re.search(rb"""<meta[^>]+charset=['"]?([^'">]+)""", content, re.IGNORECASE)
enc = meta_charset.group(1).decode("ascii", "ignore") if meta_charset:
enc = meta_charset.group(1).decode("ascii", "ignore")
if not enc: if not enc:
if "text/css" in self.headers.get("content-type", ""): if "text/css" in self.headers.get("content-type", ""):
# @charset rule must be the very first thing. # @charset rule must be the very first thing.

View File

@ -1098,6 +1098,7 @@ class TestMessageText:
def test_guess_meta_charset(self): def test_guess_meta_charset(self):
r = tresp(content=b'<meta http-equiv="content-type" ' r = tresp(content=b'<meta http-equiv="content-type" '
b'content="text/html;charset=gb2312">\xe6\x98\x8e\xe4\xbc\xaf') b'content="text/html;charset=gb2312">\xe6\x98\x8e\xe4\xbc\xaf')
r.headers["content-type"] = "text/html"
# "鏄庝集" is decoded form of \xe6\x98\x8e\xe4\xbc\xaf in gb18030 # "鏄庝集" is decoded form of \xe6\x98\x8e\xe4\xbc\xaf in gb18030
assert "鏄庝集" in r.text assert "鏄庝集" in r.text