mirror of
https://github.com/Grasscutters/mitmproxy.git
synced 2024-11-21 22:58:24 +00:00
Fixed encoding guessing: only search for meta tags in HTML bodies (#4566)
This commit is contained in:
parent
a9283befad
commit
e8ae38c8b6
@ -37,6 +37,7 @@
|
||||
* Fix processing of `--set` options (#5067, @marwinxxii)
|
||||
* Lowercase user-added header names and emit a log message to notify the user when using HTTP/2 (#4746, @mhils)
|
||||
* Exit early if there are errors on startup (#4544, @mhils)
|
||||
* Fixed encoding guessing: only search for meta tags in HTML bodies (##4566, @Prinzhorn)
|
||||
|
||||
## 28 September 2021: mitmproxy 7.0.4
|
||||
|
||||
|
@ -414,9 +414,10 @@ class Message(serializable.Serializable):
|
||||
if "json" in self.headers.get("content-type", ""):
|
||||
enc = "utf8"
|
||||
if not enc:
|
||||
meta_charset = re.search(rb"""<meta[^>]+charset=['"]?([^'">]+)""", content, re.IGNORECASE)
|
||||
if meta_charset:
|
||||
enc = meta_charset.group(1).decode("ascii", "ignore")
|
||||
if "html" in self.headers.get("content-type", ""):
|
||||
meta_charset = re.search(rb"""<meta[^>]+charset=['"]?([^'">]+)""", content, re.IGNORECASE)
|
||||
if meta_charset:
|
||||
enc = meta_charset.group(1).decode("ascii", "ignore")
|
||||
if not enc:
|
||||
if "text/css" in self.headers.get("content-type", ""):
|
||||
# @charset rule must be the very first thing.
|
||||
|
@ -1098,6 +1098,7 @@ class TestMessageText:
|
||||
def test_guess_meta_charset(self):
|
||||
r = tresp(content=b'<meta http-equiv="content-type" '
|
||||
b'content="text/html;charset=gb2312">\xe6\x98\x8e\xe4\xbc\xaf')
|
||||
r.headers["content-type"] = "text/html"
|
||||
# "鏄庝集" is decoded form of \xe6\x98\x8e\xe4\xbc\xaf in gb18030
|
||||
assert "鏄庝集" in r.text
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user