charset detection: ignore case when searching in HTML (#4785)

This commit is contained in:
Maximilian Hils 2021-08-31 11:13:28 +02:00 committed by GitHub
parent d5bba9878b
commit 4e5a0ae71d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -408,13 +408,13 @@ class Message(serializable.Serializable):
if "json" in self.headers.get("content-type", ""):
enc = "utf8"
if not enc:
meta_charset = re.search(rb"""<meta[^>]+charset=['"]?([^'">]+)""", content)
meta_charset = re.search(rb"""<meta[^>]+charset=['"]?([^'">]+)""", content, re.IGNORECASE)
if meta_charset:
enc = meta_charset.group(1).decode("ascii", "ignore")
if not enc:
if "text/css" in self.headers.get("content-type", ""):
# @charset rule must be the very first thing.
css_charset = re.match(rb"""@charset "([^"]+)";""", content)
css_charset = re.match(rb"""@charset "([^"]+)";""", content, re.IGNORECASE)
if css_charset:
enc = css_charset.group(1).decode("ascii", "ignore")
if not enc: