charset detection: ignore case when searching in HTML (#4785)

This commit is contained in:
Maximilian Hils 2021-08-31 11:13:28 +02:00 committed by GitHub
parent d5bba9878b
commit 4e5a0ae71d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -408,13 +408,13 @@ class Message(serializable.Serializable):
if "json" in self.headers.get("content-type", ""): if "json" in self.headers.get("content-type", ""):
enc = "utf8" enc = "utf8"
if not enc: if not enc:
meta_charset = re.search(rb"""<meta[^>]+charset=['"]?([^'">]+)""", content) meta_charset = re.search(rb"""<meta[^>]+charset=['"]?([^'">]+)""", content, re.IGNORECASE)
if meta_charset: if meta_charset:
enc = meta_charset.group(1).decode("ascii", "ignore") enc = meta_charset.group(1).decode("ascii", "ignore")
if not enc: if not enc:
if "text/css" in self.headers.get("content-type", ""): if "text/css" in self.headers.get("content-type", ""):
# @charset rule must be the very first thing. # @charset rule must be the very first thing.
css_charset = re.match(rb"""@charset "([^"]+)";""", content) css_charset = re.match(rb"""@charset "([^"]+)";""", content, re.IGNORECASE)
if css_charset: if css_charset:
enc = css_charset.group(1).decode("ascii", "ignore") enc = css_charset.group(1).decode("ascii", "ignore")
if not enc: if not enc: