faster extract_canonical_link

2025-01-30 17:58:36 +00:00 · 2022-02-27 10:49:04 +08:00 · 2022-02-27 10:49:04 +08:00 · a57d204b11
commit a57d204b11
parent 30195dba3f
1 changed files with 8 additions and 0 deletions
--- a/ytdlbot/limit.py
+++ b/ytdlbot/limit.py
@ -8,6 +8,7 @@
 __author__ = "Benny <benny.think@gmail.com>"

 import hashlib
+import http
 import logging
 import math
 import os
@ -109,6 +110,13 @@ class VIP(Redis, MySQL):
        props = ["canonical", "alternate", "shortlinkUrl"]
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"}
+        # send head request first
+        r = requests.head(url, headers=headers)
+        if r.status_code != http.HTTPStatus.METHOD_NOT_ALLOWED and r.headers.get("content-type") != "text/html":
+            # get content-type, if it's not text/html, there's no need to issue a GET request
+            logging.warning("%s Content-type is not text/html, no need to GET for extract_canonical_link", url)
+            return url
+
        html_doc = requests.get(url, headers=headers, timeout=5).text
        soup = BeautifulSoup(html_doc, "html.parser")
        for prop in props: