better get canonical

2024-11-16 11:52:14 +00:00 · 2022-02-05 16:57:28 +08:00 · 2022-02-05 16:57:28 +08:00 · d6fff5033b
commit d6fff5033b
parent 6528a45e81
1 changed files with 16 additions and 9 deletions
--- a/ytdlbot/limit.py
+++ b/ytdlbot/limit.py
@ -12,7 +12,6 @@ import logging
 import math
 import os
 import re
 import tempfile
 import time
 from unittest.mock import MagicMock
@ -107,12 +106,20 @@ class VIP(Redis, MySQL):
    @staticmethod
    def extract_canonical_link(url):
        # canonic link works for many websites. It will strip out unnecessary stuff
-        try:
+        props = ["canonical", "alternate", "shortlinkUrl"]
-            html_doc = requests.get(url).text
+        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"}
        html_doc = requests.get(url, headers=headers, timeout=5).text
        soup = BeautifulSoup(html_doc, "html.parser")
-            element = soup.find("link", rel="canonical")
+        for prop in props:
-            return element['href']
+            element = soup.find("link", rel=prop)
            try:
                href = element["href"]
                if href not in ["null", "", None]:
                    return href
            except Exception:
                logging.warning("Canonical exception %s soup.find(%s, %s=%s) --> %s", url, tag, kw, v, result)
        return url
    def get_channel_info(self, url: "str"):
@ -289,5 +296,5 @@ def subscribe_query():
 if __name__ == '__main__':
-    a = VIP.extract_canonical_link("https://youtu.be/FUACKXI-1BA?t=71")
+    a = VIP.extract_canonical_link("https://www.youtube.com/shorts/YrnvPPGznXM")
    print(a)