better get canonical

This commit is contained in:
BennyThink 2022-02-05 16:57:28 +08:00
parent 6528a45e81
commit d6fff5033b
No known key found for this signature in database
GPG Key ID: 6CD0DBDA5235D481

View File

@ -12,7 +12,6 @@ import logging
import math import math
import os import os
import re import re
import tempfile
import time import time
from unittest.mock import MagicMock from unittest.mock import MagicMock
@ -107,12 +106,20 @@ class VIP(Redis, MySQL):
@staticmethod @staticmethod
def extract_canonical_link(url): def extract_canonical_link(url):
# canonic link works for many websites. It will strip out unnecessary stuff # canonic link works for many websites. It will strip out unnecessary stuff
try: props = ["canonical", "alternate", "shortlinkUrl"]
html_doc = requests.get(url).text headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"}
html_doc = requests.get(url, headers=headers, timeout=5).text
soup = BeautifulSoup(html_doc, "html.parser") soup = BeautifulSoup(html_doc, "html.parser")
element = soup.find("link", rel="canonical") for prop in props:
return element['href'] element = soup.find("link", rel=prop)
try:
href = element["href"]
if href not in ["null", "", None]:
return href
except Exception: except Exception:
logging.warning("Canonical exception %s soup.find(%s, %s=%s) --> %s", url, tag, kw, v, result)
return url return url
def get_channel_info(self, url: "str"): def get_channel_info(self, url: "str"):
@ -289,5 +296,5 @@ def subscribe_query():
if __name__ == '__main__': if __name__ == '__main__':
a = VIP.extract_canonical_link("https://youtu.be/FUACKXI-1BA?t=71") a = VIP.extract_canonical_link("https://www.youtube.com/shorts/YrnvPPGznXM")
print(a) print(a)